diff options
Diffstat (limited to 'generic/tclUtf.c')
| -rw-r--r-- | generic/tclUtf.c | 1549 | 
1 files changed, 1083 insertions, 466 deletions
| diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 0c46d26..15529c7 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -5,10 +5,8 @@   *   * Copyright (c) 1997-1998 Sun Microsystems, Inc.   * - * See the file "license.terms" for information on usage and redistribution - * of this file, and for a DISCLAIMER OF ALL WARRANTIES. - * - * RCS: @(#) $Id: tclUtf.c,v 1.4 1999/04/30 16:22:24 hershey Exp $ + * See the file "license.terms" for information on usage and redistribution of + * this file, and for a DISCLAIMER OF ALL WARRANTIES.   */  #include "tclInt.h" @@ -20,34 +18,48 @@  #include "tclUniData.c"  /* - * The following macros are used for fast character category tests.  The - * x_BITS values are shifted right by the category value to determine whether - * the given category is included in the set. - */  + * The following macros are used for fast character category tests. The x_BITS + * values are shifted right by the category value to determine whether the + * given category is included in the set. + */  #define ALPHA_BITS ((1 << UPPERCASE_LETTER) | (1 << LOWERCASE_LETTER) \ -    | (1 << TITLECASE_LETTER) | (1 << MODIFIER_LETTER) | (1 << OTHER_LETTER)) +	| (1 << TITLECASE_LETTER) | (1 << MODIFIER_LETTER) | (1<<OTHER_LETTER)) + +#define CONTROL_BITS ((1 << CONTROL) | (1 << FORMAT) | (1 << PRIVATE_USE))  #define DIGIT_BITS (1 << DECIMAL_DIGIT_NUMBER)  #define SPACE_BITS ((1 << SPACE_SEPARATOR) | (1 << LINE_SEPARATOR) \ -    | (1 << PARAGRAPH_SEPARATOR)) +	| (1 << PARAGRAPH_SEPARATOR)) + +#define WORD_BITS (ALPHA_BITS | DIGIT_BITS | (1 << CONNECTOR_PUNCTUATION)) -#define CONNECTOR_BITS (1 << CONNECTOR_PUNCTUATION) +#define PUNCT_BITS ((1 << CONNECTOR_PUNCTUATION) | \ +	(1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \ +	(1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \ +	(1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION)) + +#define GRAPH_BITS (WORD_BITS | PUNCT_BITS | \ +	(1 << NON_SPACING_MARK) | (1 << ENCLOSING_MARK) | \ +	(1 << COMBINING_SPACING_MARK) | (1 << LETTER_NUMBER) | \ +	(1 << OTHER_NUMBER) | \ +	(1 << MATH_SYMBOL) | (1 << CURRENCY_SYMBOL) | \ +	(1 << MODIFIER_SYMBOL) | (1 << OTHER_SYMBOL))  /* - * Unicode characters less than this value are represented by themselves  - * in UTF-8 strings.  + * Unicode characters less than this value are represented by themselves in + * UTF-8 strings.   */  #define UNICODE_SELF	0x80  /* - * The following structures are used when mapping between Unicode (UCS-2) - * and UTF-8. + * The following structures are used when mapping between Unicode (UCS-2) and + * UTF-8.   */ -  -CONST unsigned char totalBytes[256] = { + +static const unsigned char totalBytes[256] = {      1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,      1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,      1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, @@ -74,19 +86,17 @@ CONST unsigned char totalBytes[256] = {  };  /* - * Procedures used only in this module. + * Functions used only in this module.   */ -static int UtfCount _ANSI_ARGS_((int ch)); - +static int		UtfCount(int ch);  /*   *---------------------------------------------------------------------------   * - * Tcl_UniCharToUtf -- + * UtfCount --   * - *	Store the given Tcl_UniChar as a sequence of UTF-8 bytes in the - *	Utf character "ch". + *	Find the number of bytes in the Utf character "ch".   *   * Results:   *	The return values is the number of bytes in the Utf character "ch". @@ -96,10 +106,10 @@ static int UtfCount _ANSI_ARGS_((int ch));   *   *---------------------------------------------------------------------------   */ -  -static int -UtfCount(ch) -    int ch;			/* The Tcl_UniChar whose size is returned. */ + +INLINE static int +UtfCount( +    int ch)			/* The Tcl_UniChar whose size is returned. */  {      if ((ch > 0) && (ch < UNICODE_SELF)) {  	return 1; @@ -130,70 +140,72 @@ UtfCount(ch)   * Tcl_UniCharToUtf --   *   *	Store the given Tcl_UniChar as a sequence of UTF-8 bytes in the - *	provided buffer.  Equivalent to Plan 9 runetochar(). + *	provided buffer. Equivalent to Plan 9 runetochar().   *   * Results: - *	The return values is the number of bytes in the buffer that - *	were consumed.   + *	The return values is the number of bytes in the buffer that were + *	consumed.   *   * Side effects:   *	None.   *   *---------------------------------------------------------------------------   */ -  +  INLINE int -Tcl_UniCharToUtf(ch, str) -    int ch;			/* The Tcl_UniChar to be stored in the +Tcl_UniCharToUtf( +    int ch,			/* The Tcl_UniChar to be stored in the  				 * buffer. */ -    char *str;			/* Buffer in which the UTF-8 representation -				 * of the Tcl_UniChar is stored.  Buffer must -				 * be large enough to hold the UTF-8 character +    char *buf)			/* Buffer in which the UTF-8 representation of +				 * the Tcl_UniChar is stored. Buffer must be +				 * large enough to hold the UTF-8 character  				 * (at most TCL_UTF_MAX bytes). */  {      if ((ch > 0) && (ch < UNICODE_SELF)) { -	str[0] = (char) ch; +	buf[0] = (char) ch;  	return 1;      } -    if (ch <= 0x7FF) { -	str[1] = (char) ((ch | 0x80) & 0xBF); -	str[0] = (char) ((ch >> 6) | 0xC0); -	return 2; -    } -    if (ch <= 0xFFFF) { +    if (ch >= 0) { +	if (ch <= 0x7FF) { +	    buf[1] = (char) ((ch | 0x80) & 0xBF); +	    buf[0] = (char) ((ch >> 6) | 0xC0); +	    return 2; +	} +	if (ch <= 0xFFFF) {  	three: -	str[2] = (char) ((ch | 0x80) & 0xBF); -	str[1] = (char) (((ch >> 6) | 0x80) & 0xBF); -	str[0] = (char) ((ch >> 12) | 0xE0); -	return 3; -    } +	    buf[2] = (char) ((ch | 0x80) & 0xBF); +	    buf[1] = (char) (((ch >> 6) | 0x80) & 0xBF); +	    buf[0] = (char) ((ch >> 12) | 0xE0); +	    return 3; +	}  #if TCL_UTF_MAX > 3 -    if (ch <= 0x1FFFFF) { -	str[3] = (char) ((ch | 0x80) & 0xBF); -	str[2] = (char) (((ch >> 6) | 0x80) & 0xBF); -	str[1] = (char) (((ch >> 12) | 0x80) & 0xBF); -	str[0] = (char) ((ch >> 18) | 0xF0); -	return 4; -    } -    if (ch <= 0x3FFFFFF) { -	str[4] = (char) ((ch | 0x80) & 0xBF); -	str[3] = (char) (((ch >> 6) | 0x80) & 0xBF); -	str[2] = (char) (((ch >> 12) | 0x80) & 0xBF); -	str[1] = (char) (((ch >> 18) | 0x80) & 0xBF); -	str[0] = (char) ((ch >> 24) | 0xF8); -	return 5; -    } -    if (ch <= 0x7FFFFFFF) { -	str[5] = (char) ((ch | 0x80) & 0xBF); -	str[4] = (char) (((ch >> 6) | 0x80) & 0xBF); -	str[3] = (char) (((ch >> 12) | 0x80) & 0xBF); -	str[2] = (char) (((ch >> 18) | 0x80) & 0xBF); -	str[1] = (char) (((ch >> 24) | 0x80) & 0xBF); -	str[0] = (char) ((ch >> 30) | 0xFC); -	return 6; -    } +	if (ch <= 0x1FFFFF) { +	    buf[3] = (char) ((ch | 0x80) & 0xBF); +	    buf[2] = (char) (((ch >> 6) | 0x80) & 0xBF); +	    buf[1] = (char) (((ch >> 12) | 0x80) & 0xBF); +	    buf[0] = (char) ((ch >> 18) | 0xF0); +	    return 4; +	} +	if (ch <= 0x3FFFFFF) { +	    buf[4] = (char) ((ch | 0x80) & 0xBF); +	    buf[3] = (char) (((ch >> 6) | 0x80) & 0xBF); +	    buf[2] = (char) (((ch >> 12) | 0x80) & 0xBF); +	    buf[1] = (char) (((ch >> 18) | 0x80) & 0xBF); +	    buf[0] = (char) ((ch >> 24) | 0xF8); +	    return 5; +	} +	if (ch <= 0x7FFFFFFF) { +	    buf[5] = (char) ((ch | 0x80) & 0xBF); +	    buf[4] = (char) (((ch >> 6) | 0x80) & 0xBF); +	    buf[3] = (char) (((ch >> 12) | 0x80) & 0xBF); +	    buf[2] = (char) (((ch >> 18) | 0x80) & 0xBF); +	    buf[1] = (char) (((ch >> 24) | 0x80) & 0xBF); +	    buf[0] = (char) ((ch >> 30) | 0xFC); +	    return 6; +	}  #endif +    }      ch = 0xFFFD;      goto three; @@ -208,25 +220,24 @@ Tcl_UniCharToUtf(ch, str)   *   * Results:   *	The return value is a pointer to the UTF-8 representation of the - *	Unicode string.  Storage for the return value is appended to the - *	end of dsPtr. + *	Unicode string. Storage for the return value is appended to the end of + *	dsPtr.   *   * Side effects:   *	None.   *   *---------------------------------------------------------------------------   */ -  +  char * -Tcl_UniCharToUtfDString(wString, numChars, dsPtr) -    CONST Tcl_UniChar *wString;	/* Unicode string to convert to UTF-8. */ -    int numChars;		/* Length of Unicode string in Tcl_UniChars +Tcl_UniCharToUtfDString( +    const Tcl_UniChar *uniStr,	/* Unicode string to convert to UTF-8. */ +    int uniLength,		/* Length of Unicode string in Tcl_UniChars  				 * (must be >= 0). */ -    Tcl_DString *dsPtr;		/* UTF-8 representation of string is -				 * appended to this previously initialized -				 * DString. */ +    Tcl_DString *dsPtr)		/* UTF-8 representation of string is appended +				 * to this previously initialized DString. */  { -    CONST Tcl_UniChar *w, *wEnd; +    const Tcl_UniChar *w, *wEnd;      char *p, *string;      int oldLength; @@ -236,12 +247,12 @@ Tcl_UniCharToUtfDString(wString, numChars, dsPtr)       */      oldLength = Tcl_DStringLength(dsPtr); -    Tcl_DStringSetLength(dsPtr, (oldLength + numChars + 1) * TCL_UTF_MAX); +    Tcl_DStringSetLength(dsPtr, (oldLength + uniLength + 1) * TCL_UTF_MAX);      string = Tcl_DStringValue(dsPtr) + oldLength;      p = string; -    wEnd = wString + numChars; -    for (w = wString; w < wEnd; ) { +    wEnd = uniStr + uniLength; +    for (w = uniStr; w < wEnd; ) {  	p += Tcl_UniCharToUtf(*w, p);  	w++;      } @@ -255,16 +266,16 @@ Tcl_UniCharToUtfDString(wString, numChars, dsPtr)   *   * Tcl_UtfToUniChar --   * - *	Extract the Tcl_UniChar represented by the UTF-8 string.  Bad - *	UTF-8 sequences are converted to valid Tcl_UniChars and processing - *	continues.  Equivalent to Plan 9 chartorune(). + *	Extract the Tcl_UniChar represented by the UTF-8 string. Bad UTF-8 + *	sequences are converted to valid Tcl_UniChars and processing + *	continues. Equivalent to Plan 9 chartorune().   * - *	The caller must ensure that the source buffer is long enough that - *	this routine does not run off the end and dereference non-existent - *	memory looking for trail bytes.  If the source buffer is known to - *	be '\0' terminated, this cannot happen.  Otherwise, the caller - *	should call Tcl_UtfCharComplete() before calling this routine to - *	ensure that enough bytes remain in the string. + *	The caller must ensure that the source buffer is long enough that this + *	routine does not run off the end and dereference non-existent memory + *	looking for trail bytes. If the source buffer is known to be '\0' + *	terminated, this cannot happen. Otherwise, the caller should call + *	Tcl_UtfCharComplete() before calling this routine to ensure that + *	enough bytes remain in the string.   *   * Results:   *	*chPtr is filled with the Tcl_UniChar, and the return value is the @@ -275,55 +286,57 @@ Tcl_UniCharToUtfDString(wString, numChars, dsPtr)   *   *---------------------------------------------------------------------------   */ -  +  int -Tcl_UtfToUniChar(str, chPtr) -    register CONST char *str;	 /* The UTF-8 string. */ -    register Tcl_UniChar *chPtr; /* Filled with the Tcl_UniChar represented -				  * by the UTF-8 string. */ +Tcl_UtfToUniChar( +    register const char *src,	/* The UTF-8 string. */ +    register Tcl_UniChar *chPtr)/* Filled with the Tcl_UniChar represented by +				 * the UTF-8 string. */  {      register int byte; -     +      /*       * Unroll 1 to 3 byte UTF-8 sequences, use loop to handle longer ones.       */ -    byte = *((unsigned char *) str); +    byte = *((unsigned char *) src);      if (byte < 0xC0) {  	/*  	 * Handles properly formed UTF-8 characters between 0x01 and 0x7F.  	 * Also treats \0 and naked trail bytes 0x80 to 0xBF as valid  	 * characters representing themselves.  	 */ -	  +  	*chPtr = (Tcl_UniChar) byte;  	return 1;      } else if (byte < 0xE0) { -	if ((str[1] & 0xC0) == 0x80) { +	if ((src[1] & 0xC0) == 0x80) {  	    /*  	     * Two-byte-character lead-byte followed by a trail-byte.  	     */ -	      -	    *chPtr = (Tcl_UniChar) (((byte & 0x1F) << 6) | (str[1] & 0x3F)); + +	    *chPtr = (Tcl_UniChar) (((byte & 0x1F) << 6) | (src[1] & 0x3F));  	    return 2;  	} +  	/*  	 * A two-byte-character lead-byte not followed by trail-byte  	 * represents itself.  	 */ -	  +  	*chPtr = (Tcl_UniChar) byte;  	return 1;      } else if (byte < 0xF0) { -	if (((str[1] & 0xC0) == 0x80) && ((str[2] & 0xC0) == 0x80)) { +	if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80)) {  	    /*  	     * Three-byte-character lead byte followed by two trail bytes.  	     */ -	    *chPtr = (Tcl_UniChar) (((byte & 0x0F) << 12)  -		    | ((str[1] & 0x3F) << 6) | (str[2] & 0x3F)); +	    *chPtr = (Tcl_UniChar) (((byte & 0x0F) << 12) +		    | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F));  	    return 3;  	} +  	/*  	 * A three-byte-character lead-byte not followed by two trail-bytes  	 * represents itself. @@ -333,7 +346,7 @@ Tcl_UtfToUniChar(str, chPtr)  	return 1;      }  #if TCL_UTF_MAX > 3 -    else { +    {  	int ch, total, trail;  	total = totalBytes[byte]; @@ -341,13 +354,13 @@ Tcl_UtfToUniChar(str, chPtr)  	if (trail > 0) {  	    ch = byte & (0x3F >> trail);  	    do { -		str++; -		if ((*str & 0xC0) != 0x80) { +		src++; +		if ((*src & 0xC0) != 0x80) {  		    *chPtr = byte;  		    return 1;  		}  		ch <<= 6; -		ch |= (*str & 0x3F); +		ch |= (*src & 0x3F);  		trail--;  	    } while (trail > 0);  	    *chPtr = ch; @@ -369,9 +382,8 @@ Tcl_UtfToUniChar(str, chPtr)   *   * Results:   *	The return value is a pointer to the Unicode representation of the - *	UTF-8 string.  Storage for the return value is appended to the - *	end of dsPtr.  The Unicode string is terminated with a Unicode - *	NULL character. + *	UTF-8 string. Storage for the return value is appended to the end of + *	dsPtr. The Unicode string is terminated with a Unicode NULL character.   *   * Side effects:   *	None. @@ -380,36 +392,37 @@ Tcl_UtfToUniChar(str, chPtr)   */  Tcl_UniChar * -Tcl_UtfToUniCharDString(string, length, dsPtr) -    CONST char *string;		/* UTF-8 string to convert to Unicode. */ -    int length;			/* Length of UTF-8 string in bytes, or -1 -				 * for strlen(). */ -    Tcl_DString *dsPtr;		/* Unicode representation of string is +Tcl_UtfToUniCharDString( +    const char *src,		/* UTF-8 string to convert to Unicode. */ +    int length,			/* Length of UTF-8 string in bytes, or -1 for +				 * strlen(). */ +    Tcl_DString *dsPtr)		/* Unicode representation of string is  				 * appended to this previously initialized  				 * DString. */  {      Tcl_UniChar *w, *wString; -    CONST char *p, *end; +    const char *p, *end;      int oldLength;      if (length < 0) { -	length = strlen(string); +	length = strlen(src);      }      /* -     * Unicode string length in Tcl_UniChars will be <= UTF-8 string length -     * in bytes. +     * Unicode string length in Tcl_UniChars will be <= UTF-8 string length in +     * bytes.       */      oldLength = Tcl_DStringLength(dsPtr); +/* TODO: fix overreach! */      Tcl_DStringSetLength(dsPtr,  	    (int) ((oldLength + length + 1) * sizeof(Tcl_UniChar)));      wString = (Tcl_UniChar *) (Tcl_DStringValue(dsPtr) + oldLength);      w = wString; -    end = string + length; -    for (p = string; p < end; ) { -	p += Tcl_UtfToUniChar(p, w); +    end = src + length; +    for (p = src; p < end; ) { +	p += TclUtfToUniChar(p, w);  	w++;      }      *w = '\0'; @@ -424,9 +437,9 @@ Tcl_UtfToUniCharDString(string, length, dsPtr)   *   * Tcl_UtfCharComplete --   * - *	Determine if the UTF-8 string of the given length is long enough - *	to be decoded by Tcl_UtfToUniChar().  This does not ensure that the - *	UTF-8 string is properly formed.  Equivalent to Plan 9 fullrune(). + *	Determine if the UTF-8 string of the given length is long enough to be + *	decoded by Tcl_UtfToUniChar(). This does not ensure that the UTF-8 + *	string is properly formed. Equivalent to Plan 9 fullrune().   *   * Results:   *	The return value is 0 if the string is not long enough, non-zero @@ -439,15 +452,15 @@ Tcl_UtfToUniCharDString(string, length, dsPtr)   */  int -Tcl_UtfCharComplete(str, len) -    CONST char *str;		/* String to check if first few bytes -				 * contain a complete UTF-8 character. */ -    int len;			/* Length of above string in bytes. */ +Tcl_UtfCharComplete( +    const char *src,		/* String to check if first few bytes contain +				 * a complete UTF-8 character. */ +    int length)			/* Length of above string in bytes. */  {      int ch; -    ch = *((unsigned char *) str); -    return len >= totalBytes[ch]; +    ch = *((unsigned char *) src); +    return length >= totalBytes[ch];  }  /* @@ -455,48 +468,54 @@ Tcl_UtfCharComplete(str, len)   *   * Tcl_NumUtfChars --   * - *	Returns the number of characters (not bytes) in the UTF-8 string, - *	not including the terminating NULL byte.  This is equivalent to - *	Plan 9 utflen() and utfnlen(). + *	Returns the number of characters (not bytes) in the UTF-8 string, not + *	including the terminating NULL byte. This is equivalent to Plan 9 + *	utflen() and utfnlen().   *   * Results: - *	As above.   + *	As above.   *   * Side effects:   *	None.   *   *---------------------------------------------------------------------------   */ -  -int  -Tcl_NumUtfChars(str, len) -    register CONST char *str;	/* The UTF-8 string to measure. */ -    int len;			/* The length of the string in bytes, or -1 + +int +Tcl_NumUtfChars( +    register const char *src,	/* The UTF-8 string to measure. */ +    int length)			/* The length of the string in bytes, or -1  				 * for strlen(string). */  {      Tcl_UniChar ch;      register Tcl_UniChar *chPtr = &ch; -    register int n; -    int i; +    register int i;      /*       * The separate implementations are faster. +     * +     * Since this is a time-sensitive function, we also do the check for the +     * single-byte char case specially.       */ -      +      i = 0; -    if (len < 0) { -	while (1) { -	    str += Tcl_UtfToUniChar(str, chPtr); -	    if (ch == '\0') { -		break; -	    } +    if (length < 0) { +	while (*src != '\0') { +	    src += TclUtfToUniChar(src, chPtr);  	    i++;  	}      } else { -	while (len > 0) { -	    n = Tcl_UtfToUniChar(str, chPtr); -	    len -= n; -	    str += n; +	register int n; + +	while (length > 0) { +	    if (UCHAR(*src) < 0xC0) { +		length--; +		src++; +	    } else { +		n = Tcl_UtfToUniChar(src, chPtr); +		length -= n; +		src += n; +	    }  	    i++;  	}      } @@ -508,37 +527,37 @@ Tcl_NumUtfChars(str, len)   *   * Tcl_UtfFindFirst --   * - *	Returns a pointer to the first occurance of the given Tcl_UniChar - *	in the NULL-terminated UTF-8 string.  The NULL terminator is - *	considered part of the UTF-8 string.  Equivalent to Plan 9 - *	utfrune(). + *	Returns a pointer to the first occurance of the given Tcl_UniChar in + *	the NULL-terminated UTF-8 string. The NULL terminator is considered + *	part of the UTF-8 string. Equivalent to Plan 9 utfrune().   *   * Results: - *	As above.  If the Tcl_UniChar does not exist in the given string, - *	the return value is NULL. + *	As above. If the Tcl_UniChar does not exist in the given string, the + *	return value is NULL.   *   * Side effects:   *	None.   *   *---------------------------------------------------------------------------   */ -char * -Tcl_UtfFindFirst(string, ch) -    CONST char *string;		/* The UTF-8 string to be searched. */ -    int ch;			/* The Tcl_UniChar to search for. */ + +const char * +Tcl_UtfFindFirst( +    const char *src,		/* The UTF-8 string to be searched. */ +    int ch)			/* The Tcl_UniChar to search for. */  {      int len;      Tcl_UniChar find; -     +      while (1) { -	len = Tcl_UtfToUniChar(string, &find); +	len = TclUtfToUniChar(src, &find);  	if (find == ch) { -	    return (char *) string; +	    return src;  	} -	if (*string == '\0') { +	if (*src == '\0') {  	    return NULL;  	} -	string += len; +	src += len;      }  } @@ -547,14 +566,13 @@ Tcl_UtfFindFirst(string, ch)   *   * Tcl_UtfFindLast --   * - *	Returns a pointer to the last occurance of the given Tcl_UniChar - *	in the NULL-terminated UTF-8 string.  The NULL terminator is - *	considered part of the UTF-8 string.  Equivalent to Plan 9 - *	utfrrune(). + *	Returns a pointer to the last occurance of the given Tcl_UniChar in + *	the NULL-terminated UTF-8 string. The NULL terminator is considered + *	part of the UTF-8 string. Equivalent to Plan 9 utfrrune().   *   * Results: - *	As above.  If the Tcl_UniChar does not exist in the given string, - *	the return value is NULL. + *	As above. If the Tcl_UniChar does not exist in the given string, the + *	return value is NULL.   *   * Side effects:   *	None. @@ -562,27 +580,27 @@ Tcl_UtfFindFirst(string, ch)   *---------------------------------------------------------------------------   */ -char * -Tcl_UtfFindLast(string, ch) -    CONST char *string;		/* The UTF-8 string to be searched. */ -    int ch;			/* The Tcl_UniChar to search for. */ +const char * +Tcl_UtfFindLast( +    const char *src,		/* The UTF-8 string to be searched. */ +    int ch)			/* The Tcl_UniChar to search for. */  {      int len;      Tcl_UniChar find; -    CONST char *last; -	 +    const char *last; +      last = NULL;      while (1) { -	len = Tcl_UtfToUniChar(string, &find); +	len = TclUtfToUniChar(src, &find);  	if (find == ch) { -	    last = string; +	    last = src;  	} -	if (*string == '\0') { +	if (*src == '\0') {  	    break;  	} -	string += len; +	src += len;      } -    return (char *) last; +    return last;  }  /* @@ -590,28 +608,27 @@ Tcl_UtfFindLast(string, ch)   *   * Tcl_UtfNext --   * - *	Given a pointer to some current location in a UTF-8 string, - *	move forward one character.  The caller must ensure that they - *	are not asking for the next character after the last character - *	in the string. + *	Given a pointer to some current location in a UTF-8 string, move + *	forward one character. The caller must ensure that they are not asking + *	for the next character after the last character in the string.   *   * Results: - *	The return value is the pointer to the next character in - *	the UTF-8 string. + *	The return value is the pointer to the next character in the UTF-8 + *	string.   *   * Side effects:   *	None.   *   *---------------------------------------------------------------------------   */ -  -char * -Tcl_UtfNext(str)  -    CONST char *str;		    /* The current location in the string. */ + +const char * +Tcl_UtfNext( +    const char *src)		/* The current location in the string. */  {      Tcl_UniChar ch; -    return (char *) str + Tcl_UtfToUniChar(str, &ch); +    return src + TclUtfToUniChar(src, &ch);  }  /* @@ -619,14 +636,15 @@ Tcl_UtfNext(str)   *   * Tcl_UtfPrev --   * - *	Given a pointer to some current location in a UTF-8 string, - *	move backwards one character. + *	Given a pointer to some current location in a UTF-8 string, move + *	backwards one character. This works correctly when the pointer is in + *	the middle of a UTF-8 character.   *   * Results: - *	The return value is a pointer to the previous character in the - *	UTF-8 string.  If the current location was already at the - *	beginning of the string, the return value will also be a - *	pointer to the beginning of the string. + *	The return value is a pointer to the previous character in the UTF-8 + *	string. If the current location was already at the beginning of the + *	string, the return value will also be a pointer to the beginning of + *	the string.   *   * Side effects:   *	None. @@ -634,47 +652,43 @@ Tcl_UtfNext(str)   *---------------------------------------------------------------------------   */ -char * -Tcl_UtfPrev(str, start) -    CONST char *str;		    /* The current location in the string. */ -    CONST char *start;		    /* Pointer to the beginning of the -				     * string, to avoid going backwards too -				     * far. */ +const char * +Tcl_UtfPrev( +    const char *src,		/* The current location in the string. */ +    const char *start)		/* Pointer to the beginning of the string, to +				 * avoid going backwards too far. */  { -    CONST char *look; +    const char *look;      int i, byte; -     -    str--; -    look = str; + +    src--; +    look = src;      for (i = 0; i < TCL_UTF_MAX; i++) {  	if (look < start) { -	    if (str < start) { -		str = start; +	    if (src < start) { +		src = start;  	    }  	    break;  	}  	byte = *((unsigned char *) look);  	if (byte < 0x80) {  	    break; -	}  +	}  	if (byte >= 0xC0) { -	    if (totalBytes[byte] != i + 1) { -		break; -	    } -	    return (char *) look; +	    return look;  	}  	look--;      } -    return (char *) str; +    return src;  } -	 +  /*   *---------------------------------------------------------------------------   *   * Tcl_UniCharAtIndex --   * - *	Returns the Unicode character represented at the specified - *	character (not byte) position in the UTF-8 string. + *	Returns the Unicode character represented at the specified character + *	(not byte) position in the UTF-8 string.   *   * Results:   *	As above. @@ -684,17 +698,17 @@ Tcl_UtfPrev(str, start)   *   *---------------------------------------------------------------------------   */ -  +  Tcl_UniChar -Tcl_UniCharAtIndex(src, index) -    register CONST char *src;	/* The UTF-8 string to dereference. */ -    register int index;		/* The position of the desired character. */ +Tcl_UniCharAtIndex( +    register const char *src,	/* The UTF-8 string to dereference. */ +    register int index)		/* The position of the desired character. */  { -    Tcl_UniChar ch; +    Tcl_UniChar ch = 0;      while (index >= 0) {  	index--; -	src += Tcl_UtfToUniChar(src, &ch); +	src += TclUtfToUniChar(src, &ch);      }      return ch;  } @@ -704,8 +718,8 @@ Tcl_UniCharAtIndex(src, index)   *   * Tcl_UtfAtIndex --   * - *	Returns a pointer to the specified character (not byte) position - *	in the UTF-8 string. + *	Returns a pointer to the specified character (not byte) position in + *	the UTF-8 string.   *   * Results:   *	As above. @@ -716,18 +730,18 @@ Tcl_UniCharAtIndex(src, index)   *---------------------------------------------------------------------------   */ -char * -Tcl_UtfAtIndex(src, index) -    register CONST char *src;	/* The UTF-8 string. */ -    register int index;		/* The position of the desired character. */ +const char * +Tcl_UtfAtIndex( +    register const char *src,	/* The UTF-8 string. */ +    register int index)		/* The position of the desired character. */  {      Tcl_UniChar ch; -     +      while (index > 0) {  	index--; -	src += Tcl_UtfToUniChar(src, &ch); +	src += TclUtfToUniChar(src, &ch);      } -    return (char *) src; +    return src;  }  /* @@ -739,142 +753,48 @@ Tcl_UtfAtIndex(src, index)   *   * Results:   *	Stores the bytes represented by the backslash sequence in dst and - *	returns the number of bytes written to dst.  At most TCL_UTF_MAX - *	bytes are written to dst; dst must have been large enough to accept - *	those bytes.  If readPtr isn't NULL then it is filled in with a - *	count of the number of bytes in the backslash sequence.   + *	returns the number of bytes written to dst. At most TCL_UTF_MAX bytes + *	are written to dst; dst must have been large enough to accept those + *	bytes. If readPtr isn't NULL then it is filled in with a count of the + *	number of bytes in the backslash sequence.   *   * Side effects: - *	The maximum number of bytes it takes to represent a Unicode - *	character in UTF-8 is guaranteed to be less than the number of - *	bytes used to express the backslash sequence that represents - *	that Unicode character.  If the target buffer into which the - *	caller is going to store the bytes that represent the Unicode - *	character is at least as large as the source buffer from which - *	the backslashed sequence was extracted, no buffer overruns should - *	occur. + *	The maximum number of bytes it takes to represent a Unicode character + *	in UTF-8 is guaranteed to be less than the number of bytes used to + *	express the backslash sequence that represents that Unicode character. + *	If the target buffer into which the caller is going to store the bytes + *	that represent the Unicode character is at least as large as the + *	source buffer from which the backslashed sequence was extracted, no + *	buffer overruns should occur.   *   *---------------------------------------------------------------------------   */  int -Tcl_UtfBackslash(src, readPtr, dst) -    CONST char *src;		/* Points to the backslash character of -				 * a backslash sequence. */ -    int *readPtr;		/* Fill in with number of characters read -				 * from src, unless NULL. */ -    char *dst;			/* Filled with the bytes represented by the +Tcl_UtfBackslash( +    const char *src,		/* Points to the backslash character of a +				 * backslash sequence. */ +    int *readPtr,		/* Fill in with number of characters read from +				 * src, unless NULL. */ +    char *dst)			/* Filled with the bytes represented by the  				 * backslash sequence. */  { -    register CONST char *p = src+1; -    int result, count, n; -    char buf[TCL_UTF_MAX]; +#define LINE_LENGTH 128 +    int numRead; +    int result; -    if (dst == NULL) { -	dst = buf; -    } - -    count = 2; -    switch (*p) { +    result = TclParseBackslash(src, LINE_LENGTH, &numRead, dst); +    if (numRead == LINE_LENGTH) {  	/* -         * Note: in the conversions below, use absolute values (e.g., -         * 0xa) rather than symbolic values (e.g. \n) that get converted -         * by the compiler.  It's possible that compilers on some -         * platforms will do the symbolic conversions differently, which -         * could result in non-portable Tcl scripts. -         */ - -        case 'a': -            result = 0x7; -            break; -        case 'b': -            result = 0x8; -            break; -        case 'f': -            result = 0xc; -            break; -        case 'n': -            result = 0xa; -            break; -        case 'r': -            result = 0xd; -            break; -        case 't': -            result = 0x9; -            break; -        case 'v': -            result = 0xb; -            break; -        case 'x': -            if (isxdigit(UCHAR(p[1]))) { /* INTL: digit */ -                char *end; - -                result = (unsigned char) strtoul(p+1, &end, 16); -                count = end - src; -            } else { -                count = 2; -                result = 'x'; -            } -            break; -	case 'u': -	    result = 0; -	    for (count = 0; count < 4; count++) { -		p++; -		if (!isxdigit(UCHAR(*p))) { /* INTL: digit */ -		    break; -		} -		n = *p - '0'; -		if (n > 9) { -		    n = n + '0' + 10 - 'A'; -		} -		if (n > 16) { -		    n = n + 'A' - 'a'; -		} -		result = (result << 4) + n; -	    } -	    if (count == 0) { -		result = 'u'; -	    } -	    count += 2; -	    break; -		     -        case '\n': -            do { -                p++; -            } while ((*p == ' ') || (*p == '\t')); -            result = ' '; -            count = p - src; -            break; -        case 0: -            result = '\\'; -            count = 1; -            break; -	default: -	    if (isdigit(UCHAR(*p))) { /* INTL: digit */ -		result = (unsigned char)(*p - '0'); -		p++; -		if (!isdigit(UCHAR(*p))) { /* INTL: digit */ -		    break; -		} -		count = 3; -		result = (unsigned char)((result << 3) + (*p - '0')); -		p++; -		if (!isdigit(UCHAR(*p))) { /* INTL: digit */ -		    break; -		} -		count = 4; -		result = (unsigned char)((result << 3) + (*p - '0')); -		break; -	    } -	    result = *p; -	    count = 2; -	    break; -    } +	 * We ate a whole line. Pay the price of a strlen() +	 */ +	result = TclParseBackslash(src, (int)strlen(src), &numRead, dst); +    }      if (readPtr != NULL) { -	*readPtr = count; +	*readPtr = numRead;      } -    return Tcl_UniCharToUtf(result, dst); +    return result;  }  /* @@ -882,12 +802,12 @@ Tcl_UtfBackslash(src, readPtr, dst)   *   * Tcl_UtfToUpper --   * - *	Convert lowercase characters to uppercase characters in a UTF - *	string in place.  The conversion may shrink the UTF string. + *	Convert lowercase characters to uppercase characters in a UTF string + *	in place. The conversion may shrink the UTF string.   *   * Results: - *	Returns the number of bytes in the resulting string - *	excluding the trailing null. + *	Returns the number of bytes in the resulting string excluding the + *	trailing null.   *   * Side effects:   *	Writes a terminating null after the last converted character. @@ -896,8 +816,8 @@ Tcl_UtfBackslash(src, readPtr, dst)   */  int -Tcl_UtfToUpper(str) -    char *str;			/* String to convert in place. */ +Tcl_UtfToUpper( +    char *str)			/* String to convert in place. */  {      Tcl_UniChar ch, upChar;      char *src, *dst; @@ -909,15 +829,15 @@ Tcl_UtfToUpper(str)      src = dst = str;      while (*src) { -        bytes = Tcl_UtfToUniChar(src, &ch); +	bytes = TclUtfToUniChar(src, &ch);  	upChar = Tcl_UniCharToUpper(ch);  	/* -	 * To keep badly formed Utf strings from getting inflated by -	 * the conversion (thereby causing a segfault), only copy the -	 * upper case char to dst if its size is <= the original char. +	 * To keep badly formed Utf strings from getting inflated by the +	 * conversion (thereby causing a segfault), only copy the upper case +	 * char to dst if its size is <= the original char.  	 */ -	 +  	if (bytes < UtfCount(upChar)) {  	    memcpy(dst, src, (size_t) bytes);  	    dst += bytes; @@ -935,12 +855,12 @@ Tcl_UtfToUpper(str)   *   * Tcl_UtfToLower --   * - *	Convert uppercase characters to lowercase characters in a UTF - *	string in place.  The conversion may shrink the UTF string. + *	Convert uppercase characters to lowercase characters in a UTF string + *	in place. The conversion may shrink the UTF string.   *   * Results: - *	Returns the number of bytes in the resulting string - *	excluding the trailing null. + *	Returns the number of bytes in the resulting string excluding the + *	trailing null.   *   * Side effects:   *	Writes a terminating null after the last converted character. @@ -949,28 +869,28 @@ Tcl_UtfToUpper(str)   */  int -Tcl_UtfToLower(str) -    char *str;			/* String to convert in place. */ +Tcl_UtfToLower( +    char *str)			/* String to convert in place. */  {      Tcl_UniChar ch, lowChar;      char *src, *dst;      int bytes; -     +      /*       * Iterate over the string until we hit the terminating null.       */      src = dst = str;      while (*src) { -	bytes = Tcl_UtfToUniChar(src, &ch); +	bytes = TclUtfToUniChar(src, &ch);  	lowChar = Tcl_UniCharToLower(ch);  	/* -	 * To keep badly formed Utf strings from getting inflated by -	 * the conversion (thereby causing a segfault), only copy the -	 * lower case char to dst if its size is <= the original char. +	 * To keep badly formed Utf strings from getting inflated by the +	 * conversion (thereby causing a segfault), only copy the lower case +	 * char to dst if its size is <= the original char.  	 */ -	 +  	if (bytes < UtfCount(lowChar)) {  	    memcpy(dst, src, (size_t) bytes);  	    dst += bytes; @@ -988,13 +908,13 @@ Tcl_UtfToLower(str)   *   * Tcl_UtfToTitle --   * - *	Changes the first character of a UTF string to title case or - *	uppercase and the rest of the string to lowercase.  The - *	conversion happens in place and may shrink the UTF string. + *	Changes the first character of a UTF string to title case or uppercase + *	and the rest of the string to lowercase. The conversion happens in + *	place and may shrink the UTF string.   *   * Results: - *	Returns the number of bytes in the resulting string - *	excluding the trailing null. + *	Returns the number of bytes in the resulting string excluding the + *	trailing null.   *   * Side effects:   *	Writes a terminating null after the last converted character. @@ -1003,13 +923,13 @@ Tcl_UtfToLower(str)   */  int -Tcl_UtfToTitle(str) -    char *str;			/* String to convert in place. */ +Tcl_UtfToTitle( +    char *str)			/* String to convert in place. */  {      Tcl_UniChar ch, titleChar, lowChar;      char *src, *dst;      int bytes; -     +      /*       * Capitalize the first character and then lowercase the rest of the       * characters until we get to a null. @@ -1018,7 +938,7 @@ Tcl_UtfToTitle(str)      src = dst = str;      if (*src) { -	bytes = Tcl_UtfToUniChar(src, &ch); +	bytes = TclUtfToUniChar(src, &ch);  	titleChar = Tcl_UniCharToTitle(ch);  	if (bytes < UtfCount(titleChar)) { @@ -1030,7 +950,7 @@ Tcl_UtfToTitle(str)  	src += bytes;      }      while (*src) { -	bytes = Tcl_UtfToUniChar(src, &ch); +	bytes = TclUtfToUniChar(src, &ch);  	lowChar = Tcl_UniCharToLower(ch);  	if (bytes < UtfCount(lowChar)) { @@ -1048,6 +968,184 @@ Tcl_UtfToTitle(str)  /*   *----------------------------------------------------------------------   * + * TclpUtfNcmp2 -- + * + *	Compare at most numBytes bytes of utf-8 strings cs and ct. Both cs and + *	ct are assumed to be at least numBytes bytes long. + * + * Results: + *	Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct. + * + * Side effects: + *	None. + * + *---------------------------------------------------------------------- + */ + +int +TclpUtfNcmp2( +    const char *cs,		/* UTF string to compare to ct. */ +    const char *ct,		/* UTF string cs is compared to. */ +    unsigned long numBytes)	/* Number of *bytes* to compare. */ +{ +    /* +     * We can't simply call 'memcmp(cs, ct, numBytes);' because we need to +     * check for Tcl's \xC0\x80 non-utf-8 null encoding. Otherwise utf-8 lexes +     * fine in the strcmp manner. +     */ + +    register int result = 0; + +    for ( ; numBytes != 0; numBytes--, cs++, ct++) { +	if (*cs != *ct) { +	    result = UCHAR(*cs) - UCHAR(*ct); +	    break; +	} +    } +    if (numBytes && ((UCHAR(*cs) == 0xC0) || (UCHAR(*ct) == 0xC0))) { +	unsigned char c1, c2; + +	c1 = ((UCHAR(*cs) == 0xC0) && (UCHAR(cs[1]) == 0x80)) ? 0 : UCHAR(*cs); +	c2 = ((UCHAR(*ct) == 0xC0) && (UCHAR(ct[1]) == 0x80)) ? 0 : UCHAR(*ct); +	result = (c1 - c2); +    } +    return result; +} + +/* + *---------------------------------------------------------------------- + * + * Tcl_UtfNcmp -- + * + *	Compare at most numChars UTF chars of string cs to string ct. Both cs + *	and ct are assumed to be at least numChars UTF chars long. + * + * Results: + *	Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct. + * + * Side effects: + *	None. + * + *---------------------------------------------------------------------- + */ + +int +Tcl_UtfNcmp( +    const char *cs,		/* UTF string to compare to ct. */ +    const char *ct,		/* UTF string cs is compared to. */ +    unsigned long numChars)	/* Number of UTF chars to compare. */ +{ +    Tcl_UniChar ch1, ch2; + +    /* +     * Cannot use 'memcmp(cs, ct, n);' as byte representation of \u0000 (the +     * pair of bytes 0xc0,0x80) is larger than byte representation of \u0001 +     * (the byte 0x01.) +     */ + +    while (numChars-- > 0) { +	/* +	 * n must be interpreted as chars, not bytes. This should be called +	 * only when both strings are of at least n chars long (no need for \0 +	 * check) +	 */ + +	cs += TclUtfToUniChar(cs, &ch1); +	ct += TclUtfToUniChar(ct, &ch2); +	if (ch1 != ch2) { +	    return (ch1 - ch2); +	} +    } +    return 0; +} + +/* + *---------------------------------------------------------------------- + * + * Tcl_UtfNcasecmp -- + * + *	Compare at most numChars UTF chars of string cs to string ct case + *	insensitive. Both cs and ct are assumed to be at least numChars UTF + *	chars long. + * + * Results: + *	Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct. + * + * Side effects: + *	None. + * + *---------------------------------------------------------------------- + */ + +int +Tcl_UtfNcasecmp( +    const char *cs,		/* UTF string to compare to ct. */ +    const char *ct,		/* UTF string cs is compared to. */ +    unsigned long numChars)	/* Number of UTF chars to compare. */ +{ +    Tcl_UniChar ch1, ch2; +    while (numChars-- > 0) { +	/* +	 * n must be interpreted as chars, not bytes. +	 * This should be called only when both strings are of +	 * at least n chars long (no need for \0 check) +	 */ +	cs += TclUtfToUniChar(cs, &ch1); +	ct += TclUtfToUniChar(ct, &ch2); +	if (ch1 != ch2) { +	    ch1 = Tcl_UniCharToLower(ch1); +	    ch2 = Tcl_UniCharToLower(ch2); +	    if (ch1 != ch2) { +		return (ch1 - ch2); +	    } +	} +    } +    return 0; +} + +/* + *---------------------------------------------------------------------- + * + * Tcl_UtfNcasecmp -- + * + *	Compare UTF chars of string cs to string ct case insensitively. + *	Replacement for strcasecmp in Tcl core, in places where UTF-8 should + *	be handled. + * + * Results: + *	Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct. + * + * Side effects: + *	None. + * + *---------------------------------------------------------------------- + */ + +int +TclUtfCasecmp( +    const char *cs,		/* UTF string to compare to ct. */ +    const char *ct)		/* UTF string cs is compared to. */ +{ +    while (*cs && *ct) { +	Tcl_UniChar ch1, ch2; + +	cs += TclUtfToUniChar(cs, &ch1); +	ct += TclUtfToUniChar(ct, &ch2); +	if (ch1 != ch2) { +	    ch1 = Tcl_UniCharToLower(ch1); +	    ch2 = Tcl_UniCharToLower(ch2); +	    if (ch1 != ch2) { +		return ch1 - ch2; +	    } +	} +    } +    return UCHAR(*cs) - UCHAR(*ct); +} + + +/* + *---------------------------------------------------------------------- + *   * Tcl_UniCharToUpper --   *   *	Compute the uppercase equivalent of the given Unicode character. @@ -1062,16 +1160,15 @@ Tcl_UtfToTitle(str)   */  Tcl_UniChar -Tcl_UniCharToUpper(ch) -    int ch;			/* Unicode character to convert. */ +Tcl_UniCharToUpper( +    int ch)			/* Unicode character to convert. */  {      int info = GetUniCharInfo(ch);      if (GetCaseType(info) & 0x04) { -	return (Tcl_UniChar) (ch - GetDelta(info)); -    } else { -	return ch; +	ch -= GetDelta(info);      } +    return (Tcl_UniChar) ch;  }  /* @@ -1091,16 +1188,15 @@ Tcl_UniCharToUpper(ch)   */  Tcl_UniChar -Tcl_UniCharToLower(ch) -    int ch;			/* Unicode character to convert. */ +Tcl_UniCharToLower( +    int ch)			/* Unicode character to convert. */  {      int info = GetUniCharInfo(ch);      if (GetCaseType(info) & 0x02) { -	return (Tcl_UniChar) (ch + GetDelta(info)); -    } else { -	return ch; +	ch += GetDelta(info);      } +    return (Tcl_UniChar) ch;  }  /* @@ -1120,8 +1216,8 @@ Tcl_UniCharToLower(ch)   */  Tcl_UniChar -Tcl_UniCharToTitle(ch) -    int ch;			/* Unicode character to convert. */ +Tcl_UniCharToTitle( +    int ch)			/* Unicode character to convert. */  {      int info = GetUniCharInfo(ch);      int mode = GetCaseType(info); @@ -1131,12 +1227,11 @@ Tcl_UniCharToTitle(ch)  	 * Subtract or add one depending on the original case.  	 */ -	return (Tcl_UniChar) (ch + ((mode & 0x4) ? -1 : 1)); +	ch += ((mode & 0x4) ? -1 : 1);      } else if (mode == 0x4) { -	return (Tcl_UniChar) (ch - GetDelta(info)); -    } else { -	return ch; +	ch -= GetDelta(info);      } +    return (Tcl_UniChar) ch;  }  /* @@ -1144,7 +1239,7 @@ Tcl_UniCharToTitle(ch)   *   * Tcl_UniCharLen --   * - *	Find the length of a UniChar string.  The str input must be null + *	Find the length of a UniChar string. The str input must be null   *	terminated.   *   * Results: @@ -1157,14 +1252,14 @@ Tcl_UniCharToTitle(ch)   */  int -Tcl_UniCharLen(str) -    Tcl_UniChar *str;		/* Unicode string to find length of. */ +Tcl_UniCharLen( +    const Tcl_UniChar *uniStr)	/* Unicode string to find length of. */  {      int len = 0; -     -    while (*str != '\0') { + +    while (*uniStr != '\0') {  	len++; -	str++; +	uniStr++;      }      return len;  } @@ -1174,11 +1269,11 @@ Tcl_UniCharLen(str)   *   * Tcl_UniCharNcmp --   * - *	Compare at most n unichars of string cs to string ct.  Both cs - *	and ct are assumed to be at least n unichars long. + *	Compare at most numChars unichars of string ucs to string uct. + *	Both ucs and uct are assumed to be at least numChars unichars long.   *   * Results: - *	Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct. + *	Return <0 if ucs < uct, 0 if ucs == uct, or >0 if ucs > uct.   *   * Side effects:   *	None. @@ -1187,17 +1282,64 @@ Tcl_UniCharLen(str)   */  int -Tcl_UniCharNcmp(cs, ct, n) -    CONST Tcl_UniChar *cs;		/* Unicode string to compare to ct. */ -    CONST Tcl_UniChar *ct;		/* Unicode string cs is compared to. */ -    size_t n;				/* Number of unichars to compare. */ +Tcl_UniCharNcmp( +    const Tcl_UniChar *ucs,	/* Unicode string to compare to uct. */ +    const Tcl_UniChar *uct,	/* Unicode string ucs is compared to. */ +    unsigned long numChars)	/* Number of unichars to compare. */  { -    for ( ; n != 0; n--, cs++, ct++) { -	if (*cs != *ct) { -	    return *cs - *ct; +#ifdef WORDS_BIGENDIAN +    /* +     * We are definitely on a big-endian machine; memcmp() is safe +     */ + +    return memcmp(ucs, uct, numChars*sizeof(Tcl_UniChar)); + +#else /* !WORDS_BIGENDIAN */ +    /* +     * We can't simply call memcmp() because that is not lexically correct. +     */ + +    for ( ; numChars != 0; ucs++, uct++, numChars--) { +	if (*ucs != *uct) { +	    return (*ucs - *uct);  	} -	if (*cs == '\0') { -	    break; +    } +    return 0; +#endif /* WORDS_BIGENDIAN */ +} + +/* + *---------------------------------------------------------------------- + * + * Tcl_UniCharNcasecmp -- + * + *	Compare at most numChars unichars of string ucs to string uct case + *	insensitive. Both ucs and uct are assumed to be at least numChars + *	unichars long. + * + * Results: + *	Return <0 if ucs < uct, 0 if ucs == uct, or >0 if ucs > uct. + * + * Side effects: + *	None. + * + *---------------------------------------------------------------------- + */ + +int +Tcl_UniCharNcasecmp( +    const Tcl_UniChar *ucs,	/* Unicode string to compare to uct. */ +    const Tcl_UniChar *uct,	/* Unicode string ucs is compared to. */ +    unsigned long numChars)	/* Number of unichars to compare. */ +{ +    for ( ; numChars != 0; numChars--, ucs++, uct++) { +	if (*ucs != *uct) { +	    Tcl_UniChar lcs = Tcl_UniCharToLower(*ucs); +	    Tcl_UniChar lct = Tcl_UniCharToLower(*uct); + +	    if (lcs != lct) { +		return (lcs - lct); +	    }  	}      }      return 0; @@ -1220,12 +1362,10 @@ Tcl_UniCharNcmp(cs, ct, n)   */  int -Tcl_UniCharIsAlnum(ch) -    int ch;			/* Unicode character to test. */ +Tcl_UniCharIsAlnum( +    int ch)			/* Unicode character to test. */  { -    register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); - -    return (((ALPHA_BITS | DIGIT_BITS) >> category) & 1); +    return (((ALPHA_BITS | DIGIT_BITS) >> GetCategory(ch)) & 1);  }  /* @@ -1245,11 +1385,33 @@ Tcl_UniCharIsAlnum(ch)   */  int -Tcl_UniCharIsAlpha(ch) -    int ch;			/* Unicode character to test. */ +Tcl_UniCharIsAlpha( +    int ch)			/* Unicode character to test. */ +{ +    return ((ALPHA_BITS >> GetCategory(ch)) & 1); +} + +/* + *---------------------------------------------------------------------- + * + * Tcl_UniCharIsControl -- + * + *	Test if a character is a Unicode control character. + * + * Results: + *	Returns non-zero if character is a control. + * + * Side effects: + *	None. + * + *---------------------------------------------------------------------- + */ + +int +Tcl_UniCharIsControl( +    int ch)			/* Unicode character to test. */  { -    register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); -    return ((ALPHA_BITS >> category) & 1); +    return ((CONTROL_BITS >> GetCategory(ch)) & 1);  }  /* @@ -1269,11 +1431,33 @@ Tcl_UniCharIsAlpha(ch)   */  int -Tcl_UniCharIsDigit(ch) -    int ch;			/* Unicode character to test. */ +Tcl_UniCharIsDigit( +    int ch)			/* Unicode character to test. */ +{ +    return (GetCategory(ch) == DECIMAL_DIGIT_NUMBER); +} + +/* + *---------------------------------------------------------------------- + * + * Tcl_UniCharIsGraph -- + * + *	Test if a character is any Unicode print character except space. + * + * Results: + *	Returns non-zero if character is printable, but not space. + * + * Side effects: + *	None. + * + *---------------------------------------------------------------------- + */ + +int +Tcl_UniCharIsGraph( +    int ch)			/* Unicode character to test. */  { -    return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) -	    == DECIMAL_DIGIT_NUMBER); +    return ((GRAPH_BITS >> GetCategory(ch)) & 1);  }  /* @@ -1293,10 +1477,56 @@ Tcl_UniCharIsDigit(ch)   */  int -Tcl_UniCharIsLower(ch) -    int ch;			/* Unicode character to test. */ +Tcl_UniCharIsLower( +    int ch)			/* Unicode character to test. */ +{ +    return (GetCategory(ch) == LOWERCASE_LETTER); +} + +/* + *---------------------------------------------------------------------- + * + * Tcl_UniCharIsPrint -- + * + *	Test if a character is a Unicode print character. + * + * Results: + *	Returns non-zero if character is printable. + * + * Side effects: + *	None. + * + *---------------------------------------------------------------------- + */ + +int +Tcl_UniCharIsPrint( +    int ch)			/* Unicode character to test. */ +{ +    return (((GRAPH_BITS|SPACE_BITS) >> GetCategory(ch)) & 1); +} + +/* + *---------------------------------------------------------------------- + * + * Tcl_UniCharIsPunct -- + * + *	Test if a character is a Unicode punctuation character. + * + * Results: + *	Returns non-zero if character is punct. + * + * Side effects: + *	None. + * + *---------------------------------------------------------------------- + */ + +int +Tcl_UniCharIsPunct( +    int ch)			/* Unicode character to test. */  { -    return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == LOWERCASE_LETTER); +    return ((PUNCT_BITS >> GetCategory(ch)) & 1);  }  /* @@ -1316,21 +1546,22 @@ Tcl_UniCharIsLower(ch)   */  int -Tcl_UniCharIsSpace(ch) -    int ch;			/* Unicode character to test. */ +Tcl_UniCharIsSpace( +    int ch)			/* Unicode character to test. */  { -    register int category; -      /*       * If the character is within the first 127 characters, just use the       * standard C function, otherwise consult the Unicode table.       */ -    if (ch < 0x80) { -	return isspace(UCHAR(ch)); /* INTL: ISO space */ +    if (((Tcl_UniChar) ch) < ((Tcl_UniChar) 0x80)) { +	return TclIsSpaceProc((char) ch); +    } else if ((Tcl_UniChar) ch == 0x0085 || (Tcl_UniChar) ch == 0x180e +	    || (Tcl_UniChar) ch == 0x200b || (Tcl_UniChar) ch == 0x2060 +	    || (Tcl_UniChar) ch == 0x202f || (Tcl_UniChar) ch == 0xfeff) { +	return 1;      } else { -	category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); -	return ((SPACE_BITS >> category) & 1); +	return ((SPACE_BITS >> GetCategory(ch)) & 1);      }  } @@ -1351,10 +1582,10 @@ Tcl_UniCharIsSpace(ch)   */  int -Tcl_UniCharIsUpper(ch) -    int ch;			/* Unicode character to test. */ +Tcl_UniCharIsUpper( +    int ch)			/* Unicode character to test. */  { -    return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == UPPERCASE_LETTER); +    return (GetCategory(ch) == UPPERCASE_LETTER);  }  /* @@ -1362,8 +1593,7 @@ Tcl_UniCharIsUpper(ch)   *   * Tcl_UniCharIsWordChar --   * - *	Test if a character is alphanumeric or a connector punctuation - *	mark. + *	Test if a character is alphanumeric or a connector punctuation mark.   *   * Results:   *	Returns 1 if character is a word character. @@ -1375,10 +1605,397 @@ Tcl_UniCharIsUpper(ch)   */  int -Tcl_UniCharIsWordChar(ch) -    int ch;			/* Unicode character to test. */ +Tcl_UniCharIsWordChar( +    int ch)			/* Unicode character to test. */ +{ +    return ((WORD_BITS >> GetCategory(ch)) & 1); +} + +/* + *---------------------------------------------------------------------- + * + * Tcl_UniCharCaseMatch -- + * + *	See if a particular Unicode string matches a particular pattern. + *	Allows case insensitivity. This is the Unicode equivalent of the char* + *	Tcl_StringCaseMatch. The UniChar strings must be NULL-terminated. + *	This has no provision for counted UniChar strings, thus should not be + *	used where NULLs are expected in the UniChar string. Use + *	TclUniCharMatch where possible. + * + * Results: + *	The return value is 1 if string matches pattern, and 0 otherwise. The + *	matching operation permits the following special characters in the + *	pattern: *?\[] (see the manual entry for details on what these mean). + * + * Side effects: + *	None. + * + *---------------------------------------------------------------------- + */ + +int +Tcl_UniCharCaseMatch( +    const Tcl_UniChar *uniStr,	/* Unicode String. */ +    const Tcl_UniChar *uniPattern, +				/* Pattern, which may contain special +				 * characters. */ +    int nocase)			/* 0 for case sensitive, 1 for insensitive */  { -    register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); +    Tcl_UniChar ch1, p; + +    while (1) { +	p = *uniPattern; + +	/* +	 * See if we're at the end of both the pattern and the string. If so, +	 * we succeeded. If we're at the end of the pattern but not at the end +	 * of the string, we failed. +	 */ + +	if (p == 0) { +	    return (*uniStr == 0); +	} +	if ((*uniStr == 0) && (p != '*')) { +	    return 0; +	} + +	/* +	 * Check for a "*" as the next pattern character. It matches any +	 * substring. We handle this by skipping all the characters up to the +	 * next matching one in the pattern, and then calling ourselves +	 * recursively for each postfix of string, until either we match or we +	 * reach the end of the string. +	 */ -    return (((ALPHA_BITS | DIGIT_BITS | CONNECTOR_BITS) >> category) & 1); +	if (p == '*') { +	    /* +	     * Skip all successive *'s in the pattern +	     */ + +	    while (*(++uniPattern) == '*') { +		/* empty body */ +	    } +	    p = *uniPattern; +	    if (p == 0) { +		return 1; +	    } +	    if (nocase) { +		p = Tcl_UniCharToLower(p); +	    } +	    while (1) { +		/* +		 * Optimization for matching - cruise through the string +		 * quickly if the next char in the pattern isn't a special +		 * character +		 */ + +		if ((p != '[') && (p != '?') && (p != '\\')) { +		    if (nocase) { +			while (*uniStr && (p != *uniStr) +				&& (p != Tcl_UniCharToLower(*uniStr))) { +			    uniStr++; +			} +		    } else { +			while (*uniStr && (p != *uniStr)) { +			    uniStr++; +			} +		    } +		} +		if (Tcl_UniCharCaseMatch(uniStr, uniPattern, nocase)) { +		    return 1; +		} +		if (*uniStr == 0) { +		    return 0; +		} +		uniStr++; +	    } +	} + +	/* +	 * Check for a "?" as the next pattern character. It matches any +	 * single character. +	 */ + +	if (p == '?') { +	    uniPattern++; +	    uniStr++; +	    continue; +	} + +	/* +	 * Check for a "[" as the next pattern character. It is followed by a +	 * list of characters that are acceptable, or by a range (two +	 * characters separated by "-"). +	 */ + +	if (p == '[') { +	    Tcl_UniChar startChar, endChar; + +	    uniPattern++; +	    ch1 = (nocase ? Tcl_UniCharToLower(*uniStr) : *uniStr); +	    uniStr++; +	    while (1) { +		if ((*uniPattern == ']') || (*uniPattern == 0)) { +		    return 0; +		} +		startChar = (nocase ? Tcl_UniCharToLower(*uniPattern) +			: *uniPattern); +		uniPattern++; +		if (*uniPattern == '-') { +		    uniPattern++; +		    if (*uniPattern == 0) { +			return 0; +		    } +		    endChar = (nocase ? Tcl_UniCharToLower(*uniPattern) +			    : *uniPattern); +		    uniPattern++; +		    if (((startChar <= ch1) && (ch1 <= endChar)) +			    || ((endChar <= ch1) && (ch1 <= startChar))) { +			/* +			 * Matches ranges of form [a-z] or [z-a]. +			 */ +			break; +		    } +		} else if (startChar == ch1) { +		    break; +		} +	    } +	    while (*uniPattern != ']') { +		if (*uniPattern == 0) { +		    uniPattern--; +		    break; +		} +		uniPattern++; +	    } +	    uniPattern++; +	    continue; +	} + +	/* +	 * If the next pattern character is '\', just strip off the '\' so we +	 * do exact matching on the character that follows. +	 */ + +	if (p == '\\') { +	    if (*(++uniPattern) == '\0') { +		return 0; +	    } +	} + +	/* +	 * There's no special character. Just make sure that the next bytes of +	 * each string match. +	 */ + +	if (nocase) { +	    if (Tcl_UniCharToLower(*uniStr) != +		    Tcl_UniCharToLower(*uniPattern)) { +		return 0; +	    } +	} else if (*uniStr != *uniPattern) { +	    return 0; +	} +	uniStr++; +	uniPattern++; +    }  } + +/* + *---------------------------------------------------------------------- + * + * TclUniCharMatch -- + * + *	See if a particular Unicode string matches a particular pattern. + *	Allows case insensitivity. This is the Unicode equivalent of the char* + *	Tcl_StringCaseMatch. This variant of Tcl_UniCharCaseMatch uses counted + *	Strings, so embedded NULLs are allowed. + * + * Results: + *	The return value is 1 if string matches pattern, and 0 otherwise. The + *	matching operation permits the following special characters in the + *	pattern: *?\[] (see the manual entry for details on what these mean). + * + * Side effects: + *	None. + * + *---------------------------------------------------------------------- + */ + +int +TclUniCharMatch( +    const Tcl_UniChar *string,	/* Unicode String. */ +    int strLen,			/* Length of String */ +    const Tcl_UniChar *pattern,	/* Pattern, which may contain special +				 * characters. */ +    int ptnLen,			/* Length of Pattern */ +    int nocase)			/* 0 for case sensitive, 1 for insensitive */ +{ +    const Tcl_UniChar *stringEnd, *patternEnd; +    Tcl_UniChar p; + +    stringEnd = string + strLen; +    patternEnd = pattern + ptnLen; + +    while (1) { +	/* +	 * See if we're at the end of both the pattern and the string. If so, +	 * we succeeded. If we're at the end of the pattern but not at the end +	 * of the string, we failed. +	 */ + +	if (pattern == patternEnd) { +	    return (string == stringEnd); +	} +	p = *pattern; +	if ((string == stringEnd) && (p != '*')) { +	    return 0; +	} + +	/* +	 * Check for a "*" as the next pattern character. It matches any +	 * substring. We handle this by skipping all the characters up to the +	 * next matching one in the pattern, and then calling ourselves +	 * recursively for each postfix of string, until either we match or we +	 * reach the end of the string. +	 */ + +	if (p == '*') { +	    /* +	     * Skip all successive *'s in the pattern. +	     */ + +	    while (*(++pattern) == '*') { +		/* empty body */ +	    } +	    if (pattern == patternEnd) { +		return 1; +	    } +	    p = *pattern; +	    if (nocase) { +		p = Tcl_UniCharToLower(p); +	    } +	    while (1) { +		/* +		 * Optimization for matching - cruise through the string +		 * quickly if the next char in the pattern isn't a special +		 * character. +		 */ + +		if ((p != '[') && (p != '?') && (p != '\\')) { +		    if (nocase) { +			while ((string < stringEnd) && (p != *string) +				&& (p != Tcl_UniCharToLower(*string))) { +			    string++; +			} +		    } else { +			while ((string < stringEnd) && (p != *string)) { +			    string++; +			} +		    } +		} +		if (TclUniCharMatch(string, stringEnd - string, +			pattern, patternEnd - pattern, nocase)) { +		    return 1; +		} +		if (string == stringEnd) { +		    return 0; +		} +		string++; +	    } +	} + +	/* +	 * Check for a "?" as the next pattern character. It matches any +	 * single character. +	 */ + +	if (p == '?') { +	    pattern++; +	    string++; +	    continue; +	} + +	/* +	 * Check for a "[" as the next pattern character. It is followed by a +	 * list of characters that are acceptable, or by a range (two +	 * characters separated by "-"). +	 */ + +	if (p == '[') { +	    Tcl_UniChar ch1, startChar, endChar; + +	    pattern++; +	    ch1 = (nocase ? Tcl_UniCharToLower(*string) : *string); +	    string++; +	    while (1) { +		if ((*pattern == ']') || (pattern == patternEnd)) { +		    return 0; +		} +		startChar = (nocase ? Tcl_UniCharToLower(*pattern) : *pattern); +		pattern++; +		if (*pattern == '-') { +		    pattern++; +		    if (pattern == patternEnd) { +			return 0; +		    } +		    endChar = (nocase ? Tcl_UniCharToLower(*pattern) +			    : *pattern); +		    pattern++; +		    if (((startChar <= ch1) && (ch1 <= endChar)) +			    || ((endChar <= ch1) && (ch1 <= startChar))) { +			/* +			 * Matches ranges of form [a-z] or [z-a]. +			 */ +			break; +		    } +		} else if (startChar == ch1) { +		    break; +		} +	    } +	    while (*pattern != ']') { +		if (pattern == patternEnd) { +		    pattern--; +		    break; +		} +		pattern++; +	    } +	    pattern++; +	    continue; +	} + +	/* +	 * If the next pattern character is '\', just strip off the '\' so we +	 * do exact matching on the character that follows. +	 */ + +	if (p == '\\') { +	    if (++pattern == patternEnd) { +		return 0; +	    } +	} + +	/* +	 * There's no special character. Just make sure that the next bytes of +	 * each string match. +	 */ + +	if (nocase) { +	    if (Tcl_UniCharToLower(*string) != Tcl_UniCharToLower(*pattern)) { +		return 0; +	    } +	} else if (*string != *pattern) { +	    return 0; +	} +	string++; +	pattern++; +    } +} + +/* + * Local Variables: + * mode: c + * c-basic-offset: 4 + * fill-column: 78 + * End: + */ | 
