1 files changed, 109 insertions, 61 deletions
diff --git a/doc/Utf.3 b/doc/Utf.3
index 12756bc..3b2ef91 100644
--- a/doc/Utf.3
+++ b/doc/Utf.3
@@ -4,18 +4,16 @@
 '\" See the file "license.terms" for information on usage and redistribution
 '\" of this file, and for a DISCLAIMER OF ALL WARRANTIES.
 '\" 
-'\" RCS: @(#) $Id: Utf.3,v 1.3 1999/04/30 22:45:01 stanton Exp $
-'\" 
-.so man.macros
 .TH Utf 3 "8.1" Tcl "Tcl Library Procedures"
+.so man.macros
 .BS
 .SH NAME
-Tcl_UniChar, Tcl_UniCharToUtf, Tcl_UtfToUniChar, Tcl_UniCharToUtfDString, Tcl_UtfToUniCharDString, Tcl_UniCharLen, Tcl_UniCharNcmp, Tcl_UtfCharComplete, Tcl_NumUtfChars, Tcl_UtfFindFirst, Tcl_UtfFindLast, Tcl_UtfNext, Tcl_UtfPrev, Tcl_UniCharAtIndex, Tcl_UtfAtIndex, Tcl_UtfBackslash \- routines for manipulating UTF-8 strings.
+Tcl_UniChar, Tcl_UniCharCaseMatch, Tcl_UniCharNcasecmp, Tcl_UniCharToUtf, Tcl_UtfToUniChar, Tcl_UniCharToUtfDString, Tcl_UtfToUniCharDString, Tcl_UniCharLen, Tcl_UniCharNcmp, Tcl_UtfCharComplete, Tcl_NumUtfChars, Tcl_UtfFindFirst, Tcl_UtfFindLast, Tcl_UtfNext, Tcl_UtfPrev, Tcl_UniCharAtIndex, Tcl_UtfAtIndex, Tcl_UtfBackslash \- routines for manipulating UTF-8 strings
 .SH SYNOPSIS
 .nf
 \fB#include <tcl.h>\fR
 .sp
-typedef ... Tcl_UniChar;
+typedef ... \fBTcl_UniChar\fR;
 .sp
 int
 \fBTcl_UniCharToUtf\fR(\fIch, buf\fR)
@@ -24,67 +22,89 @@ int
 \fBTcl_UtfToUniChar\fR(\fIsrc, chPtr\fR)
 .sp
 char *
-\fBTcl_UniCharToUtfDString\fR(\fIuniStr, numChars, dstPtr\fR)
+\fBTcl_UniCharToUtfDString\fR(\fIuniStr, uniLength, dsPtr\fR)
 .sp
 Tcl_UniChar *
-\fBTcl_UtfToUniCharDString\fR(\fIsrc, len, dstPtr\fR)
+\fBTcl_UtfToUniCharDString\fR(\fIsrc, length, dsPtr\fR)
 .sp
 int
 \fBTcl_UniCharLen\fR(\fIuniStr\fR)
 .sp
 int
-\fBTcl_UniCharNcmp\fR(\fIuniStr, uniStr, num\fR)
+\fBTcl_UniCharNcmp\fR(\fIucs, uct, numChars\fR)
+.sp
+int
+\fBTcl_UniCharNcasecmp\fR(\fIucs, uct, numChars\fR)
+.sp
+int
+\fBTcl_UniCharCaseMatch\fR(\fIuniStr, uniPattern, nocase\fR)
+.sp
+int
+\fBTcl_UtfNcmp\fR(\fIcs, ct, numChars\fR)
 .sp
 int
-\fBTcl_UtfCharComplete\fR(\fIsrc, len\fR)
+\fBTcl_UtfNcasecmp\fR(\fIcs, ct, numChars\fR)
+.sp
+int
+\fBTcl_UtfCharComplete\fR(\fIsrc, length\fR)
 .sp
 int 
-\fBTcl_NumUtfChars\fR(\fIsrc, len\fR)
+\fBTcl_NumUtfChars\fR(\fIsrc, length\fR)
 .sp
-char *
+const char *
 \fBTcl_UtfFindFirst\fR(\fIsrc, ch\fR)
 .sp
-char *
+const char *
 \fBTcl_UtfFindLast\fR(\fIsrc, ch\fR)
 .sp
-char *
+const char *
 \fBTcl_UtfNext\fR(\fIsrc\fR)
 .sp
-char *
+const char *
 \fBTcl_UtfPrev\fR(\fIsrc, start\fR)
 .sp
 Tcl_UniChar
 \fBTcl_UniCharAtIndex\fR(\fIsrc, index\fR)
 .sp
-char *
+const char *
 \fBTcl_UtfAtIndex\fR(\fIsrc, index\fR)
 .sp
 int
 \fBTcl_UtfBackslash\fR(\fIsrc, readPtr, dst\fR)
 .SH ARGUMENTS
-.AS "CONST Tcl_UniChar" numChars in/out
+.AS "const Tcl_UniChar" *uniPattern in/out
 .AP char *buf out
 Buffer in which the UTF-8 representation of the Tcl_UniChar is stored.  At most
-TCL_UTF_MAX bytes are stored in the buffer.
+\fBTCL_UTF_MAX\fR bytes are stored in the buffer.
 .AP int ch in
 The Tcl_UniChar to be converted or examined.
 .AP Tcl_UniChar *chPtr out
 Filled with the Tcl_UniChar represented by the head of the UTF-8 string.
-.AP "CONST char" *src in
+.AP "const char" *src in
+Pointer to a UTF-8 string.
+.AP "const char" *cs in
+Pointer to a UTF-8 string.
+.AP "const char" *ct in
 Pointer to a UTF-8 string.
-.AP "CONST Tcl_UniChar" *uniStr in
-A NULL-terminated Unicode string.
-.AP int len in
+.AP "const Tcl_UniChar" *uniStr in
+A null-terminated Unicode string.
+.AP "const Tcl_UniChar" *ucs in
+A null-terminated Unicode string.
+.AP "const Tcl_UniChar" *uct in
+A null-terminated Unicode string.
+.AP "const Tcl_UniChar" *uniPattern in
+A null-terminated Unicode string.
+.AP int length in
 The length of the UTF-8 string in bytes (not UTF-8 characters).  If
 negative, all bytes up to the first null byte are used.
-.AP int numChars in
+.AP int uniLength in
 The length of the Unicode string in characters.  Must be greater than or
 equal to 0.
-.AP "Tcl_DString" *dstPtr in/out
-A pointer to a previously-initialized \fBTcl_DString\fR.
-.AP size_t n in
-The number of Unicode characters to compare in \fBTcl_UniCharNcmp\fR.
-.AP "CONST char" *start in
+.AP "Tcl_DString" *dsPtr in/out
+A pointer to a previously initialized \fBTcl_DString\fR.
+.AP "unsigned long" numChars in
+The number of characters to compare.
+.AP "const char" *start in
 Pointer to the beginning of a UTF-8 string.
 .AP int index in
 The index of a character (not byte) in the UTF-8 string.
@@ -93,7 +113,10 @@ If non-NULL, filled with the number of bytes in the backslash sequence,
 including the backslash character.
 .AP char *dst out
 Buffer in which the bytes represented by the backslash sequence are stored.
-At most TCL_UTF_MAX bytes are stored in the buffer.
+At most \fBTCL_UTF_MAX\fR bytes are stored in the buffer.
+.AP int nocase in
+Specifies whether the match should be done case-sensitive (0) or
+case-insensitive (1).
 .BE
 
 .SH DESCRIPTION
@@ -101,7 +124,7 @@ At most TCL_UTF_MAX bytes are stored in the buffer.
 These routines convert between UTF-8 strings and Tcl_UniChars.  A
 Tcl_UniChar is a Unicode character represented as an unsigned, fixed-size
 quantity.  A UTF-8 character is a Unicode character represented as
-a varying-length sequence of up to TCL_UTF_MAX bytes.  A multibyte UTF-8
+a varying-length sequence of up to \fBTCL_UTF_MAX\fR bytes.  A multibyte UTF-8
 sequence consists of a lead byte followed by some number of trail bytes.
 .PP
 \fBTCL_UTF_MAX\fR is the maximum number of bytes that it takes to
@@ -113,46 +136,67 @@ in \fIbuf\fR.
 .PP
 \fBTcl_UtfToUniChar\fR reads one UTF-8 character starting at \fIsrc\fR
 and stores it as a Tcl_UniChar in \fI*chPtr\fR.  The return value is the
-number of bytes read from \fIsrc\fR..  The caller must ensure that the
+number of bytes read from \fIsrc\fR.  The caller must ensure that the
 source buffer is long enough such that this routine does not run off the
 end and dereference non-existent or random memory; if the source buffer
-is known to be null terminated, this will not happen.  If the input is
+is known to be null-terminated, this will not happen.  If the input is
 not in proper UTF-8 format, \fBTcl_UtfToUniChar\fR will store the first
 byte of \fIsrc\fR in \fI*chPtr\fR as a Tcl_UniChar between 0x0000 and
 0x00ff and return 1.  
 .PP
 \fBTcl_UniCharToUtfDString\fR converts the given Unicode string
-to UTF-8, storing the result in a previously-initialized \fBTcl_DString\fR.
-You must specify the length of the given Unicode string.
+to UTF-8, storing the result in a previously initialized \fBTcl_DString\fR.
+You must specify \fIuniLength\fR, the length of the given Unicode string.
 The return value is a pointer to the UTF-8 representation of the
 Unicode string.  Storage for the return value is appended to the
 end of the \fBTcl_DString\fR.
 .PP
-\fBTcl_UtfToUniCharDString\fR coverts the given UTF-8 string to Unicode,
-storing the result in the previously-initialized \fBTcl_Dstring\fR.
-you may either specify the length of the given UTF-8 string or "-1",
+\fBTcl_UtfToUniCharDString\fR converts the given UTF-8 string to Unicode,
+storing the result in the previously initialized \fBTcl_DString\fR.
+In the argument \fIlength\fR, you may either specify the length of
+the given UTF-8 string in bytes or
+.QW \-1 ,
 in which case \fBTcl_UtfToUniCharDString\fR uses \fBstrlen\fR to
 calculate the length.  The return value is a pointer to the Unicode
 representation of the UTF-8 string.  Storage for the return value
 is appended to the end of the \fBTcl_DString\fR.  The Unicode string
-is terminated with a Unicode NULL character.
+is terminated with a Unicode null character.
 .PP
 \fBTcl_UniCharLen\fR corresponds to \fBstrlen\fR for Unicode
-characters.  It accepts a NULL-terminated Unicode string and returns
+characters.  It accepts a null-terminated Unicode string and returns
 the number of Unicode characters (not bytes) in that string.
 .PP
-\fBTcl_UniCharNcmp\fR corresponds to \fBstrncmp\fR for Unicode
-characters.  It accepts two NULL-terminated Unicode strings
-and the number of characters to compare.  (Both strings are
-assumed to be at least \fIlen\fR characters long.)
-\fBTcl_UniCharNcmp\fR compares the two strings character-by-character
-according to the Unicode character ordering.  It returns an integer
-greater than, equal to,
-or less than 0 if the first string is greater than, equal to, or
-less than the second string respectively.
+\fBTcl_UniCharNcmp\fR and \fBTcl_UniCharNcasecmp\fR correspond to
+\fBstrncmp\fR and \fBstrncasecmp\fR, respectively, for Unicode characters.
+They accept two null-terminated Unicode strings and the number of characters
+to compare.  Both strings are assumed to be at least \fInumChars\fR characters
+long. \fBTcl_UniCharNcmp\fR  compares the two strings character-by-character
+according to the Unicode character ordering.  It returns an integer greater
+than, equal to, or less than 0 if the first string is greater than, equal
+to, or less than the second string respectively.  \fBTcl_UniCharNcasecmp\fR
+is the Unicode case insensitive version.
+.PP
+\fBTcl_UniCharCaseMatch\fR is the Unicode equivalent to
+\fBTcl_StringCaseMatch\fR.  It accepts a null-terminated Unicode string,
+a Unicode pattern, and a boolean value specifying whether the match should
+be case sensitive and returns whether the string matches the pattern.
+.PP
+\fBTcl_UtfNcmp\fR corresponds to \fBstrncmp\fR for UTF-8 strings. It
+accepts two null-terminated UTF-8 strings and the number of characters
+to compare.  (Both strings are assumed to be at least \fInumChars\fR
+characters long.)  \fBTcl_UtfNcmp\fR compares the two strings
+character-by-character according to the Unicode character ordering.
+It returns an integer greater than, equal to, or less than 0 if the
+first string is greater than, equal to, or less than the second string
+respectively.
+.PP
+\fBTcl_UtfNcasecmp\fR corresponds to \fBstrncasecmp\fR for UTF-8
+strings.  It is similar to \fBTcl_UtfNcmp\fR except comparisons ignore
+differences in case when comparing upper, lower or title case
+characters.
 .PP
 \fBTcl_UtfCharComplete\fR returns 1 if the source UTF-8 string \fIsrc\fR
-of length \fIlen\fR bytes is long enough to be decoded by
+of \fIlength\fR bytes is long enough to be decoded by
 \fBTcl_UtfToUniChar\fR, or 0 otherwise.  This function does not guarantee
 that the UTF-8 string is properly formed.  This routine is used by
 procedures that are operating on a byte at a time and need to know if a
@@ -160,27 +204,30 @@ full Tcl_UniChar has been seen.
 .PP
 \fBTcl_NumUtfChars\fR corresponds to \fBstrlen\fR for UTF-8 strings.  It
 returns the number of Tcl_UniChars that are represented by the UTF-8 string
-\fIsrc\fR.  The length of the source string is \fIlen\fR bytes.  If the
-length is negative, all bytes up to the first NULL byte are used.
+\fIsrc\fR.  The length of the source string is \fIlength\fR bytes.  If the
+length is negative, all bytes up to the first null byte are used.
 .PP
 \fBTcl_UtfFindFirst\fR corresponds to \fBstrchr\fR for UTF-8 strings.  It
-returns a pointer to the first occurance of the Tcl_UniChar \fIch\fR
-in the NULL-terminated UTF-8 string \fIsrc\fR.  The NULL terminator is
+returns a pointer to the first occurrence of the Tcl_UniChar \fIch\fR
+in the null-terminated UTF-8 string \fIsrc\fR.  The null terminator is
 considered part of the UTF-8 string.  
 .PP
 \fBTcl_UtfFindLast\fR corresponds to \fBstrrchr\fR for UTF-8 strings.  It
-returns a pointer to the last occurance of the Tcl_UniChar \fIch\fR
-in the NULL terminated UTF-8 string \fIsrc\fR.  The NULL terminator is
+returns a pointer to the last occurrence of the Tcl_UniChar \fIch\fR
+in the null-terminated UTF-8 string \fIsrc\fR.  The null terminator is
 considered part of the UTF-8 string.  
 .PP
 Given \fIsrc\fR, a pointer to some location in a UTF-8 string,
 \fBTcl_UtfNext\fR returns a pointer to the next UTF-8 character in the
 string.  The caller must not ask for the next character after the last
-character in the string.
+character in the string if the string is not terminated by a null
+character.
 .PP
-Given \fIsrc\fR, a pointer to some location in a UTF-8 string,
-\fBTcl_UtfPrev\fR returns a pointer to the previous UTF-8 character in the
-string.  This function will not back up to a position before \fIstart\fR,
+Given \fIsrc\fR, a pointer to some location in a UTF-8 string (or to a
+null byte immediately following such a string), \fBTcl_UtfPrev\fR
+returns a pointer to the closest preceding byte that starts a UTF-8
+character.
+This function will not back up to a position before \fIstart\fR,
 the start of the UTF-8 string.  If \fIsrc\fR was already at \fIstart\fR, the
 return value will be \fIstart\fR.
 .PP
@@ -188,17 +235,18 @@ return value will be \fIstart\fR.
 Pascal Ord() function.  It returns the Tcl_UniChar represented at the
 specified character (not byte) \fIindex\fR in the UTF-8 string
 \fIsrc\fR.  The source string must contain at least \fIindex\fR
-characters.
+characters.  Behavior is undefined if a negative \fIindex\fR is given.
 .PP
 \fBTcl_UtfAtIndex\fR returns a pointer to the specified character (not
 byte) \fIindex\fR in the UTF-8 string \fIsrc\fR.  The source string must
 contain at least \fIindex\fR characters.  This is equivalent to calling 
-\fBTcl_UtfNext\fR \fIindex\fR times.
+\fBTcl_UtfNext\fR \fIindex\fR times.  If a negative \fIindex\fR is given,
+the return pointer points to the first character in the source string.
 .PP
 \fBTcl_UtfBackslash\fR is a utility procedure used by several of the Tcl
 commands.  It parses a backslash sequence and stores the properly formed
 UTF-8 character represented by the backslash sequence in the output
-buffer \fIdst\fR.  At most TCL_UTF_MAX bytes are stored in the buffer.
+buffer \fIdst\fR.  At most \fBTCL_UTF_MAX\fR bytes are stored in the buffer.
 \fBTcl_UtfBackslash\fR modifies \fI*readPtr\fR to contain the number
 of bytes in the backslash sequence, including the backslash character.
 The return value is the number of bytes stored in the output buffer.