1 files changed, 73 insertions, 148 deletions
diff --git a/doc/Utf.3 b/doc/Utf.3
index 069a612..903fac7 100644
--- a/doc/Utf.3
+++ b/doc/Utf.3
@@ -3,17 +3,17 @@
 '\"
 '\" See the file "license.terms" for information on usage and redistribution
 '\" of this file, and for a DISCLAIMER OF ALL WARRANTIES.
-'\"
+'\" 
 .TH Utf 3 "8.1" Tcl "Tcl Library Procedures"
 .so man.macros
 .BS
 .SH NAME
-Tcl_UniChar, Tcl_UniCharToUtf, Tcl_UtfToUniChar, Tcl_UtfToChar16, Tcl_UtfToWChar, Tcl_UniCharToUtfDString, Tcl_UtfToUniCharDString, Tcl_Char16ToUtfDString, Tcl_UtfToWCharDString, Tcl_UtfToChar16DString, Tcl_WCharToUtfDString, Tcl_WCharLen, Tcl_Char16Len, Tcl_UniCharLen, Tcl_UniCharNcmp, Tcl_UniCharNcasecmp, Tcl_UniCharCaseMatch, Tcl_UtfNcmp, Tcl_UtfNcasecmp, Tcl_UtfCharComplete, Tcl_NumUtfChars, Tcl_UtfFindFirst, Tcl_UtfFindLast, Tcl_UtfNext, Tcl_UtfPrev, Tcl_UniCharAtIndex, Tcl_UtfAtIndex, Tcl_UtfBackslash \- routines for manipulating UTF-8 strings
+Tcl_UniChar, Tcl_UniCharToUtf, Tcl_UtfToUniChar, Tcl_UniCharToUtfDString, Tcl_UtfToUniCharDString, Tcl_UniCharLen, Tcl_UniCharNcmp, Tcl_UniCharNcasecmp, Tcl_UniCharCaseMatch, Tcl_UtfNcmp, Tcl_UtfNcasecmp, Tcl_UtfCharComplete, Tcl_NumUtfChars, Tcl_UtfFindFirst, Tcl_UtfFindLast, Tcl_UtfNext, Tcl_UtfPrev, Tcl_UniCharAtIndex, Tcl_UtfAtIndex, Tcl_UtfBackslash \- routines for manipulating UTF-8 strings
 .SH SYNOPSIS
 .nf
 \fB#include <tcl.h>\fR
 .sp
-typedef ... \fBTcl_UniChar\fR;
+typedef ... Tcl_UniChar;
 .sp
 int
 \fBTcl_UniCharToUtf\fR(\fIch, buf\fR)
@@ -21,35 +21,11 @@ int
 int
 \fBTcl_UtfToUniChar\fR(\fIsrc, chPtr\fR)
 .sp
-int
-\fBTcl_UtfToChar16\fR(\fIsrc, uPtr\fR)
-.sp
-int
-\fBTcl_UtfToWChar\fR(\fIsrc, wPtr\fR)
-.sp
 char *
-\fBTcl_UniCharToUtfDString\fR(\fIuniStr, numUniChars, dsPtr\fR)
-.sp
-char *
-\fBTcl_Char16ToUtfDString\fR(\fIutf16, numUtf16, dsPtr\fR)
-.sp
-char *
-\fBTcl_WCharToUtfDString\fR(\fIwcharStr, numWChars, dsPtr\fR)
+\fBTcl_UniCharToUtfDString\fR(\fIuniStr, uniLength, dsPtr\fR)
 .sp
 Tcl_UniChar *
-\fBTcl_UtfToUniCharDString\fR(\fIsrc, numBytes, dsPtr\fR)
-.sp
-unsigned short *
-\fBTcl_UtfToChar16DString\fR(\fIsrc, numBytes, dsPtr\fR)
-.sp
-wchar_t *
-\fBTcl_UtfToWCharDString\fR(\fIsrc, numBytes, dsPtr\fR)
-.sp
-int
-\fBTcl_Char16Len\fR(\fIutf16\fR)
-.sp
-int
-\fBTcl_WCharLen\fR(\fIwcharStr\fR)
+\fBTcl_UtfToUniCharDString\fR(\fIsrc, length, dsPtr\fR)
 .sp
 int
 \fBTcl_UniCharLen\fR(\fIuniStr\fR)
@@ -70,10 +46,10 @@ int
 \fBTcl_UtfNcasecmp\fR(\fIcs, ct, numChars\fR)
 .sp
 int
-\fBTcl_UtfCharComplete\fR(\fIsrc, numBytes\fR)
+\fBTcl_UtfCharComplete\fR(\fIsrc, length\fR)
 .sp
-int
-\fBTcl_NumUtfChars\fR(\fIsrc, numBytes\fR)
+int 
+\fBTcl_NumUtfChars\fR(\fIsrc, length\fR)
 .sp
 const char *
 \fBTcl_UtfFindFirst\fR(\fIsrc, ch\fR)
@@ -87,7 +63,7 @@ const char *
 const char *
 \fBTcl_UtfPrev\fR(\fIsrc, start\fR)
 .sp
-int
+Tcl_UniChar
 \fBTcl_UniCharAtIndex\fR(\fIsrc, index\fR)
 .sp
 const char *
@@ -99,15 +75,11 @@ int
 .AS "const Tcl_UniChar" *uniPattern in/out
 .AP char *buf out
 Buffer in which the UTF-8 representation of the Tcl_UniChar is stored.  At most
-4 bytes are stored in the buffer.
+\fBTCL_UTF_MAX\fR bytes are stored in the buffer.
 .AP int ch in
-The Unicode character to be converted or examined.
+The Tcl_UniChar to be converted or examined.
 .AP Tcl_UniChar *chPtr out
 Filled with the Tcl_UniChar represented by the head of the UTF-8 string.
-.AP unsigned short *uPtr out
-Filled with the utf-16 represented by the head of the UTF-8 string.
-.AP wchar_t *wPtr out
-Filled with the wchar_t represented by the head of the UTF-8 string.
 .AP "const char" *src in
 Pointer to a UTF-8 string.
 .AP "const char" *cs in
@@ -115,32 +87,19 @@ Pointer to a UTF-8 string.
 .AP "const char" *ct in
 Pointer to a UTF-8 string.
 .AP "const Tcl_UniChar" *uniStr in
-A sequence of \fBTcl_UniChar\fR units with null-termination optional
-depending on function.
+A null-terminated Unicode string.
 .AP "const Tcl_UniChar" *ucs in
-A null-terminated sequence of \fBTcl_UniChar\fR.
+A null-terminated Unicode string.
 .AP "const Tcl_UniChar" *uct in
-A null-terminated sequence of \fBTcl_UniChar\fR.
+A null-terminated Unicode string.
 .AP "const Tcl_UniChar" *uniPattern in
-A null-terminated sequence of \fBTcl_UniChar\fR.
-.AP "const unsigned short" *utf16 in
-A sequence of UTF-16 units with null-termination optional
-depending on function.
-.AP "const wchar_t" *wcharStr in
-A sequence of \fBwchar_t\fR units with null-termination optional
-depending on function.
-.AP int numBytes in
-The length of the UTF-8 input in bytes.  If
-negative, the length includes all bytes until the first null byte.
-.AP int numUtf16 in
-The length of the input in UTF-16 units.
-If negative, the length includes all bytes until the first null.
-.AP int numUniChars in
-The length of the input in Tcl_UniChar units.
-If negative, the length includes all bytes until the first null.
-.AP int numWChars in
-The length of the input in wchar_t units.
-If negative, the length includes all bytes until the first null.
+A null-terminated Unicode string.
+.AP int length in
+The length of the UTF-8 string in bytes (not UTF-8 characters).  If
+negative, all bytes up to the first null byte are used.
+.AP int uniLength in
+The length of the Unicode string in characters.  Must be greater than or
+equal to 0.
 .AP "Tcl_DString" *dsPtr in/out
 A pointer to a previously initialized \fBTcl_DString\fR.
 .AP "unsigned long" numChars in
@@ -150,11 +109,11 @@ Pointer to the beginning of a UTF-8 string.
 .AP int index in
 The index of a character (not byte) in the UTF-8 string.
 .AP int *readPtr out
-If non-NULL, filled with the number of bytes in the backslash sequence,
+If non-NULL, filled with the number of bytes in the backslash sequence, 
 including the backslash character.
 .AP char *dst out
 Buffer in which the bytes represented by the backslash sequence are stored.
-At most 4 bytes are stored in the buffer.
+At most \fBTCL_UTF_MAX\fR bytes are stored in the buffer.
 .AP int nocase in
 Specifies whether the match should be done case-sensitive (0) or
 case-insensitive (1).
@@ -162,21 +121,18 @@ case-insensitive (1).
 
 .SH DESCRIPTION
 .PP
-These routines convert between UTF-8 strings and Unicode/Utf-16 characters.
-A UTF-8 character is a Unicode character represented as a varying-length
-sequence of up to \fB4\fR bytes.  A multibyte UTF-8 sequence
-consists of a lead byte followed by some number of trail bytes.
+These routines convert between UTF-8 strings and Tcl_UniChars.  A
+Tcl_UniChar is a Unicode character represented as an unsigned, fixed-size
+quantity.  A UTF-8 character is a Unicode character represented as
+a varying-length sequence of up to \fBTCL_UTF_MAX\fR bytes.  A multibyte UTF-8
+sequence consists of a lead byte followed by some number of trail bytes.
 .PP
-\fBTCL_UTF_MAX\fR is the maximum number of bytes that \fBTcl_UtfToUniChar\fR
-can consume in a single call.
+\fBTCL_UTF_MAX\fR is the maximum number of bytes that it takes to
+represent one Unicode character in the UTF-8 representation.
 .PP
-\fBTcl_UniCharToUtf\fR stores the character \fIch\fR as a UTF-8 string
+\fBTcl_UniCharToUtf\fR stores the Tcl_UniChar \fIch\fR as a UTF-8 string
 in starting at \fIbuf\fR.  The return value is the number of bytes stored
-in \fIbuf\fR. If ch is a high surrogate (range U+D800 - U+DBFF), then
-the return value will be 1 and a single byte in the range 0xF0 - 0xF4
-will be stored. If you still want to produce UTF-8 output for it (even
-though knowing it's an illegal code-point on its own), just call
-\fBTcl_UniCharToUtf\fR again specifying ch = -1.
+in \fIbuf\fR.
 .PP
 \fBTcl_UtfToUniChar\fR reads one UTF-8 character starting at \fIsrc\fR
 and stores it as a Tcl_UniChar in \fI*chPtr\fR.  The return value is the
@@ -184,39 +140,27 @@ number of bytes read from \fIsrc\fR.  The caller must ensure that the
 source buffer is long enough such that this routine does not run off the
 end and dereference non-existent or random memory; if the source buffer
 is known to be null-terminated, this will not happen.  If the input is
-a byte in the range 0x80 - 0x9F, \fBTcl_UtfToUniChar\fR assumes the
-cp1252 encoding, stores the corresponding Tcl_UniChar in \fI*chPtr\fR
-and returns 1. If the input is otherwise
 not in proper UTF-8 format, \fBTcl_UtfToUniChar\fR will store the first
-byte of \fIsrc\fR in \fI*chPtr\fR as a Tcl_UniChar between 0x00A0 and
-0x00FF and return 1.
+byte of \fIsrc\fR in \fI*chPtr\fR as a Tcl_UniChar between 0x0000 and
+0x00ff and return 1.  
 .PP
-\fBTcl_UniCharToUtfDString\fR converts the input in the form of a
-sequence of \fBTcl_UniChar\fR code points to UTF-8, appending the result to the
-previously initialized output \fBTcl_DString\fR. The return value is a pointer
-to the UTF-8 representation of the \fBappended\fR string.
+\fBTcl_UniCharToUtfDString\fR converts the given Unicode string
+to UTF-8, storing the result in a previously initialized \fBTcl_DString\fR.
+You must specify \fIuniLength\fR, the length of the given Unicode string.
+The return value is a pointer to the UTF-8 representation of the
+Unicode string.  Storage for the return value is appended to the
+end of the \fBTcl_DString\fR.
 .PP
-\fBTcl_UtfToUniCharDString\fR converts the input in the form of
-a UTF-8 encoded string to a \fBTcl_UniChar\fR sequence
-appending the result in the previously initialized \fBTcl_DString\fR.
-The return value is a pointer to the appended result which is also
-terminated with a \fBTcl_UniChar\fR null character.
-.PP
-\fBTcl_WCharToUtfDString\fR and \fBTcl_UtfToWCharDString\fR are similar to
-\fBTcl_UniCharToUtfDString\fR and \fBTcl_UtfToUniCharDString\fR except they
-operate on sequences of \fBwchar_t\fR instead of \fBTcl_UniChar\fR.
-.PP
-\fBTcl_Char16ToUtfDString\fR and \fBTcl_UtfToChar16DString\fR are similar to
-\fBTcl_UniCharToUtfDString\fR and \fBTcl_UtfToUniCharDString\fR except they
-operate on sequences of \fBUTF-16\fR units instead of \fBTcl_UniChar\fR.
-.PP
-\fBTcl_Char16Len\fR corresponds to \fBstrlen\fR for UTF-16
-characters.  It accepts a null-terminated UTF-16 sequence and returns
-the number of UTF-16 units until the null.
-.PP
-\fBTcl_WCharLen\fR corresponds to \fBstrlen\fR for wchar_t
-characters.  It accepts a null-terminated \fBwchar_t\fR sequence and returns
-the number of \fBwchar_t\fR units until the null.
+\fBTcl_UtfToUniCharDString\fR converts the given UTF-8 string to Unicode,
+storing the result in the previously initialized \fBTcl_DString\fR.
+In the argument \fIlength\fR, you may either specify the length of
+the given UTF-8 string in bytes or
+.QW \-1 ,
+in which case \fBTcl_UtfToUniCharDString\fR uses \fBstrlen\fR to
+calculate the length.  The return value is a pointer to the Unicode
+representation of the UTF-8 string.  Storage for the return value
+is appended to the end of the \fBTcl_DString\fR.  The Unicode string
+is terminated with a Unicode null character.
 .PP
 \fBTcl_UniCharLen\fR corresponds to \fBstrlen\fR for Unicode
 characters.  It accepts a null-terminated Unicode string and returns
@@ -252,11 +196,11 @@ differences in case when comparing upper, lower or title case
 characters.
 .PP
 \fBTcl_UtfCharComplete\fR returns 1 if the source UTF-8 string \fIsrc\fR
-of \fInumBytes\fR bytes is long enough to be decoded by
-\fBTcl_UtfToUniChar\fR/\fBTcl_UtfNext\fR, or 0 otherwise.  This function
-does not guarantee that the UTF-8 string is properly formed.  This routine
-is used by procedures that are operating on a byte at a time and need to
-know if a full Unicode character has been seen.
+of \fIlength\fR bytes is long enough to be decoded by
+\fBTcl_UtfToUniChar\fR, or 0 otherwise.  This function does not guarantee
+that the UTF-8 string is properly formed.  This routine is used by
+procedures that are operating on a byte at a time and need to know if a
+full Tcl_UniChar has been seen.
 .PP
 \fBTcl_NumUtfChars\fR corresponds to \fBstrlen\fR for UTF-8 strings.  It
 returns the number of Tcl_UniChars that are represented by the UTF-8 string
@@ -264,64 +208,45 @@ returns the number of Tcl_UniChars that are represented by the UTF-8 string
 length is negative, all bytes up to the first null byte are used.
 .PP
 \fBTcl_UtfFindFirst\fR corresponds to \fBstrchr\fR for UTF-8 strings.  It
-returns a pointer to the first occurrence of the Unicode character \fIch\fR
+returns a pointer to the first occurrence of the Tcl_UniChar \fIch\fR
 in the null-terminated UTF-8 string \fIsrc\fR.  The null terminator is
-considered part of the UTF-8 string.
+considered part of the UTF-8 string.  
 .PP
 \fBTcl_UtfFindLast\fR corresponds to \fBstrrchr\fR for UTF-8 strings.  It
-returns a pointer to the last occurrence of the Unicode character \fIch\fR
+returns a pointer to the last occurrence of the Tcl_UniChar \fIch\fR
 in the null-terminated UTF-8 string \fIsrc\fR.  The null terminator is
-considered part of the UTF-8 string.
+considered part of the UTF-8 string.  
 .PP
 Given \fIsrc\fR, a pointer to some location in a UTF-8 string,
 \fBTcl_UtfNext\fR returns a pointer to the next UTF-8 character in the
 string.  The caller must not ask for the next character after the last
 character in the string if the string is not terminated by a null
-character. \fBTcl_UtfCharComplete\fR can be used in that case to
-make sure enough bytes are available before calling \fBTcl_UtfNext\fR.
-.PP
-\fBTcl_UtfPrev\fR is used to step backward through but not beyond the
-UTF-8 string that begins at \fIstart\fR.  If the UTF-8 string is made
-up entirely of complete and well-formed characters, and \fIsrc\fR points
-to the lead byte of one of those characters (or to the location one byte
-past the end of the string), then repeated calls of \fBTcl_UtfPrev\fR will
-return pointers to the lead bytes of each character in the string, one
-character at a time, terminating when it returns \fIstart\fR.
+character.
 .PP
-When the conditions of completeness and well-formedness may not be satisfied,
-a more precise description of the function of \fBTcl_UtfPrev\fR is necessary.
-It always returns a pointer greater than or equal to \fIstart\fR; that is,
-always a pointer to a location in the string. It always returns a pointer to
-a byte that begins a character when scanning for characters beginning
-from \fIstart\fR. When \fIsrc\fR is greater than \fIstart\fR, it
-always returns a pointer less than \fIsrc\fR and greater than or
-equal to (\fIsrc\fR - 4).  The character that begins
-at the returned pointer is the first one that either includes the
-byte \fIsrc[-1]\fR, or might include it if the right trail bytes are
-present at \fIsrc\fR and greater. \fBTcl_UtfPrev\fR never reads the
-byte \fIsrc[0]\fR nor the byte \fIstart[-1]\fR nor the byte
-\fIsrc[-5]\fR.
+Given \fIsrc\fR, a pointer to some location in a UTF-8 string (or to a
+null byte immediately following such a string), \fBTcl_UtfPrev\fR
+returns a pointer to the closest preceding byte that starts a UTF-8
+character.
+This function will not back up to a position before \fIstart\fR,
+the start of the UTF-8 string.  If \fIsrc\fR was already at \fIstart\fR, the
+return value will be \fIstart\fR.
 .PP
 \fBTcl_UniCharAtIndex\fR corresponds to a C string array dereference or the
-Pascal Ord() function.  It returns the Unicode character represented at the
+Pascal Ord() function.  It returns the Tcl_UniChar represented at the
 specified character (not byte) \fIindex\fR in the UTF-8 string
 \fIsrc\fR.  The source string must contain at least \fIindex\fR
-characters.  If a negative \fIindex\fR is given or \fIindex\fR points
-to the second half of a surrogate pair, it returns -1.
+characters.  Behavior is undefined if a negative \fIindex\fR is given.
 .PP
 \fBTcl_UtfAtIndex\fR returns a pointer to the specified character (not
 byte) \fIindex\fR in the UTF-8 string \fIsrc\fR.  The source string must
-contain at least \fIindex\fR characters.  This is equivalent to calling
-\fBTcl_UtfToUniChar\fR \fIindex\fR times, except if that would return
-a pointer to the second byte of a valid 4-byte UTF-8 sequence, in which
-case, \fBTcl_UtfToUniChar\fR will be called once more to find the end
-of the sequence. If a negative \fIindex\fR is given, the returned pointer
-points to the first character in the source string.
+contain at least \fIindex\fR characters.  This is equivalent to calling 
+\fBTcl_UtfNext\fR \fIindex\fR times.  If a negative \fIindex\fR is given,
+the return pointer points to the first character in the source string.
 .PP
 \fBTcl_UtfBackslash\fR is a utility procedure used by several of the Tcl
 commands.  It parses a backslash sequence and stores the properly formed
 UTF-8 character represented by the backslash sequence in the output
-buffer \fIdst\fR.  At most 4 bytes are stored in the buffer.
+buffer \fIdst\fR.  At most \fBTCL_UTF_MAX\fR bytes are stored in the buffer.
 \fBTcl_UtfBackslash\fR modifies \fI*readPtr\fR to contain the number
 of bytes in the backslash sequence, including the backslash character.
 The return value is the number of bytes stored in the output buffer.