diff options
Diffstat (limited to 'doc/Utf.3')
| -rw-r--r-- | doc/Utf.3 | 221 |
1 files changed, 73 insertions, 148 deletions
@@ -3,17 +3,17 @@ '\" '\" See the file "license.terms" for information on usage and redistribution '\" of this file, and for a DISCLAIMER OF ALL WARRANTIES. -'\" +'\" .TH Utf 3 "8.1" Tcl "Tcl Library Procedures" .so man.macros .BS .SH NAME -Tcl_UniChar, Tcl_UniCharToUtf, Tcl_UtfToUniChar, Tcl_UtfToChar16, Tcl_UtfToWChar, Tcl_UniCharToUtfDString, Tcl_UtfToUniCharDString, Tcl_Char16ToUtfDString, Tcl_UtfToWCharDString, Tcl_UtfToChar16DString, Tcl_WCharToUtfDString, Tcl_WCharLen, Tcl_Char16Len, Tcl_UniCharLen, Tcl_UniCharNcmp, Tcl_UniCharNcasecmp, Tcl_UniCharCaseMatch, Tcl_UtfNcmp, Tcl_UtfNcasecmp, Tcl_UtfCharComplete, Tcl_NumUtfChars, Tcl_UtfFindFirst, Tcl_UtfFindLast, Tcl_UtfNext, Tcl_UtfPrev, Tcl_UniCharAtIndex, Tcl_UtfAtIndex, Tcl_UtfBackslash \- routines for manipulating UTF-8 strings +Tcl_UniChar, Tcl_UniCharToUtf, Tcl_UtfToUniChar, Tcl_UniCharToUtfDString, Tcl_UtfToUniCharDString, Tcl_UniCharLen, Tcl_UniCharNcmp, Tcl_UniCharNcasecmp, Tcl_UniCharCaseMatch, Tcl_UtfNcmp, Tcl_UtfNcasecmp, Tcl_UtfCharComplete, Tcl_NumUtfChars, Tcl_UtfFindFirst, Tcl_UtfFindLast, Tcl_UtfNext, Tcl_UtfPrev, Tcl_UniCharAtIndex, Tcl_UtfAtIndex, Tcl_UtfBackslash \- routines for manipulating UTF-8 strings .SH SYNOPSIS .nf \fB#include <tcl.h>\fR .sp -typedef ... \fBTcl_UniChar\fR; +typedef ... Tcl_UniChar; .sp int \fBTcl_UniCharToUtf\fR(\fIch, buf\fR) @@ -21,35 +21,11 @@ int int \fBTcl_UtfToUniChar\fR(\fIsrc, chPtr\fR) .sp -int -\fBTcl_UtfToChar16\fR(\fIsrc, uPtr\fR) -.sp -int -\fBTcl_UtfToWChar\fR(\fIsrc, wPtr\fR) -.sp char * -\fBTcl_UniCharToUtfDString\fR(\fIuniStr, numUniChars, dsPtr\fR) -.sp -char * -\fBTcl_Char16ToUtfDString\fR(\fIutf16, numUtf16, dsPtr\fR) -.sp -char * -\fBTcl_WCharToUtfDString\fR(\fIwcharStr, numWChars, dsPtr\fR) +\fBTcl_UniCharToUtfDString\fR(\fIuniStr, uniLength, dsPtr\fR) .sp Tcl_UniChar * -\fBTcl_UtfToUniCharDString\fR(\fIsrc, numBytes, dsPtr\fR) -.sp -unsigned short * -\fBTcl_UtfToChar16DString\fR(\fIsrc, numBytes, dsPtr\fR) -.sp -wchar_t * -\fBTcl_UtfToWCharDString\fR(\fIsrc, numBytes, dsPtr\fR) -.sp -int -\fBTcl_Char16Len\fR(\fIutf16\fR) -.sp -int -\fBTcl_WCharLen\fR(\fIwcharStr\fR) +\fBTcl_UtfToUniCharDString\fR(\fIsrc, length, dsPtr\fR) .sp int \fBTcl_UniCharLen\fR(\fIuniStr\fR) @@ -70,10 +46,10 @@ int \fBTcl_UtfNcasecmp\fR(\fIcs, ct, numChars\fR) .sp int -\fBTcl_UtfCharComplete\fR(\fIsrc, numBytes\fR) +\fBTcl_UtfCharComplete\fR(\fIsrc, length\fR) .sp -int -\fBTcl_NumUtfChars\fR(\fIsrc, numBytes\fR) +int +\fBTcl_NumUtfChars\fR(\fIsrc, length\fR) .sp const char * \fBTcl_UtfFindFirst\fR(\fIsrc, ch\fR) @@ -87,7 +63,7 @@ const char * const char * \fBTcl_UtfPrev\fR(\fIsrc, start\fR) .sp -int +Tcl_UniChar \fBTcl_UniCharAtIndex\fR(\fIsrc, index\fR) .sp const char * @@ -99,15 +75,11 @@ int .AS "const Tcl_UniChar" *uniPattern in/out .AP char *buf out Buffer in which the UTF-8 representation of the Tcl_UniChar is stored. At most -4 bytes are stored in the buffer. +\fBTCL_UTF_MAX\fR bytes are stored in the buffer. .AP int ch in -The Unicode character to be converted or examined. +The Tcl_UniChar to be converted or examined. .AP Tcl_UniChar *chPtr out Filled with the Tcl_UniChar represented by the head of the UTF-8 string. -.AP unsigned short *uPtr out -Filled with the utf-16 represented by the head of the UTF-8 string. -.AP wchar_t *wPtr out -Filled with the wchar_t represented by the head of the UTF-8 string. .AP "const char" *src in Pointer to a UTF-8 string. .AP "const char" *cs in @@ -115,32 +87,19 @@ Pointer to a UTF-8 string. .AP "const char" *ct in Pointer to a UTF-8 string. .AP "const Tcl_UniChar" *uniStr in -A sequence of \fBTcl_UniChar\fR units with null-termination optional -depending on function. +A null-terminated Unicode string. .AP "const Tcl_UniChar" *ucs in -A null-terminated sequence of \fBTcl_UniChar\fR. +A null-terminated Unicode string. .AP "const Tcl_UniChar" *uct in -A null-terminated sequence of \fBTcl_UniChar\fR. +A null-terminated Unicode string. .AP "const Tcl_UniChar" *uniPattern in -A null-terminated sequence of \fBTcl_UniChar\fR. -.AP "const unsigned short" *utf16 in -A sequence of UTF-16 units with null-termination optional -depending on function. -.AP "const wchar_t" *wcharStr in -A sequence of \fBwchar_t\fR units with null-termination optional -depending on function. -.AP int numBytes in -The length of the UTF-8 input in bytes. If -negative, the length includes all bytes until the first null byte. -.AP int numUtf16 in -The length of the input in UTF-16 units. -If negative, the length includes all bytes until the first null. -.AP int numUniChars in -The length of the input in Tcl_UniChar units. -If negative, the length includes all bytes until the first null. -.AP int numWChars in -The length of the input in wchar_t units. -If negative, the length includes all bytes until the first null. +A null-terminated Unicode string. +.AP int length in +The length of the UTF-8 string in bytes (not UTF-8 characters). If +negative, all bytes up to the first null byte are used. +.AP int uniLength in +The length of the Unicode string in characters. Must be greater than or +equal to 0. .AP "Tcl_DString" *dsPtr in/out A pointer to a previously initialized \fBTcl_DString\fR. .AP "unsigned long" numChars in @@ -150,11 +109,11 @@ Pointer to the beginning of a UTF-8 string. .AP int index in The index of a character (not byte) in the UTF-8 string. .AP int *readPtr out -If non-NULL, filled with the number of bytes in the backslash sequence, +If non-NULL, filled with the number of bytes in the backslash sequence, including the backslash character. .AP char *dst out Buffer in which the bytes represented by the backslash sequence are stored. -At most 4 bytes are stored in the buffer. +At most \fBTCL_UTF_MAX\fR bytes are stored in the buffer. .AP int nocase in Specifies whether the match should be done case-sensitive (0) or case-insensitive (1). @@ -162,21 +121,18 @@ case-insensitive (1). .SH DESCRIPTION .PP -These routines convert between UTF-8 strings and Unicode/Utf-16 characters. -A UTF-8 character is a Unicode character represented as a varying-length -sequence of up to \fB4\fR bytes. A multibyte UTF-8 sequence -consists of a lead byte followed by some number of trail bytes. +These routines convert between UTF-8 strings and Tcl_UniChars. A +Tcl_UniChar is a Unicode character represented as an unsigned, fixed-size +quantity. A UTF-8 character is a Unicode character represented as +a varying-length sequence of up to \fBTCL_UTF_MAX\fR bytes. A multibyte UTF-8 +sequence consists of a lead byte followed by some number of trail bytes. .PP -\fBTCL_UTF_MAX\fR is the maximum number of bytes that \fBTcl_UtfToUniChar\fR -can consume in a single call. +\fBTCL_UTF_MAX\fR is the maximum number of bytes that it takes to +represent one Unicode character in the UTF-8 representation. .PP -\fBTcl_UniCharToUtf\fR stores the character \fIch\fR as a UTF-8 string +\fBTcl_UniCharToUtf\fR stores the Tcl_UniChar \fIch\fR as a UTF-8 string in starting at \fIbuf\fR. The return value is the number of bytes stored -in \fIbuf\fR. If ch is a high surrogate (range U+D800 - U+DBFF), then -the return value will be 1 and a single byte in the range 0xF0 - 0xF4 -will be stored. If you still want to produce UTF-8 output for it (even -though knowing it's an illegal code-point on its own), just call -\fBTcl_UniCharToUtf\fR again specifying ch = -1. +in \fIbuf\fR. .PP \fBTcl_UtfToUniChar\fR reads one UTF-8 character starting at \fIsrc\fR and stores it as a Tcl_UniChar in \fI*chPtr\fR. The return value is the @@ -184,39 +140,27 @@ number of bytes read from \fIsrc\fR. The caller must ensure that the source buffer is long enough such that this routine does not run off the end and dereference non-existent or random memory; if the source buffer is known to be null-terminated, this will not happen. If the input is -a byte in the range 0x80 - 0x9F, \fBTcl_UtfToUniChar\fR assumes the -cp1252 encoding, stores the corresponding Tcl_UniChar in \fI*chPtr\fR -and returns 1. If the input is otherwise not in proper UTF-8 format, \fBTcl_UtfToUniChar\fR will store the first -byte of \fIsrc\fR in \fI*chPtr\fR as a Tcl_UniChar between 0x00A0 and -0x00FF and return 1. +byte of \fIsrc\fR in \fI*chPtr\fR as a Tcl_UniChar between 0x0000 and +0x00ff and return 1. .PP -\fBTcl_UniCharToUtfDString\fR converts the input in the form of a -sequence of \fBTcl_UniChar\fR code points to UTF-8, appending the result to the -previously initialized output \fBTcl_DString\fR. The return value is a pointer -to the UTF-8 representation of the \fBappended\fR string. +\fBTcl_UniCharToUtfDString\fR converts the given Unicode string +to UTF-8, storing the result in a previously initialized \fBTcl_DString\fR. +You must specify \fIuniLength\fR, the length of the given Unicode string. +The return value is a pointer to the UTF-8 representation of the +Unicode string. Storage for the return value is appended to the +end of the \fBTcl_DString\fR. .PP -\fBTcl_UtfToUniCharDString\fR converts the input in the form of -a UTF-8 encoded string to a \fBTcl_UniChar\fR sequence -appending the result in the previously initialized \fBTcl_DString\fR. -The return value is a pointer to the appended result which is also -terminated with a \fBTcl_UniChar\fR null character. -.PP -\fBTcl_WCharToUtfDString\fR and \fBTcl_UtfToWCharDString\fR are similar to -\fBTcl_UniCharToUtfDString\fR and \fBTcl_UtfToUniCharDString\fR except they -operate on sequences of \fBwchar_t\fR instead of \fBTcl_UniChar\fR. -.PP -\fBTcl_Char16ToUtfDString\fR and \fBTcl_UtfToChar16DString\fR are similar to -\fBTcl_UniCharToUtfDString\fR and \fBTcl_UtfToUniCharDString\fR except they -operate on sequences of \fBUTF-16\fR units instead of \fBTcl_UniChar\fR. -.PP -\fBTcl_Char16Len\fR corresponds to \fBstrlen\fR for UTF-16 -characters. It accepts a null-terminated UTF-16 sequence and returns -the number of UTF-16 units until the null. -.PP -\fBTcl_WCharLen\fR corresponds to \fBstrlen\fR for wchar_t -characters. It accepts a null-terminated \fBwchar_t\fR sequence and returns -the number of \fBwchar_t\fR units until the null. +\fBTcl_UtfToUniCharDString\fR converts the given UTF-8 string to Unicode, +storing the result in the previously initialized \fBTcl_DString\fR. +In the argument \fIlength\fR, you may either specify the length of +the given UTF-8 string in bytes or +.QW \-1 , +in which case \fBTcl_UtfToUniCharDString\fR uses \fBstrlen\fR to +calculate the length. The return value is a pointer to the Unicode +representation of the UTF-8 string. Storage for the return value +is appended to the end of the \fBTcl_DString\fR. The Unicode string +is terminated with a Unicode null character. .PP \fBTcl_UniCharLen\fR corresponds to \fBstrlen\fR for Unicode characters. It accepts a null-terminated Unicode string and returns @@ -252,11 +196,11 @@ differences in case when comparing upper, lower or title case characters. .PP \fBTcl_UtfCharComplete\fR returns 1 if the source UTF-8 string \fIsrc\fR -of \fInumBytes\fR bytes is long enough to be decoded by -\fBTcl_UtfToUniChar\fR/\fBTcl_UtfNext\fR, or 0 otherwise. This function -does not guarantee that the UTF-8 string is properly formed. This routine -is used by procedures that are operating on a byte at a time and need to -know if a full Unicode character has been seen. +of \fIlength\fR bytes is long enough to be decoded by +\fBTcl_UtfToUniChar\fR, or 0 otherwise. This function does not guarantee +that the UTF-8 string is properly formed. This routine is used by +procedures that are operating on a byte at a time and need to know if a +full Tcl_UniChar has been seen. .PP \fBTcl_NumUtfChars\fR corresponds to \fBstrlen\fR for UTF-8 strings. It returns the number of Tcl_UniChars that are represented by the UTF-8 string @@ -264,64 +208,45 @@ returns the number of Tcl_UniChars that are represented by the UTF-8 string length is negative, all bytes up to the first null byte are used. .PP \fBTcl_UtfFindFirst\fR corresponds to \fBstrchr\fR for UTF-8 strings. It -returns a pointer to the first occurrence of the Unicode character \fIch\fR +returns a pointer to the first occurrence of the Tcl_UniChar \fIch\fR in the null-terminated UTF-8 string \fIsrc\fR. The null terminator is -considered part of the UTF-8 string. +considered part of the UTF-8 string. .PP \fBTcl_UtfFindLast\fR corresponds to \fBstrrchr\fR for UTF-8 strings. It -returns a pointer to the last occurrence of the Unicode character \fIch\fR +returns a pointer to the last occurrence of the Tcl_UniChar \fIch\fR in the null-terminated UTF-8 string \fIsrc\fR. The null terminator is -considered part of the UTF-8 string. +considered part of the UTF-8 string. .PP Given \fIsrc\fR, a pointer to some location in a UTF-8 string, \fBTcl_UtfNext\fR returns a pointer to the next UTF-8 character in the string. The caller must not ask for the next character after the last character in the string if the string is not terminated by a null -character. \fBTcl_UtfCharComplete\fR can be used in that case to -make sure enough bytes are available before calling \fBTcl_UtfNext\fR. -.PP -\fBTcl_UtfPrev\fR is used to step backward through but not beyond the -UTF-8 string that begins at \fIstart\fR. If the UTF-8 string is made -up entirely of complete and well-formed characters, and \fIsrc\fR points -to the lead byte of one of those characters (or to the location one byte -past the end of the string), then repeated calls of \fBTcl_UtfPrev\fR will -return pointers to the lead bytes of each character in the string, one -character at a time, terminating when it returns \fIstart\fR. +character. .PP -When the conditions of completeness and well-formedness may not be satisfied, -a more precise description of the function of \fBTcl_UtfPrev\fR is necessary. -It always returns a pointer greater than or equal to \fIstart\fR; that is, -always a pointer to a location in the string. It always returns a pointer to -a byte that begins a character when scanning for characters beginning -from \fIstart\fR. When \fIsrc\fR is greater than \fIstart\fR, it -always returns a pointer less than \fIsrc\fR and greater than or -equal to (\fIsrc\fR - 4). The character that begins -at the returned pointer is the first one that either includes the -byte \fIsrc[-1]\fR, or might include it if the right trail bytes are -present at \fIsrc\fR and greater. \fBTcl_UtfPrev\fR never reads the -byte \fIsrc[0]\fR nor the byte \fIstart[-1]\fR nor the byte -\fIsrc[-5]\fR. +Given \fIsrc\fR, a pointer to some location in a UTF-8 string (or to a +null byte immediately following such a string), \fBTcl_UtfPrev\fR +returns a pointer to the closest preceding byte that starts a UTF-8 +character. +This function will not back up to a position before \fIstart\fR, +the start of the UTF-8 string. If \fIsrc\fR was already at \fIstart\fR, the +return value will be \fIstart\fR. .PP \fBTcl_UniCharAtIndex\fR corresponds to a C string array dereference or the -Pascal Ord() function. It returns the Unicode character represented at the +Pascal Ord() function. It returns the Tcl_UniChar represented at the specified character (not byte) \fIindex\fR in the UTF-8 string \fIsrc\fR. The source string must contain at least \fIindex\fR -characters. If a negative \fIindex\fR is given or \fIindex\fR points -to the second half of a surrogate pair, it returns -1. +characters. Behavior is undefined if a negative \fIindex\fR is given. .PP \fBTcl_UtfAtIndex\fR returns a pointer to the specified character (not byte) \fIindex\fR in the UTF-8 string \fIsrc\fR. The source string must -contain at least \fIindex\fR characters. This is equivalent to calling -\fBTcl_UtfToUniChar\fR \fIindex\fR times, except if that would return -a pointer to the second byte of a valid 4-byte UTF-8 sequence, in which -case, \fBTcl_UtfToUniChar\fR will be called once more to find the end -of the sequence. If a negative \fIindex\fR is given, the returned pointer -points to the first character in the source string. +contain at least \fIindex\fR characters. This is equivalent to calling +\fBTcl_UtfNext\fR \fIindex\fR times. If a negative \fIindex\fR is given, +the return pointer points to the first character in the source string. .PP \fBTcl_UtfBackslash\fR is a utility procedure used by several of the Tcl commands. It parses a backslash sequence and stores the properly formed UTF-8 character represented by the backslash sequence in the output -buffer \fIdst\fR. At most 4 bytes are stored in the buffer. +buffer \fIdst\fR. At most \fBTCL_UTF_MAX\fR bytes are stored in the buffer. \fBTcl_UtfBackslash\fR modifies \fI*readPtr\fR to contain the number of bytes in the backslash sequence, including the backslash character. The return value is the number of bytes stored in the output buffer. |
