summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorkenj <kenj>1999-04-28 02:36:58 (GMT)
committerkenj <kenj>1999-04-28 02:36:58 (GMT)
commit008308d1ac157b5ff57e1db057c04968f9f45d00 (patch)
tree40234411e467f99a8f46fe011d6bdda781619b00
parenta0d445052c6ea0a965e3af400f7d7bc206926db8 (diff)
downloadtcl-008308d1ac157b5ff57e1db057c04968f9f45d00.zip
tcl-008308d1ac157b5ff57e1db057c04968f9f45d00.tar.gz
tcl-008308d1ac157b5ff57e1db057c04968f9f45d00.tar.bz2
Documented Tcl_UniCharLen, Tcl_UniCharNcmp, Tcl_UniCharToUtfDString,
and Tcl_UtfToUniCharDString. [Bug 1844]
-rw-r--r--doc/Utf.3217
1 files changed, 217 insertions, 0 deletions
diff --git a/doc/Utf.3 b/doc/Utf.3
new file mode 100644
index 0000000..0e94f9c
--- /dev/null
+++ b/doc/Utf.3
@@ -0,0 +1,217 @@
+'\"
+'\" Copyright (c) 1997 Sun Microsystems, Inc.
+'\"
+'\" See the file "license.terms" for information on usage and redistribution
+'\" of this file, and for a DISCLAIMER OF ALL WARRANTIES.
+'\"
+'\" RCS: @(#) $Id: Utf.3,v 1.2.4.1 1999/04/28 02:36:58 kenj Exp $
+'\"
+.so man.macros
+.TH Utf 3 "8.1" Tcl "Tcl Library Procedures"
+.BS
+.SH NAME
+Tcl_UniChar, Tcl_UniCharToUtf, Tcl_UtfToUniChar,
+Tcl_UniCharToUtfDString, Tcl_UtfToUniCharDString,
+Tcl_UniCharLen, Tcl_UniCharNcmp,
+Tcl_UtfCharComplete,
+Tcl_NumUtfChars, Tcl_UtfFindFirst, Tcl_UtfFindLast, Tcl_UtfNext, Tcl_UtfPrev,
+Tcl_UniCharAtIndex, Tcl_UtfAtIndex,
+Tcl_UtfBackslash \- routines for manipulating UTF-8 strings.
+.SH SYNOPSIS
+.nf
+\fB#include <tcl.h>\fR
+.sp
+typedef ... Tcl_UniChar;
+.sp
+int
+\fBTcl_UniCharToUtf\fR(\fIch, buf\fR)
+.sp
+int
+\fBTcl_UtfToUniChar\fR(\fIsrc, chPtr\fR)
+.sp
+char *
+\fBTcl_UniCharToUtfDString\fR(\fIuniStr, numChars, dstPtr\fR)
+.sp
+Tcl_UniChar *
+\fBTcl_UtfToUniCharDString\fR(\fIsrc, len, dstPtr\fR)
+.sp
+int
+\fBTcl_UniCharLen\fR(\fIuniStr\fR)
+.sp
+int
+\fBTcl_UniCharNcmp\fR(\fIuniStr, uniStr, num\fR)
+.sp
+int
+\fBTcl_UtfCharComplete\fR(\fIsrc, len\fR)
+.sp
+int
+\fBTcl_NumUtfChars\fR(\fIsrc, len\fR)
+.sp
+char *
+\fBTcl_UtfFindFirst\fR(\fIsrc, ch\fR)
+.sp
+char *
+\fBTcl_UtfFindLast\fR(\fIsrc, ch\fR)
+.sp
+char *
+\fBTcl_UtfNext\fR(\fIsrc\fR)
+.sp
+char *
+\fBTcl_UtfPrev\fR(\fIsrc, start\fR)
+.sp
+Tcl_UniChar
+\fBTcl_UniCharAtIndex\fR(\fIsrc, index\fR)
+.sp
+char *
+\fBTcl_UtfAtIndex\fR(\fIsrc, index\fR)
+.sp
+int
+\fBTcl_UtfBackslash\fR(\fIsrc, readPtr, dst\fR)
+.SH ARGUMENTS
+.AS "CONST Tcl_UniChar" numChars in/out
+.AP char *buf out
+Buffer in which the UTF-8 representation of the Tcl_UniChar is stored. At most
+TCL_UTF_MAX bytes are stored in the buffer.
+.AP int ch in
+The Tcl_UniChar to be converted or examined.
+.AP Tcl_UniChar *chPtr out
+Filled with the Tcl_UniChar represented by the head of the UTF-8 string.
+.AP "CONST char" *src in
+Pointer to a UTF-8 string.
+.AP "CONST Tcl_UniChar" *uniStr in
+A NULL-terminated Unicode string.
+.AP int len in
+The length of the UTF-8 string in bytes (not UTF-8 characters). If
+negative, all bytes up to the first null byte are used.
+.AP int numChars in
+The length of the Unicode string in characters. Must be greater than or
+equal to 0.
+.AP "Tcl_DString" *dstPtr in/out
+A pointer to a previously-initialized \fBTcl_DString\fR.
+.AP size_t n in
+The number of Unicode characters to compare in \fBTcl_UniCharNcmp\fR.
+.AP "CONST char" *start in
+Pointer to the beginning of a UTF-8 string.
+.AP int index in
+The index of a character (not byte) in the UTF-8 string.
+.AP int *readPtr out
+If non-NULL, filled with the number of bytes in the backslash sequence,
+including the backslash character.
+.AP char *dst out
+Buffer in which the bytes represented by the backslash sequence are stored.
+At most TCL_UTF_MAX bytes are stored in the buffer.
+.BE
+
+.SH DESCRIPTION
+.PP
+These routines convert between UTF-8 strings and Tcl_UniChars. A
+Tcl_UniChar is a Unicode character represented as an unsigned, fixed-size
+quantity. A UTF-8 character is a Unicode character represented as
+a varying-length sequence of up to TCL_UTF_MAX bytes. A multibyte UTF-8
+sequence consists of a lead byte followed by some number of trail bytes.
+.PP
+\fBTCL_UTF_MAX\fR is the maximum number of bytes that it takes to
+represent one Unicode character in the UTF-8 representation.
+.PP
+\fBTcl_UniCharToUtf\fR stores the Tcl_UniChar \fIch\fR as a UTF-8 string
+in starting at \fIbuf\fR. The return value is the number of bytes stored
+in \fIbuf\fR.
+.PP
+\fBTcl_UtfToUniChar\fR reads one UTF-8 character starting at \fIsrc\fR
+and stores it as a Tcl_UniChar in \fI*chPtr\fR. The return value is the
+number of bytes read from \fIsrc\fR.. The caller must ensure that the
+source buffer is long enough such that this routine does not run off the
+end and dereference non-existent or random memory; if the source buffer
+is known to be null terminated, this will not happen. If the input is
+not in proper UTF-8 format, \fBTcl_UtfToUniChar\fR will store the first
+byte of \fIsrc\fR in \fI*chPtr\fR as a Tcl_UniChar between 0x0000 and
+0x00ff and return 1.
+.PP
+\fBTcl_UniCharToUtfDString\fR converts the given Unicode string
+to UTF-8, storing the result in a previously-initialized \fBTcl_DString\fR.
+You must specify the length of the given Unicode string.
+The return value is a pointer to the UTF-8 representation of the
+Unicode string. Storage for the return value is appended to the
+end of the \fBTcl_DString\fR.
+.PP
+\fBTcl_UtfToUniCharDString\fR coverts the given UTF-8 string to Unicode,
+storing the result in the previously-initialized \fBTcl_Dstring\fR.
+you may either specify the length of the given UTF-8 string or "-1",
+in which case \fBTcl_UtfToUniCharDString\fR uses \fBstrlen\fR to
+calculate the length. The return value is a pointer to the Unicode
+representation of the UTF-8 string. Storage for the return value
+is appended to the end of the \fBTcl_DString\fR. The Unicode string
+is terminated with a Unicode NULL character.
+.PP
+\fBTcl_UniCharLen\fR corresponds to \fBstrlen\fR for Unicode
+characters. It accepts a NULL-terminated Unicode string and returns
+the number of Unicode characters (not bytes) in that string.
+.PP
+\fBTcl_UniCharNcmp\fR corresponds to \fBstrncmp\fR for Unicode
+characters. It accepts two NULL-terminated Unicode strings
+and the number of characters to compare. (Both strings are
+assumed to be at least \fIlen\fR characters long.)
+\fBTcl_UniCharNcmp\fR compares the two strings character-by-character
+according to the Unicode character ordering. It returns an integer
+greater than, equal to,
+or less than 0 if the first string is greater than, equal to, or
+less than the second string respectively.
+.PP
+\fBTcl_UtfCharComplete\fR returns 1 if the source UTF-8 string \fIsrc\fR
+of length \fIlen\fR bytes is long enough to be decoded by
+\fBTcl_UtfToUniChar\fR, or 0 otherwise. This function does not guarantee
+that the UTF-8 string is properly formed. This routine is used by
+procedures that are operating on a byte at a time and need to know if a
+full Tcl_UniChar has been seen.
+.PP
+\fBTcl_NumUtfChars\fR corresponds to \fBstrlen\fR for UTF-8 strings. It
+returns the number of Tcl_UniChars that are represented by the UTF-8 string
+\fIsrc\fR. The length of the source string is \fIlen\fR bytes. If the
+length is negative, all bytes up to the first NULL byte are used.
+.PP
+\fBTcl_UtfFindFirst\fR corresponds to \fBstrchr\fR for UTF-8 strings. It
+returns a pointer to the first occurance of the Tcl_UniChar \fIch\fR
+in the NULL-terminated UTF-8 string \fIsrc\fR. The NULL terminator is
+considered part of the UTF-8 string.
+.PP
+\fBTcl_UtfFindLast\fR corresponds to \fBstrrchr\fR for UTF-8 strings. It
+returns a pointer to the last occurance of the Tcl_UniChar \fIch\fR
+in the NULL terminated UTF-8 string \fIsrc\fR. The NULL terminator is
+considered part of the UTF-8 string.
+.PP
+Given \fIsrc\fR, a pointer to some location in a UTF-8 string,
+\fBTcl_UtfNext\fR returns a pointer to the next UTF-8 character in the
+string. The caller must not ask for the next character after the last
+character in the string.
+.PP
+Given \fIsrc\fR, a pointer to some location in a UTF-8 string,
+\fBTcl_UtfPrev\fR returns a pointer to the previous UTF-8 character in the
+string. This function will not back up to a position before \fIstart\fR,
+the start of the UTF-8 string. If \fIsrc\fR was already at \fIstart\fR, the
+return value will be \fIstart\fR.
+.PP
+\fBTcl_UniCharAtIndex\fR corresponds to a C string array dereference or the
+Pascal Ord() function. It returns the Tcl_UniChar represented at the
+specified character (not byte) \fIindex\fR in the UTF-8 string
+\fIsrc\fR. The source string must contain at least \fIindex\fR
+characters.
+.PP
+\fBTcl_UtfAtIndex\fR returns a pointer to the specified character (not
+byte) \fIindex\fR in the UTF-8 string \fIsrc\fR. The source string must
+contain at least \fIindex\fR characters. This is equivalent to calling
+\fBTcl_UtfNext\fR \fIindex\fR times.
+.PP
+\fBTcl_UtfBackslash\fR is a utility procedure used by several of the Tcl
+commands. It parses a backslash sequence and stores the properly formed
+UTF-8 character represented by the backslash sequence in the output
+buffer \fIdst\fR. At most TCL_UTF_MAX bytes are stored in the buffer.
+\fBTcl_UtfBackslash\fR modifies \fI*readPtr\fR to contain the number
+of bytes in the backslash sequence, including the backslash character.
+The return value is the number of bytes stored in the output buffer.
+.PP
+See the \fBTcl\fR manual entry for information on the valid backslash
+sequences. All of the sequences described in the Tcl manual entry are
+supported by \fBTcl_UtfBackslash\fR.
+
+.SH KEYWORDS
+utf, unicode, backslash