summaryrefslogtreecommitdiffstats
path: root/doc/Encoding.3
diff options
context:
space:
mode:
Diffstat (limited to 'doc/Encoding.3')
-rw-r--r--doc/Encoding.3242
1 files changed, 156 insertions, 86 deletions
diff --git a/doc/Encoding.3 b/doc/Encoding.3
index 12f8b93..1478c35 100644
--- a/doc/Encoding.3
+++ b/doc/Encoding.3
@@ -4,13 +4,11 @@
'\" See the file "license.terms" for information on usage and redistribution
'\" of this file, and for a DISCLAIMER OF ALL WARRANTIES.
'\"
-'\" RCS: @(#) $Id: Encoding.3,v 1.10 2002/02/08 02:52:54 dgp Exp $
-'\"
-.so man.macros
.TH Tcl_GetEncoding 3 "8.1" Tcl "Tcl Library Procedures"
+.so man.macros
.BS
.SH NAME
-Tcl_GetEncoding, Tcl_FreeEncoding, Tcl_ExternalToUtfDString, Tcl_ExternalToUtf, Tcl_UtfToExternalDString, Tcl_UtfToExternal, Tcl_WinTCharToUtf, Tcl_WinUtfToTChar, Tcl_GetEncodingName, Tcl_SetSystemEncoding, Tcl_GetEncodingNames, Tcl_CreateEncoding, Tcl_GetDefaultEncodingDir, Tcl_SetDefaultEncodingDir \- procedures for creating and using encodings.
+Tcl_GetEncoding, Tcl_FreeEncoding, Tcl_GetEncodingFromObj, Tcl_ExternalToUtfDString, Tcl_ExternalToUtf, Tcl_UtfToExternalDString, Tcl_UtfToExternal, Tcl_WinTCharToUtf, Tcl_WinUtfToTChar, Tcl_GetEncodingName, Tcl_SetSystemEncoding, Tcl_GetEncodingNameFromEnvironment, Tcl_GetEncodingNames, Tcl_CreateEncoding, Tcl_GetEncodingSearchPath, Tcl_SetEncodingSearchPath, Tcl_GetDefaultEncodingDir, Tcl_SetDefaultEncodingDir \- procedures for creating and using encodings
.SH SYNOPSIS
.nf
\fB#include <tcl.h>\fR
@@ -21,19 +19,22 @@ Tcl_Encoding
void
\fBTcl_FreeEncoding\fR(\fIencoding\fR)
.sp
+int
+\fBTcl_GetEncodingFromObj\fR(\fIinterp, objPtr, encodingPtr\fR)
+.sp
char *
\fBTcl_ExternalToUtfDString\fR(\fIencoding, src, srcLen, dstPtr\fR)
.sp
-int
-\fBTcl_ExternalToUtf\fR(\fIinterp, encoding, src, srcLen, flags, statePtr, dst, dstLen, srcReadPtr, dstWrotePtr,
- dstCharsPtr\fR)
-.sp
-char *
+char *
\fBTcl_UtfToExternalDString\fR(\fIencoding, src, srcLen, dstPtr\fR)
.sp
int
-\fBTcl_UtfToExternal\fR(\fIinterp, encoding, src, srcLen, flags, statePtr, dst, dstLen, srcReadPtr, dstWrotePtr,
- dstCharsPtr\fR)
+\fBTcl_ExternalToUtf\fR(\fIinterp, encoding, src, srcLen, flags, statePtr,
+ dst, dstLen, srcReadPtr, dstWrotePtr, dstCharsPtr\fR)
+.sp
+int
+\fBTcl_UtfToExternal\fR(\fIinterp, encoding, src, srcLen, flags, statePtr,
+ dst, dstLen, srcReadPtr, dstWrotePtr, dstCharsPtr\fR)
.sp
char *
\fBTcl_WinTCharToUtf\fR(\fItsrc, srcLen, dstPtr\fR)
@@ -41,41 +42,52 @@ char *
TCHAR *
\fBTcl_WinUtfToTChar\fR(\fIsrc, srcLen, dstPtr\fR)
.sp
-CONST char *
+const char *
\fBTcl_GetEncodingName\fR(\fIencoding\fR)
.sp
int
\fBTcl_SetSystemEncoding\fR(\fIinterp, name\fR)
.sp
+const char *
+\fBTcl_GetEncodingNameFromEnvironment\fR(\fIbufPtr\fR)
+.sp
void
\fBTcl_GetEncodingNames\fR(\fIinterp\fR)
.sp
Tcl_Encoding
\fBTcl_CreateEncoding\fR(\fItypePtr\fR)
.sp
-CONST char *
+Tcl_Obj *
+\fBTcl_GetEncodingSearchPath\fR()
+.sp
+int
+\fBTcl_SetEncodingSearchPath\fR(\fIsearchPath\fR)
+.sp
+const char *
\fBTcl_GetDefaultEncodingDir\fR(\fIvoid\fR)
.sp
void
\fBTcl_SetDefaultEncodingDir\fR(\fIpath\fR)
-
-
.SH ARGUMENTS
-.AS Tcl_EncodingState *dstWrotePtr
+.AS "const Tcl_EncodingType" *dstWrotePtr in/out
.AP Tcl_Interp *interp in
Interpreter to use for error reporting, or NULL if no error reporting is
desired.
-.AP "CONST char" *name in
+.AP "const char" *name in
Name of encoding to load.
.AP Tcl_Encoding encoding in
The encoding to query, free, or use for converting text. If \fIencoding\fR is
NULL, the current system encoding is used.
-.AP "CONST char" *src in
+.AP Tcl_Obj *objPtr in
+Name of encoding to get token for.
+.AP Tcl_Encoding *encodingPtr out
+Points to storage where encoding token is to be written.
+.AP "const char" *src in
For the \fBTcl_ExternalToUtf\fR functions, an array of bytes in the
specified encoding that are to be converted to UTF-8. For the
\fBTcl_UtfToExternal\fR and \fBTcl_WinUtfToTChar\fR functions, an array of
UTF-8 characters to be converted to the specified encoding.
-.AP "CONST TCHAR" *tsrc in
+.AP "const TCHAR" *tsrc in
An array of Windows TCHAR characters to convert to UTF-8.
.AP int srcLen in
Length of \fIsrc\fR or \fItsrc\fR in bytes. If the length is negative, the
@@ -85,21 +97,21 @@ Pointer to an uninitialized or free \fBTcl_DString\fR in which the converted
result will be stored.
.AP int flags in
Various flag bits OR-ed together.
-TCL_ENCODING_START signifies that the
+\fBTCL_ENCODING_START\fR signifies that the
source buffer is the first block in a (potentially multi-block) input
stream, telling the conversion routine to reset to an initial state and
perform any initialization that needs to occur before the first byte is
-converted. TCL_ENCODING_END signifies that the source buffer is the last
+converted. \fBTCL_ENCODING_END\fR signifies that the source buffer is the last
block in a (potentially multi-block) input stream, telling the conversion
routine to perform any finalization that needs to occur after the last
byte is converted and then to reset to an initial state.
-TCL_ENCODING_STOPONERROR signifies that the conversion routine should
-return immediately upon reading a source character that doesn't exist in
+\fBTCL_ENCODING_STOPONERROR\fR signifies that the conversion routine should
+return immediately upon reading a source character that does not exist in
the target encoding; otherwise a default fallback character will
automatically be substituted.
.AP Tcl_EncodingState *statePtr in/out
Used when converting a (generally long or indefinite length) byte stream
-in a piece by piece fashion. The conversion routine stores its current
+in a piece-by-piece fashion. The conversion routine stores its current
state in \fI*statePtr\fR after \fIsrc\fR (the buffer containing the
current piece) has been converted; that state information must be passed
back when converting the next piece of the stream so the conversion
@@ -122,9 +134,13 @@ buffer as a result of the conversion. May be NULL.
.AP int *dstCharsPtr out
Filled with the number of characters that correspond to the number of bytes
stored in the output buffer. May be NULL.
-.AP Tcl_EncodingType *typePtr in
+.AP Tcl_DString *bufPtr out
+Storage for the prescribed system encoding name.
+.AP "const Tcl_EncodingType" *typePtr in
Structure that defines a new type of encoding.
-.AP "CONST char" *path in
+.AP Tcl_Obj *searchPath in
+List of filesystem directories in which to search for encoding data files.
+.AP "const char" *path in
A path to the location of the encoding file.
.BE
.SH INTRODUCTION
@@ -151,7 +167,7 @@ platform-independent manner.
.SH DESCRIPTION
.PP
\fBTcl_GetEncoding\fR finds an encoding given its \fIname\fR. The name may
-refer to a builtin Tcl encoding, a user-defined encoding registered by
+refer to a built-in Tcl encoding, a user-defined encoding registered by
calling \fBTcl_CreateEncoding\fR, or a dynamically-loadable encoding
file. The return value is a token that represents the encoding and can be
used in subsequent calls to procedures such as \fBTcl_GetEncodingName\fR,
@@ -172,9 +188,21 @@ anywhere (i.e., it has been freed as many times as it has been gotten)
\fBTcl_FreeEncoding\fR will release all storage the encoding was using
and delete it from the database.
.PP
+\fBTcl_GetEncodingFromObj\fR treats the string representation of
+\fIobjPtr\fR as an encoding name, and finds an encoding with that
+name, just as \fBTcl_GetEncoding\fR does. When an encoding is found,
+it is cached within the \fBobjPtr\fR value for future reference, the
+\fBTcl_Encoding\fR token is written to the storage pointed to by
+\fIencodingPtr\fR, and the value \fBTCL_OK\fR is returned. If no such
+encoding is found, the value \fBTCL_ERROR\fR is returned, and no
+writing to \fB*\fR\fIencodingPtr\fR takes place. Just as with
+\fBTcl_GetEncoding\fR, the caller should call \fBTcl_FreeEncoding\fR
+on the resulting encoding token when that token will no longer be
+used.
+.PP
\fBTcl_ExternalToUtfDString\fR converts a source buffer \fIsrc\fR from the
specified \fIencoding\fR into UTF-8. The converted bytes are stored in
-\fIdstPtr\fR, which is then NULL terminated. The caller should eventually
+\fIdstPtr\fR, which is then null-terminated. The caller should eventually
call \fBTcl_DStringFree\fR to free any information stored in \fIdstPtr\fR.
When converting, if any of the characters in the source buffer cannot be
represented in the target encoding, a default fallback character will be
@@ -194,7 +222,7 @@ All bytes of \fIsrc\fR were converted.
The destination buffer was not large enough for all of the converted data; as
many characters as could fit were converted though.
.IP \fBTCL_CONVERT_MULTIBYTE\fR 29
-The last fews bytes in the source buffer were the beginning of a multibyte
+The last few bytes in the source buffer were the beginning of a multibyte
sequence, but more bytes were needed to complete this sequence. A
subsequent call to the conversion routine should pass a buffer containing
the unconverted bytes that remained in \fIsrc\fR plus some further bytes
@@ -206,13 +234,13 @@ if the input stream has been damaged or if the input encoding method was
misidentified.
.IP \fBTCL_CONVERT_UNKNOWN\fR 29
The source buffer contained a character that could not be represented in
-the target encoding and TCL_ENCODING_STOPONERROR was specified.
+the target encoding and \fBTCL_ENCODING_STOPONERROR\fR was specified.
.RE
.LP
\fBTcl_UtfToExternalDString\fR converts a source buffer \fIsrc\fR from UTF-8
into the specified \fIencoding\fR. The converted bytes are stored in
\fIdstPtr\fR, which is then terminated with the appropriate encoding-specific
-NULL. The caller should eventually call \fBTcl_DStringFree\fR to free any
+null. The caller should eventually call \fBTcl_DStringFree\fR to free any
information stored in \fIdstPtr\fR. When converting, if any of the
characters in the source buffer cannot be represented in the target
encoding, a default fallback character will be used. The return value is
@@ -230,21 +258,29 @@ is filled with the corresponding number of bytes that were stored in
\fBTcl_WinUtfToTChar\fR and \fBTcl_WinTCharToUtf\fR are
Windows-only convenience
functions for converting between UTF-8 and Windows strings. On Windows 95
-(as with the Macintosh and Unix operating systems),
-all strings exchanged between Tcl and the operating system are "char"
+(as with the Unix operating system),
+all strings exchanged between Tcl and the operating system are
+.QW "char"
based. On Windows NT, some strings exchanged between Tcl and the
-operating system are "char" oriented while others are in Unicode. By
+operating system are
+.QW "char"
+oriented while others are in Unicode. By
convention, in Windows a TCHAR is a character in the ANSI code page
on Windows 95 and a Unicode character on Windows NT.
.PP
-If you planned to use the same "char" based interfaces on both Windows
+If you planned to use the same
+.QW "char"
+based interfaces on both Windows
95 and Windows NT, you could use \fBTcl_UtfToExternal\fR and
\fBTcl_ExternalToUtf\fR (or their \fBTcl_DString\fR equivalents) with an
encoding of NULL (the current system encoding). On the other hand,
if you planned to use the Unicode interface when running on Windows NT
-and the "char" interfaces when running on Windows 95, you would have
+and the
+.QW "char"
+interfaces when running on Windows 95, you would have
to perform the following type of test over and over in your program
-(as represented in psuedo-code):
+(as represented in pseudo-code):
+.PP
.CS
if (running NT) {
encoding <- Tcl_GetEncoding("unicode");
@@ -252,7 +288,9 @@ if (running NT) {
Tcl_FreeEncoding(encoding);
} else {
nativeBuffer <- Tcl_UtfToExternal(NULL, utfBuffer);
+}
.CE
+.PP
\fBTcl_WinUtfToTChar\fR and \fBTcl_WinTCharToUtf\fR automatically
handle this test and use the proper encoding based on the current
operating system. \fBTcl_WinUtfToTChar\fR returns a pointer to
@@ -271,11 +309,18 @@ was used to create the encoding. The string returned by
whenever the user passes a NULL value for the \fIencoding\fR argument to
any of the other encoding functions. If \fIname\fR is NULL, the system
encoding is reset to the default system encoding, \fBbinary\fR. If the
-name did not refer to any known or loadable encoding, TCL_ERROR is
+name did not refer to any known or loadable encoding, \fBTCL_ERROR\fR is
returned and an error message is left in \fIinterp\fR. Otherwise, this
procedure increments the reference count of the new system encoding,
decrements the reference count of the old system encoding, and returns
-TCL_OK.
+\fBTCL_OK\fR.
+.PP
+\fBTcl_GetEncodingNameFromEnvironment\fR provides a means for the Tcl
+library to report the encoding name it believes to be the correct one
+to use as the system encoding, based on system calls and examination of
+the environment suitable for the platform. It accepts \fIbufPtr\fR,
+a pointer to an uninitialized or freed \fBTcl_DString\fR and writes
+the encoding name to it. The \fBTcl_DStringValue\fR is returned.
.PP
\fBTcl_GetEncodingNames\fR sets the \fIinterp\fR result to a list
consisting of the names of all the encodings that are currently defined
@@ -303,13 +348,13 @@ convert between this encoding and UTF-8. It is defined as follows:
.PP
.CS
typedef struct Tcl_EncodingType {
- CONST char *\fIencodingName\fR;
- Tcl_EncodingConvertProc *\fItoUtfProc\fR;
- Tcl_EncodingConvertProc *\fIfromUtfProc\fR;
- Tcl_EncodingFreeProc *\fIfreeProc\fR;
- ClientData \fIclientData\fR;
- int \fInullSize\fR;
-} Tcl_EncodingType;
+ const char *\fIencodingName\fR;
+ Tcl_EncodingConvertProc *\fItoUtfProc\fR;
+ Tcl_EncodingConvertProc *\fIfromUtfProc\fR;
+ Tcl_EncodingFreeProc *\fIfreeProc\fR;
+ ClientData \fIclientData\fR;
+ int \fInullSize\fR;
+} \fBTcl_EncodingType\fR;
.CE
.PP
The \fIencodingName\fR provides a string name for the encoding, by
@@ -337,17 +382,17 @@ The callback procedures \fItoUtfProc\fR and \fIfromUtfProc\fR should match the
type \fBTcl_EncodingConvertProc\fR:
.PP
.CS
-typedef int Tcl_EncodingConvertProc(
- ClientData \fIclientData\fR,
- CONST char *\fIsrc\fR,
- int \fIsrcLen\fR,
- int \fIflags\fR,
- Tcl_Encoding *\fIstatePtr\fR,
- char *\fIdst\fR,
- int \fIdstLen\fR,
- int *\fIsrcReadPtr\fR,
- int *\fIdstWrotePtr\fR,
- int *\fIdstCharsPtr\fR);
+typedef int \fBTcl_EncodingConvertProc\fR(
+ ClientData \fIclientData\fR,
+ const char *\fIsrc\fR,
+ int \fIsrcLen\fR,
+ int \fIflags\fR,
+ Tcl_EncodingState *\fIstatePtr\fR,
+ char *\fIdst\fR,
+ int \fIdstLen\fR,
+ int *\fIsrcReadPtr\fR,
+ int *\fIdstWrotePtr\fR,
+ int *\fIdstCharsPtr\fR);
.CE
.PP
The \fItoUtfProc\fR and \fIfromUtfProc\fR procedures are called by the
@@ -367,22 +412,43 @@ procedure will be a non-NULL location.
.PP
The callback procedure \fIfreeProc\fR, if non-NULL, should match the type
\fBTcl_EncodingFreeProc\fR:
+.PP
.CS
-typedef void Tcl_EncodingFreeProc(
- ClientData \fIclientData\fR);
+typedef void \fBTcl_EncodingFreeProc\fR(
+ ClientData \fIclientData\fR);
.CE
.PP
This \fIfreeProc\fR function is called when the encoding is deleted. The
\fIclientData\fR parameter is the same as the \fIclientData\fR field
specified to \fBTcl_CreateEncoding\fR when the encoding was created.
.PP
-
+\fBTcl_GetEncodingSearchPath\fR and \fBTcl_SetEncodingSearchPath\fR
+are called to access and set the list of filesystem directories searched
+for encoding data files.
+.PP
+The value returned by \fBTcl_GetEncodingSearchPath\fR
+is the value stored by the last successful call to
+\fBTcl_SetEncodingSearchPath\fR. If no calls to
+\fBTcl_SetEncodingSearchPath\fR have occurred, Tcl will compute an initial
+value based on the environment. There is one encoding search path for the
+entire process, shared by all threads in the process.
+.PP
+\fBTcl_SetEncodingSearchPath\fR stores \fIsearchPath\fR and returns
+\fBTCL_OK\fR, unless \fIsearchPath\fR is not a valid Tcl list, which
+causes \fBTCL_ERROR\fR to be returned. The elements of \fIsearchPath\fR
+are not verified as existing readable filesystem directories. When
+searching for encoding data files takes place, and non-existent or
+non-readable filesystem directories on the \fIsearchPath\fR are silently
+ignored.
+.PP
\fBTcl_GetDefaultEncodingDir\fR and \fBTcl_SetDefaultEncodingDir\fR
-access and set the directory to use when locating the default encoding
-files. If this value is not NULL, the \fBTclpInitLibraryPath\fR routine
-appends the path to the head of the search path, and uses this path as
-the first place to look into when trying to locate the encoding file.
-
+are obsolete interfaces best replaced with calls to
+\fBTcl_GetEncodingSearchPath\fR and \fBTcl_SetEncodingSearchPath\fR.
+They are called to access and set the first element of the \fIsearchPath\fR
+list. Since Tcl searches \fIsearchPath\fR for encoding data files in
+list order, these routines establish the
+.QW default
+directory in which to find encoding data files.
.SH "ENCODING FILES"
Space would prohibit precompiling into Tcl every possible encoding
algorithm, so many encodings are stored on disk as dynamically-loadable
@@ -394,23 +460,25 @@ external encoding may consist of single-byte, multi-byte, or double-byte
characters.
.PP
Each dynamically-loadable encoding is represented as a text file. The
-initial line of the file, beginning with a ``#'' symbol, is a comment
+initial line of the file, beginning with a
+.QW #
+symbol, is a comment
that provides a human-readable description of the file. The next line
identifies the type of encoding file. It can be one of the following
letters:
-.IP "[1] \fBS\fR"
+.IP "[1] \fBS\fR"
A single-byte encoding, where one character is always one byte long in the
encoding. An example is \fBiso8859-1\fR, used by many European languages.
-.IP "[2] \fBD\fR"
+.IP "[2] \fBD\fR"
A double-byte encoding, where one character is always two bytes long in the
encoding. An example is \fBbig5\fR, used for Chinese text.
-.IP "[3] \fBM\fR"
+.IP "[3] \fBM\fR"
A multi-byte encoding, where one character may be either one or two bytes long.
-Certain bytes are a lead bytes, indicating that another byte must follow
+Certain bytes are lead bytes, indicating that another byte must follow
and that together the two bytes represent one character. Other bytes are not
lead bytes and represent themselves. An example is \fBshiftjis\fR, used by
many Japanese computers.
-.IP "[4] \fBE\fR"
+.IP "[4] \fBE\fR"
An escape-sequence encoding, specifying that certain sequences of bytes
do not represent characters, but commands that describe how following bytes
should be interpreted.
@@ -421,6 +489,7 @@ Cases [1], [2], and [3] are collectively referred to as table-based encoding
files. The lines in a table-based encoding file are in the same
format as this example taken from the \fBshiftjis\fR encoding (this is not
the complete file):
+.PP
.CS
# Encoding file: shiftjis, multi-byte
M
@@ -480,25 +549,26 @@ and 0x8163 in \fBshiftjis\fR map to 203E and 2026 in Unicode, respectively.
Following the first page will be all the other pages, each in the same
format as the first: one number identifying the page followed by 256
double-byte Unicode characters. If a character in the encoding maps to the
-Unicode character 0000, it means that the character doesn't actually exist.
+Unicode character 0000, it means that the character does not actually exist.
If all characters on a page would map to 0000, that page can be omitted.
.PP
Case [4] is the escape-sequence encoding file. The lines in an this type of
file are in the same format as this example taken from the \fBiso2022-jp\fR
encoding:
+.PP
.CS
.ta 1.5i
# Encoding file: iso2022-jp, escape-driven
E
init {}
final {}
-iso8859-1 \\x1b(B
-jis0201 \\x1b(J
-jis0208 \\x1b$@
-jis0208 \\x1b$B
-jis0212 \\x1b$(D
-gb2312 \\x1b$A
-ksc5601 \\x1b$(C
+iso8859-1 \ex1b(B
+jis0201 \ex1b(J
+jis0208 \ex1b$@
+jis0208 \ex1b$B
+jis0212 \ex1b$(D
+gb2312 \ex1b$A
+ksc5601 \ex1b$(C
.CE
.PP
In the file, the first column represents an option and the second column
@@ -507,16 +577,16 @@ the first character is converted, while \fBfinal\fR is a string to emit
or expect after the last character. All other options are names of
table-based encodings; the associated value is the escape-sequence that
marks that encoding. Tcl syntax is used for the values; in the above
-example, for instance, ``\fB{}\fR'' represents the empty string and
-``\fB\\x1b\fR'' represents character 27.
+example, for instance,
+.QW \fB{}\fR
+represents the empty string and
+.QW \fB\ex1b\fR
+represents character 27.
.PP
When \fBTcl_GetEncoding\fR encounters an encoding \fIname\fR that has not
been loaded, it attempts to load an encoding file called \fIname\fB.enc\fR
-from the \fBencoding\fR subdirectory of each directory specified in the
-library path \fB$tcl_libPath\fR. If the encoding file exists, but is
+from the \fBencoding\fR subdirectory of each directory that Tcl searches
+for its script library. If the encoding file exists, but is
malformed, an error message will be left in \fIinterp\fR.
.SH KEYWORDS
utf, encoding, convert
-
-
-