summaryrefslogtreecommitdiffstats
path: root/doc/Encoding.3
diff options
context:
space:
mode:
Diffstat (limited to 'doc/Encoding.3')
-rw-r--r--doc/Encoding.3152
1 files changed, 111 insertions, 41 deletions
diff --git a/doc/Encoding.3 b/doc/Encoding.3
index c365aaf..1478c35 100644
--- a/doc/Encoding.3
+++ b/doc/Encoding.3
@@ -4,13 +4,11 @@
'\" See the file "license.terms" for information on usage and redistribution
'\" of this file, and for a DISCLAIMER OF ALL WARRANTIES.
'\"
-'\" RCS: @(#) $Id: Encoding.3,v 1.20 2004/10/07 15:15:37 dkf Exp $
-'\"
-.so man.macros
.TH Tcl_GetEncoding 3 "8.1" Tcl "Tcl Library Procedures"
+.so man.macros
.BS
.SH NAME
-Tcl_GetEncoding, Tcl_FreeEncoding, Tcl_ExternalToUtfDString, Tcl_ExternalToUtf, Tcl_UtfToExternalDString, Tcl_UtfToExternal, Tcl_WinTCharToUtf, Tcl_WinUtfToTChar, Tcl_GetEncodingName, Tcl_SetSystemEncoding, Tcl_GetEncodingNames, Tcl_CreateEncoding, Tcl_GetDefaultEncodingDir, Tcl_SetDefaultEncodingDir \- procedures for creating and using encodings
+Tcl_GetEncoding, Tcl_FreeEncoding, Tcl_GetEncodingFromObj, Tcl_ExternalToUtfDString, Tcl_ExternalToUtf, Tcl_UtfToExternalDString, Tcl_UtfToExternal, Tcl_WinTCharToUtf, Tcl_WinUtfToTChar, Tcl_GetEncodingName, Tcl_SetSystemEncoding, Tcl_GetEncodingNameFromEnvironment, Tcl_GetEncodingNames, Tcl_CreateEncoding, Tcl_GetEncodingSearchPath, Tcl_SetEncodingSearchPath, Tcl_GetDefaultEncodingDir, Tcl_SetDefaultEncodingDir \- procedures for creating and using encodings
.SH SYNOPSIS
.nf
\fB#include <tcl.h>\fR
@@ -21,6 +19,9 @@ Tcl_Encoding
void
\fBTcl_FreeEncoding\fR(\fIencoding\fR)
.sp
+int
+\fBTcl_GetEncodingFromObj\fR(\fIinterp, objPtr, encodingPtr\fR)
+.sp
char *
\fBTcl_ExternalToUtfDString\fR(\fIencoding, src, srcLen, dstPtr\fR)
.sp
@@ -47,20 +48,28 @@ const char *
int
\fBTcl_SetSystemEncoding\fR(\fIinterp, name\fR)
.sp
+const char *
+\fBTcl_GetEncodingNameFromEnvironment\fR(\fIbufPtr\fR)
+.sp
void
\fBTcl_GetEncodingNames\fR(\fIinterp\fR)
.sp
Tcl_Encoding
\fBTcl_CreateEncoding\fR(\fItypePtr\fR)
.sp
+Tcl_Obj *
+\fBTcl_GetEncodingSearchPath\fR()
+.sp
+int
+\fBTcl_SetEncodingSearchPath\fR(\fIsearchPath\fR)
+.sp
const char *
\fBTcl_GetDefaultEncodingDir\fR(\fIvoid\fR)
.sp
void
\fBTcl_SetDefaultEncodingDir\fR(\fIpath\fR)
-
.SH ARGUMENTS
-.AS Tcl_EncodingState *dstWrotePtr in/out
+.AS "const Tcl_EncodingType" *dstWrotePtr in/out
.AP Tcl_Interp *interp in
Interpreter to use for error reporting, or NULL if no error reporting is
desired.
@@ -69,6 +78,10 @@ Name of encoding to load.
.AP Tcl_Encoding encoding in
The encoding to query, free, or use for converting text. If \fIencoding\fR is
NULL, the current system encoding is used.
+.AP Tcl_Obj *objPtr in
+Name of encoding to get token for.
+.AP Tcl_Encoding *encodingPtr out
+Points to storage where encoding token is to be written.
.AP "const char" *src in
For the \fBTcl_ExternalToUtf\fR functions, an array of bytes in the
specified encoding that are to be converted to UTF-8. For the
@@ -93,7 +106,7 @@ block in a (potentially multi-block) input stream, telling the conversion
routine to perform any finalization that needs to occur after the last
byte is converted and then to reset to an initial state.
\fBTCL_ENCODING_STOPONERROR\fR signifies that the conversion routine should
-return immediately upon reading a source character that doesn't exist in
+return immediately upon reading a source character that does not exist in
the target encoding; otherwise a default fallback character will
automatically be substituted.
.AP Tcl_EncodingState *statePtr in/out
@@ -121,8 +134,12 @@ buffer as a result of the conversion. May be NULL.
.AP int *dstCharsPtr out
Filled with the number of characters that correspond to the number of bytes
stored in the output buffer. May be NULL.
-.AP Tcl_EncodingType *typePtr in
+.AP Tcl_DString *bufPtr out
+Storage for the prescribed system encoding name.
+.AP "const Tcl_EncodingType" *typePtr in
Structure that defines a new type of encoding.
+.AP Tcl_Obj *searchPath in
+List of filesystem directories in which to search for encoding data files.
.AP "const char" *path in
A path to the location of the encoding file.
.BE
@@ -171,6 +188,18 @@ anywhere (i.e., it has been freed as many times as it has been gotten)
\fBTcl_FreeEncoding\fR will release all storage the encoding was using
and delete it from the database.
.PP
+\fBTcl_GetEncodingFromObj\fR treats the string representation of
+\fIobjPtr\fR as an encoding name, and finds an encoding with that
+name, just as \fBTcl_GetEncoding\fR does. When an encoding is found,
+it is cached within the \fBobjPtr\fR value for future reference, the
+\fBTcl_Encoding\fR token is written to the storage pointed to by
+\fIencodingPtr\fR, and the value \fBTCL_OK\fR is returned. If no such
+encoding is found, the value \fBTCL_ERROR\fR is returned, and no
+writing to \fB*\fR\fIencodingPtr\fR takes place. Just as with
+\fBTcl_GetEncoding\fR, the caller should call \fBTcl_FreeEncoding\fR
+on the resulting encoding token when that token will no longer be
+used.
+.PP
\fBTcl_ExternalToUtfDString\fR converts a source buffer \fIsrc\fR from the
specified \fIencoding\fR into UTF-8. The converted bytes are stored in
\fIdstPtr\fR, which is then null-terminated. The caller should eventually
@@ -230,20 +259,28 @@ is filled with the corresponding number of bytes that were stored in
Windows-only convenience
functions for converting between UTF-8 and Windows strings. On Windows 95
(as with the Unix operating system),
-all strings exchanged between Tcl and the operating system are "char"
+all strings exchanged between Tcl and the operating system are
+.QW "char"
based. On Windows NT, some strings exchanged between Tcl and the
-operating system are "char" oriented while others are in Unicode. By
+operating system are
+.QW "char"
+oriented while others are in Unicode. By
convention, in Windows a TCHAR is a character in the ANSI code page
on Windows 95 and a Unicode character on Windows NT.
.PP
-If you planned to use the same "char" based interfaces on both Windows
+If you planned to use the same
+.QW "char"
+based interfaces on both Windows
95 and Windows NT, you could use \fBTcl_UtfToExternal\fR and
\fBTcl_ExternalToUtf\fR (or their \fBTcl_DString\fR equivalents) with an
encoding of NULL (the current system encoding). On the other hand,
if you planned to use the Unicode interface when running on Windows NT
-and the "char" interfaces when running on Windows 95, you would have
+and the
+.QW "char"
+interfaces when running on Windows 95, you would have
to perform the following type of test over and over in your program
(as represented in pseudo-code):
+.PP
.CS
if (running NT) {
encoding <- Tcl_GetEncoding("unicode");
@@ -253,6 +290,7 @@ if (running NT) {
nativeBuffer <- Tcl_UtfToExternal(NULL, utfBuffer);
}
.CE
+.PP
\fBTcl_WinUtfToTChar\fR and \fBTcl_WinTCharToUtf\fR automatically
handle this test and use the proper encoding based on the current
operating system. \fBTcl_WinUtfToTChar\fR returns a pointer to
@@ -277,6 +315,13 @@ procedure increments the reference count of the new system encoding,
decrements the reference count of the old system encoding, and returns
\fBTCL_OK\fR.
.PP
+\fBTcl_GetEncodingNameFromEnvironment\fR provides a means for the Tcl
+library to report the encoding name it believes to be the correct one
+to use as the system encoding, based on system calls and examination of
+the environment suitable for the platform. It accepts \fIbufPtr\fR,
+a pointer to an uninitialized or freed \fBTcl_DString\fR and writes
+the encoding name to it. The \fBTcl_DStringValue\fR is returned.
+.PP
\fBTcl_GetEncodingNames\fR sets the \fIinterp\fR result to a list
consisting of the names of all the encodings that are currently defined
or can be dynamically loaded, searching the encoding path specified by
@@ -303,13 +348,13 @@ convert between this encoding and UTF-8. It is defined as follows:
.PP
.CS
typedef struct Tcl_EncodingType {
- const char *\fIencodingName\fR;
- Tcl_EncodingConvertProc *\fItoUtfProc\fR;
- Tcl_EncodingConvertProc *\fIfromUtfProc\fR;
- Tcl_EncodingFreeProc *\fIfreeProc\fR;
- ClientData \fIclientData\fR;
- int \fInullSize\fR;
-} Tcl_EncodingType;
+ const char *\fIencodingName\fR;
+ Tcl_EncodingConvertProc *\fItoUtfProc\fR;
+ Tcl_EncodingConvertProc *\fIfromUtfProc\fR;
+ Tcl_EncodingFreeProc *\fIfreeProc\fR;
+ ClientData \fIclientData\fR;
+ int \fInullSize\fR;
+} \fBTcl_EncodingType\fR;
.CE
.PP
The \fIencodingName\fR provides a string name for the encoding, by
@@ -337,7 +382,7 @@ The callback procedures \fItoUtfProc\fR and \fIfromUtfProc\fR should match the
type \fBTcl_EncodingConvertProc\fR:
.PP
.CS
-typedef int Tcl_EncodingConvertProc(
+typedef int \fBTcl_EncodingConvertProc\fR(
ClientData \fIclientData\fR,
const char *\fIsrc\fR,
int \fIsrcLen\fR,
@@ -367,8 +412,9 @@ procedure will be a non-NULL location.
.PP
The callback procedure \fIfreeProc\fR, if non-NULL, should match the type
\fBTcl_EncodingFreeProc\fR:
+.PP
.CS
-typedef void Tcl_EncodingFreeProc(
+typedef void \fBTcl_EncodingFreeProc\fR(
ClientData \fIclientData\fR);
.CE
.PP
@@ -376,13 +422,33 @@ This \fIfreeProc\fR function is called when the encoding is deleted. The
\fIclientData\fR parameter is the same as the \fIclientData\fR field
specified to \fBTcl_CreateEncoding\fR when the encoding was created.
.PP
-
+\fBTcl_GetEncodingSearchPath\fR and \fBTcl_SetEncodingSearchPath\fR
+are called to access and set the list of filesystem directories searched
+for encoding data files.
+.PP
+The value returned by \fBTcl_GetEncodingSearchPath\fR
+is the value stored by the last successful call to
+\fBTcl_SetEncodingSearchPath\fR. If no calls to
+\fBTcl_SetEncodingSearchPath\fR have occurred, Tcl will compute an initial
+value based on the environment. There is one encoding search path for the
+entire process, shared by all threads in the process.
+.PP
+\fBTcl_SetEncodingSearchPath\fR stores \fIsearchPath\fR and returns
+\fBTCL_OK\fR, unless \fIsearchPath\fR is not a valid Tcl list, which
+causes \fBTCL_ERROR\fR to be returned. The elements of \fIsearchPath\fR
+are not verified as existing readable filesystem directories. When
+searching for encoding data files takes place, and non-existent or
+non-readable filesystem directories on the \fIsearchPath\fR are silently
+ignored.
+.PP
\fBTcl_GetDefaultEncodingDir\fR and \fBTcl_SetDefaultEncodingDir\fR
-access and set the directory to use when locating the default encoding
-files. If this value is not NULL, the \fBTclpInitLibraryPath\fR routine
-appends the path to the head of the search path, and uses this path as
-the first place to look into when trying to locate the encoding file.
-
+are obsolete interfaces best replaced with calls to
+\fBTcl_GetEncodingSearchPath\fR and \fBTcl_SetEncodingSearchPath\fR.
+They are called to access and set the first element of the \fIsearchPath\fR
+list. Since Tcl searches \fIsearchPath\fR for encoding data files in
+list order, these routines establish the
+.QW default
+directory in which to find encoding data files.
.SH "ENCODING FILES"
Space would prohibit precompiling into Tcl every possible encoding
algorithm, so many encodings are stored on disk as dynamically-loadable
@@ -394,7 +460,9 @@ external encoding may consist of single-byte, multi-byte, or double-byte
characters.
.PP
Each dynamically-loadable encoding is represented as a text file. The
-initial line of the file, beginning with a ``#'' symbol, is a comment
+initial line of the file, beginning with a
+.QW #
+symbol, is a comment
that provides a human-readable description of the file. The next line
identifies the type of encoding file. It can be one of the following
letters:
@@ -421,6 +489,7 @@ Cases [1], [2], and [3] are collectively referred to as table-based encoding
files. The lines in a table-based encoding file are in the same
format as this example taken from the \fBshiftjis\fR encoding (this is not
the complete file):
+.PP
.CS
# Encoding file: shiftjis, multi-byte
M
@@ -480,25 +549,26 @@ and 0x8163 in \fBshiftjis\fR map to 203E and 2026 in Unicode, respectively.
Following the first page will be all the other pages, each in the same
format as the first: one number identifying the page followed by 256
double-byte Unicode characters. If a character in the encoding maps to the
-Unicode character 0000, it means that the character doesn't actually exist.
+Unicode character 0000, it means that the character does not actually exist.
If all characters on a page would map to 0000, that page can be omitted.
.PP
Case [4] is the escape-sequence encoding file. The lines in an this type of
file are in the same format as this example taken from the \fBiso2022-jp\fR
encoding:
+.PP
.CS
.ta 1.5i
# Encoding file: iso2022-jp, escape-driven
E
init {}
final {}
-iso8859-1 \\x1b(B
-jis0201 \\x1b(J
-jis0208 \\x1b$@
-jis0208 \\x1b$B
-jis0212 \\x1b$(D
-gb2312 \\x1b$A
-ksc5601 \\x1b$(C
+iso8859-1 \ex1b(B
+jis0201 \ex1b(J
+jis0208 \ex1b$@
+jis0208 \ex1b$B
+jis0212 \ex1b$(D
+gb2312 \ex1b$A
+ksc5601 \ex1b$(C
.CE
.PP
In the file, the first column represents an option and the second column
@@ -507,8 +577,11 @@ the first character is converted, while \fBfinal\fR is a string to emit
or expect after the last character. All other options are names of
table-based encodings; the associated value is the escape-sequence that
marks that encoding. Tcl syntax is used for the values; in the above
-example, for instance, ``\fB{}\fR'' represents the empty string and
-``\fB\\x1b\fR'' represents character 27.
+example, for instance,
+.QW \fB{}\fR
+represents the empty string and
+.QW \fB\ex1b\fR
+represents character 27.
.PP
When \fBTcl_GetEncoding\fR encounters an encoding \fIname\fR that has not
been loaded, it attempts to load an encoding file called \fIname\fB.enc\fR
@@ -517,6 +590,3 @@ for its script library. If the encoding file exists, but is
malformed, an error message will be left in \fIinterp\fR.
.SH KEYWORDS
utf, encoding, convert
-
-
-