diff options
Diffstat (limited to 'doc/Encoding.3')
-rw-r--r-- | doc/Encoding.3 | 152 |
1 files changed, 111 insertions, 41 deletions
diff --git a/doc/Encoding.3 b/doc/Encoding.3 index c365aaf..1478c35 100644 --- a/doc/Encoding.3 +++ b/doc/Encoding.3 @@ -4,13 +4,11 @@ '\" See the file "license.terms" for information on usage and redistribution '\" of this file, and for a DISCLAIMER OF ALL WARRANTIES. '\" -'\" RCS: @(#) $Id: Encoding.3,v 1.20 2004/10/07 15:15:37 dkf Exp $ -'\" -.so man.macros .TH Tcl_GetEncoding 3 "8.1" Tcl "Tcl Library Procedures" +.so man.macros .BS .SH NAME -Tcl_GetEncoding, Tcl_FreeEncoding, Tcl_ExternalToUtfDString, Tcl_ExternalToUtf, Tcl_UtfToExternalDString, Tcl_UtfToExternal, Tcl_WinTCharToUtf, Tcl_WinUtfToTChar, Tcl_GetEncodingName, Tcl_SetSystemEncoding, Tcl_GetEncodingNames, Tcl_CreateEncoding, Tcl_GetDefaultEncodingDir, Tcl_SetDefaultEncodingDir \- procedures for creating and using encodings +Tcl_GetEncoding, Tcl_FreeEncoding, Tcl_GetEncodingFromObj, Tcl_ExternalToUtfDString, Tcl_ExternalToUtf, Tcl_UtfToExternalDString, Tcl_UtfToExternal, Tcl_WinTCharToUtf, Tcl_WinUtfToTChar, Tcl_GetEncodingName, Tcl_SetSystemEncoding, Tcl_GetEncodingNameFromEnvironment, Tcl_GetEncodingNames, Tcl_CreateEncoding, Tcl_GetEncodingSearchPath, Tcl_SetEncodingSearchPath, Tcl_GetDefaultEncodingDir, Tcl_SetDefaultEncodingDir \- procedures for creating and using encodings .SH SYNOPSIS .nf \fB#include <tcl.h>\fR @@ -21,6 +19,9 @@ Tcl_Encoding void \fBTcl_FreeEncoding\fR(\fIencoding\fR) .sp +int +\fBTcl_GetEncodingFromObj\fR(\fIinterp, objPtr, encodingPtr\fR) +.sp char * \fBTcl_ExternalToUtfDString\fR(\fIencoding, src, srcLen, dstPtr\fR) .sp @@ -47,20 +48,28 @@ const char * int \fBTcl_SetSystemEncoding\fR(\fIinterp, name\fR) .sp +const char * +\fBTcl_GetEncodingNameFromEnvironment\fR(\fIbufPtr\fR) +.sp void \fBTcl_GetEncodingNames\fR(\fIinterp\fR) .sp Tcl_Encoding \fBTcl_CreateEncoding\fR(\fItypePtr\fR) .sp +Tcl_Obj * +\fBTcl_GetEncodingSearchPath\fR() +.sp +int +\fBTcl_SetEncodingSearchPath\fR(\fIsearchPath\fR) +.sp const char * \fBTcl_GetDefaultEncodingDir\fR(\fIvoid\fR) .sp void \fBTcl_SetDefaultEncodingDir\fR(\fIpath\fR) - .SH ARGUMENTS -.AS Tcl_EncodingState *dstWrotePtr in/out +.AS "const Tcl_EncodingType" *dstWrotePtr in/out .AP Tcl_Interp *interp in Interpreter to use for error reporting, or NULL if no error reporting is desired. @@ -69,6 +78,10 @@ Name of encoding to load. .AP Tcl_Encoding encoding in The encoding to query, free, or use for converting text. If \fIencoding\fR is NULL, the current system encoding is used. +.AP Tcl_Obj *objPtr in +Name of encoding to get token for. +.AP Tcl_Encoding *encodingPtr out +Points to storage where encoding token is to be written. .AP "const char" *src in For the \fBTcl_ExternalToUtf\fR functions, an array of bytes in the specified encoding that are to be converted to UTF-8. For the @@ -93,7 +106,7 @@ block in a (potentially multi-block) input stream, telling the conversion routine to perform any finalization that needs to occur after the last byte is converted and then to reset to an initial state. \fBTCL_ENCODING_STOPONERROR\fR signifies that the conversion routine should -return immediately upon reading a source character that doesn't exist in +return immediately upon reading a source character that does not exist in the target encoding; otherwise a default fallback character will automatically be substituted. .AP Tcl_EncodingState *statePtr in/out @@ -121,8 +134,12 @@ buffer as a result of the conversion. May be NULL. .AP int *dstCharsPtr out Filled with the number of characters that correspond to the number of bytes stored in the output buffer. May be NULL. -.AP Tcl_EncodingType *typePtr in +.AP Tcl_DString *bufPtr out +Storage for the prescribed system encoding name. +.AP "const Tcl_EncodingType" *typePtr in Structure that defines a new type of encoding. +.AP Tcl_Obj *searchPath in +List of filesystem directories in which to search for encoding data files. .AP "const char" *path in A path to the location of the encoding file. .BE @@ -171,6 +188,18 @@ anywhere (i.e., it has been freed as many times as it has been gotten) \fBTcl_FreeEncoding\fR will release all storage the encoding was using and delete it from the database. .PP +\fBTcl_GetEncodingFromObj\fR treats the string representation of +\fIobjPtr\fR as an encoding name, and finds an encoding with that +name, just as \fBTcl_GetEncoding\fR does. When an encoding is found, +it is cached within the \fBobjPtr\fR value for future reference, the +\fBTcl_Encoding\fR token is written to the storage pointed to by +\fIencodingPtr\fR, and the value \fBTCL_OK\fR is returned. If no such +encoding is found, the value \fBTCL_ERROR\fR is returned, and no +writing to \fB*\fR\fIencodingPtr\fR takes place. Just as with +\fBTcl_GetEncoding\fR, the caller should call \fBTcl_FreeEncoding\fR +on the resulting encoding token when that token will no longer be +used. +.PP \fBTcl_ExternalToUtfDString\fR converts a source buffer \fIsrc\fR from the specified \fIencoding\fR into UTF-8. The converted bytes are stored in \fIdstPtr\fR, which is then null-terminated. The caller should eventually @@ -230,20 +259,28 @@ is filled with the corresponding number of bytes that were stored in Windows-only convenience functions for converting between UTF-8 and Windows strings. On Windows 95 (as with the Unix operating system), -all strings exchanged between Tcl and the operating system are "char" +all strings exchanged between Tcl and the operating system are +.QW "char" based. On Windows NT, some strings exchanged between Tcl and the -operating system are "char" oriented while others are in Unicode. By +operating system are +.QW "char" +oriented while others are in Unicode. By convention, in Windows a TCHAR is a character in the ANSI code page on Windows 95 and a Unicode character on Windows NT. .PP -If you planned to use the same "char" based interfaces on both Windows +If you planned to use the same +.QW "char" +based interfaces on both Windows 95 and Windows NT, you could use \fBTcl_UtfToExternal\fR and \fBTcl_ExternalToUtf\fR (or their \fBTcl_DString\fR equivalents) with an encoding of NULL (the current system encoding). On the other hand, if you planned to use the Unicode interface when running on Windows NT -and the "char" interfaces when running on Windows 95, you would have +and the +.QW "char" +interfaces when running on Windows 95, you would have to perform the following type of test over and over in your program (as represented in pseudo-code): +.PP .CS if (running NT) { encoding <- Tcl_GetEncoding("unicode"); @@ -253,6 +290,7 @@ if (running NT) { nativeBuffer <- Tcl_UtfToExternal(NULL, utfBuffer); } .CE +.PP \fBTcl_WinUtfToTChar\fR and \fBTcl_WinTCharToUtf\fR automatically handle this test and use the proper encoding based on the current operating system. \fBTcl_WinUtfToTChar\fR returns a pointer to @@ -277,6 +315,13 @@ procedure increments the reference count of the new system encoding, decrements the reference count of the old system encoding, and returns \fBTCL_OK\fR. .PP +\fBTcl_GetEncodingNameFromEnvironment\fR provides a means for the Tcl +library to report the encoding name it believes to be the correct one +to use as the system encoding, based on system calls and examination of +the environment suitable for the platform. It accepts \fIbufPtr\fR, +a pointer to an uninitialized or freed \fBTcl_DString\fR and writes +the encoding name to it. The \fBTcl_DStringValue\fR is returned. +.PP \fBTcl_GetEncodingNames\fR sets the \fIinterp\fR result to a list consisting of the names of all the encodings that are currently defined or can be dynamically loaded, searching the encoding path specified by @@ -303,13 +348,13 @@ convert between this encoding and UTF-8. It is defined as follows: .PP .CS typedef struct Tcl_EncodingType { - const char *\fIencodingName\fR; - Tcl_EncodingConvertProc *\fItoUtfProc\fR; - Tcl_EncodingConvertProc *\fIfromUtfProc\fR; - Tcl_EncodingFreeProc *\fIfreeProc\fR; - ClientData \fIclientData\fR; - int \fInullSize\fR; -} Tcl_EncodingType; + const char *\fIencodingName\fR; + Tcl_EncodingConvertProc *\fItoUtfProc\fR; + Tcl_EncodingConvertProc *\fIfromUtfProc\fR; + Tcl_EncodingFreeProc *\fIfreeProc\fR; + ClientData \fIclientData\fR; + int \fInullSize\fR; +} \fBTcl_EncodingType\fR; .CE .PP The \fIencodingName\fR provides a string name for the encoding, by @@ -337,7 +382,7 @@ The callback procedures \fItoUtfProc\fR and \fIfromUtfProc\fR should match the type \fBTcl_EncodingConvertProc\fR: .PP .CS -typedef int Tcl_EncodingConvertProc( +typedef int \fBTcl_EncodingConvertProc\fR( ClientData \fIclientData\fR, const char *\fIsrc\fR, int \fIsrcLen\fR, @@ -367,8 +412,9 @@ procedure will be a non-NULL location. .PP The callback procedure \fIfreeProc\fR, if non-NULL, should match the type \fBTcl_EncodingFreeProc\fR: +.PP .CS -typedef void Tcl_EncodingFreeProc( +typedef void \fBTcl_EncodingFreeProc\fR( ClientData \fIclientData\fR); .CE .PP @@ -376,13 +422,33 @@ This \fIfreeProc\fR function is called when the encoding is deleted. The \fIclientData\fR parameter is the same as the \fIclientData\fR field specified to \fBTcl_CreateEncoding\fR when the encoding was created. .PP - +\fBTcl_GetEncodingSearchPath\fR and \fBTcl_SetEncodingSearchPath\fR +are called to access and set the list of filesystem directories searched +for encoding data files. +.PP +The value returned by \fBTcl_GetEncodingSearchPath\fR +is the value stored by the last successful call to +\fBTcl_SetEncodingSearchPath\fR. If no calls to +\fBTcl_SetEncodingSearchPath\fR have occurred, Tcl will compute an initial +value based on the environment. There is one encoding search path for the +entire process, shared by all threads in the process. +.PP +\fBTcl_SetEncodingSearchPath\fR stores \fIsearchPath\fR and returns +\fBTCL_OK\fR, unless \fIsearchPath\fR is not a valid Tcl list, which +causes \fBTCL_ERROR\fR to be returned. The elements of \fIsearchPath\fR +are not verified as existing readable filesystem directories. When +searching for encoding data files takes place, and non-existent or +non-readable filesystem directories on the \fIsearchPath\fR are silently +ignored. +.PP \fBTcl_GetDefaultEncodingDir\fR and \fBTcl_SetDefaultEncodingDir\fR -access and set the directory to use when locating the default encoding -files. If this value is not NULL, the \fBTclpInitLibraryPath\fR routine -appends the path to the head of the search path, and uses this path as -the first place to look into when trying to locate the encoding file. - +are obsolete interfaces best replaced with calls to +\fBTcl_GetEncodingSearchPath\fR and \fBTcl_SetEncodingSearchPath\fR. +They are called to access and set the first element of the \fIsearchPath\fR +list. Since Tcl searches \fIsearchPath\fR for encoding data files in +list order, these routines establish the +.QW default +directory in which to find encoding data files. .SH "ENCODING FILES" Space would prohibit precompiling into Tcl every possible encoding algorithm, so many encodings are stored on disk as dynamically-loadable @@ -394,7 +460,9 @@ external encoding may consist of single-byte, multi-byte, or double-byte characters. .PP Each dynamically-loadable encoding is represented as a text file. The -initial line of the file, beginning with a ``#'' symbol, is a comment +initial line of the file, beginning with a +.QW # +symbol, is a comment that provides a human-readable description of the file. The next line identifies the type of encoding file. It can be one of the following letters: @@ -421,6 +489,7 @@ Cases [1], [2], and [3] are collectively referred to as table-based encoding files. The lines in a table-based encoding file are in the same format as this example taken from the \fBshiftjis\fR encoding (this is not the complete file): +.PP .CS # Encoding file: shiftjis, multi-byte M @@ -480,25 +549,26 @@ and 0x8163 in \fBshiftjis\fR map to 203E and 2026 in Unicode, respectively. Following the first page will be all the other pages, each in the same format as the first: one number identifying the page followed by 256 double-byte Unicode characters. If a character in the encoding maps to the -Unicode character 0000, it means that the character doesn't actually exist. +Unicode character 0000, it means that the character does not actually exist. If all characters on a page would map to 0000, that page can be omitted. .PP Case [4] is the escape-sequence encoding file. The lines in an this type of file are in the same format as this example taken from the \fBiso2022-jp\fR encoding: +.PP .CS .ta 1.5i # Encoding file: iso2022-jp, escape-driven E init {} final {} -iso8859-1 \\x1b(B -jis0201 \\x1b(J -jis0208 \\x1b$@ -jis0208 \\x1b$B -jis0212 \\x1b$(D -gb2312 \\x1b$A -ksc5601 \\x1b$(C +iso8859-1 \ex1b(B +jis0201 \ex1b(J +jis0208 \ex1b$@ +jis0208 \ex1b$B +jis0212 \ex1b$(D +gb2312 \ex1b$A +ksc5601 \ex1b$(C .CE .PP In the file, the first column represents an option and the second column @@ -507,8 +577,11 @@ the first character is converted, while \fBfinal\fR is a string to emit or expect after the last character. All other options are names of table-based encodings; the associated value is the escape-sequence that marks that encoding. Tcl syntax is used for the values; in the above -example, for instance, ``\fB{}\fR'' represents the empty string and -``\fB\\x1b\fR'' represents character 27. +example, for instance, +.QW \fB{}\fR +represents the empty string and +.QW \fB\ex1b\fR +represents character 27. .PP When \fBTcl_GetEncoding\fR encounters an encoding \fIname\fR that has not been loaded, it attempts to load an encoding file called \fIname\fB.enc\fR @@ -517,6 +590,3 @@ for its script library. If the encoding file exists, but is malformed, an error message will be left in \fIinterp\fR. .SH KEYWORDS utf, encoding, convert - - - |