diff options
Diffstat (limited to 'doc/Encoding.3')
-rw-r--r-- | doc/Encoding.3 | 241 |
1 files changed, 164 insertions, 77 deletions
diff --git a/doc/Encoding.3 b/doc/Encoding.3 index 913d4fa..a940a5b 100644 --- a/doc/Encoding.3 +++ b/doc/Encoding.3 @@ -8,7 +8,7 @@ .TH Tcl_GetEncoding 3 "8.1" Tcl "Tcl Library Procedures" .BS .SH NAME -Tcl_GetEncoding, Tcl_FreeEncoding, Tcl_ExternalToUtfDString, Tcl_ExternalToUtf, Tcl_UtfToExternalDString, Tcl_UtfToExternal, Tcl_WinTCharToUtf, Tcl_WinUtfToTChar, Tcl_GetEncodingName, Tcl_SetSystemEncoding, Tcl_GetEncodingNames, Tcl_CreateEncoding, Tcl_GetDefaultEncodingDir, Tcl_SetDefaultEncodingDir \- procedures for creating and using encodings. +Tcl_GetEncoding, Tcl_FreeEncoding, Tcl_GetEncodingFromObj, Tcl_ExternalToUtfDString, Tcl_ExternalToUtf, Tcl_UtfToExternalDString, Tcl_UtfToExternal, Tcl_WinTCharToUtf, Tcl_WinUtfToTChar, Tcl_GetEncodingName, Tcl_SetSystemEncoding, Tcl_GetEncodingNameFromEnvironment, Tcl_GetEncodingNames, Tcl_CreateEncoding, Tcl_GetEncodingSearchPath, Tcl_SetEncodingSearchPath, Tcl_GetDefaultEncodingDir, Tcl_SetDefaultEncodingDir \- procedures for creating and using encodings .SH SYNOPSIS .nf \fB#include <tcl.h>\fR @@ -19,19 +19,24 @@ Tcl_Encoding void \fBTcl_FreeEncoding\fR(\fIencoding\fR) .sp +.VS 8.5 +int +\fBTcl_GetEncodingFromObj\fR(\fIinterp, objPtr, encodingPtr\fR) +.VE 8.5 +.sp char * \fBTcl_ExternalToUtfDString\fR(\fIencoding, src, srcLen, dstPtr\fR) .sp -int -\fBTcl_ExternalToUtf\fR(\fIinterp, encoding, src, srcLen, flags, statePtr, dst, dstLen, srcReadPtr, dstWrotePtr, - dstCharsPtr\fR) -.sp -char * +char * \fBTcl_UtfToExternalDString\fR(\fIencoding, src, srcLen, dstPtr\fR) .sp int -\fBTcl_UtfToExternal\fR(\fIinterp, encoding, src, srcLen, flags, statePtr, dst, dstLen, srcReadPtr, dstWrotePtr, - dstCharsPtr\fR) +\fBTcl_ExternalToUtf\fR(\fIinterp, encoding, src, srcLen, flags, statePtr, + dst, dstLen, srcReadPtr, dstWrotePtr, dstCharsPtr\fR) +.sp +int +\fBTcl_UtfToExternal\fR(\fIinterp, encoding, src, srcLen, flags, statePtr, + dst, dstLen, srcReadPtr, dstWrotePtr, dstCharsPtr\fR) .sp char * \fBTcl_WinTCharToUtf\fR(\fItsrc, srcLen, dstPtr\fR) @@ -39,41 +44,60 @@ char * TCHAR * \fBTcl_WinUtfToTChar\fR(\fIsrc, srcLen, dstPtr\fR) .sp -CONST char * +const char * \fBTcl_GetEncodingName\fR(\fIencoding\fR) .sp int \fBTcl_SetSystemEncoding\fR(\fIinterp, name\fR) .sp +.VS 8.5 +const char * +\fBTcl_GetEncodingNameFromEnvironment\fR(\fIbufPtr\fR) +.VE 8.5 +.sp void \fBTcl_GetEncodingNames\fR(\fIinterp\fR) .sp Tcl_Encoding \fBTcl_CreateEncoding\fR(\fItypePtr\fR) .sp -CONST char * +.VS 8.5 +Tcl_Obj * +\fBTcl_GetEncodingSearchPath\fR() +.sp +int +\fBTcl_SetEncodingSearchPath\fR(\fIsearchPath\fR) +.VE 8.5 +.sp +const char * \fBTcl_GetDefaultEncodingDir\fR(\fIvoid\fR) .sp void \fBTcl_SetDefaultEncodingDir\fR(\fIpath\fR) - - .SH ARGUMENTS -.AS Tcl_EncodingState *dstWrotePtr +.AS "const Tcl_EncodingType" *dstWrotePtr in/out .AP Tcl_Interp *interp in Interpreter to use for error reporting, or NULL if no error reporting is desired. -.AP "CONST char" *name in +.AP "const char" *name in Name of encoding to load. .AP Tcl_Encoding encoding in The encoding to query, free, or use for converting text. If \fIencoding\fR is NULL, the current system encoding is used. -.AP "CONST char" *src in +.AP Tcl_Obj *objPtr in +.VS 8.5 +Name of encoding to get token for. +.VE 8.5 +.AP Tcl_Encoding *encodingPtr out +.VS 8.5 +Points to storage where encoding token is to be written. +.VE 8.5 +.AP "const char" *src in For the \fBTcl_ExternalToUtf\fR functions, an array of bytes in the specified encoding that are to be converted to UTF-8. For the \fBTcl_UtfToExternal\fR and \fBTcl_WinUtfToTChar\fR functions, an array of UTF-8 characters to be converted to the specified encoding. -.AP "CONST TCHAR" *tsrc in +.AP "const TCHAR" *tsrc in An array of Windows TCHAR characters to convert to UTF-8. .AP int srcLen in Length of \fIsrc\fR or \fItsrc\fR in bytes. If the length is negative, the @@ -83,21 +107,21 @@ Pointer to an uninitialized or free \fBTcl_DString\fR in which the converted result will be stored. .AP int flags in Various flag bits OR-ed together. -TCL_ENCODING_START signifies that the +\fBTCL_ENCODING_START\fR signifies that the source buffer is the first block in a (potentially multi-block) input stream, telling the conversion routine to reset to an initial state and perform any initialization that needs to occur before the first byte is -converted. TCL_ENCODING_END signifies that the source buffer is the last +converted. \fBTCL_ENCODING_END\fR signifies that the source buffer is the last block in a (potentially multi-block) input stream, telling the conversion routine to perform any finalization that needs to occur after the last byte is converted and then to reset to an initial state. -TCL_ENCODING_STOPONERROR signifies that the conversion routine should -return immediately upon reading a source character that doesn't exist in +\fBTCL_ENCODING_STOPONERROR\fR signifies that the conversion routine should +return immediately upon reading a source character that does not exist in the target encoding; otherwise a default fallback character will automatically be substituted. .AP Tcl_EncodingState *statePtr in/out Used when converting a (generally long or indefinite length) byte stream -in a piece by piece fashion. The conversion routine stores its current +in a piece-by-piece fashion. The conversion routine stores its current state in \fI*statePtr\fR after \fIsrc\fR (the buffer containing the current piece) has been converted; that state information must be passed back when converting the next piece of the stream so the conversion @@ -120,9 +144,17 @@ buffer as a result of the conversion. May be NULL. .AP int *dstCharsPtr out Filled with the number of characters that correspond to the number of bytes stored in the output buffer. May be NULL. -.AP Tcl_EncodingType *typePtr in +.AP Tcl_DString *bufPtr out +.VS 8.5 +Storage for the prescribed system encoding name. +.VE 8.5 +.AP "const Tcl_EncodingType" *typePtr in Structure that defines a new type of encoding. -.AP "CONST char" *path in +.AP Tcl_Obj *searchPath in +.VS 8.5 +List of filesystem directories in which to search for encoding data files. +.VE 8.5 +.AP "const char" *path in A path to the location of the encoding file. .BE .SH INTRODUCTION @@ -149,7 +181,7 @@ platform-independent manner. .SH DESCRIPTION .PP \fBTcl_GetEncoding\fR finds an encoding given its \fIname\fR. The name may -refer to a builtin Tcl encoding, a user-defined encoding registered by +refer to a built-in Tcl encoding, a user-defined encoding registered by calling \fBTcl_CreateEncoding\fR, or a dynamically-loadable encoding file. The return value is a token that represents the encoding and can be used in subsequent calls to procedures such as \fBTcl_GetEncodingName\fR, @@ -170,6 +202,20 @@ anywhere (i.e., it has been freed as many times as it has been gotten) \fBTcl_FreeEncoding\fR will release all storage the encoding was using and delete it from the database. .PP +.VS 8.5 +\fBTcl_GetEncodingFromObj\fR treats the string representation of +\fIobjPtr\fR as an encoding name, and finds an encoding with that +name, just as \fBTcl_GetEncoding\fR does. When an encoding is found, +it is cached within the \fBobjPtr\fR value for future reference, the +\fBTcl_Encoding\fR token is written to the storage pointed to by +\fIencodingPtr\fR, and the value \fBTCL_OK\fR is returned. If no such +encoding is found, the value \fBTCL_ERROR\fR is returned, and no +writing to \fB*\fR\fIencodingPtr\fR takes place. Just as with +\fBTcl_GetEncoding\fR, the caller should call \fBTcl_FreeEncoding\fR +on the resulting encoding token when that token will no longer be +used. +.VE 8.5 +.PP \fBTcl_ExternalToUtfDString\fR converts a source buffer \fIsrc\fR from the specified \fIencoding\fR into UTF-8. The converted bytes are stored in \fIdstPtr\fR, which is then null-terminated. The caller should eventually @@ -192,7 +238,7 @@ All bytes of \fIsrc\fR were converted. The destination buffer was not large enough for all of the converted data; as many characters as could fit were converted though. .IP \fBTCL_CONVERT_MULTIBYTE\fR 29 -The last fews bytes in the source buffer were the beginning of a multibyte +The last few bytes in the source buffer were the beginning of a multibyte sequence, but more bytes were needed to complete this sequence. A subsequent call to the conversion routine should pass a buffer containing the unconverted bytes that remained in \fIsrc\fR plus some further bytes @@ -204,7 +250,7 @@ if the input stream has been damaged or if the input encoding method was misidentified. .IP \fBTCL_CONVERT_UNKNOWN\fR 29 The source buffer contained a character that could not be represented in -the target encoding and TCL_ENCODING_STOPONERROR was specified. +the target encoding and \fBTCL_ENCODING_STOPONERROR\fR was specified. .RE .LP \fBTcl_UtfToExternalDString\fR converts a source buffer \fIsrc\fR from UTF-8 @@ -228,19 +274,26 @@ is filled with the corresponding number of bytes that were stored in \fBTcl_WinUtfToTChar\fR and \fBTcl_WinTCharToUtf\fR are Windows-only convenience functions for converting between UTF-8 and Windows strings. On Windows 95 -(as with the Macintosh and Unix operating systems), -all strings exchanged between Tcl and the operating system are "char" +(as with the Unix operating system), +all strings exchanged between Tcl and the operating system are +.QW "char" based. On Windows NT, some strings exchanged between Tcl and the -operating system are "char" oriented while others are in Unicode. By +operating system are +.QW "char" +oriented while others are in Unicode. By convention, in Windows a TCHAR is a character in the ANSI code page on Windows 95 and a Unicode character on Windows NT. .PP -If you planned to use the same "char" based interfaces on both Windows +If you planned to use the same +.QW "char" +based interfaces on both Windows 95 and Windows NT, you could use \fBTcl_UtfToExternal\fR and \fBTcl_ExternalToUtf\fR (or their \fBTcl_DString\fR equivalents) with an encoding of NULL (the current system encoding). On the other hand, if you planned to use the Unicode interface when running on Windows NT -and the "char" interfaces when running on Windows 95, you would have +and the +.QW "char" +interfaces when running on Windows 95, you would have to perform the following type of test over and over in your program (as represented in pseudo-code): .CS @@ -250,6 +303,7 @@ if (running NT) { Tcl_FreeEncoding(encoding); } else { nativeBuffer <- Tcl_UtfToExternal(NULL, utfBuffer); +} .CE \fBTcl_WinUtfToTChar\fR and \fBTcl_WinTCharToUtf\fR automatically handle this test and use the proper encoding based on the current @@ -269,11 +323,20 @@ was used to create the encoding. The string returned by whenever the user passes a NULL value for the \fIencoding\fR argument to any of the other encoding functions. If \fIname\fR is NULL, the system encoding is reset to the default system encoding, \fBbinary\fR. If the -name did not refer to any known or loadable encoding, TCL_ERROR is +name did not refer to any known or loadable encoding, \fBTCL_ERROR\fR is returned and an error message is left in \fIinterp\fR. Otherwise, this procedure increments the reference count of the new system encoding, decrements the reference count of the old system encoding, and returns -TCL_OK. +\fBTCL_OK\fR. +.PP +.VS 8.5 +\fBTcl_GetEncodingNameFromEnvironment\fR provides a means for the Tcl +library to report the encoding name it believes to be the correct one +to use as the system encoding, based on system calls and examination of +the environment suitable for the platform. It accepts \fIbufPtr\fR, +a pointer to an uninitialized or freed \fBTcl_DString\fR and writes +the encoding name to it. The \fBTcl_DStringValue\fR is returned. +.VE 8.5 .PP \fBTcl_GetEncodingNames\fR sets the \fIinterp\fR result to a list consisting of the names of all the encodings that are currently defined @@ -301,12 +364,12 @@ convert between this encoding and UTF-8. It is defined as follows: .PP .CS typedef struct Tcl_EncodingType { - CONST char *\fIencodingName\fR; - Tcl_EncodingConvertProc *\fItoUtfProc\fR; - Tcl_EncodingConvertProc *\fIfromUtfProc\fR; - Tcl_EncodingFreeProc *\fIfreeProc\fR; - ClientData \fIclientData\fR; - int \fInullSize\fR; + const char *\fIencodingName\fR; + Tcl_EncodingConvertProc *\fItoUtfProc\fR; + Tcl_EncodingConvertProc *\fIfromUtfProc\fR; + Tcl_EncodingFreeProc *\fIfreeProc\fR; + ClientData \fIclientData\fR; + int \fInullSize\fR; } Tcl_EncodingType; .CE .PP @@ -336,16 +399,16 @@ type \fBTcl_EncodingConvertProc\fR: .PP .CS typedef int Tcl_EncodingConvertProc( - ClientData \fIclientData\fR, - CONST char *\fIsrc\fR, - int \fIsrcLen\fR, - int \fIflags\fR, - Tcl_Encoding *\fIstatePtr\fR, - char *\fIdst\fR, - int \fIdstLen\fR, - int *\fIsrcReadPtr\fR, - int *\fIdstWrotePtr\fR, - int *\fIdstCharsPtr\fR); + ClientData \fIclientData\fR, + const char *\fIsrc\fR, + int \fIsrcLen\fR, + int \fIflags\fR, + Tcl_EncodingState *\fIstatePtr\fR, + char *\fIdst\fR, + int \fIdstLen\fR, + int *\fIsrcReadPtr\fR, + int *\fIdstWrotePtr\fR, + int *\fIdstCharsPtr\fR); .CE .PP The \fItoUtfProc\fR and \fIfromUtfProc\fR procedures are called by the @@ -367,20 +430,42 @@ The callback procedure \fIfreeProc\fR, if non-NULL, should match the type \fBTcl_EncodingFreeProc\fR: .CS typedef void Tcl_EncodingFreeProc( - ClientData \fIclientData\fR); + ClientData \fIclientData\fR); .CE .PP This \fIfreeProc\fR function is called when the encoding is deleted. The \fIclientData\fR parameter is the same as the \fIclientData\fR field specified to \fBTcl_CreateEncoding\fR when the encoding was created. .PP - +.VS 8.5 +\fBTcl_GetEncodingSearchPath\fR and \fBTcl_SetEncodingSearchPath\fR +are called to access and set the list of filesystem directories searched +for encoding data files. +.PP +The value returned by \fBTcl_GetEncodingSearchPath\fR +is the value stored by the last successful call to +\fBTcl_SetEncodingSearchPath\fR. If no calls to +\fBTcl_SetEncodingSearchPath\fR have occurred, Tcl will compute an initial +value based on the environment. There is one encoding search path for the +entire process, shared by all threads in the process. +.PP +\fBTcl_SetEncodingSearchPath\fR stores \fIsearchPath\fR and returns +\fBTCL_OK\fR, unless \fIsearchPath\fR is not a valid Tcl list, which +causes \fBTCL_ERROR\fR to be returned. The elements of \fIsearchPath\fR +are not verified as existing readable filesystem directories. When +searching for encoding data files takes place, and non-existent or +non-readable filesystem directories on the \fIsearchPath\fR are silently +ignored. +.PP \fBTcl_GetDefaultEncodingDir\fR and \fBTcl_SetDefaultEncodingDir\fR -access and set the directory to use when locating the default encoding -files. If this value is not NULL, the \fBTclpInitLibraryPath\fR routine -appends the path to the head of the search path, and uses this path as -the first place to look into when trying to locate the encoding file. - +are obsolete interfaces best replaced with calls to +\fBTcl_GetEncodingSearchPath\fR and \fBTcl_SetEncodingSearchPath\fR. +They are called to access and set the first element of the \fIsearchPath\fR +list. Since Tcl searches \fIsearchPath\fR for encoding data files in +list order, these routines establish the +.QW default +directory in which to find encoding data files. +.VE 8.5 .SH "ENCODING FILES" Space would prohibit precompiling into Tcl every possible encoding algorithm, so many encodings are stored on disk as dynamically-loadable @@ -392,23 +477,25 @@ external encoding may consist of single-byte, multi-byte, or double-byte characters. .PP Each dynamically-loadable encoding is represented as a text file. The -initial line of the file, beginning with a ``#'' symbol, is a comment +initial line of the file, beginning with a +.QW # +symbol, is a comment that provides a human-readable description of the file. The next line identifies the type of encoding file. It can be one of the following letters: -.IP "[1] \fBS\fR" +.IP "[1] \fBS\fR" A single-byte encoding, where one character is always one byte long in the encoding. An example is \fBiso8859-1\fR, used by many European languages. -.IP "[2] \fBD\fR" +.IP "[2] \fBD\fR" A double-byte encoding, where one character is always two bytes long in the encoding. An example is \fBbig5\fR, used for Chinese text. -.IP "[3] \fBM\fR" +.IP "[3] \fBM\fR" A multi-byte encoding, where one character may be either one or two bytes long. -Certain bytes are a lead bytes, indicating that another byte must follow +Certain bytes are lead bytes, indicating that another byte must follow and that together the two bytes represent one character. Other bytes are not lead bytes and represent themselves. An example is \fBshiftjis\fR, used by many Japanese computers. -.IP "[4] \fBE\fR" +.IP "[4] \fBE\fR" An escape-sequence encoding, specifying that certain sequences of bytes do not represent characters, but commands that describe how following bytes should be interpreted. @@ -478,7 +565,7 @@ and 0x8163 in \fBshiftjis\fR map to 203E and 2026 in Unicode, respectively. Following the first page will be all the other pages, each in the same format as the first: one number identifying the page followed by 256 double-byte Unicode characters. If a character in the encoding maps to the -Unicode character 0000, it means that the character doesn't actually exist. +Unicode character 0000, it means that the character does not actually exist. If all characters on a page would map to 0000, that page can be omitted. .PP Case [4] is the escape-sequence encoding file. The lines in an this type of @@ -490,13 +577,13 @@ encoding: E init {} final {} -iso8859-1 \\x1b(B -jis0201 \\x1b(J -jis0208 \\x1b$@ -jis0208 \\x1b$B -jis0212 \\x1b$(D -gb2312 \\x1b$A -ksc5601 \\x1b$(C +iso8859-1 \ex1b(B +jis0201 \ex1b(J +jis0208 \ex1b$@ +jis0208 \ex1b$B +jis0212 \ex1b$(D +gb2312 \ex1b$A +ksc5601 \ex1b$(C .CE .PP In the file, the first column represents an option and the second column @@ -505,16 +592,16 @@ the first character is converted, while \fBfinal\fR is a string to emit or expect after the last character. All other options are names of table-based encodings; the associated value is the escape-sequence that marks that encoding. Tcl syntax is used for the values; in the above -example, for instance, ``\fB{}\fR'' represents the empty string and -``\fB\\x1b\fR'' represents character 27. +example, for instance, +.QW \fB{}\fR +represents the empty string and +.QW \fB\ex1b\fR +represents character 27. .PP When \fBTcl_GetEncoding\fR encounters an encoding \fIname\fR that has not been loaded, it attempts to load an encoding file called \fIname\fB.enc\fR -from the \fBencoding\fR subdirectory of each directory specified in the -library path \fB$tcl_libPath\fR. If the encoding file exists, but is +from the \fBencoding\fR subdirectory of each directory that Tcl searches +for its script library. If the encoding file exists, but is malformed, an error message will be left in \fIinterp\fR. .SH KEYWORDS utf, encoding, convert - - - |