diff options
Diffstat (limited to 'doc/Encoding.3')
-rw-r--r-- | doc/Encoding.3 | 484 |
1 files changed, 484 insertions, 0 deletions
diff --git a/doc/Encoding.3 b/doc/Encoding.3 new file mode 100644 index 0000000..e9329dd --- /dev/null +++ b/doc/Encoding.3 @@ -0,0 +1,484 @@ +'\" +'\" Copyright (c) 1997-1998 Sun Microsystems, Inc. +'\" +'\" See the file "license.terms" for information on usage and redistribution +'\" of this file, and for a DISCLAIMER OF ALL WARRANTIES. +'\" +'\" RCS: @(#) $Id: Encoding.3,v 1.2 1999/04/16 00:46:31 stanton Exp $ +'\" +.so man.macros +.TH Tcl_GetEncoding 3 "8.1" Tcl "Tcl Library Procedures" +.BS +.SH NAME +Tcl_GetEncoding, Tcl_FreeEncoding, Tcl_ExternalToUtfDString, Tcl_ExternalToUtf, Tcl_UtfToExternalDString, Tcl_UtfToExternal, Tcl_GetEncodingName, Tcl_SetSystemEncoding, Tcl_GetEncodingNames, Tcl_CreateEncoding, Tcl_GetDefaultEncodingDir, Tcl_SetDefaultEncodingDir \- procedures for creating and using encodings. + + + + +.SH SYNOPSIS +.nf +\fB#include <tcl.h>\fR +.sp +Tcl_Encoding +\fBTcl_GetEncoding\fR(\fIinterp, name\fR) +.sp +void +\fBTcl_FreeEncoding\fR(\fIencoding\fR) +.sp +void +\fBTcl_ExternalToUtfDString\fR(\fIencoding, src, srcLen, dstPtr\fR) +.sp +int +\fBTcl_ExternalToUtf\fR(\fIinterp, encoding, src, srcLen, flags, statePtr, dst, dstLen, srcReadPtr, dstWrotePtr, + dstCharsPtr\fR) +.sp +void +\fBTcl_UtfToExternalDString\fR(\fIencoding, src, srcLen, dstPtr\fR) +.sp +int +\fBTcl_UtfToExternal\fR(\fIinterp, encoding, src, srcLen, flags, statePtr, dst, dstLen, srcReadPtr, dstWrotePtr, + dstCharsPtr\fR) +.sp +char * +\fBTcl_GetEncodingName\fR(\fIencoding\fR) +.sp +int +\fBTcl_SetSystemEncoding\fR(\fIinterp, name\fR) +.sp +void +\fBTcl_GetEncodingNames\fR(\fIinterp\fR) +.sp +Tcl_Encoding +\fBTcl_CreateEncoding\fR(\fItypePtr\fR) + +.sp +char * +\fBTcl_GetDefaultEncodingDir\fR(\fIvoid\fR) +.sp +void +\fBTcl_SetDefaultEncodingDir\fR(\fIpath\fR) + + +.SH ARGUMENTS +.AS Tcl_EncodingState *dstWrotePtr +.AP Tcl_Interp *interp in +Interpreter to use for error reporting, or NULL if no error reporting is +desired. +.AP "CONST char" *name in +Name of encoding to load. +.AP Tcl_Encoding encoding in +The encoding to query, free, or use for converting text. If \fIencoding\fR is +NULL, the current system encoding is used. +.AP "CONST char" *src in +For the \fBTcl_ExternalToUtf\fR functions, an array of bytes in the +specified encoding that are to be converted to UTF-8. For the +\fBTcl_UtfToExternal\fR functions, an array of UTF-8 characters to be +converted to the specified encoding. +.AP int srcLen in +Length of \fIsrc\fR in bytes. If the length is negative, the +encoding-specific length of the string is used. +.AP Tcl_DString *dstPtr out +Pointer to an uninitialized or free \fBTcl_DString\fR in which the converted +result will be stored. +.AP int flags in +Various flag bits OR-ed together. +TCL_ENCODING_START signifies that the +source buffer is the first block in a (potentially multi-block) input +stream, telling the conversion routine to reset to an initial state and +perform any initialization that needs to occur before the first byte is +converted. TCL_ENCODING_END signifies that the source buffer is the last +block in a (potentially multi-block) input stream, telling the conversion +routine to perform any finalization that needs to occur after the last +byte is converted and then to reset to an initial state. +TCL_ENCODING_STOPONERROR signifies that the conversion routine should +return immediately upon reading a source character that doesn't exist in +the target encoding; otherwise a default fallback character will +automatically be substituted. +.AP Tcl_EncodingState *statePtr in/out +Used when converting a (generally long or indefinite length) byte stream +in a piece by piece fashion. The conversion routine stores its current +state in \fI*statePtr\fR after \fIsrc\fR (the buffer containing the +current piece) has been converted; that state information must be passed +back when converting the next piece of the stream so the conversion +routine knows what state it was in when it left off at the end of the +last piece. May be NULL, in which case the value specified for \fIflags\fR +is ignored and the source buffer is assumed to contain the complete string to +convert. +.AP char *dst out +Buffer in which the converted result will be stored. No more than +\fIdstLen\fR bytes will be stored in \fIdst\fR. +.AP int dstLen in +The maximum length of the output buffer \fIdst\fR in bytes. +.AP int *srcReadPtr out +Filled with the number of bytes from \fIsrc\fR that were actually +converted. This may be less than the original source length if there was +a problem converting some source characters. May be NULL. +.AP int *dstWrotePtr out +Filled with the number of bytes that were actually stored in the output +buffer as a result of the conversion. May be NULL. +.AP int *dstCharsPtr out +Filled with the number of characters that correspond to the number of bytes +stored in the output buffer. May be NULL. +.AP Tcl_EncodingType *typePtr in +Structure that defines a new type of encoding. +.AP char *path in +A path to the location of the encoding file. +.BE +.SH INTRODUCTION +.PP +These routines convert between Tcl's internal character representation, +UTF-8, and character representations used by various operating systems or +file systems, such as Unicode, ASCII, or Shift-JIS. When operating on +strings, such as such as obtaining the names of files or displaying +characters using international fonts, the strings must be translated into +one or possibly multiple formats that the various system calls can use. For +instance, on a Japanese Unix workstation, a user might obtain a filename +represented in the EUC-JP file encoding and then translate the characters to +the jisx0208 font encoding in order to display the filename in a Tk widget. +The purpose of the encoding package is to help bridge the translation gap. +UTF-8 provides an intermediate staging ground for all the various +encodings. In the example above, text would be translated into UTF-8 from +whatever file encoding the operating system is using. Then it would be +translated from UTF-8 into whatever font encoding the display routines +require. +.PP +Some basic encodings are compiled into Tcl. Others can be defined by the +user or dynamically loaded from encoding files in a +platform-independent manner. +.SH DESCRIPTION +.PP +\fBTcl_GetEncoding\fR finds an encoding given its \fIname\fR. The name may +refer to a builtin Tcl encoding, a user-defined encoding registered by +calling \fBTcl_CreateEncoding\fR, or a dynamically-loadable encoding +file. The return value is a token that represents the encoding and can be +used in subsequent calls to procedures such as \fBTcl_GetEncodingName\fR, +\fBTcl_FreeEncoding\fR, and \fBTcl_UtfToExternal\fR. If the name did not +refer to any known or loadable encoding, NULL is returned and an error +message is returned in \fIinterp\fR. +.PP +The encoding package maintains a database of all encodings currently in use. +The first time \fIname\fR is seen, \fBTcl_GetEncoding\fR returns an +encoding with a reference count of 1. If the same \fIname\fR is requested +further times, then the reference count for that encoding is incremented +without the overhead of allocating a new encoding and all its associated +data structures. +.PP +When an \fIencoding\fR is no longer needed, \fBTcl_FreeEncoding\fR +should be called to release it. When an \fIencoding\fR is no longer in use +anywhere (i.e., it has been freed as many times as it has been gotten) +\fBTcl_FreeEncoding\fR will release all storage the encoding was using +and delete it from the database. +.PP +\fBTcl_ExternalToUtfDString\fR converts a source buffer \fIsrc\fR from the +specified \fIencoding\fR into UTF-8. The converted bytes are stored in +\fIdstPtr\fR, which is then NULL terminated. The caller should eventually +call \fBTcl_DStringFree\fR to free any information stored in \fIdstPtr\fR. +When converting, if any of the characters in the source buffer cannot be +represented in the target encoding, a default fallback character will be +used. +.PP +\fBTcl_ExternalToUtf\fR converts a source buffer \fIsrc\fR from the specified +\fIencoding\fR into UTF-8. Up to \fIsrcLen\fR bytes are converted from the +source buffer and up to \fIdstLen\fR converted bytes are stored in \fIdst\fR. +In all cases, \fI*srcReadPtr\fR is filled with the number of bytes that were +successfully converted from \fIsrc\fR and \fI*dstWrotePtr\fR is filled with +the corresponding number of bytes that were stored in \fIdst\fR. The return +value is one of the following: +.RS +.IP \fBTCL_OK\fR 29 +All bytes of \fIsrc\fR were converted. +.IP \fBTCL_CONVERT_NOSPACE\fR 29 +The destination buffer was not large enough for all of the converted data; as +many characters as could fit were converted though. +.IP \fBTCL_CONVERT_MULTIBYTE\fR 29 +The last fews bytes in the source buffer were the beginning of a multibyte +sequence, but more bytes were needed to complete this sequence. A +subsequent call to the conversion routine should pass a buffer containing +the unconverted bytes that remained in \fIsrc\fR plus some further bytes +from the source stream to properly convert the formerly split-up multibyte +sequence. +.IP \fBTCL_CONVERT_SYNTAX\fR 29 +The source buffer contained an invalid character sequence. This may occur +if the input stream has been damaged or if the input encoding method was +misidentified. +.IP \fBTCL_CONVERT_UNKNOWN\fR 29 +The source buffer contained a character that could not be represented in +the target encoding and TCL_ENCODING_STOPONERROR was specified. +.RE +.LP +\fBTcl_UtfToExternalDString\fR converts a source buffer \fIsrc\fR from UTF-8 +into the specified \fIencoding\fR. The converted bytes are stored in +\fIdstPtr\fR, which is then terminated with the appropriate encoding-specific +NULL. The caller should eventually call \fBTcl_DStringFree\fR to free any +information stored in \fIdstPtr\fR. When converting, if any of the +characters in the source buffer cannot be represented in the target +encoding, a default fallback character will be used. +.PP +\fBTcl_UtfToExternal\fR converts a source buffer \fIsrc\fR from UTF-8 into +the specified \fIencoding\fR. Up to \fIsrcLen\fR bytes are converted from +the source buffer and up to \fIdstLen\fR converted bytes are stored in +\fIdst\fR. In all cases, \fI*srcReadPtr\fR is filled with the number of +bytes that were successfully converted from \fIsrc\fR and \fI*dstWrotePtr\fR +is filled with the corresponding number of bytes that were stored in +\fIdst\fR. The return values are the same as the return values for +\fBTcl_ExternalToUtf\fR. +.PP +\fBTcl_GetEncodingName\fR is roughly the inverse of \fBTk_GetEncoding\fR. +Given an \fIencoding\fR, the return value is the \fIname\fR argument that +was used to create the encoding. The string returned by +\fBTcl_GetEncodingName\fR is only guaranteed to persist until the +\fIencoding\fR is deleted. The caller must not modify this string. +.PP +\fBTcl_SetSystemEncoding\fR sets the default encoding that should be used +whenever the user passes a NULL value for the \fIencoding\fR argument to +any of the other encoding functions. If \fIname\fR is NULL, the system +encoding is reset to the default system encoding, \fBbinary\fR. If the +name did not refer to any known or loadable encoding, TCL_ERROR is +returned and an error message is left in \fIinterp\fR. Otherwise, this +procedure increments the reference count of the new system encoding, +decrements the reference count of the old system encoding, and returns +TCL_OK. +.PP +\fBTcl_GetEncodingNames\fR sets the \fIinterp\fR result to a list +consisting of the names of all the encodings that are currently defined +or can be dynamically loaded, searching the encoding path specified by +\fBTcl_SetDefaultEncodingDir\fR. This procedure does not ensure that the +dynamically-loadable encoding files contain valid data, but merely that they +exist. +.PP +\fBTcl_CreateEncoding\fR defines a new encoding and registers the C +procedures that are called back to convert between the encoding and +UTF-8. Encodings created by \fBTcl_CreateEncoding\fR are thereafter +visible in the database used by \fBTcl_GetEncoding\fR. Just as with the +\fBTcl_GetEncoding\fR procedure, the return value is a token that +represents the encoding and can be used in subsequent calls to other +encoding functions. \fBTcl_CreateEncoding\fR returns an encoding with a +reference count of 1. If an encoding with the specified \fIname\fR +already exists, then its entry in the database is replaced with the new +encoding; the token for the old encoding will remain valid and continue +to behave as before, but users of the new token will now call the new +encoding procedures. +.PP +The \fItypePtr\fR argument to \fBTcl_CreateEncoding\fR contains information +about the name of the encoding and the procedures that will be called to +convert between this encoding and UTF-8. It is defined as follows: +.PP +.CS +typedef struct Tcl_EncodingType { + CONST char *\fIencodingName\fR; + Tcl_EncodingConvertProc *\fItoUtfProc\fR; + Tcl_EncodingConvertProc *\fIfromUtfProc\fR; + Tcl_EncodingFreeProc *\fIfreeProc\fR; + ClientData \fIclientData\fR; + int \fInullSize\fR; +} Tcl_EncodingType; +.CE +.PP +The \fIencodingName\fR provides a string name for the encoding, by +which it can be referred in other procedures such as +\fBTcl_GetEncoding\fR. The \fItoUtfProc\fR refers to a callback +procedure to invoke to convert text from this encoding into UTF-8. +The \fIfromUtfProc\fR refers to a callback procedure to invoke to +convert text from UTF-8 into this encoding. The \fIfreeProc\fR refers +to a callback procedure to invoke when this encoding is deleted. The +\fIfreeProc\fR field may be NULL. The \fIclientData\fR contains an +arbitrary one-word value passed to \fItoUtfProc\fR, \fIfromUtfProc\fR, +and \fIfreeProc\fR whenever they are called. Typically, this is a +pointer to a data structure containing encoding-specific information +that can be used by the callback procedures. For instance, two very +similar encodings such as \fBascii\fR and \fBmacRoman\fR may use the +same callback procedure, but use different values of \fIclientData\fR +to control its behavior. The \fInullSize\fR specifies the number of +zero bytes that signify end-of-string in this encoding. It must be +\fB1\fR (for single-byte or multi-byte encodings like ASCII or +Shift-JIS) or \fB2\fR (for double-byte encodings like Unicode). +Constant-sized encodings with 3 or more bytes per character (such as +CNS11643) are not accepted. +.PP +The callback procedures \fItoUtfProc\fR and \fIfromUtfProc\fR should match the +type \fBTcl_EncodingConvertProc\fR: +.PP +.CS +typedef int Tcl_EncodingConvertProc( + ClientData \fIclientData\fR, + CONST char *\fIsrc\fR, + int \fIsrcLen\fR, + int \fIflags\fR, + Tcl_Encoding *\fIstatePtr\fR, + char *\fIdst\fR, + int \fIdstLen\fR, + int *\fIsrcReadPtr\fR, + int *\fIdstWrotePtr\fR, + int *\fIdstCharsPtr\fR); +.CE +.PP +The \fItoUtfProc\fR and \fIfromUtfProc\fR procedures are called by the +\fBTcl_ExternalToUtf\fR or \fBTcl_UtfToExternal\fR family of functions to +perform the actual conversion. The \fIclientData\fR parameter to these +procedures is the same as the \fIclientData\fR field specified to +\fBTcl_CreateEncoding\fR when the encoding was created. The remaining +arguments to the callback procedures are the same as the arguments, +documented at the top, to \fBTcl_ExternalToUtf\fR or +\fBTcl_UtfToExternal\fR, with the following exceptions. If the +\fIsrcLen\fR argument to one of those high-level functions is negative, +the value passed to the callback procedure will be the appropriate +encoding-specific string length of \fIsrc\fR. If any of the \fIsrcReadPtr\fR, +\fIdstWrotePtr\fR, or \fIdstCharsPtr\fR arguments to one of the high-level +functions is NULL, the corresponding value passed to the callback +procedure will be a non-NULL location. +.PP +The callback procedure \fIfreeProc\fR, if non-NULL, should match the type +\fBTcl_EncodingFreeProc\fR: +.CS +typedef void Tcl_EncodingFreeProc( + ClientData \fIclientData\fR); +.CE +.PP +This \fIfreeProc\fR function is called when the encoding is deleted. The +\fIclientData\fR parameter is the same as the \fIclientData\fR field +specified to \fBTcl_CreateEncoding\fR when the encoding was created. +.PP + +\fBTcl_GetDefaultEncodingDir\fR and \fBTcl_SetDefaultEncodingDir\fR +access and set the directory to use when locating the default encoding +files. If this value is not NULL, the \fBTclpInitLibraryPath\fR routine +appends the path to the head of the search path, and uses this path as +the first place to look into when trying to locate the encoding file. + +.SH ENCODING FILES +Space would prohibit precompiling into Tcl every possible encoding +algorithm, so many encodings are stored on disk as dynamically-loadable +encoding files. This behavior also allows the user to create additional +encoding files that can be loaded using the same mechanism. These +encoding files contain information about the tables and/or escape +sequences used to map between an external encoding and Unicode. The +external encoding may consist of single-byte, multi-byte, or double-byte +characters. +.PP +Each dynamically-loadable encoding is represented as a text file. The +initial line of the file, beginning with a ``#'' symbol, is a comment +that provides a human-readable description of the file. The next line +identifies the type of encoding file. It can be one of the following +letters: +.IP "[1] \fBS\fR" +A single-byte encoding, where one character is always one byte long in the +encoding. An example is \fBiso8859-1\fR, used by many European languages. +.IP "[2] \fBD\fR" +A double-byte encoding, where one character is always two bytes long in the +encoding. An example is \fBbig5\fR, used for Chinese text. +.IP "[3] \fBM\fR" +A multi-byte encoding, where one character may be either one or two bytes long. +Certain bytes are a lead bytes, indicating that another byte must follow +and that together the two bytes represent one character. Other bytes are not +lead bytes and represent themselves. An example is \fBshiftjis\fR, used by +many Japanese computers. +.IP "[4] \fBE\fR" +An escape-sequence encoding, specifying that certain sequences of bytes +do not represent characters, but commands that describe how following bytes +should be interpreted. +.PP +The rest of the lines in the file depend on the type. +.PP +Cases [1], [2], and [3] are collectively referred to as table-based encoding +files. The lines in a table-based encoding file are in the same +format as this example taken from the \fBshiftjis\fR encoding (this is not +the complete file): +.CS +# Encoding file: shiftjis, multi-byte +M +003F 0 40 +00 +0000000100020003000400050006000700080009000A000B000C000D000E000F +0010001100120013001400150016001700180019001A001B001C001D001E001F +0020002100220023002400250026002700280029002A002B002C002D002E002F +0030003100320033003400350036003700380039003A003B003C003D003E003F +0040004100420043004400450046004700480049004A004B004C004D004E004F +0050005100520053005400550056005700580059005A005B005C005D005E005F +0060006100620063006400650066006700680069006A006B006C006D006E006F +0070007100720073007400750076007700780079007A007B007C007D203E007F +0080000000000000000000000000000000000000000000000000000000000000 +0000000000000000000000000000000000000000000000000000000000000000 +0000FF61FF62FF63FF64FF65FF66FF67FF68FF69FF6AFF6BFF6CFF6DFF6EFF6F +FF70FF71FF72FF73FF74FF75FF76FF77FF78FF79FF7AFF7BFF7CFF7DFF7EFF7F +FF80FF81FF82FF83FF84FF85FF86FF87FF88FF89FF8AFF8BFF8CFF8DFF8EFF8F +FF90FF91FF92FF93FF94FF95FF96FF97FF98FF99FF9AFF9BFF9CFF9DFF9EFF9F +0000000000000000000000000000000000000000000000000000000000000000 +0000000000000000000000000000000000000000000000000000000000000000 +81 +0000000000000000000000000000000000000000000000000000000000000000 +0000000000000000000000000000000000000000000000000000000000000000 +0000000000000000000000000000000000000000000000000000000000000000 +0000000000000000000000000000000000000000000000000000000000000000 +300030013002FF0CFF0E30FBFF1AFF1BFF1FFF01309B309C00B4FF4000A8FF3E +FFE3FF3F30FD30FE309D309E30034EDD30053006300730FC20152010FF0F005C +301C2016FF5C2026202520182019201C201DFF08FF0930143015FF3BFF3DFF5B +FF5D30083009300A300B300C300D300E300F30103011FF0B221200B100D70000 +00F7FF1D2260FF1CFF1E22662267221E22342642264000B0203220332103FFE5 +FF0400A200A3FF05FF03FF06FF0AFF2000A72606260525CB25CF25CE25C725C6 +25A125A025B325B225BD25BC203B301221922190219121933013000000000000 +000000000000000000000000000000002208220B2286228722822283222A2229 +000000000000000000000000000000002227222800AC21D221D4220022030000 +0000000000000000000000000000000000000000222022A52312220222072261 +2252226A226B221A223D221D2235222B222C0000000000000000000000000000 +212B2030266F266D266A2020202100B6000000000000000025EF000000000000 +.CE +.PP +The third line of the file is three numbers. The first number is the +fallback character (in base 16) to use when converting from UTF-8 to this +encoding. The second number is a \fB1\fR if this file represents the +encoding for a symbol font, or \fB0\fR otherwise. The last number (in base +10) is how many pages of data follow. +.PP +Subsequent lines in the example above are pages that describe how to map +from the encoding into 2-byte Unicode. The first line in a page identifies +the page number. Following it are 256 double-byte numbers, arranged as 16 +rows of 16 numbers. Given a character in the encoding, the high byte of +that character is used to select which page, and the low byte of that +character is used as an index to select one of the double-byte numbers in +that page \- the value obtained being the corresponding Unicode character. +By examination of the example above, one can see that the characters 0x7E +and 0x8163 in \fBshiftjis\fR map to 203E and 2026 in Unicode, respectively. +.PP +Following the first page will be all the other pages, each in the same +format as the first: one number identifying the page followed by 256 +double-byte Unicode characters. If a character in the encoding maps to the +Unicode character 0000, it means that the character doesn't actually exist. +If all characters on a page would map to 0000, that page can be omitted. +.PP +Case [4] is the escape-sequence encoding file. The lines in an this type of +file are in the same format as this example taken from the \fBiso2022-jp\fR +encoding: +.CS +.ta 1.5i +# Encoding file: iso2022-jp, escape-driven +E +init {} +final {} +iso8859-1 \\x1b(B +jis0201 \\x1b(J +jis0208 \\x1b$@ +jis0208 \\x1b$B +jis0212 \\x1b$(D +gb2312 \\x1b$A +ksc5601 \\x1b$(C +.CE +.PP +In the file, the first column represents an option and the second column +is the associated value. \fBinit\fR is a string to emit or expect before +the first character is converted, while \fBfinal\fR is a string to emit +or expect after the last character. All other options are names of +table-based encodings; the associated value is the escape-sequence that +marks that encoding. Tcl syntax is used for the values; in the above +example, for instance, ``\fB{}\fR'' represents the empty string and +``\fB\\x1b\fR'' represents character 27. +.PP +When \fBTcl_GetEncoding\fR encounters an encoding \fIname\fR that has not +been loaded, it attempts to load an encoding file called \fIname\fB.enc\fR +from the \fBencoding\fR subdirectory of each directory specified in the +library path \fB$tcl_libPath\fR. If the encoding file exists, but is +malformed, an error message will be left in \fIinterp\fR. +.SH KEYWORDS +utf, encoding, convert + + + |