diff options
| author | dgp <dgp@users.sourceforge.net> | 2020-04-21 02:50:23 (GMT) |
|---|---|---|
| committer | dgp <dgp@users.sourceforge.net> | 2020-04-21 02:50:23 (GMT) |
| commit | d8bc590eef94f9e9ec24150cf8208f38638290ef (patch) | |
| tree | a72f09cc1ef8b7f9d867b0b82b4ba396873366b9 /generic/tclEncoding.c | |
| parent | 4c9bc32c393e100ae9caf7f06e57c798f96ada6d (diff) | |
| download | tcl-d8bc590eef94f9e9ec24150cf8208f38638290ef.zip tcl-d8bc590eef94f9e9ec24150cf8208f38638290ef.tar.gz tcl-d8bc590eef94f9e9ec24150cf8208f38638290ef.tar.bz2 | |
Revert the backport to tclEncoding.c that seems to redefine the "unicode"
encoding to mean UTF-16. Don't want that behavior change in 8.5.
Diffstat (limited to 'generic/tclEncoding.c')
| -rw-r--r-- | generic/tclEncoding.c | 240 |
1 files changed, 112 insertions, 128 deletions
diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index da03055..5a9d2d5 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -83,7 +83,7 @@ typedef struct TableEncodingData { } TableEncodingData; /* - * Each of the following structures is the clientData for a dynamically-loaded + * The following structures is the clientData for a dynamically-loaded, * escape-driven encoding that is itself comprised of other simpler encodings. * An example is "iso-2022-jp", which uses escape sequences to switch between * ascii, jis0208, jis0212, gb2312, and ksc5601. Note that "escape-driven" @@ -117,8 +117,8 @@ typedef struct EscapeEncodingData { * 0. */ int numSubTables; /* Length of following array. */ EscapeSubTable subTables[1];/* Information about each EscapeSubTable used - * by this encoding type. The actual size is - * as large as necessary to hold all + * by this encoding type. The actual size will + * be as large as necessary to hold all * EscapeSubTables. */ } EscapeEncodingData; @@ -156,7 +156,7 @@ static ProcessGlobalValue encodingFileMap = { * A list of directories making up the "library path". Historically this * search path has served many uses, but the only one remaining is a base for * the encodingSearchPath above. If the application does not explicitly set - * the encodingSearchPath, then it is initialized by appending /encoding + * the encodingSearchPath, then it will be initialized by appending /encoding * to each directory in this "libraryPath". */ @@ -177,7 +177,7 @@ TCL_DECLARE_MUTEX(encodingMutex) /* * The following are used to hold the default and current system encodings. * If NULL is passed to one of the conversion routines, the current setting of - * the system encoding is used to perform the conversion. + * the system encoding will be used to perform the conversion. */ static Tcl_Encoding defaultEncoding; @@ -429,8 +429,9 @@ TclGetLibraryPath(void) * Keeps the per-thread copy of the library path current with changes to * the global copy. * - * Since the result of this routine is void, if searchPath is not a valid - * list this routine silently does nothing. + * NOTE: this routine returns void, so there's no way to report the error + * that searchPath is not a valid list. In that case, this routine will + * silently do nothing. * *---------------------------------------------------------------------- */ @@ -452,16 +453,17 @@ TclSetLibraryPath( * * FillEncodingFileMap -- * - * Called to update the encoding file map with the current value - * of the encoding search path. + * Called to bring the encoding file map in sync with the current value + * of the encoding search path. * - * Finds *.end files in the directories on the encoding search path and - * stores the found pathnames in a map associated with the encoding name. + * Scan the directories on the encoding search path, find the *.enc + * files, and store the found pathnames in a map associated with the + * encoding name. * - * If $dir is on the encoding search path and the file $dir/foo.enc is - * found, stores a "foo" -> $dir entry in the map. if the "foo" encoding - * is needed later, the $dir/foo.enc name can be quickly constructed in - * order to read the encoding data. + * In particular, if $dir is on the encoding search path, and the file + * $dir/foo.enc is found, then store a "foo" -> $dir entry in the map. + * Later, any need for the "foo" encoding will quickly * be able to + * construct the $dir/foo.enc pathname for reading the encoding data. * * Results: * None. @@ -542,24 +544,19 @@ void TclInitEncodingSubsystem(void) { Tcl_EncodingType type; - union { - char c; - short s; - } isLe; if (encodingsInitialized) { return; } - isLe.s = 1; Tcl_MutexLock(&encodingMutex); Tcl_InitHashTable(&encodingTable, TCL_STRING_KEYS); Tcl_MutexUnlock(&encodingMutex); /* - * Create a few initial encodings. UTF-8 to UTF-8 translation is not a - * no-op because it turns a stream of improperly formed UTF-8 into a - * properly formed stream. + * Create a few initial encodings. Note that the UTF-8 to UTF-8 + * translation is not a no-op, because it will turn a stream of improperly + * formed UTF-8 into a properly formed stream. */ type.encodingName = "identity"; @@ -586,7 +583,7 @@ TclInitEncodingSubsystem(void) type.fromUtfProc = UtfToUnicodeProc; type.freeProc = NULL; type.nullSize = 2; - type.clientData = INT2PTR(isLe.c); + type.clientData = NULL; Tcl_CreateEncoding(&type); /* @@ -758,7 +755,11 @@ Tcl_SetDefaultEncodingDir( * interp was NULL. * * Side effects: - * LoadEncodingFile is called if necessary. + * The new encoding type is entered into a table visible to all + * interpreters, keyed off the encoding's name. For each call to this + * function, there should eventually be a call to Tcl_FreeEncoding, so + * that the database can be cleaned up when encodings aren't needed + * anymore. * *------------------------------------------------------------------------- */ @@ -796,15 +797,15 @@ Tcl_GetEncoding( * * Tcl_FreeEncoding -- * - * Releases an encoding allocated by Tcl_CreateEncoding() or - * Tcl_GetEncoding(). + * This function is called to release an encoding allocated by + * Tcl_CreateEncoding() or Tcl_GetEncoding(). * * Results: * None. * * Side effects: * The reference count associated with the encoding is decremented and - * the encoding is deleted if nothing is using it anymore. + * the encoding may be deleted if nothing is using it anymore. * *--------------------------------------------------------------------------- */ @@ -823,14 +824,13 @@ Tcl_FreeEncoding( * * FreeEncoding -- * - * Decrements the reference count of an encoding. The caller must hold - * encodingMutes. + * This function is called to release an encoding by functions that + * already have the encodingMutex. * * Results: * None. * * Side effects: - * Releases the resource for an encoding if it is now unused. * The reference count associated with the encoding is decremented and * the encoding may be deleted if nothing is using it anymore. * @@ -850,17 +850,16 @@ FreeEncoding( if (encodingPtr->refCount<=0) { Tcl_Panic("FreeEncoding: refcount problem !!!"); } - if (encodingPtr->refCount-- <= 1) { + encodingPtr->refCount--; + if (encodingPtr->refCount == 0) { if (encodingPtr->freeProc != NULL) { (*encodingPtr->freeProc)(encodingPtr->clientData); } if (encodingPtr->hPtr != NULL) { Tcl_DeleteHashEntry(encodingPtr->hPtr); } - if (encodingPtr->name) { - ckfree((char *)encodingPtr->name); - } - ckfree((char *)encodingPtr); + ckfree((char *) encodingPtr->name); + ckfree((char *) encodingPtr); } } @@ -1021,22 +1020,23 @@ Tcl_SetSystemEncoding( * * Tcl_CreateEncoding -- * - * Defines a new encoding, along with the functions that are used to - * convert to and from Unicode. + * This function is called to define a new encoding and the functions + * that are used to convert between the specified encoding and Unicode. * * Results: * Returns a token that represents the encoding. If an encoding with the * same name already existed, the old encoding token remains valid and - * continues to behave as it used to, and is eventually garbage collected - * when the last reference to it goes away. Any subsequent calls to - * Tcl_GetEncoding with the specified name retrieve the most recent - * encoding token. + * continues to behave as it used to, and will eventually be garbage + * collected when the last reference to it goes away. Any subsequent + * calls to Tcl_GetEncoding with the specified name will retrieve the + * most recent encoding token. * * Side effects: - * A new record having the name of the encoding is entered into a table of - * encodings visible to all interpreters. For each call to this function, - * there should eventually be a call to Tcl_FreeEncoding, which cleans - * deletes the record in the table when an encoding is no longer needed. + * The new encoding type is entered into a table visible to all + * interpreters, keyed off the encoding's name. For each call to this + * function, there should eventually be a call to Tcl_FreeEncoding, so + * that the database can be cleaned up when encodings aren't needed + * anymore. * *--------------------------------------------------------------------------- */ @@ -1258,9 +1258,10 @@ Tcl_ExternalToUtf( * * Tcl_UtfToExternalDString -- * - * Convert a source buffer from UTF-8 to the specified encoding. If any + * Convert a source buffer from UTF-8 into the specified encoding. If any * of the bytes in the source buffer are invalid or cannot be represented - * in the target encoding, a default fallback character is substituted. + * in the target encoding, a default fallback character will be + * substituted. * * Results: * The converted bytes are stored in the DString, which is then NULL @@ -1569,13 +1570,13 @@ OpenEncodingFileChannel( * the data. * * Results: - * The return value is the newly loaded Tcl_Encoding or NULL if the file - * didn't exist or could not be processed. If NULL is returned and interp - * is not NULL, an error message is left in interp's result object. + * The return value is the newly loaded Encoding, or NULL if the file + * didn't exist of was in the incorrect format. If NULL was returned, an + * error message is left in interp's result object, unless interp was + * NULL. * * Side effects: - * A corresponding encoding file might be read from persistent storage, in - * which case LoadTableEncoding is called. + * File read from disk. * *--------------------------------------------------------------------------- */ @@ -1583,8 +1584,8 @@ OpenEncodingFileChannel( static Tcl_Encoding LoadEncodingFile( Tcl_Interp *interp, /* Interp for error reporting, if not NULL. */ - const char *name) /* The name of both the encoding file - * and the new encoding. */ + const char *name) /* The name of the encoding file on disk and + * also the name for new encoding. */ { Tcl_Channel chan = NULL; Tcl_Encoding encoding = NULL; @@ -1636,27 +1637,27 @@ LoadEncodingFile( * * LoadTableEncoding -- * - * Helper function for LoadEncodingFile(). Creates a Tcl_EncodingType - * structure along with its corresponding TableEncodingData structure, and - * passes it to Tcl_Createncoding. + * Helper function for LoadEncodingTable(). Loads a table to that + * converts between Unicode and some other encoding and creates an + * encoding (using a TableEncoding structure) from that information. * - * The file contains binary data but begins with a marker to indicate - * byte-ordering so a single binary file can be read on big or - * little-endian systems. + * File contains binary data, but begins with a marker to indicate + * byte-ordering, so that same binary file can be read on either endian + * platforms. * * Results: - * Returns the new Tcl_Encoding, or NULL if it could could - * not be created because the file contained invalid data. + * The return value is the new encoding, or NULL if the encoding could + * not be created (because the file contained invalid data). * * Side effects: - * See Tcl_CreateEncoding(). + * None. * *------------------------------------------------------------------------- */ static Tcl_Encoding LoadTableEncoding( - const char *name, /* Name of the new encoding. */ + const char *name, /* Name for new encoding. */ int type, /* Type of encoding (ENCODING_?????). */ Tcl_Channel chan) /* File containing new encoding. */ { @@ -1768,10 +1769,10 @@ LoadTableEncoding( } /* - * Invert the toUnicode array to produce the fromUnicode array. Performs a + * Invert toUnicode array to produce the fromUnicode array. Performs a * single malloc to get the memory for the array and all the pages needed - * by the array. While reading in the toUnicode array remember what - * pages are needed for the fromUnicode array. + * by the array. While reading in the toUnicode array, we remembered what + * pages that would be needed for the fromUnicode array. */ if (symbol) { @@ -1813,8 +1814,8 @@ LoadTableEncoding( if (type == ENCODING_MULTIBYTE) { /* * If multibyte encodings don't have a backslash character, define - * one. Otherwise, on Windows, native file names don't work because - * the backslash in the file name maps to the unknown character + * one. Otherwise, on Windows, native file names won't work because + * the backslash in the file name will map to the unknown character * (question mark) when converting from UTF-8 to external encoding. */ @@ -1828,13 +1829,13 @@ LoadTableEncoding( unsigned short *page; /* - * Make a special symbol encoding that maps each symbol character from - * its Unicode code point down into page 0, and also ensure that each - * characters on page 0 maps to itself so that a symbol font can be - * used to display a simple string like "abcd" and have alpha, beta, - * chi, delta show up, rather than have "unknown" chars show up because - * strictly speaking the symbol font doesn't have glyphs for those low - * ASCII chars. + * Make a special symbol encoding that not only maps the symbol + * characters from their Unicode code points down into page 0, but + * also ensure that the characters on page 0 map to themselves. This + * is so that a symbol font can be used to display a simple string + * like "abcd" and have alpha, beta, chi, delta show up, rather than + * have "unknown" chars show up because strictly speaking the symbol + * font doesn't have glyphs for those low ascii chars. */ page = dataPtr->fromUnicode[0]; @@ -1938,7 +1939,7 @@ LoadTableEncoding( static Tcl_Encoding LoadEscapeEncoding( - const char *name, /* Name of the new encoding. */ + const char *name, /* Name for new encoding. */ Tcl_Channel chan) /* File containing new encoding. */ { int i; @@ -2317,7 +2318,7 @@ UtfToUtfProc( * * UnicodeToUtfProc -- * - * Convert from UTF-16 to UTF-8. + * Convert from Unicode to UTF-8. * * Results: * Returns TCL_OK if conversion was successful. @@ -2330,7 +2331,7 @@ UtfToUtfProc( static int UnicodeToUtfProc( - ClientData clientData, /* != NULL means LE, == NUL means BE */ + ClientData clientData, /* Not used. */ const char *src, /* Source string in Unicode. */ int srcLen, /* Source string length in bytes. */ int flags, /* Conversion control flags. */ @@ -2358,19 +2359,13 @@ UnicodeToUtfProc( const char *srcStart, *srcEnd; char *dstEnd, *dstStart; int result, numChars; - unsigned short ch; + Tcl_UniChar ch; result = TCL_OK; - - /* check alignment with utf-16 (2 == sizeof(UTF-16)) */ - if ((srcLen % 2) != 0) { - result = TCL_CONVERT_MULTIBYTE; - srcLen--; - } - /* If last code point is a high surrogate, we cannot handle that yet */ - if ((srcLen >= 2) && ((src[srcLen - (clientData?1:2)] & 0xFC) == 0xD8)) { + if ((srcLen % sizeof(Tcl_UniChar)) != 0) { result = TCL_CONVERT_MULTIBYTE; - srcLen-= 2; + srcLen /= sizeof(Tcl_UniChar); + srcLen *= sizeof(Tcl_UniChar); } srcStart = src; @@ -2384,21 +2379,17 @@ UnicodeToUtfProc( result = TCL_CONVERT_NOSPACE; break; } - if (clientData) { - ch = (src[1] & 0xFF) << 8 | (src[0] & 0xFF); - } else { - ch = (src[0] & 0xFF) << 8 | (src[1] & 0xFF); - } /* - * Special case for 1-byte utf chars for speed. Make sure we work with - * unsigned short-size data. + * Special case for 1-byte utf chars for speed. Make sure we + * work with Tcl_UniChar-size data. */ + ch = *(Tcl_UniChar *)src; if (ch && ch < 0x80) { *dst++ = (ch & 0xFF); } else { dst += Tcl_UniCharToUtf(ch, dst); } - src += sizeof(unsigned short); + src += sizeof(Tcl_UniChar); } *srcReadPtr = src - srcStart; @@ -2412,7 +2403,7 @@ UnicodeToUtfProc( * * UtfToUnicodeProc -- * - * Convert from UTF-8 to UTF-16. + * Convert from UTF-8 to Unicode. * * Results: * Returns TCL_OK if conversion was successful. @@ -2425,7 +2416,8 @@ UnicodeToUtfProc( static int UtfToUnicodeProc( - ClientData clientData, /* != NULL means LE, == NUL means BE */ + ClientData clientData, /* TableEncodingData that specifies + * encoding. */ const char *src, /* Source string in UTF-8. */ int srcLen, /* Source string length in bytes. */ int flags, /* Conversion control flags. */ @@ -2452,7 +2444,7 @@ UtfToUnicodeProc( { const char *srcStart, *srcEnd, *srcClose, *dstStart, *dstEnd; int result, numChars; - Tcl_UniChar ch = 0; + Tcl_UniChar ch; srcStart = src; srcEnd = src + srcLen; @@ -2484,37 +2476,27 @@ UtfToUnicodeProc( * Need to handle this in a way that won't cause misalignment * by casting dst to a Tcl_UniChar. [Bug 1122671] */ - if (clientData) { +#ifdef WORDS_BIGENDIAN #if TCL_UTF_MAX > 4 - if (ch <= 0xFFFF) { - *dst++ = (ch & 0xFF); - *dst++ = (ch >> 8); - } else { - *dst++ = (((ch - 0x10000) >> 10) & 0xFF); - *dst++ = (((ch - 0x10000) >> 18) & 0x3) | 0xD8; - *dst++ = (ch & 0xFF); - *dst++ = ((ch & 0x3) >> 8) | 0xDC; - } + *dst++ = (ch >> 24); + *dst++ = ((ch >> 16) & 0xFF); + *dst++ = ((ch >> 8) & 0xFF); + *dst++ = (ch & 0xFF); #else - *dst++ = (ch & 0xFF); - *dst++ = (ch >> 8); + *dst++ = (ch >> 8); + *dst++ = (ch & 0xFF); #endif - } else { +#else #if TCL_UTF_MAX > 4 - if (ch <= 0xFFFF) { - *dst++ = (ch >> 8); - *dst++ = (ch & 0xFF); - } else { - *dst++ = ((ch & 0x3) >> 8) | 0xDC; - *dst++ = (ch & 0xFF); - *dst++ = (((ch - 0x10000) >> 18) & 0x3) | 0xD8; - *dst++ = (((ch - 0x10000) >> 10) & 0xFF); - } + *dst++ = (ch & 0xFF); + *dst++ = ((ch >> 8) & 0xFF); + *dst++ = ((ch >> 16) & 0xFF); + *dst++ = (ch >> 24); #else - *dst++ = (ch >> 8); - *dst++ = (ch & 0xFF); + *dst++ = (ch & 0xFF); + *dst++ = (ch >> 8); +#endif #endif - } } *srcReadPtr = src - srcStart; *dstWrotePtr = dst - dstStart; @@ -2917,6 +2899,7 @@ Iso88591FromUtfProc( result = TCL_CONVERT_UNKNOWN; break; } + /* * Plunge on, using '?' as a fallback character. */ @@ -3404,13 +3387,14 @@ EscapeFromUtfProc( * * EscapeFreeProc -- * - * Frees resources used by the encoding. + * This function is invoked when an EscapeEncodingData encoding is + * deleted. It deletes the memory used by the encoding. * * Results: * None. * * Side effects: - * Memory is freed. + * Memory freed. * *--------------------------------------------------------------------------- */ |
