From 6ddcd1aac092384d9126dd6e101edcc2a681b5e5 Mon Sep 17 00:00:00 2001 From: Miguel Sofer Date: Tue, 6 Nov 2007 15:23:15 +0000 Subject: * generic/tclEncoding.c: Version of the embedded iso8859-1 encoding handler that is faster (functions to do the encoding know exactly what they're doing instead of pulling it from a table, though the table itself has to be retained for use by shift encodings that depend on iso8859-1.) [Patch 1826906], committing for dkf. --- ChangeLog | 9 ++ generic/tclEncoding.c | 223 +++++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 219 insertions(+), 13 deletions(-) diff --git a/ChangeLog b/ChangeLog index 48f8d82..cc05d80 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,12 @@ +2007-11-06 Miguel Sofer + + * generic/tclEncoding.c: Version of the embedded iso8859-1 + encoding handler that is faster (functions to do the encoding know + exactly what they're doing instead of pulling it from a table, + though the table itself has to be retained for use by shift + encodings that depend on iso8859-1.) [Patch 1826906], committing + for dkf. + 2007-11-05 Andreas Kupries * generic/tclConfig.c (Tcl_RegisterConfig): Modified to not extend diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index fbaa8e1..e2308a5 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -8,7 +8,7 @@ * See the file "license.terms" for information on usage and redistribution of * this file, and for a DISCLAIMER OF ALL WARRANTIES. * - * RCS: @(#) $Id: tclEncoding.c,v 1.56 2007/10/28 00:40:48 dkf Exp $ + * RCS: @(#) $Id: tclEncoding.c,v 1.57 2007/11/06 15:23:15 msofer Exp $ */ #include "tclInt.h" @@ -260,14 +260,24 @@ static int UtfExtToUtfIntProc(ClientData clientData, Tcl_EncodingState *statePtr, char *dst, int dstLen, int *srcReadPtr, int *dstWrotePtr, int *dstCharsPtr); +static int Iso88591FromUtfProc(ClientData clientData, + CONST char *src, int srcLen, int flags, + Tcl_EncodingState *statePtr, char *dst, int dstLen, + int *srcReadPtr, int *dstWrotePtr, + int *dstCharsPtr); +static int Iso88591ToUtfProc(ClientData clientData, + CONST char *src, int srcLen, int flags, + Tcl_EncodingState *statePtr, char *dst, + int dstLen, int *srcReadPtr, int *dstWrotePtr, + int *dstCharsPtr); /* - * A Tcl_ObjType for holding a cached Tcl_Encoding as the intrep. This should - * help the lifetime of encodings be more useful. See concerns raised in [Bug - * 1077262]. + * A Tcl_ObjType for holding a cached Tcl_Encoding in the otherValuePtr field + * of the intrep. This should help the lifetime of encodings be more useful. + * See concerns raised in [Bug 1077262]. */ -static Tcl_ObjType EncodingType = { +static Tcl_ObjType encodingType = { "encoding", FreeEncodingIntRep, DupEncodingIntRep, NULL, NULL }; @@ -297,7 +307,7 @@ Tcl_GetEncodingFromObj( Tcl_Encoding *encodingPtr) { CONST char *name = Tcl_GetString(objPtr); - if (objPtr->typePtr != &EncodingType) { + if (objPtr->typePtr != &encodingType) { Tcl_Encoding encoding = Tcl_GetEncoding(interp, name); if (encoding == NULL) { @@ -305,7 +315,7 @@ Tcl_GetEncodingFromObj( } TclFreeIntRep(objPtr); objPtr->internalRep.otherValuePtr = (VOID *) encoding; - objPtr->typePtr = &EncodingType; + objPtr->typePtr = &encodingType; } *encodingPtr = Tcl_GetEncoding(NULL, name); return TCL_OK; @@ -611,8 +621,8 @@ TclInitEncodingSubsystem(void) } type.encodingName = "iso8859-1"; - type.toUtfProc = TableToUtfProc; - type.fromUtfProc = TableFromUtfProc; + type.toUtfProc = Iso88591ToUtfProc; + type.fromUtfProc = Iso88591FromUtfProc; type.freeProc = TableFreeProc; type.nullSize = 1; type.clientData = dataPtr; @@ -1536,6 +1546,7 @@ OpenEncodingFileChannel( if ((NULL == chan) && (interp != NULL)) { Tcl_AppendResult(interp, "unknown encoding \"", name, "\"", NULL); + Tcl_SetErrorCode(interp, "TCL", "LOOKUP", "ENCODING", name, NULL); } Tcl_DecrRefCount(fileNameObj); Tcl_DecrRefCount(nameObj); @@ -2132,9 +2143,9 @@ UtfIntToUtfExtProc( * * UtfExtToUtfIntProc -- * - * Convert from UTF-8 to UTF-8 while converting null-bytes from - * the official representation (0x00) to Tcl's internal - * representation (0xc0, 0x80). See UtfToUtfProc for details. + * Convert from UTF-8 to UTF-8 while converting null-bytes from the + * official representation (0x00) to Tcl's internal representation (0xc0, + * 0x80). See UtfToUtfProc for details. * * Results: * Returns TCL_OK if conversion was successful. @@ -2703,6 +2714,190 @@ TableFromUtfProc( } /* + *------------------------------------------------------------------------- + * + * Iso88591ToUtfProc -- + * + * Convert from the "iso8859-1" encoding into UTF-8. + * + * Results: + * Returns TCL_OK if conversion was successful. + * + * Side effects: + * None. + * + *------------------------------------------------------------------------- + */ + +static int +Iso88591ToUtfProc( + ClientData clientData, /* Ignored. */ + CONST char *src, /* Source string in specified encoding. */ + int srcLen, /* Source string length in bytes. */ + int flags, /* Conversion control flags. */ + Tcl_EncodingState *statePtr,/* Place for conversion routine to store state + * information used during a piecewise + * conversion. Contents of statePtr are + * initialized and/or reset by conversion + * routine under control of flags argument. */ + char *dst, /* Output buffer in which converted string is + * stored. */ + int dstLen, /* The maximum length of output buffer in + * bytes. */ + int *srcReadPtr, /* Filled with the number of bytes from the + * source string that were converted. This may + * be less than the original source length if + * there was a problem converting some source + * characters. */ + int *dstWrotePtr, /* Filled with the number of bytes that were + * stored in the output buffer as a result of + * the conversion. */ + int *dstCharsPtr) /* Filled with the number of characters that + * correspond to the bytes stored in the + * output buffer. */ +{ + CONST char *srcStart, *srcEnd; + char *dstEnd, *dstStart; + int result, numChars; + + srcStart = src; + srcEnd = src + srcLen; + + dstStart = dst; + dstEnd = dst + dstLen - TCL_UTF_MAX; + + result = TCL_OK; + for (numChars = 0; src < srcEnd; numChars++) { + Tcl_UniChar ch; + + if (dst > dstEnd) { + result = TCL_CONVERT_NOSPACE; + break; + } + ch = (Tcl_UniChar) *((unsigned char *) src); + /* + * Special case for 1-byte utf chars for speed. + */ + if (ch && ch < 0x80) { + *dst++ = (char) ch; + } else { + dst += Tcl_UniCharToUtf(ch, dst); + } + src++; + } + + *srcReadPtr = src - srcStart; + *dstWrotePtr = dst - dstStart; + *dstCharsPtr = numChars; + return result; +} + +/* + *------------------------------------------------------------------------- + * + * Iso88591FromUtfProc -- + * + * Convert from UTF-8 into the encoding "iso8859-1". + * + * Results: + * Returns TCL_OK if conversion was successful. + * + * Side effects: + * None. + * + *------------------------------------------------------------------------- + */ + +static int +Iso88591FromUtfProc( + ClientData clientData, /* Ignored. */ + CONST char *src, /* Source string in UTF-8. */ + int srcLen, /* Source string length in bytes. */ + int flags, /* Conversion control flags. */ + Tcl_EncodingState *statePtr,/* Place for conversion routine to store state + * information used during a piecewise + * conversion. Contents of statePtr are + * initialized and/or reset by conversion + * routine under control of flags argument. */ + char *dst, /* Output buffer in which converted string is + * stored. */ + int dstLen, /* The maximum length of output buffer in + * bytes. */ + int *srcReadPtr, /* Filled with the number of bytes from the + * source string that were converted. This may + * be less than the original source length if + * there was a problem converting some source + * characters. */ + int *dstWrotePtr, /* Filled with the number of bytes that were + * stored in the output buffer as a result of + * the conversion. */ + int *dstCharsPtr) /* Filled with the number of characters that + * correspond to the bytes stored in the + * output buffer. */ +{ + CONST char *srcStart, *srcEnd, *srcClose; + char *dstStart, *dstEnd; + int result, numChars; + + result = TCL_OK; + + srcStart = src; + srcEnd = src + srcLen; + srcClose = srcEnd; + if ((flags & TCL_ENCODING_END) == 0) { + srcClose -= TCL_UTF_MAX; + } + + dstStart = dst; + dstEnd = dst + dstLen - 1; + + for (numChars = 0; src < srcEnd; numChars++) { + Tcl_UniChar ch; + int len; + + if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) { + /* + * If there is more string to follow, this will ensure that the + * last UTF-8 character in the source buffer hasn't been cut off. + */ + + result = TCL_CONVERT_MULTIBYTE; + break; + } + len = TclUtfToUniChar(src, &ch); + + /* + * Check for illegal characters. + */ + + if (ch > 0xff) { + if (flags & TCL_ENCODING_STOPONERROR) { + result = TCL_CONVERT_UNKNOWN; + break; + } + + /* + * Plunge on, using '?' as a fallback character. + */ + + ch = (Tcl_UniChar) '?'; + } + + if (dst > dstEnd) { + result = TCL_CONVERT_NOSPACE; + break; + } + *(dst++) = (char) ch; + src += len; + } + + *srcReadPtr = src - srcStart; + *dstWrotePtr = dst - dstStart; + *dstCharsPtr = numChars; + return result; +} + +/* *--------------------------------------------------------------------------- * * TableFreeProc -- @@ -3235,7 +3430,8 @@ GetTableEncoding( if (encodingPtr == NULL) { encodingPtr = (Encoding *) Tcl_GetEncoding(NULL, subTablePtr->name); if ((encodingPtr == NULL) - || (encodingPtr->toUtfProc != TableToUtfProc)) { + || (encodingPtr->toUtfProc != TableToUtfProc + && encodingPtr->toUtfProc != Iso88591ToUtfProc)) { Tcl_Panic("EscapeToUtfProc: invalid sub table"); } subTablePtr->encodingPtr = encodingPtr; @@ -3350,3 +3546,4 @@ InitializeEncodingSearchPath( * fill-column: 78 * End: */ + -- cgit v0.12