From 30f26cff8be85483cb7c90b15ce9acc2f4607583 Mon Sep 17 00:00:00 2001 From: sebres Date: Wed, 20 Mar 2024 17:56:25 +0000 Subject: optimize TclUtfToUCS4 for single code units (non high surrogates), especially for ascii; fixes performance regression [6811a0081940b76c] --- generic/tclInt.h | 8 +++++++- generic/tclUtf.c | 6 +++--- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/generic/tclInt.h b/generic/tclInt.h index de92a7d..7efaf80 100644 --- a/generic/tclInt.h +++ b/generic/tclInt.h @@ -3180,7 +3180,7 @@ MODULE_SCOPE int TclTrimLeft(const char *bytes, int numBytes, MODULE_SCOPE int TclTrimRight(const char *bytes, int numBytes, const char *trim, int numTrim); MODULE_SCOPE int TclUtfCasecmp(const char *cs, const char *ct); -MODULE_SCOPE int TclUtfToUCS4(const char *, int *); +MODULE_SCOPE int TclpUtfToUCS4(const char *, int *); MODULE_SCOPE int TclUCS4ToUtf(int, char *); MODULE_SCOPE int TclUCS4ToLower(int ch); #if TCL_UTF_MAX == 4 @@ -3995,6 +3995,7 @@ MODULE_SCOPE void TclDbInitNewObj(Tcl_Obj *objPtr, const char *file, * the result of Tcl_UtfToUniChar. The ANSI C "prototype" for this macro is: * * MODULE_SCOPE int TclUtfToUniChar(const char *string, Tcl_UniChar *ch); + * MODULE_SCOPE int TclpUtfToUCS4(const char *src, int *ucs4Ptr); *---------------------------------------------------------------- */ @@ -4003,6 +4004,11 @@ MODULE_SCOPE void TclDbInitNewObj(Tcl_Obj *objPtr, const char *file, ((*(chPtr) = UCHAR(*(str))), 1) \ : Tcl_UtfToUniChar(str, chPtr)) +#define TclUtfToUCS4(src, ucs4Ptr) \ + (((UCHAR(*(src))) < 0x80) ? \ + ((*(ucs4Ptr) = UCHAR(*(src))), 1) \ + : TclpUtfToUCS4(src, ucs4Ptr)) + /* *---------------------------------------------------------------- * Macro counterpart of the Tcl_NumUtfChars() function. To be used in speed- diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 6fbeb36..04f7208 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -2462,18 +2462,18 @@ TclUniCharMatch( */ int -TclUtfToUCS4( +TclpUtfToUCS4( const char *src, /* The UTF-8 string. */ int *ucs4Ptr) /* Filled with the UCS4 codepoint represented * by the UTF-8 string. */ { Tcl_UniChar ch = 0; - int len = Tcl_UtfToUniChar(src, &ch); + int len = TclUtfToUniChar(src, &ch); #if TCL_UTF_MAX <= 4 if ((ch & ~0x3FF) == 0xD800) { Tcl_UniChar low = ch; - int len2 = Tcl_UtfToUniChar(src+len, &low); + int len2 = TclUtfToUniChar(src+len, &low); if ((low & ~0x3FF) == 0xDC00) { *ucs4Ptr = (((ch & 0x3FF) << 10) | (low & 0x3FF)) + 0x10000; return len + len2; -- cgit v0.12