From 846b23ab0a9dfcfb2675ae37c1451ee573609196 Mon Sep 17 00:00:00 2001
From: "jan.nijtmans" <nijtmans@users.sourceforge.net>
Date: Thu, 7 May 2020 09:31:32 +0000
Subject: Tighten optimization in Tcl_UtfToUniCharDString(), just as in
 Tcl_NumUtfChars(). Don't use "-1" in the Tcl_NumUtfChars() calculation, since
 that raises more questions than it solves, but that's easy to be remedied as
 well: Juse use >= in stead of > in the comparation. Great idea, Don! Backport
 more code formatting from Tcl 8.6 (e.g. use of CONST, which makes no sense
 any more in c-files)

---
 generic/tclUtf.c | 160 +++++++++++++++++++++++++++----------------------------
 1 file changed, 79 insertions(+), 81 deletions(-)

diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index 6a142bc..4147bbc 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -59,7 +59,7 @@
  * UTF-8.
  */
 
-static CONST unsigned char totalBytes[256] = {
+static const unsigned char totalBytes[256] = {
     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
@@ -99,7 +99,7 @@ static int		Invalid(const char *src);
  *---------------------------------------------------------------------------
  */
 
-INLINE static int
+static INLINE int
 UtfCount(
     int ch)			/* The Unicode character whose size is returned. */
 {
@@ -145,7 +145,7 @@ UtfCount(
  *---------------------------------------------------------------------------
  */
 
-static CONST unsigned char bounds[28] = {
+static const unsigned char bounds[28] = {
     0x80, 0x80,		/* \xC0 accepts \x80 only */
     0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF,
     0x80, 0xBF,		/* (\xC4 - \xDC) -- all sequences valid */
@@ -160,7 +160,7 @@ static CONST unsigned char bounds[28] = {
 #endif
 };
 
-INLINE static int
+static INLINE int
 Invalid(
     const char *src)	/* Points to lead byte of a UTF-8 byte sequence */
 {
@@ -259,13 +259,13 @@ three:
 
 char *
 Tcl_UniCharToUtfDString(
-    CONST Tcl_UniChar *uniStr,	/* Unicode string to convert to UTF-8. */
+    const Tcl_UniChar *uniStr,	/* Unicode string to convert to UTF-8. */
     int uniLength,		/* Length of Unicode string in Tcl_UniChars
 				 * (must be >= 0). */
     Tcl_DString *dsPtr)		/* UTF-8 representation of string is appended
 				 * to this previously initialized DString. */
 {
-    CONST Tcl_UniChar *w, *wEnd;
+    const Tcl_UniChar *w, *wEnd;
     char *p, *string;
     int oldLength;
 
@@ -317,17 +317,17 @@ Tcl_UniCharToUtfDString(
 
 int
 Tcl_UtfToUniChar(
-    register CONST char *src,	/* The UTF-8 string. */
-    register Tcl_UniChar *chPtr)/* Filled with the Tcl_UniChar represented by
+    const char *src,	/* The UTF-8 string. */
+    Tcl_UniChar *chPtr)/* Filled with the Tcl_UniChar represented by
 				 * the UTF-8 string. */
 {
-    register int byte;
+    Tcl_UniChar byte;
 
     /*
-     * Unroll 1 to 3 byte UTF-8 sequences, use loop to handle longer ones.
+     * Unroll 1 to 3 (or 4) byte UTF-8 sequences.
      */
 
-    byte = *((unsigned char *) src);
+    byte = UCHAR(*src);
     if (byte < 0xC0) {
 	/*
 	 * Handles properly formed UTF-8 characters between 0x01 and 0x7F.
@@ -335,7 +335,7 @@ Tcl_UtfToUniChar(
 	 * characters representing themselves.
 	 */
 
-	*chPtr = (Tcl_UniChar) byte;
+	*chPtr = byte;
 	return 1;
     } else if (byte < 0xE0) {
 	if ((src[1] & 0xC0) == 0x80) {
@@ -415,7 +415,7 @@ Tcl_UtfToUniChar(
 
 Tcl_UniChar *
 Tcl_UtfToUniCharDString(
-    CONST char *src,		/* UTF-8 string to convert to Unicode. */
+    const char *src,		/* UTF-8 string to convert to Unicode. */
     int length,			/* Length of UTF-8 string in bytes, or -1 for
 				 * strlen(). */
     Tcl_DString *dsPtr)		/* Unicode representation of string is
@@ -423,8 +423,12 @@ Tcl_UtfToUniCharDString(
 				 * DString. */
 {
     Tcl_UniChar *w, *wString;
-    CONST char *p, *end;
+    const char *p;
     int oldLength;
+	/* Pointer to the end of string. Never read endPtr[0] */
+	const char *endPtr = src + length;
+	/* Pointer to breakpoint in scan where optimization is lost */
+	const char *optPtr = endPtr - TCL_UTF_MAX;
 
     if (length < 0) {
 	length = strlen(src);
@@ -438,24 +442,22 @@ Tcl_UtfToUniCharDString(
     oldLength = Tcl_DStringLength(dsPtr);
 
     Tcl_DStringSetLength(dsPtr,
-	    oldLength + (int) ((length + 1) * sizeof(Tcl_UniChar)));
+	    oldLength + ((length + 1) * sizeof(Tcl_UniChar)));
     wString = (Tcl_UniChar *) (Tcl_DStringValue(dsPtr) + oldLength);
 
     w = wString;
     p = src;
-    end = src + length - TCL_UTF_MAX;
-    while (p < end) {
+    endPtr = src + length;
+    optPtr = endPtr - TCL_UTF_MAX;
+    while (p <= optPtr) {
 	p += TclUtfToUniChar(p, w);
 	w++;
     }
-    end += TCL_UTF_MAX;
-    while (p < end) {
-	if (Tcl_UtfCharComplete(p, end-p)) {
-	    p += TclUtfToUniChar(p, w);
-	} else {
-	    *w = UCHAR(*p++);
-	}
-	w++;
+    while ((p < endPtr) && Tcl_UtfCharComplete(p, endPtr-p)) {
+	p += TclUtfToUniChar(p, w++);
+    }
+    while (p < endPtr) {
+	*w++ = UCHAR(*p++);
     }
     *w = '\0';
     Tcl_DStringSetLength(dsPtr,
@@ -485,11 +487,11 @@ Tcl_UtfToUniCharDString(
 
 int
 Tcl_UtfCharComplete(
-    CONST char *src,		/* String to check if first few bytes contain
+    const char *src,		/* String to check if first few bytes contain
 				 * a complete UTF-8 character. */
     int length)			/* Length of above string in bytes. */
 {
-    return length >= totalBytes[(unsigned char)*src];
+    return length >= totalBytes[UCHAR(*src)];
 }
 
 /*
@@ -512,7 +514,7 @@ Tcl_UtfCharComplete(
 
 int
 Tcl_NumUtfChars(
-    CONST char *src,	/* The UTF-8 string to measure. */
+    const char *src,	/* The UTF-8 string to measure. */
     int length)		/* The length of the string in bytes, or -1
 			 * for strlen(string). */
 {
@@ -530,17 +532,17 @@ Tcl_NumUtfChars(
 
 	/* Pointer to the end of string. Never read endPtr[0] */
 	const char *endPtr = src + length;
-	/* Pointer to breakpoint in scan where optimization is lost */
-	const char *optPtr = endPtr - TCL_UTF_MAX + 1;
+	/* Pointer to last byte where optimization still can be used */
+	const char *optPtr = endPtr - TCL_UTF_MAX;
 
 	/*
 	 * Optimize away the call in this loop. Justified because...
-	 *	when (src < optPtr), (endPtr - src) > (endPtr - optPtr)
-	 * By initialization above (endPtr - optPtr) = TCL_UTF_MAX - 1
+	 * when (src <= optPtr), (endPtr - src) >= (endPtr - optPtr)
+	 * By initialization above (endPtr - optPtr) = TCL_UTF_MAX
 	 * So (endPtr - src) >= TCL_UTF_MAX, and passing that to
 	 * Tcl_UtfCharComplete we know will cause return of 1.
 	 */
-	while ((src < optPtr)
+	while (src <= optPtr
 		/* && Tcl_UtfCharComplete(src, endPtr - src) */ ) {
 	    src += TclUtfToUniChar(src, &ch);
 	    i++;
@@ -580,16 +582,14 @@ Tcl_NumUtfChars(
  *---------------------------------------------------------------------------
  */
 
-CONST char *
+const char *
 Tcl_UtfFindFirst(
-    CONST char *src,		/* The UTF-8 string to be searched. */
+    const char *src,		/* The UTF-8 string to be searched. */
     int ch)			/* The Tcl_UniChar to search for. */
 {
-    int len;
-    Tcl_UniChar find;
-
     while (1) {
-	len = TclUtfToUniChar(src, &find);
+	Tcl_UniChar find;
+	int len = TclUtfToUniChar(src, &find);
 	if (find == ch) {
 	    return src;
 	}
@@ -619,18 +619,17 @@ Tcl_UtfFindFirst(
  *---------------------------------------------------------------------------
  */
 
-CONST char *
+const char *
 Tcl_UtfFindLast(
-    CONST char *src,		/* The UTF-8 string to be searched. */
+    const char *src,		/* The UTF-8 string to be searched. */
     int ch)			/* The Unicode character to search for. */
 {
-    int len;
-    Tcl_UniChar find;
-    CONST char *last;
+    const char *last = NULL;
 
-    last = NULL;
     while (1) {
-	len = TclUtfToUniChar(src, &find);
+	Tcl_UniChar find;
+	int len = TclUtfToUniChar(src, &find);
+
 	if (find == ch) {
 	    last = src;
 	}
@@ -663,9 +662,9 @@ Tcl_UtfFindLast(
  *---------------------------------------------------------------------------
  */
 
-CONST char *
+const char *
 Tcl_UtfNext(
-    CONST char *src)		/* The current location in the string. */
+    const char *src)		/* The current location in the string. */
 {
     int left;
     const char *next;
@@ -717,13 +716,13 @@ Tcl_UtfNext(
  *---------------------------------------------------------------------------
  */
 
-CONST char *
+const char *
 Tcl_UtfPrev(
-    CONST char *src,		/* A location in a UTF-8 string. */
-    CONST char *start)		/* Pointer to the beginning of the string */
+    const char *src,		/* A location in a UTF-8 string. */
+    const char *start)		/* Pointer to the beginning of the string */
 {
     int trailBytesSeen = 0;	/* How many trail bytes have been verified? */
-    CONST char *fallback = src - 1;
+    const char *fallback = src - 1;
 				/* If we cannot find a lead byte that might
 				 * start a prefix of a valid UTF byte sequence,
 				 * we will fallback to a one-byte back step */
@@ -779,13 +778,13 @@ Tcl_UtfPrev(
 		/* Reject */
 		return fallback;
 	    }
-	    return (CONST char *)look;
+	    return (const char *)look;
 	}
 
 	/* We saw a trail byte. */
 	trailBytesSeen++;
 
-	if ((CONST char *)look == start) {
+	if ((const char *)look == start) {
 	    /*
 	     * Do not read before the start of the string
 	     *
@@ -829,8 +828,8 @@ Tcl_UtfPrev(
 
 Tcl_UniChar
 Tcl_UniCharAtIndex(
-    register CONST char *src,	/* The UTF-8 string to dereference. */
-    register int index)		/* The position of the desired character. */
+    const char *src,	/* The UTF-8 string to dereference. */
+    int index)		/* The position of the desired character. */
 {
     Tcl_UniChar ch;
 
@@ -855,15 +854,14 @@ Tcl_UniCharAtIndex(
  *---------------------------------------------------------------------------
  */
 
-CONST char *
+const char *
 Tcl_UtfAtIndex(
-    register CONST char *src,	/* The UTF-8 string. */
-    register int index)		/* The position of the desired character. */
+    const char *src,	/* The UTF-8 string. */
+    int index)		/* The position of the desired character. */
 {
     Tcl_UniChar ch;
 
-    while (index > 0) {
-	index--;
+    while (index-- > 0) {
 	src += TclUtfToUniChar(src, &ch);
     }
     return src;
@@ -897,7 +895,7 @@ Tcl_UtfAtIndex(
 
 int
 Tcl_UtfBackslash(
-    CONST char *src,		/* Points to the backslash character of a
+    const char *src,		/* Points to the backslash character of a
 				 * backslash sequence. */
     int *readPtr,		/* Fill in with number of characters read from
 				 * src, unless NULL. */
@@ -1113,8 +1111,8 @@ Tcl_UtfToTitle(
 
 int
 TclpUtfNcmp2(
-    CONST char *cs,		/* UTF string to compare to ct. */
-    CONST char *ct,		/* UTF string cs is compared to. */
+    const char *cs,		/* UTF string to compare to ct. */
+    const char *ct,		/* UTF string cs is compared to. */
     unsigned long numBytes)	/* Number of *bytes* to compare. */
 {
     /*
@@ -1123,7 +1121,7 @@ TclpUtfNcmp2(
      * fine in the strcmp manner.
      */
 
-    register int result = 0;
+    int result = 0;
 
     for ( ; numBytes != 0; numBytes--, cs++, ct++) {
 	if (*cs != *ct) {
@@ -1160,8 +1158,8 @@ TclpUtfNcmp2(
 
 int
 Tcl_UtfNcmp(
-    CONST char *cs,		/* UTF string to compare to ct. */
-    CONST char *ct,		/* UTF string cs is compared to. */
+    const char *cs,		/* UTF string to compare to ct. */
+    const char *ct,		/* UTF string cs is compared to. */
     unsigned long numChars)	/* Number of UTF chars to compare. */
 {
     Tcl_UniChar ch1, ch2;
@@ -1208,8 +1206,8 @@ Tcl_UtfNcmp(
 
 int
 Tcl_UtfNcasecmp(
-    CONST char *cs,		/* UTF string to compare to ct. */
-    CONST char *ct,		/* UTF string cs is compared to. */
+    const char *cs,		/* UTF string to compare to ct. */
+    const char *ct,		/* UTF string cs is compared to. */
     unsigned long numChars)	/* Number of UTF chars to compare. */
 {
     Tcl_UniChar ch1, ch2;
@@ -1252,8 +1250,8 @@ Tcl_UtfNcasecmp(
 
 int
 TclUtfCasecmp(
-    CONST char *cs,		/* UTF string to compare to ct. */
-    CONST char *ct)		/* UTF string cs is compared to. */
+    const char *cs,		/* UTF string to compare to ct. */
+    const char *ct)		/* UTF string cs is compared to. */
 {
     Tcl_UniChar ch1, ch2;
 
@@ -1409,7 +1407,7 @@ Tcl_UniCharToTitle(
 
 int
 Tcl_UniCharLen(
-    CONST Tcl_UniChar *uniStr)	/* Unicode string to find length of. */
+    const Tcl_UniChar *uniStr)	/* Unicode string to find length of. */
 {
     int len = 0;
 
@@ -1439,8 +1437,8 @@ Tcl_UniCharLen(
 
 int
 Tcl_UniCharNcmp(
-    CONST Tcl_UniChar *ucs,	/* Unicode string to compare to uct. */
-    CONST Tcl_UniChar *uct,	/* Unicode string ucs is compared to. */
+    const Tcl_UniChar *ucs,	/* Unicode string to compare to uct. */
+    const Tcl_UniChar *uct,	/* Unicode string ucs is compared to. */
     unsigned long numChars)	/* Number of unichars to compare. */
 {
 #ifdef WORDS_BIGENDIAN
@@ -1484,8 +1482,8 @@ Tcl_UniCharNcmp(
 
 int
 Tcl_UniCharNcasecmp(
-    CONST Tcl_UniChar *ucs,	/* Unicode string to compare to uct. */
-    CONST Tcl_UniChar *uct,	/* Unicode string ucs is compared to. */
+    const Tcl_UniChar *ucs,	/* Unicode string to compare to uct. */
+    const Tcl_UniChar *uct,	/* Unicode string ucs is compared to. */
     unsigned long numChars)	/* Number of unichars to compare. */
 {
     for ( ; numChars != 0; numChars--, ucs++, uct++) {
@@ -1860,8 +1858,8 @@ Tcl_UniCharIsWordChar(
 
 int
 Tcl_UniCharCaseMatch(
-    CONST Tcl_UniChar *uniStr,	/* Unicode String. */
-    CONST Tcl_UniChar *uniPattern,
+    const Tcl_UniChar *uniStr,	/* Unicode String. */
+    const Tcl_UniChar *uniPattern,
 				/* Pattern, which may contain special
 				 * characters. */
     int nocase)			/* 0 for case sensitive, 1 for insensitive */
@@ -2048,14 +2046,14 @@ Tcl_UniCharCaseMatch(
 
 int
 TclUniCharMatch(
-    CONST Tcl_UniChar *string,	/* Unicode String. */
+    const Tcl_UniChar *string,	/* Unicode String. */
     int strLen,			/* Length of String */
-    CONST Tcl_UniChar *pattern,	/* Pattern, which may contain special
+    const Tcl_UniChar *pattern,	/* Pattern, which may contain special
 				 * characters. */
     int ptnLen,			/* Length of Pattern */
     int nocase)			/* 0 for case sensitive, 1 for insensitive */
 {
-    CONST Tcl_UniChar *stringEnd, *patternEnd;
+    const Tcl_UniChar *stringEnd, *patternEnd;
     Tcl_UniChar p;
 
     stringEnd = string + strLen;
-- 
cgit v0.12