summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorjan.nijtmans <nijtmans@users.sourceforge.net>2020-05-07 09:31:32 (GMT)
committerjan.nijtmans <nijtmans@users.sourceforge.net>2020-05-07 09:31:32 (GMT)
commit846b23ab0a9dfcfb2675ae37c1451ee573609196 (patch)
tree6cde4ad53a04e27bc0f7609afb05ffd731baf7ea
parentc2fa3a3fa94ad6516014f5376cb9d97e8b5550bb (diff)
downloadtcl-846b23ab0a9dfcfb2675ae37c1451ee573609196.zip
tcl-846b23ab0a9dfcfb2675ae37c1451ee573609196.tar.gz
tcl-846b23ab0a9dfcfb2675ae37c1451ee573609196.tar.bz2
Tighten optimization in Tcl_UtfToUniCharDString(), just as in Tcl_NumUtfChars(). Don't use "-1" in the Tcl_NumUtfChars() calculation, since that raises more questions than it solves, but that's easy to be remedied as well: Juse use >= in stead of > in the comparation. Great idea, Don!
Backport more code formatting from Tcl 8.6 (e.g. use of CONST, which makes no sense any more in c-files)
-rw-r--r--generic/tclUtf.c160
1 files changed, 79 insertions, 81 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index 6a142bc..4147bbc 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -59,7 +59,7 @@
* UTF-8.
*/
-static CONST unsigned char totalBytes[256] = {
+static const unsigned char totalBytes[256] = {
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
@@ -99,7 +99,7 @@ static int Invalid(const char *src);
*---------------------------------------------------------------------------
*/
-INLINE static int
+static INLINE int
UtfCount(
int ch) /* The Unicode character whose size is returned. */
{
@@ -145,7 +145,7 @@ UtfCount(
*---------------------------------------------------------------------------
*/
-static CONST unsigned char bounds[28] = {
+static const unsigned char bounds[28] = {
0x80, 0x80, /* \xC0 accepts \x80 only */
0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF,
0x80, 0xBF, /* (\xC4 - \xDC) -- all sequences valid */
@@ -160,7 +160,7 @@ static CONST unsigned char bounds[28] = {
#endif
};
-INLINE static int
+static INLINE int
Invalid(
const char *src) /* Points to lead byte of a UTF-8 byte sequence */
{
@@ -259,13 +259,13 @@ three:
char *
Tcl_UniCharToUtfDString(
- CONST Tcl_UniChar *uniStr, /* Unicode string to convert to UTF-8. */
+ const Tcl_UniChar *uniStr, /* Unicode string to convert to UTF-8. */
int uniLength, /* Length of Unicode string in Tcl_UniChars
* (must be >= 0). */
Tcl_DString *dsPtr) /* UTF-8 representation of string is appended
* to this previously initialized DString. */
{
- CONST Tcl_UniChar *w, *wEnd;
+ const Tcl_UniChar *w, *wEnd;
char *p, *string;
int oldLength;
@@ -317,17 +317,17 @@ Tcl_UniCharToUtfDString(
int
Tcl_UtfToUniChar(
- register CONST char *src, /* The UTF-8 string. */
- register Tcl_UniChar *chPtr)/* Filled with the Tcl_UniChar represented by
+ const char *src, /* The UTF-8 string. */
+ Tcl_UniChar *chPtr)/* Filled with the Tcl_UniChar represented by
* the UTF-8 string. */
{
- register int byte;
+ Tcl_UniChar byte;
/*
- * Unroll 1 to 3 byte UTF-8 sequences, use loop to handle longer ones.
+ * Unroll 1 to 3 (or 4) byte UTF-8 sequences.
*/
- byte = *((unsigned char *) src);
+ byte = UCHAR(*src);
if (byte < 0xC0) {
/*
* Handles properly formed UTF-8 characters between 0x01 and 0x7F.
@@ -335,7 +335,7 @@ Tcl_UtfToUniChar(
* characters representing themselves.
*/
- *chPtr = (Tcl_UniChar) byte;
+ *chPtr = byte;
return 1;
} else if (byte < 0xE0) {
if ((src[1] & 0xC0) == 0x80) {
@@ -415,7 +415,7 @@ Tcl_UtfToUniChar(
Tcl_UniChar *
Tcl_UtfToUniCharDString(
- CONST char *src, /* UTF-8 string to convert to Unicode. */
+ const char *src, /* UTF-8 string to convert to Unicode. */
int length, /* Length of UTF-8 string in bytes, or -1 for
* strlen(). */
Tcl_DString *dsPtr) /* Unicode representation of string is
@@ -423,8 +423,12 @@ Tcl_UtfToUniCharDString(
* DString. */
{
Tcl_UniChar *w, *wString;
- CONST char *p, *end;
+ const char *p;
int oldLength;
+ /* Pointer to the end of string. Never read endPtr[0] */
+ const char *endPtr = src + length;
+ /* Pointer to breakpoint in scan where optimization is lost */
+ const char *optPtr = endPtr - TCL_UTF_MAX;
if (length < 0) {
length = strlen(src);
@@ -438,24 +442,22 @@ Tcl_UtfToUniCharDString(
oldLength = Tcl_DStringLength(dsPtr);
Tcl_DStringSetLength(dsPtr,
- oldLength + (int) ((length + 1) * sizeof(Tcl_UniChar)));
+ oldLength + ((length + 1) * sizeof(Tcl_UniChar)));
wString = (Tcl_UniChar *) (Tcl_DStringValue(dsPtr) + oldLength);
w = wString;
p = src;
- end = src + length - TCL_UTF_MAX;
- while (p < end) {
+ endPtr = src + length;
+ optPtr = endPtr - TCL_UTF_MAX;
+ while (p <= optPtr) {
p += TclUtfToUniChar(p, w);
w++;
}
- end += TCL_UTF_MAX;
- while (p < end) {
- if (Tcl_UtfCharComplete(p, end-p)) {
- p += TclUtfToUniChar(p, w);
- } else {
- *w = UCHAR(*p++);
- }
- w++;
+ while ((p < endPtr) && Tcl_UtfCharComplete(p, endPtr-p)) {
+ p += TclUtfToUniChar(p, w++);
+ }
+ while (p < endPtr) {
+ *w++ = UCHAR(*p++);
}
*w = '\0';
Tcl_DStringSetLength(dsPtr,
@@ -485,11 +487,11 @@ Tcl_UtfToUniCharDString(
int
Tcl_UtfCharComplete(
- CONST char *src, /* String to check if first few bytes contain
+ const char *src, /* String to check if first few bytes contain
* a complete UTF-8 character. */
int length) /* Length of above string in bytes. */
{
- return length >= totalBytes[(unsigned char)*src];
+ return length >= totalBytes[UCHAR(*src)];
}
/*
@@ -512,7 +514,7 @@ Tcl_UtfCharComplete(
int
Tcl_NumUtfChars(
- CONST char *src, /* The UTF-8 string to measure. */
+ const char *src, /* The UTF-8 string to measure. */
int length) /* The length of the string in bytes, or -1
* for strlen(string). */
{
@@ -530,17 +532,17 @@ Tcl_NumUtfChars(
/* Pointer to the end of string. Never read endPtr[0] */
const char *endPtr = src + length;
- /* Pointer to breakpoint in scan where optimization is lost */
- const char *optPtr = endPtr - TCL_UTF_MAX + 1;
+ /* Pointer to last byte where optimization still can be used */
+ const char *optPtr = endPtr - TCL_UTF_MAX;
/*
* Optimize away the call in this loop. Justified because...
- * when (src < optPtr), (endPtr - src) > (endPtr - optPtr)
- * By initialization above (endPtr - optPtr) = TCL_UTF_MAX - 1
+ * when (src <= optPtr), (endPtr - src) >= (endPtr - optPtr)
+ * By initialization above (endPtr - optPtr) = TCL_UTF_MAX
* So (endPtr - src) >= TCL_UTF_MAX, and passing that to
* Tcl_UtfCharComplete we know will cause return of 1.
*/
- while ((src < optPtr)
+ while (src <= optPtr
/* && Tcl_UtfCharComplete(src, endPtr - src) */ ) {
src += TclUtfToUniChar(src, &ch);
i++;
@@ -580,16 +582,14 @@ Tcl_NumUtfChars(
*---------------------------------------------------------------------------
*/
-CONST char *
+const char *
Tcl_UtfFindFirst(
- CONST char *src, /* The UTF-8 string to be searched. */
+ const char *src, /* The UTF-8 string to be searched. */
int ch) /* The Tcl_UniChar to search for. */
{
- int len;
- Tcl_UniChar find;
-
while (1) {
- len = TclUtfToUniChar(src, &find);
+ Tcl_UniChar find;
+ int len = TclUtfToUniChar(src, &find);
if (find == ch) {
return src;
}
@@ -619,18 +619,17 @@ Tcl_UtfFindFirst(
*---------------------------------------------------------------------------
*/
-CONST char *
+const char *
Tcl_UtfFindLast(
- CONST char *src, /* The UTF-8 string to be searched. */
+ const char *src, /* The UTF-8 string to be searched. */
int ch) /* The Unicode character to search for. */
{
- int len;
- Tcl_UniChar find;
- CONST char *last;
+ const char *last = NULL;
- last = NULL;
while (1) {
- len = TclUtfToUniChar(src, &find);
+ Tcl_UniChar find;
+ int len = TclUtfToUniChar(src, &find);
+
if (find == ch) {
last = src;
}
@@ -663,9 +662,9 @@ Tcl_UtfFindLast(
*---------------------------------------------------------------------------
*/
-CONST char *
+const char *
Tcl_UtfNext(
- CONST char *src) /* The current location in the string. */
+ const char *src) /* The current location in the string. */
{
int left;
const char *next;
@@ -717,13 +716,13 @@ Tcl_UtfNext(
*---------------------------------------------------------------------------
*/
-CONST char *
+const char *
Tcl_UtfPrev(
- CONST char *src, /* A location in a UTF-8 string. */
- CONST char *start) /* Pointer to the beginning of the string */
+ const char *src, /* A location in a UTF-8 string. */
+ const char *start) /* Pointer to the beginning of the string */
{
int trailBytesSeen = 0; /* How many trail bytes have been verified? */
- CONST char *fallback = src - 1;
+ const char *fallback = src - 1;
/* If we cannot find a lead byte that might
* start a prefix of a valid UTF byte sequence,
* we will fallback to a one-byte back step */
@@ -779,13 +778,13 @@ Tcl_UtfPrev(
/* Reject */
return fallback;
}
- return (CONST char *)look;
+ return (const char *)look;
}
/* We saw a trail byte. */
trailBytesSeen++;
- if ((CONST char *)look == start) {
+ if ((const char *)look == start) {
/*
* Do not read before the start of the string
*
@@ -829,8 +828,8 @@ Tcl_UtfPrev(
Tcl_UniChar
Tcl_UniCharAtIndex(
- register CONST char *src, /* The UTF-8 string to dereference. */
- register int index) /* The position of the desired character. */
+ const char *src, /* The UTF-8 string to dereference. */
+ int index) /* The position of the desired character. */
{
Tcl_UniChar ch;
@@ -855,15 +854,14 @@ Tcl_UniCharAtIndex(
*---------------------------------------------------------------------------
*/
-CONST char *
+const char *
Tcl_UtfAtIndex(
- register CONST char *src, /* The UTF-8 string. */
- register int index) /* The position of the desired character. */
+ const char *src, /* The UTF-8 string. */
+ int index) /* The position of the desired character. */
{
Tcl_UniChar ch;
- while (index > 0) {
- index--;
+ while (index-- > 0) {
src += TclUtfToUniChar(src, &ch);
}
return src;
@@ -897,7 +895,7 @@ Tcl_UtfAtIndex(
int
Tcl_UtfBackslash(
- CONST char *src, /* Points to the backslash character of a
+ const char *src, /* Points to the backslash character of a
* backslash sequence. */
int *readPtr, /* Fill in with number of characters read from
* src, unless NULL. */
@@ -1113,8 +1111,8 @@ Tcl_UtfToTitle(
int
TclpUtfNcmp2(
- CONST char *cs, /* UTF string to compare to ct. */
- CONST char *ct, /* UTF string cs is compared to. */
+ const char *cs, /* UTF string to compare to ct. */
+ const char *ct, /* UTF string cs is compared to. */
unsigned long numBytes) /* Number of *bytes* to compare. */
{
/*
@@ -1123,7 +1121,7 @@ TclpUtfNcmp2(
* fine in the strcmp manner.
*/
- register int result = 0;
+ int result = 0;
for ( ; numBytes != 0; numBytes--, cs++, ct++) {
if (*cs != *ct) {
@@ -1160,8 +1158,8 @@ TclpUtfNcmp2(
int
Tcl_UtfNcmp(
- CONST char *cs, /* UTF string to compare to ct. */
- CONST char *ct, /* UTF string cs is compared to. */
+ const char *cs, /* UTF string to compare to ct. */
+ const char *ct, /* UTF string cs is compared to. */
unsigned long numChars) /* Number of UTF chars to compare. */
{
Tcl_UniChar ch1, ch2;
@@ -1208,8 +1206,8 @@ Tcl_UtfNcmp(
int
Tcl_UtfNcasecmp(
- CONST char *cs, /* UTF string to compare to ct. */
- CONST char *ct, /* UTF string cs is compared to. */
+ const char *cs, /* UTF string to compare to ct. */
+ const char *ct, /* UTF string cs is compared to. */
unsigned long numChars) /* Number of UTF chars to compare. */
{
Tcl_UniChar ch1, ch2;
@@ -1252,8 +1250,8 @@ Tcl_UtfNcasecmp(
int
TclUtfCasecmp(
- CONST char *cs, /* UTF string to compare to ct. */
- CONST char *ct) /* UTF string cs is compared to. */
+ const char *cs, /* UTF string to compare to ct. */
+ const char *ct) /* UTF string cs is compared to. */
{
Tcl_UniChar ch1, ch2;
@@ -1409,7 +1407,7 @@ Tcl_UniCharToTitle(
int
Tcl_UniCharLen(
- CONST Tcl_UniChar *uniStr) /* Unicode string to find length of. */
+ const Tcl_UniChar *uniStr) /* Unicode string to find length of. */
{
int len = 0;
@@ -1439,8 +1437,8 @@ Tcl_UniCharLen(
int
Tcl_UniCharNcmp(
- CONST Tcl_UniChar *ucs, /* Unicode string to compare to uct. */
- CONST Tcl_UniChar *uct, /* Unicode string ucs is compared to. */
+ const Tcl_UniChar *ucs, /* Unicode string to compare to uct. */
+ const Tcl_UniChar *uct, /* Unicode string ucs is compared to. */
unsigned long numChars) /* Number of unichars to compare. */
{
#ifdef WORDS_BIGENDIAN
@@ -1484,8 +1482,8 @@ Tcl_UniCharNcmp(
int
Tcl_UniCharNcasecmp(
- CONST Tcl_UniChar *ucs, /* Unicode string to compare to uct. */
- CONST Tcl_UniChar *uct, /* Unicode string ucs is compared to. */
+ const Tcl_UniChar *ucs, /* Unicode string to compare to uct. */
+ const Tcl_UniChar *uct, /* Unicode string ucs is compared to. */
unsigned long numChars) /* Number of unichars to compare. */
{
for ( ; numChars != 0; numChars--, ucs++, uct++) {
@@ -1860,8 +1858,8 @@ Tcl_UniCharIsWordChar(
int
Tcl_UniCharCaseMatch(
- CONST Tcl_UniChar *uniStr, /* Unicode String. */
- CONST Tcl_UniChar *uniPattern,
+ const Tcl_UniChar *uniStr, /* Unicode String. */
+ const Tcl_UniChar *uniPattern,
/* Pattern, which may contain special
* characters. */
int nocase) /* 0 for case sensitive, 1 for insensitive */
@@ -2048,14 +2046,14 @@ Tcl_UniCharCaseMatch(
int
TclUniCharMatch(
- CONST Tcl_UniChar *string, /* Unicode String. */
+ const Tcl_UniChar *string, /* Unicode String. */
int strLen, /* Length of String */
- CONST Tcl_UniChar *pattern, /* Pattern, which may contain special
+ const Tcl_UniChar *pattern, /* Pattern, which may contain special
* characters. */
int ptnLen, /* Length of Pattern */
int nocase) /* 0 for case sensitive, 1 for insensitive */
{
- CONST Tcl_UniChar *stringEnd, *patternEnd;
+ const Tcl_UniChar *stringEnd, *patternEnd;
Tcl_UniChar p;
stringEnd = string + strLen;