diff options
-rw-r--r-- | generic/tclCmdMZ.c | 2 | ||||
-rw-r--r-- | generic/tclInt.h | 8 | ||||
-rw-r--r-- | generic/tclStringObj.c | 6 | ||||
-rw-r--r-- | generic/tclTest.c | 6 | ||||
-rw-r--r-- | generic/tclUtf.c | 179 | ||||
-rw-r--r-- | generic/tclUtil.c | 4 | ||||
-rw-r--r-- | tests/utf.test | 20 |
7 files changed, 179 insertions, 46 deletions
diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c index 935d42f..d7394fb 100644 --- a/generic/tclCmdMZ.c +++ b/generic/tclCmdMZ.c @@ -2529,7 +2529,7 @@ StringStartCmd( break; } - next = Tcl_UtfPrev(p, string); + next = TclUtfPrev(p, string); do { next += delta; delta = TclUtfToUCS4(next, &ch); diff --git a/generic/tclInt.h b/generic/tclInt.h index 8d2e68a..2ff644e 100644 --- a/generic/tclInt.h +++ b/generic/tclInt.h @@ -4689,6 +4689,14 @@ MODULE_SCOPE const TclFileAttrProcs tclpFileAttrProcs[]; (numChars) = _count; \ } while (0); +#define TclUtfPrev(src, start) \ + (((src) < (start) + 2) ? (start) : \ + ((unsigned char) *((src) - 1)) < 0x80 ? (src) - 1 : \ + Tcl_UtfPrev(src, start)) + +#define TclUtfNext(src) \ + ((((unsigned char) *(src)) < 0x80) ? (src) + 1 : Tcl_UtfNext(src)) + /* *---------------------------------------------------------------- * Macro that encapsulates the logic that determines when it is safe to diff --git a/generic/tclStringObj.c b/generic/tclStringObj.c index 2025674..78e49f9 100644 --- a/generic/tclStringObj.c +++ b/generic/tclStringObj.c @@ -1171,10 +1171,10 @@ Tcl_AppendLimitedToObj( } eLen = strlen(ellipsis); while (eLen > limit) { - eLen = Tcl_UtfPrev(ellipsis+eLen, ellipsis) - ellipsis; + eLen = TclUtfPrev(ellipsis+eLen, ellipsis) - ellipsis; } - toCopy = Tcl_UtfPrev(bytes+limit+1-eLen, bytes) - bytes; + toCopy = TclUtfPrev(bytes+limit+1-eLen, bytes) - bytes; } /* @@ -2614,7 +2614,7 @@ AppendPrintfToObjVA( * multi-byte characters. */ - q = Tcl_UtfPrev(end, bytes); + q = TclUtfPrev(end, bytes); if (!Tcl_UtfCharComplete(q, (int)(end - q))) { end = q; } diff --git a/generic/tclTest.c b/generic/tclTest.c index 008fdb7..19de8ee 100644 --- a/generic/tclTest.c +++ b/generic/tclTest.c @@ -6849,10 +6849,10 @@ TestUtfNextCmd( memcpy(buffer + 1, bytes, numBytes); buffer[0] = buffer[numBytes + 1] = '\x00'; - first = Tcl_UtfNext(buffer + 1); + first = TclUtfNext(buffer + 1); while ((buffer[0] = *p++) != '\0') { /* Run Tcl_UtfNext with many more possible bytes at src[-1], all should give the same result */ - result = Tcl_UtfNext(buffer + 1); + result = TclUtfNext(buffer + 1); if (first != result) { Tcl_AppendResult(interp, "Tcl_UtfNext is not supposed to read src[-1]", NULL); return TCL_ERROR; @@ -6908,7 +6908,7 @@ TestUtfPrevCmd( bytes = (char *) Tcl_SetByteArrayLength(copy, numBytes+1); bytes[numBytes] = '\0'; - result = Tcl_UtfPrev(bytes + offset, bytes); + result = TclUtfPrev(bytes + offset, bytes); Tcl_SetObjResult(interp, Tcl_NewIntObj(result - bytes)); Tcl_DecrRefCount(copy); diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 6adfba9..20b0ebe 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -90,6 +90,12 @@ static const unsigned char complete[256] = { #endif 1,1,1,1,1,1,1,1,1,1,1 }; + +/* + * Functions used only in this module. + */ + +static int Invalid(unsigned char *src); /* *--------------------------------------------------------------------------- @@ -122,7 +128,56 @@ TclUtfCount( } return 3; } + +/* + *--------------------------------------------------------------------------- + * + * Invalid -- + * + * Utility routine to report whether /src/ points to the start of an + * invald byte sequence that should be rejected. This might be because + * it is an overlong encoding, or because it encodes something out of + * the proper range. Caller guarantees that src[0] and src[1] are + * readable, and + * + * (src[0] >= 0xC0) && (src[0] != 0xC1) + * (src[1] >= 0x80) && (src[1] < 0xC0) + * (src[0] < ((TCL_UTF_MAX > 3) ? 0xF5 : 0xF0)) + * + * Results: + * A boolean. + *--------------------------------------------------------------------------- + */ + +static const unsigned char bounds[28] = { + 0x80, 0x80, /* \xC0 accepts \x80 only */ + 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF, + 0x80, 0xBF, /* (\xC4 - \xDC) -- all sequences valid */ + 0xA0, 0xBF, /* \xE0\x80 through \xE0\x9F are invalid prefixes */ + 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF, /* (\xE4 - \xEC) -- all valid */ + 0x90, 0xBF, /* \xF0\x80 through \xF0\x8F are invalid prefixes */ + 0x80, 0x8F /* \xF4\x90 and higher are invalid prefixes */ +}; +INLINE static int +Invalid( + unsigned char *src) /* Points to lead byte of a UTF-8 byte sequence */ +{ + unsigned char byte = *src; + int index; + + if (byte % 0x04) { + /* Only lead bytes 0xC0, 0xE0, 0xF0, 0xF4 need examination */ + return 0; + } + index = (byte - 0xC0) >> 1; + if (src[1] < bounds[index] || src[1] > bounds[index+1]) { + /* Out of bounds - report invalid. */ + return 1; + } + return 0; +} + /* *--------------------------------------------------------------------------- * @@ -860,7 +915,7 @@ Tcl_UtfFindLast( * * Tcl_UtfNext -- * - * Given a pointer to some location in a UTF-8 string, Tcl_UtfNext + * Given a pointer to some location in a UTF-8 string, Tcl_UtfNext * returns a pointer to the next UTF-8 character in the string. * The caller must not ask for the next character after the last * character in the string if the string is not terminated by a null @@ -880,8 +935,8 @@ const char * Tcl_UtfNext( const char *src) /* The current location in the string. */ { - Tcl_UniChar ch = 0; - int len; + int left = totalBytes[UCHAR(*src)]; + const char *next = src + 1; if (((*src) & 0xC0) == 0x80) { if ((((*++src) & 0xC0) == 0x80) && (((*++src) & 0xC0) == 0x80)) { @@ -889,14 +944,22 @@ Tcl_UtfNext( } return src; } - len = TclUtfToUniChar(src, &ch); -#if TCL_UTF_MAX <= 3 - if ((ch >= 0xD800) && (len < 3)) { - len += TclUtfToUniChar(src + len, &ch); + while (--left) { + if ((*next & 0xC0) != 0x80) { + /* + * src points to non-trail byte; We ran out of trail bytes + * before the needs of the lead byte were satisfied. + * Let the (malformed) lead byte alone be a character + */ + return src + 1; + } + next++; } -#endif - return src + len; + if (Invalid((unsigned char *)src)) { + return src + 1; + } + return next; } /* @@ -953,30 +1016,92 @@ Tcl_UtfPrev( const char *src, /* A location in a UTF-8 string. */ const char *start) /* Pointer to the beginning of the string */ { - const char *look; - int i, byte; - - look = --src; - for (i = 0; i < 4; i++) { - if (look < start) { - if (src < start) { - src = start; - } - break; - } - byte = *((unsigned char *) look); + int trailBytesSeen = 0; /* How many trail bytes have been verified? */ + CONST char *fallback = src - 1; + /* If we cannot find a lead byte that might + * start a prefix of a valid UTF byte sequence, + * we will fallback to a one-byte back step */ + unsigned char *look = (unsigned char *)fallback; + /* Start search at the fallback position */ + + /* Quick boundary case exit. */ + if (fallback <= start) { + return start; + } + + do { + unsigned char byte = look[0]; + if (byte < 0x80) { - break; + /* + * Single byte character. Either this is a correct previous + * character, or it is followed by at least one trail byte + * which indicates a malformed sequence. In either case the + * correct result is to return the fallback. + */ + return fallback; } if (byte >= 0xC0) { - if (totalBytes[byte] <= i) { - break; + /* Non-trail byte; May be multibyte lead. */ + + if ((trailBytesSeen == 0) + /* + * We've seen no trailing context to use to check + * anything. From what we know, this non-trail byte + * is a prefix of a previous character, and accepting + * it (the fallback) is correct. + */ + + || (trailBytesSeen >= totalBytes[byte])) { + /* + * That is, (1 + trailBytesSeen > needed). + * We've examined more bytes than needed to complete + * this lead byte. No matter about well-formedness or + * validity, the sequence starting with this lead byte + * will never include the fallback location, so we must + * return the fallback location. See test utf-7.17 + */ + return fallback; } - return look; + + /* + * trailBytesSeen > 0, so we can examine look[1] safely. + * Use that capability to screen out overlong sequences. + */ + + if (Invalid(look)) { + /* Reject */ + return fallback; + } + return (CONST char *)look; + } + + /* We saw a trail byte. */ + trailBytesSeen++; + + if ((CONST char *)look == start) { + /* + * Do not read before the start of the string + * + * If we get here, we've examined bytes at every location + * >= start and < src and all of them are trail bytes, + * including (*start). We need to return our fallback + * and exit this loop before we run past the start of the string. + */ + return fallback; } + + /* Continue the search backwards... */ look--; - } - return src; + } while (trailBytesSeen < 4); + + /* + * We've seen TCL_UTF_MAX trail bytes, so we know there will not be a + * properly formed byte sequence to find, and we can stop looking, + * accepting the fallback (for TCL_UTF_MAX > 3) or just go back as + * far as we can. + */ + return fallback; } /* diff --git a/generic/tclUtil.c b/generic/tclUtil.c index b48ce30..dd527dc 100644 --- a/generic/tclUtil.c +++ b/generic/tclUtil.c @@ -1707,7 +1707,7 @@ TclTrimRight( const char *q = trim; int pInc = 0, bytesLeft = numTrim; - pp = Tcl_UtfPrev(p, bytes); + pp = TclUtfPrev(p, bytes); do { pp += pInc; pInc = TclUtfToUCS4(pp, &ch1); @@ -1858,7 +1858,7 @@ TclTrim( * that we will not trim. Skip over it. */ if (numBytes > 0) { const char *first = bytes + trimLeft; - bytes = Tcl_UtfNext(first); + bytes = TclUtfNext(first); numBytes -= (bytes - first); if (numBytes > 0) { diff --git a/tests/utf.test b/tests/utf.test index 79f866a..885b057 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -600,19 +600,19 @@ test utf-7.23 {Tcl_UtfPrev} testutfprev { } 4 test utf-7.24 {Tcl_UtfPrev -- overlong sequence} testutfprev { testutfprev A\xC0\x81 -} 1 +} 2 test utf-7.25 {Tcl_UtfPrev -- overlong sequence} testutfprev { testutfprev A\xC0\x81 2 } 1 test utf-7.26 {Tcl_UtfPrev -- overlong sequence} testutfprev { testutfprev A\xE0\x80\x80 -} 1 +} 3 test utf-7.27 {Tcl_UtfPrev -- overlong sequence} testutfprev { testutfprev A\xE0\x80 -} 1 +} 2 test utf-7.27.1 {Tcl_UtfPrev -- overlong sequence} testutfprev { testutfprev A\xE0\x80\x80 3 -} 1 +} 2 test utf-7.28 {Tcl_UtfPrev -- overlong sequence} testutfprev { testutfprev A\xE0 } 1 @@ -621,13 +621,13 @@ test utf-7.28.1 {Tcl_UtfPrev -- overlong sequence} testutfprev { } 1 test utf-7.29 {Tcl_UtfPrev -- overlong sequence} testutfprev { testutfprev A\xF0\x80\x80\x80 -} 1 +} 4 test utf-7.30 {Tcl_UtfPrev -- overlong sequence} testutfprev { testutfprev A\xF0\x80\x80\x80 4 -} 1 +} 3 test utf-7.31 {Tcl_UtfPrev -- overlong sequence} testutfprev { testutfprev A\xF0\x80\x80\x80 3 -} 1 +} 2 test utf-7.32 {Tcl_UtfPrev -- overlong sequence} testutfprev { testutfprev A\xF0\x80\x80\x80 2 } 1 @@ -696,13 +696,13 @@ test utf-7.48.3 {Tcl_UtfPrev, validity check [493dccc2de]} testutfprev { } 1 test utf-7.49 {Tcl_UtfPrev, validity check [493dccc2de]} testutfprev { testutfprev A\xF4\x90\x80\x80 -} 1 +} 4 test utf-7.49.1 {Tcl_UtfPrev, validity check [493dccc2de]} testutfprev { testutfprev A\xF4\x90\x80\x80 4 -} 1 +} 3 test utf-7.49.2 {Tcl_UtfPrev, validity check [493dccc2de]} testutfprev { testutfprev A\xF4\x90\x80\x80 3 -} 1 +} 2 test utf-7.49.3 {Tcl_UtfPrev, validity check [493dccc2de]} testutfprev { testutfprev A\xF4\x90\x80\x80 2 } 1 |