summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorjan.nijtmans <nijtmans@users.sourceforge.net>2017-08-18 21:39:16 (GMT)
committerjan.nijtmans <nijtmans@users.sourceforge.net>2017-08-18 21:39:16 (GMT)
commitb916fb2e6064a68a3848169f1d6263396112d330 (patch)
tree89be35bb25db2b233d3b2644ab0316f9a107d08c
parentf338acabb27d64eddd5ccccd3adf4fdbfbcc94f9 (diff)
parent80272e6e9728da345d243a0af6def26dbc86b255 (diff)
downloadtcl-b916fb2e6064a68a3848169f1d6263396112d330.zip
tcl-b916fb2e6064a68a3848169f1d6263396112d330.tar.gz
tcl-b916fb2e6064a68a3848169f1d6263396112d330.tar.bz2
RFE [http://core.tcl.tk/tk/info/6c0d7aec67|6c0d7aec67]: Better surrogate handling for TCL_UTF_MAX==4. No change when TCL_UTF_MAX==3. (This commit is actually meant for androwish, in order to improve surrogate handling for TCL_UTF_MAX==4)
-rw-r--r--generic/tclBinary.c6
-rw-r--r--generic/tclCmdIL.c2
-rw-r--r--generic/tclCmdMZ.c10
-rw-r--r--generic/tclCompExpr.c2
-rw-r--r--generic/tclEncoding.c20
-rw-r--r--generic/tclLoad.c2
-rw-r--r--generic/tclParse.c2
-rw-r--r--generic/tclScan.c2
-rw-r--r--generic/tclStringObj.c4
-rw-r--r--generic/tclUtf.c78
-rw-r--r--generic/tclUtil.c2
-rw-r--r--win/tclWinPipe.c2
12 files changed, 80 insertions, 52 deletions
diff --git a/generic/tclBinary.c b/generic/tclBinary.c
index d0d9d5e..bb918f2 100644
--- a/generic/tclBinary.c
+++ b/generic/tclBinary.c
@@ -454,7 +454,7 @@ SetByteArrayFromAny(
const char *src, *srcEnd;
unsigned char *dst;
ByteArray *byteArrayPtr;
- Tcl_UniChar ch;
+ Tcl_UniChar ch = 0;
if (objPtr->typePtr != &tclByteArrayType) {
src = TclGetStringFromObj(objPtr, &length);
@@ -1210,7 +1210,7 @@ BinaryFormatCmd(
badField:
{
- Tcl_UniChar ch;
+ Tcl_UniChar ch = 0;
char buf[TCL_UTF_MAX + 1];
TclUtfToUniChar(errorString, &ch);
@@ -1580,7 +1580,7 @@ BinaryScanCmd(
badField:
{
- Tcl_UniChar ch;
+ Tcl_UniChar ch = 0;
char buf[TCL_UTF_MAX + 1];
TclUtfToUniChar(errorString, &ch);
diff --git a/generic/tclCmdIL.c b/generic/tclCmdIL.c
index ba9e1cf..e3c5f10 100644
--- a/generic/tclCmdIL.c
+++ b/generic/tclCmdIL.c
@@ -4345,7 +4345,7 @@ static int
DictionaryCompare(
const char *left, const char *right) /* The strings to compare. */
{
- Tcl_UniChar uniLeft, uniRight, uniLeftLower, uniRightLower;
+ Tcl_UniChar uniLeft = 0, uniRight = 0, uniLeftLower, uniRightLower;
int diff, zeros;
int secondaryDiff = 0;
diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c
index 3f79ca4..7010495 100644
--- a/generic/tclCmdMZ.c
+++ b/generic/tclCmdMZ.c
@@ -1037,7 +1037,7 @@ Tcl_SplitObjCmd(
int objc, /* Number of arguments. */
Tcl_Obj *const objv[]) /* Argument objects. */
{
- Tcl_UniChar ch;
+ Tcl_UniChar ch = 0;
int len;
const char *splitChars;
const char *stringPtr;
@@ -1122,7 +1122,7 @@ Tcl_SplitObjCmd(
} else {
const char *element, *p, *splitEnd;
int splitLen;
- Tcl_UniChar splitChar;
+ Tcl_UniChar splitChar = 0;
/*
* Normal case: split on any of a given set of characters. Discard
@@ -1451,7 +1451,7 @@ StringIsCmd(
Tcl_Obj *const objv[]) /* Argument objects. */
{
const char *string1, *end, *stop;
- Tcl_UniChar ch;
+ Tcl_UniChar ch = 0;
int (*chcomp)(int) = NULL; /* The UniChar comparison function. */
int i, failat = 0, result = 1, strict = 0, index, length1, length2;
Tcl_Obj *objPtr, *failVarObj = NULL;
@@ -2436,7 +2436,7 @@ StringStartCmd(
int objc, /* Number of arguments. */
Tcl_Obj *const objv[]) /* Argument objects. */
{
- Tcl_UniChar ch;
+ Tcl_UniChar ch = 0;
const char *p, *string;
int cur, index, length, numChars;
@@ -2497,7 +2497,7 @@ StringEndCmd(
int objc, /* Number of arguments. */
Tcl_Obj *const objv[]) /* Argument objects. */
{
- Tcl_UniChar ch;
+ Tcl_UniChar ch = 0;
const char *p, *end, *string;
int cur, index, length, numChars;
diff --git a/generic/tclCompExpr.c b/generic/tclCompExpr.c
index 59eecf9..9c7ab8d 100644
--- a/generic/tclCompExpr.c
+++ b/generic/tclCompExpr.c
@@ -1885,7 +1885,7 @@ ParseLexeme(
{
const char *end;
int scanned;
- Tcl_UniChar ch;
+ Tcl_UniChar ch = 0;
Tcl_Obj *literal = NULL;
unsigned char byte;
diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c
index b4acb5f..8450128 100644
--- a/generic/tclEncoding.c
+++ b/generic/tclEncoding.c
@@ -2296,7 +2296,7 @@ UtfToUtfProc(
const char *srcStart, *srcEnd, *srcClose;
const char *dstStart, *dstEnd;
int result, numChars, charLimit = INT_MAX;
- Tcl_UniChar ch;
+ Tcl_UniChar ch = 0;
result = TCL_OK;
@@ -2345,8 +2345,8 @@ UtfToUtfProc(
} else if (!Tcl_UtfCharComplete(src, srcEnd - src)) {
/*
* Always check before using TclUtfToUniChar. Not doing can so
- * cause it run beyond the endof the buffer! If we happen such an
- * incomplete char its byts are made to represent themselves.
+ * cause it run beyond the end of the buffer! If we happen such an
+ * incomplete char its bytes are made to represent themselves.
*/
ch = (unsigned char) *src;
@@ -2410,7 +2410,7 @@ UnicodeToUtfProc(
const char *srcStart, *srcEnd;
const char *dstEnd, *dstStart;
int result, numChars, charLimit = INT_MAX;
- Tcl_UniChar ch;
+ Tcl_UniChar ch = 0;
if (flags & TCL_ENCODING_CHAR_LIMIT) {
charLimit = *dstCharsPtr;
@@ -2500,7 +2500,7 @@ UtfToUnicodeProc(
{
const char *srcStart, *srcEnd, *srcClose, *dstStart, *dstEnd;
int result, numChars;
- Tcl_UniChar ch;
+ Tcl_UniChar ch = 0;
srcStart = src;
srcEnd = src + srcLen;
@@ -2610,7 +2610,7 @@ TableToUtfProc(
const char *srcStart, *srcEnd;
const char *dstEnd, *dstStart, *prefixBytes;
int result, byte, numChars, charLimit = INT_MAX;
- Tcl_UniChar ch;
+ Tcl_UniChar ch = 0;
const unsigned short *const *toUnicode;
const unsigned short *pageZero;
TableEncodingData *dataPtr = clientData;
@@ -2722,7 +2722,7 @@ TableFromUtfProc(
{
const char *srcStart, *srcEnd, *srcClose;
const char *dstStart, *dstEnd, *prefixBytes;
- Tcl_UniChar ch;
+ Tcl_UniChar ch = 0;
int result, len, word, numChars;
TableEncodingData *dataPtr = clientData;
const unsigned short *const *fromUnicode;
@@ -2856,7 +2856,7 @@ Iso88591ToUtfProc(
result = TCL_OK;
for (numChars = 0; src < srcEnd && numChars <= charLimit; numChars++) {
- Tcl_UniChar ch;
+ Tcl_UniChar ch = 0;
if (dst > dstEnd) {
result = TCL_CONVERT_NOSPACE;
@@ -2942,7 +2942,7 @@ Iso88591FromUtfProc(
dstEnd = dst + dstLen - 1;
for (numChars = 0; src < srcEnd; numChars++) {
- Tcl_UniChar ch;
+ Tcl_UniChar ch = 0;
int len;
if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) {
@@ -3329,7 +3329,7 @@ EscapeFromUtfProc(
for (numChars = 0; src < srcEnd; numChars++) {
unsigned len;
int word;
- Tcl_UniChar ch;
+ Tcl_UniChar ch = 0;
if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) {
/*
diff --git a/generic/tclLoad.c b/generic/tclLoad.c
index 942e6b4..f1bd248 100644
--- a/generic/tclLoad.c
+++ b/generic/tclLoad.c
@@ -130,7 +130,7 @@ Tcl_LoadObjCmd(
Tcl_PackageInitProc *initProc;
const char *p, *fullFileName, *packageName;
Tcl_LoadHandle loadHandle;
- Tcl_UniChar ch;
+ Tcl_UniChar ch = 0;
unsigned len;
int index, flags = 0;
Tcl_Obj *const *savedobjv = objv;
diff --git a/generic/tclParse.c b/generic/tclParse.c
index 3ecf4a5..f2cf322 100644
--- a/generic/tclParse.c
+++ b/generic/tclParse.c
@@ -841,7 +841,7 @@ TclParseBackslash(
* written there. */
{
register const char *p = src+1;
- Tcl_UniChar unichar;
+ Tcl_UniChar unichar = 0;
int result;
int count;
char buf[TCL_UTF_MAX];
diff --git a/generic/tclScan.c b/generic/tclScan.c
index 17069eb..7a6a8a2 100644
--- a/generic/tclScan.c
+++ b/generic/tclScan.c
@@ -257,7 +257,7 @@ ValidateFormat(
{
int gotXpg, gotSequential, value, i, flags;
char *end;
- Tcl_UniChar ch;
+ Tcl_UniChar ch = 0;
int objIndex, xpgSize, nspace = numVars;
int *nassign = TclStackAlloc(interp, nspace * sizeof(int));
char buf[TCL_UTF_MAX+1];
diff --git a/generic/tclStringObj.c b/generic/tclStringObj.c
index c84b500..6d97881 100644
--- a/generic/tclStringObj.c
+++ b/generic/tclStringObj.c
@@ -1709,7 +1709,7 @@ Tcl_AppendFormatToObj(
#endif
int newXpg, numChars, allocSegment = 0, segmentLimit, segmentNumBytes;
Tcl_Obj *segment;
- Tcl_UniChar ch;
+ Tcl_UniChar ch = 0;
int step = TclUtfToUniChar(format, &ch);
format += step;
@@ -2692,7 +2692,7 @@ TclStringObjReverse(
Tcl_Obj *objPtr)
{
String *stringPtr;
- Tcl_UniChar ch;
+ Tcl_UniChar ch = 0;
if (TclIsPureByteArray(objPtr)) {
int numBytes;
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index b8b867c..3b39226 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -259,6 +259,15 @@ Tcl_UniCharToUtfDString(
* Tcl_UtfCharComplete() before calling this routine to ensure that
* enough bytes remain in the string.
*
+ * If TCL_UTF_MAX == 4, special handling of Surrogate pairs is done:
+ * For any UTF-8 string containing a character outside of the BMP, the
+ * first call to this function will fill *chPtr with the high surrogate
+ * and generate a return value of 0. Calling Tcl_UtfToUniChar again
+ * will produce the low surrogate and a return value of 4. Because *chPtr
+ * is used to remember whether the high surrogate is already produced, it
+ * is recommended to initialize the variable it points to as 0 before
+ * the first call to Tcl_UtfToUniChar is done.
+ *
* Results:
* *chPtr is filled with the Tcl_UniChar, and the return value is the
* number of bytes from the UTF-8 string that were consumed.
@@ -278,7 +287,7 @@ Tcl_UtfToUniChar(
register int byte;
/*
- * Unroll 1 to 3 byte UTF-8 sequences, use loop to handle longer ones.
+ * Unroll 1 to 3 (or 4) byte UTF-8 sequences.
*/
byte = *((unsigned char *) src);
@@ -331,12 +340,30 @@ Tcl_UtfToUniChar(
/*
* Four-byte-character lead byte followed by three trail bytes.
*/
-
+#if TCL_UTF_MAX == 4
+ Tcl_UniChar surrogate;
+
+ byte = (((byte & 0x07) << 18) | ((src[1] & 0x3F) << 12)
+ | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F)) - 0x10000;
+ surrogate = (Tcl_UniChar) (0xD800 + (byte >> 10));
+ if (byte & 0x100000) {
+ /* out of range, < 0x10000 or > 0x10ffff */
+ } else if (*chPtr != surrogate) {
+ /* produce high surrogate, but don't advance source pointer */
+ *chPtr = surrogate;
+ return 0;
+ } else {
+ /* produce low surrogate, and advance source pointer */
+ *chPtr = (Tcl_UniChar) (0xDC00 | (byte & 0x3FF));
+ return 4;
+ }
+#else
*chPtr = (Tcl_UniChar) (((byte & 0x07) << 18) | ((src[1] & 0x3F) << 12)
| ((src[2] & 0x3F) << 6) | (src[3] & 0x3F));
if ((unsigned)(*chPtr - 0x10000) <= 0xFFFFF) {
return 4;
}
+#endif
}
/*
@@ -377,7 +404,7 @@ Tcl_UtfToUniCharDString(
* appended to this previously initialized
* DString. */
{
- Tcl_UniChar *w, *wString;
+ Tcl_UniChar ch, *w, *wString;
const char *p, *end;
int oldLength;
@@ -399,8 +426,8 @@ Tcl_UtfToUniCharDString(
w = wString;
end = src + length;
for (p = src; p < end; ) {
- p += TclUtfToUniChar(p, w);
- w++;
+ p += TclUtfToUniChar(p, &ch);
+ *w++ = ch;
}
*w = '\0';
Tcl_DStringSetLength(dsPtr,
@@ -434,10 +461,7 @@ Tcl_UtfCharComplete(
* a complete UTF-8 character. */
int length) /* Length of above string in bytes. */
{
- int ch;
-
- ch = *((unsigned char *) src);
- return length >= totalBytes[ch];
+ return length >= totalBytes[(unsigned char)*src];
}
/*
@@ -464,8 +488,8 @@ Tcl_NumUtfChars(
int length) /* The length of the string in bytes, or -1
* for strlen(string). */
{
- Tcl_UniChar ch;
- register int i;
+ Tcl_UniChar ch = 0;
+ register int i = 0;
/*
* The separate implementations are faster.
@@ -474,7 +498,6 @@ Tcl_NumUtfChars(
* single-byte char case specially.
*/
- i = 0;
if (length < 0) {
while (*src != '\0') {
src += TclUtfToUniChar(src, &ch);
@@ -525,7 +548,7 @@ Tcl_UtfFindFirst(
int ch) /* The Tcl_UniChar to search for. */
{
int len;
- Tcl_UniChar find;
+ Tcl_UniChar find = 0;
while (1) {
len = TclUtfToUniChar(src, &find);
@@ -564,7 +587,7 @@ Tcl_UtfFindLast(
int ch) /* The Tcl_UniChar to search for. */
{
int len;
- Tcl_UniChar find;
+ Tcl_UniChar find = 0;
const char *last;
last = NULL;
@@ -604,9 +627,15 @@ const char *
Tcl_UtfNext(
const char *src) /* The current location in the string. */
{
- Tcl_UniChar ch;
+ Tcl_UniChar ch = 0;
+ int len = TclUtfToUniChar(src, &ch);
- return src + TclUtfToUniChar(src, &ch);
+#if TCL_UTF_MAX == 4
+ if (len == 0) {
+ len = TclUtfToUniChar(src, &ch);
+ }
+#endif
+ return src + len;
}
/*
@@ -639,8 +668,7 @@ Tcl_UtfPrev(
const char *look;
int i, byte;
- src--;
- look = src;
+ look = --src;
for (i = 0; i < TCL_UTF_MAX; i++) {
if (look < start) {
if (src < start) {
@@ -713,7 +741,7 @@ Tcl_UtfAtIndex(
register const char *src, /* The UTF-8 string. */
register int index) /* The position of the desired character. */
{
- Tcl_UniChar ch;
+ Tcl_UniChar ch = 0;
while (index > 0) {
index--;
@@ -797,7 +825,7 @@ int
Tcl_UtfToUpper(
char *str) /* String to convert in place. */
{
- Tcl_UniChar ch, upChar;
+ Tcl_UniChar ch = 0, upChar;
char *src, *dst;
int bytes;
@@ -850,7 +878,7 @@ int
Tcl_UtfToLower(
char *str) /* String to convert in place. */
{
- Tcl_UniChar ch, lowChar;
+ Tcl_UniChar ch = 0, lowChar;
char *src, *dst;
int bytes;
@@ -904,7 +932,7 @@ int
Tcl_UtfToTitle(
char *str) /* String to convert in place. */
{
- Tcl_UniChar ch, titleChar, lowChar;
+ Tcl_UniChar ch = 0, titleChar, lowChar;
char *src, *dst;
int bytes;
@@ -1013,7 +1041,7 @@ Tcl_UtfNcmp(
const char *ct, /* UTF string cs is compared to. */
unsigned long numChars) /* Number of UTF chars to compare. */
{
- Tcl_UniChar ch1, ch2;
+ Tcl_UniChar ch1 = 0, ch2 = 0;
/*
* Cannot use 'memcmp(cs, ct, n);' as byte representation of \u0000 (the
@@ -1061,7 +1089,7 @@ Tcl_UtfNcasecmp(
const char *ct, /* UTF string cs is compared to. */
unsigned long numChars) /* Number of UTF chars to compare. */
{
- Tcl_UniChar ch1, ch2;
+ Tcl_UniChar ch1 = 0, ch2 = 0;
while (numChars-- > 0) {
/*
* n must be interpreted as chars, not bytes.
@@ -1690,7 +1718,7 @@ Tcl_UniCharCaseMatch(
* characters. */
int nocase) /* 0 for case sensitive, 1 for insensitive */
{
- Tcl_UniChar ch1, p;
+ Tcl_UniChar ch1 = 0, p;
while (1) {
p = *uniPattern;
diff --git a/generic/tclUtil.c b/generic/tclUtil.c
index 553593c..0eddb00 100644
--- a/generic/tclUtil.c
+++ b/generic/tclUtil.c
@@ -1646,7 +1646,7 @@ Tcl_Backslash(
* src, unless NULL. */
{
char buf[TCL_UTF_MAX];
- Tcl_UniChar ch;
+ Tcl_UniChar ch = 0;
Tcl_UtfBackslash(src, readPtr, buf);
TclUtfToUniChar(buf, &ch);
diff --git a/win/tclWinPipe.c b/win/tclWinPipe.c
index fe0ed2d..4b372a5 100644
--- a/win/tclWinPipe.c
+++ b/win/tclWinPipe.c
@@ -1479,7 +1479,7 @@ BuildCommandLine(
quote = 1;
} else {
int count;
- Tcl_UniChar ch;
+ Tcl_UniChar ch = 0;
for (start = arg; *start != '\0'; start += count) {
count = TclUtfToUniChar(start, &ch);