11 files changed, 223 insertions, 135 deletions
diff --git a/doc/StringObj.3 b/doc/StringObj.3
index 7042cc8..e011c27 100644
--- a/doc/StringObj.3
+++ b/doc/StringObj.3
@@ -37,7 +37,7 @@ Tcl_UniChar *
 Tcl_UniChar *
 \fBTcl_GetUnicode\fR(\fIobjPtr\fR)
 .sp
-Tcl_UniChar
+int
 \fBTcl_GetUniChar\fR(\fIobjPtr, index\fR)
 .sp
 int
@@ -204,7 +204,8 @@ where the caller does not need the length of the unicode string
 representation.
 .PP
 \fBTcl_GetUniChar\fR returns the \fIindex\fR'th character in the
-value's Unicode representation.
+value's Unicode representation. If the index is out of range or
+it references a low surrogate preceded by a high surrogate, it returns -1;
 .PP
 \fBTcl_GetRange\fR returns a newly created value comprised of the
 characters between \fIfirst\fR and \fIlast\fR (inclusive) in the
diff --git a/doc/ToUpper.3 b/doc/ToUpper.3
index be614e7..1c7a0c2 100644
--- a/doc/ToUpper.3
+++ b/doc/ToUpper.3
@@ -13,13 +13,13 @@ Tcl_UniCharToUpper, Tcl_UniCharToLower, Tcl_UniCharToTitle, Tcl_UtfToUpper, Tcl_
 .nf
 \fB#include <tcl.h>\fR
 .sp
-Tcl_UniChar
+int
 \fBTcl_UniCharToUpper\fR(\fIch\fR)
 .sp
-Tcl_UniChar
+int
 \fBTcl_UniCharToLower\fR(\fIch\fR)
 .sp
-Tcl_UniChar
+int
 \fBTcl_UniCharToTitle\fR(\fIch\fR)
 .sp
 int
diff --git a/doc/Utf.3 b/doc/Utf.3
index 9d0c617..78d795e 100644
--- a/doc/Utf.3
+++ b/doc/Utf.3
@@ -63,7 +63,7 @@ const char *
 const char *
 \fBTcl_UtfPrev\fR(\fIsrc, start\fR)
 .sp
-Tcl_UniChar
+int
 \fBTcl_UniCharAtIndex\fR(\fIsrc, index\fR)
 .sp
 const char *
@@ -140,6 +140,9 @@ number of bytes read from \fIsrc\fR.  The caller must ensure that the
 source buffer is long enough such that this routine does not run off the
 end and dereference non-existent or random memory; if the source buffer
 is known to be null-terminated, this will not happen.  If the input is
+a byte in the range 0x80 - 0x9F, \fBTcl_UtfToUniChar\fR assumes the
+cp1252 encoding, stores the corresponding Tcl_UniChar in \fI*chPtr\fR
+and returns 1. If the input is otherwise
 not in proper UTF-8 format, \fBTcl_UtfToUniChar\fR will store the first
 byte of \fIsrc\fR in \fI*chPtr\fR as a Tcl_UniChar between 0x0000 and
 0x00ff and return 1.
diff --git a/generic/tcl.decls b/generic/tcl.decls
index 60aebfd..da551bb 100644
--- a/generic/tcl.decls
+++ b/generic/tcl.decls
@@ -1145,16 +1145,16 @@ declare 319 {
 	    Tcl_QueuePosition position)
 }
 declare 320 {
-    Tcl_UniChar Tcl_UniCharAtIndex(const char *src, int index)
+    int Tcl_UniCharAtIndex(const char *src, int index)
 }
 declare 321 {
-    Tcl_UniChar Tcl_UniCharToLower(int ch)
+    int Tcl_UniCharToLower(int ch)
 }
 declare 322 {
-    Tcl_UniChar Tcl_UniCharToTitle(int ch)
+    int Tcl_UniCharToTitle(int ch)
 }
 declare 323 {
-    Tcl_UniChar Tcl_UniCharToUpper(int ch)
+    int Tcl_UniCharToUpper(int ch)
 }
 declare 324 {
     int Tcl_UniCharToUtf(int ch, char *buf)
@@ -1348,7 +1348,7 @@ declare 380 {
     int Tcl_GetCharLength(Tcl_Obj *objPtr)
 }
 declare 381 {
-    Tcl_UniChar Tcl_GetUniChar(Tcl_Obj *objPtr, int index)
+    int Tcl_GetUniChar(Tcl_Obj *objPtr, int index)
 }
 declare 382 {
     Tcl_UniChar *Tcl_GetUnicode(Tcl_Obj *objPtr)
diff --git a/generic/tcl.h b/generic/tcl.h
index 45fa0ac..c3e0f9d 100644
--- a/generic/tcl.h
+++ b/generic/tcl.h
@@ -2155,7 +2155,7 @@ typedef struct Tcl_EncodingType {
  */
 
 #ifndef TCL_UTF_MAX
-#define TCL_UTF_MAX		3
+#define TCL_UTF_MAX		4
 #endif
 
 /*
@@ -2345,10 +2345,10 @@ typedef int (Tcl_NRPostProc) (ClientData data[], Tcl_Interp *interp,
 /*
  *----------------------------------------------------------------------------
  * The following constant is used to test for older versions of Tcl in the
- * stubs tables.
+ * stubs tables. If TCL_UTF_MAX>4 use a different value.
  */
 
-#define TCL_STUB_MAGIC		((int) 0xFCA3BACF)
+#define TCL_STUB_MAGIC		((int) 0xFCA3BACF + (TCL_UTF_MAX>4))
 
 /*
  * The following function is required to be defined in all stubs aware
diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c
index 43d6206..6aaf1de 100644
--- a/generic/tclCmdMZ.c
+++ b/generic/tclCmdMZ.c
@@ -491,7 +491,7 @@ Tcl_RegsubObjCmd(
     Tcl_RegExp regExpr;
     Tcl_RegExpInfo info;
     Tcl_Obj *resultPtr, *subPtr, *objPtr, *startIndex = NULL;
-    Tcl_UniChar ch, *wsrc, *wfirstChar, *wstring, *wsubspec, *wend;
+    Tcl_UniChar ch, *wsrc, *wfirstChar, *wstring, *wsubspec = 0, *wend;
 
     static const char *const options[] = {
 	"-all",		"-command",	"-expanded",	"-line",
@@ -1220,7 +1220,7 @@ Tcl_SplitObjCmd(
 	    len = TclUtfToUniChar(stringPtr, &ch);
 	    fullchar = ch;
 
-#if TCL_UTF_MAX == 4
+#if TCL_UTF_MAX <= 4
 	    if (!len) {
 		len += TclUtfToUniChar(stringPtr, &ch);
 		fullchar = (((fullchar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000;
@@ -1419,7 +1419,7 @@ StringIndexCmd(
     }
 
     /*
-     * Get the char length to calulate what 'end' means.
+     * Get the char length to calculate what 'end' means.
      */
 
     length = Tcl_GetCharLength(objv[1]);
@@ -1428,7 +1428,11 @@ StringIndexCmd(
     }
 
     if ((index >= 0) && (index < length)) {
-	Tcl_UniChar ch = Tcl_GetUniChar(objv[1], index);
+	int ch = Tcl_GetUniChar(objv[1], index);
+
+	if (ch == -1) {
+	    return TCL_OK;
+	}
 
 	/*
 	 * If we have a ByteArray object, we're careful to generate a new
@@ -1440,7 +1444,7 @@ StringIndexCmd(
 
 	    Tcl_SetObjResult(interp, Tcl_NewByteArrayObj(&uch, 1));
 	} else {
-	    char buf[TCL_UTF_MAX];
+	    char buf[4];
 
 	    length = Tcl_UniCharToUtf(ch, buf);
 	    Tcl_SetObjResult(interp, Tcl_NewStringObj(buf, length));
@@ -1803,7 +1807,7 @@ StringIsCmd(
 	    int fullchar;
 	    length2 = TclUtfToUniChar(string1, &ch);
 	    fullchar = ch;
-#if TCL_UTF_MAX == 4
+#if TCL_UTF_MAX <= 4
 	    if (!length2) {
 	    	length2 = TclUtfToUniChar(string1, &ch);
 	    	fullchar = (((fullchar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000;
diff --git a/generic/tclDecls.h b/generic/tclDecls.h
index 49f34e8..592d945 100644
--- a/generic/tclDecls.h
+++ b/generic/tclDecls.h
@@ -962,13 +962,13 @@ EXTERN void		Tcl_ThreadAlert(Tcl_ThreadId threadId);
 EXTERN void		Tcl_ThreadQueueEvent(Tcl_ThreadId threadId,
 				Tcl_Event *evPtr, Tcl_QueuePosition position);
 /* 320 */
-EXTERN Tcl_UniChar	Tcl_UniCharAtIndex(const char *src, int index);
+EXTERN int		Tcl_UniCharAtIndex(const char *src, int index);
 /* 321 */
-EXTERN Tcl_UniChar	Tcl_UniCharToLower(int ch);
+EXTERN int		Tcl_UniCharToLower(int ch);
 /* 322 */
-EXTERN Tcl_UniChar	Tcl_UniCharToTitle(int ch);
+EXTERN int		Tcl_UniCharToTitle(int ch);
 /* 323 */
-EXTERN Tcl_UniChar	Tcl_UniCharToUpper(int ch);
+EXTERN int		Tcl_UniCharToUpper(int ch);
 /* 324 */
 EXTERN int		Tcl_UniCharToUtf(int ch, char *buf);
 /* 325 */
@@ -1123,7 +1123,7 @@ EXTERN void		Tcl_SetUnicodeObj(Tcl_Obj *objPtr,
 /* 380 */
 EXTERN int		Tcl_GetCharLength(Tcl_Obj *objPtr);
 /* 381 */
-EXTERN Tcl_UniChar	Tcl_GetUniChar(Tcl_Obj *objPtr, int index);
+EXTERN int		Tcl_GetUniChar(Tcl_Obj *objPtr, int index);
 /* 382 */
 EXTERN Tcl_UniChar *	Tcl_GetUnicode(Tcl_Obj *objPtr);
 /* 383 */
@@ -2193,10 +2193,10 @@ typedef struct TclStubs {
     Tcl_Obj * (*tcl_SetVar2Ex) (Tcl_Interp *interp, const char *part1, const char *part2, Tcl_Obj *newValuePtr, int flags); /* 317 */
     void (*tcl_ThreadAlert) (Tcl_ThreadId threadId); /* 318 */
     void (*tcl_ThreadQueueEvent) (Tcl_ThreadId threadId, Tcl_Event *evPtr, Tcl_QueuePosition position); /* 319 */
-    Tcl_UniChar (*tcl_UniCharAtIndex) (const char *src, int index); /* 320 */
-    Tcl_UniChar (*tcl_UniCharToLower) (int ch); /* 321 */
-    Tcl_UniChar (*tcl_UniCharToTitle) (int ch); /* 322 */
-    Tcl_UniChar (*tcl_UniCharToUpper) (int ch); /* 323 */
+    int (*tcl_UniCharAtIndex) (const char *src, int index); /* 320 */
+    int (*tcl_UniCharToLower) (int ch); /* 321 */
+    int (*tcl_UniCharToTitle) (int ch); /* 322 */
+    int (*tcl_UniCharToUpper) (int ch); /* 323 */
     int (*tcl_UniCharToUtf) (int ch, char *buf); /* 324 */
     const char * (*tcl_UtfAtIndex) (const char *src, int index); /* 325 */
     int (*tcl_UtfCharComplete) (const char *src, int length); /* 326 */
@@ -2254,7 +2254,7 @@ typedef struct TclStubs {
     Tcl_Obj * (*tcl_NewUnicodeObj) (const Tcl_UniChar *unicode, int numChars); /* 378 */
     void (*tcl_SetUnicodeObj) (Tcl_Obj *objPtr, const Tcl_UniChar *unicode, int numChars); /* 379 */
     int (*tcl_GetCharLength) (Tcl_Obj *objPtr); /* 380 */
-    Tcl_UniChar (*tcl_GetUniChar) (Tcl_Obj *objPtr, int index); /* 381 */
+    int (*tcl_GetUniChar) (Tcl_Obj *objPtr, int index); /* 381 */
     Tcl_UniChar * (*tcl_GetUnicode) (Tcl_Obj *objPtr); /* 382 */
     Tcl_Obj * (*tcl_GetRange) (Tcl_Obj *objPtr, int first, int last); /* 383 */
     void (*tcl_AppendUnicodeToObj) (Tcl_Obj *objPtr, const Tcl_UniChar *unicode, int length); /* 384 */
diff --git a/generic/tclExecute.c b/generic/tclExecute.c
index 49c74da..35743a1 100644
--- a/generic/tclExecute.c
+++ b/generic/tclExecute.c
@@ -5292,8 +5292,8 @@ TEBCresume(
 	    objResultPtr = Tcl_NewStringObj((const char *)
 		    valuePtr->bytes+index, 1);
 	} else {
-	    char buf[TCL_UTF_MAX];
-	    Tcl_UniChar ch = Tcl_GetUniChar(valuePtr, index);
+	    char buf[4];
+	    int ch = Tcl_GetUniChar(valuePtr, index);
 
 	    /*
 	     * This could be: Tcl_NewUnicodeObj((const Tcl_UniChar *)&ch, 1)
@@ -5301,7 +5301,7 @@ TEBCresume(
 	     * practical use.
 	     */
 
-	    length = Tcl_UniCharToUtf(ch, buf);
+	    length = (ch != -1) ? Tcl_UniCharToUtf(ch, buf) : 0;
 	    objResultPtr = Tcl_NewStringObj(buf, length);
 	}
 
diff --git a/generic/tclStringObj.c b/generic/tclStringObj.c
index fa50d6d..5915ce0 100644
--- a/generic/tclStringObj.c
+++ b/generic/tclStringObj.c
@@ -460,8 +460,9 @@ Tcl_GetCharLength(
  *
  * Tcl_GetUniChar --
  *
- *	Get the index'th Unicode character from the String object. The index
- *	is assumed to be in the appropriate range.
+ *	Get the index'th Unicode character from the String object. If index
+ *	is out of range or it references a low surrogate preceded by a high
+ *	surrogate, the result = -1;
  *
  * Results:
  *	Returns the index'th Unicode character in the Object.
@@ -472,13 +473,18 @@ Tcl_GetCharLength(
  *----------------------------------------------------------------------
  */
 
-Tcl_UniChar
+int
 Tcl_GetUniChar(
     Tcl_Obj *objPtr,		/* The object to get the Unicode charater
 				 * from. */
     int index)			/* Get the index'th Unicode character. */
 {
     String *stringPtr;
+    int ch, length;
+
+    if (index < 0) {
+	return -1;
+    }
 
     /*
      * Optimize the case where we're really dealing with a bytearray object
@@ -486,9 +492,12 @@ Tcl_GetUniChar(
      */
 
     if (TclIsPureByteArray(objPtr)) {
-	unsigned char *bytes = Tcl_GetByteArrayFromObj(objPtr, NULL);
+	unsigned char *bytes = Tcl_GetByteArrayFromObj(objPtr, &length);
+	if (index >= length) {
+		return -1;
+	}
 
-	return (Tcl_UniChar) bytes[index];
+	return (int) bytes[index];
     }
 
     /*
@@ -512,7 +521,26 @@ Tcl_GetUniChar(
 	FillUnicodeRep(objPtr);
 	stringPtr = GET_STRING(objPtr);
     }
-    return stringPtr->unicode[index];
+
+    if (index >= stringPtr->numChars) {
+	return -1;
+    }
+	ch = stringPtr->unicode[index];
+#if TCL_UTF_MAX <= 4
+	/* See: bug [11ae2be95dac9417] */
+	if ((ch&0xF800) == 0xD800) {
+	    if (ch&0x400) {
+		if ((index > 0) && ((stringPtr->unicode[index-1]&0xFC00) == 0xD800)) {
+		    ch = -1; /* low surrogate preceded by high surrogate */
+		}
+	    } else if ((++index < stringPtr->numChars)
+		    && ((stringPtr->unicode[index]&0xFC00) == 0xDC00)) {
+		/* high surrogate followed by low surrogate */
+		ch = (((ch & 0x3FF) << 10) | (stringPtr->unicode[index] & 0x3FF)) + 0x10000;
+	    }
+	}
+#endif
+    return ch;
 }
 
 /*
@@ -612,15 +640,24 @@ Tcl_GetRange(
 {
     Tcl_Obj *newObjPtr;		/* The Tcl object to find the range of. */
     String *stringPtr;
+    int length;
 
+    if (first < 0) {
+	first = 0;
+	}
     /*
      * Optimize the case where we're really dealing with a bytearray object
      * we don't need to convert to a string to perform the substring operation.
      */
 
     if (TclIsPureByteArray(objPtr)) {
-	unsigned char *bytes = Tcl_GetByteArrayFromObj(objPtr, NULL);
-
+	unsigned char *bytes = Tcl_GetByteArrayFromObj(objPtr, &length);
+	if (last >= length) {
+	    last = length - 1;
+	}
+	if (last < first) {
+	    return Tcl_NewObj();
+	}
 	return Tcl_NewByteArrayObj(bytes+first, last-first+1);
     }
 
@@ -640,6 +677,12 @@ Tcl_GetRange(
 	    TclNumUtfChars(stringPtr->numChars, objPtr->bytes, objPtr->length);
 	}
 	if (stringPtr->numChars == objPtr->length) {
+	    if (last >= stringPtr->numChars) {
+		last = stringPtr->numChars - 1;
+	    }
+	    if (last < first) {
+		return Tcl_NewObj();
+	    }
 	    newObjPtr = Tcl_NewStringObj(objPtr->bytes + first, last-first+1);
 
 	    /*
@@ -654,8 +697,13 @@ Tcl_GetRange(
 	FillUnicodeRep(objPtr);
 	stringPtr = GET_STRING(objPtr);
     }
-
-#if TCL_UTF_MAX == 4
+	if (last > stringPtr->numChars) {
+	    last = stringPtr->numChars;
+	}
+	if (last < first) {
+	    return Tcl_NewObj();
+	}
+#if TCL_UTF_MAX <= 4
 	/* See: bug [11ae2be95dac9417] */
 	if ((first>0) && ((stringPtr->unicode[first]&0xFC00) == 0xDC00)
 		&& ((stringPtr->unicode[first-1]&0xFC00) == 0xD800)) {
@@ -1930,7 +1978,7 @@ Tcl_AppendFormatToObj(
 	    }
 	    break;
 	case 'c': {
-	    char buf[TCL_UTF_MAX];
+	    char buf[4];
 	    int code, length;
 
 	    if (TclGetIntFromObj(interp, segment, &code) != TCL_OK) {
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index 2d8750d..33c22e0 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -98,11 +98,9 @@ TclUtfCount(
     if (ch <= 0x7FF) {
 	return 2;
     }
-#if TCL_UTF_MAX > 3
     if (((unsigned)(ch - 0x10000) <= 0xFFFFF)) {
 	return 4;
     }
-#endif
     return 3;
 }
 
@@ -131,7 +129,7 @@ Tcl_UniCharToUtf(
     char *buf)			/* Buffer in which the UTF-8 representation of
 				 * the Tcl_UniChar is stored. Buffer must be
 				 * large enough to hold the UTF-8 character
-				 * (at most TCL_UTF_MAX bytes). */
+				 * (at most 4 bytes). */
 {
     if ((unsigned)(ch - 1) < (UNICODE_SELF - 1)) {
 	buf[0] = (char) ch;
@@ -144,27 +142,30 @@ Tcl_UniCharToUtf(
 	    return 2;
 	}
 	if (ch <= 0xFFFF) {
-#if TCL_UTF_MAX == 4
 	    if ((ch & 0xF800) == 0xD800) {
 		if (ch & 0x0400) {
 		    /* Low surrogate */
-		    buf[3] = (char) ((ch | 0x80) & 0xBF);
-		    buf[2] |= (char) (((ch >> 6) | 0x80) & 0x8F);
-		    return 4;
+		    if (((buf[0] & 0xF8) == 0xF0) && ((buf[1] & 0xC0) == 0x80)
+			    && ((buf[2] & 0xCF) == 0)) {
+			/* Previous Tcl_UniChar was a High surrogate, so combine */
+			buf[3] = (char) ((ch & 0x3F) | 0x80);
+			buf[2] |= (char) (((ch >> 6) & 0x0F) | 0x80);
+			return 4;
+		    }
+		    /* Previous Tcl_UniChar was not a High surrogate, so just output */
 		} else {
 		    /* High surrogate */
 		    ch += 0x40;
-		    buf[2] = (char) (((ch << 4) | 0x80) & 0xB0);
-		    buf[1] = (char) (((ch >> 2) | 0x80) & 0xBF);
-		    buf[0] = (char) (((ch >> 8) | 0xF0) & 0xF7);
+		    /* Fill buffer with specific 3-byte (invalid) byte combination,
+		       so following Low surrogate can recognize it and combine */
+		    buf[2] = (char) ((ch << 4) & 0x30);
+		    buf[1] = (char) (((ch >> 2) & 0x3F) | 0x80);
+		    buf[0] = (char) (((ch >> 8) & 0x07) | 0xF0);
 		    return 0;
 		}
 	    }
-#endif
 	    goto three;
 	}
-
-#if TCL_UTF_MAX > 3
 	if (ch <= 0x10FFFF) {
 	    buf[3] = (char) ((ch | 0x80) & 0xBF);
 	    buf[2] = (char) (((ch >> 6) | 0x80) & 0xBF);
@@ -172,7 +173,6 @@ Tcl_UniCharToUtf(
 	    buf[0] = (char) ((ch >> 18) | 0xF0);
 	    return 4;
 	}
-#endif
     }
 
     ch = 0xFFFD;
@@ -268,6 +268,13 @@ Tcl_UniCharToUtfDString(
  *---------------------------------------------------------------------------
  */
 
+static const unsigned short cp1252[32] = {
+  0x20ac,   0x81, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
+  0x02C6, 0x2030, 0x0160, 0x2039, 0x0152,   0x8D, 0x017D,   0x8F,
+    0x90, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
+   0x2DC, 0x2122, 0x0161, 0x203A, 0x0153,   0x9D, 0x017E, 0x0178
+};
+
 int
 Tcl_UtfToUniChar(
     register const char *src,	/* The UTF-8 string. */
@@ -284,11 +291,17 @@ Tcl_UtfToUniChar(
     if (byte < 0xC0) {
 	/*
 	 * Handles properly formed UTF-8 characters between 0x01 and 0x7F.
-	 * Also treats \0 and naked trail bytes 0x80 to 0xBF as valid
+	 * Treats naked trail bytes 0x80 to 0x9F as valid characters from
+	 * the cp1252 table. See: <https://en.wikipedia.org/wiki/UTF-8>
+	 * Also treats \0 and other naked trail bytes 0xA0 to 0xBF as valid
 	 * characters representing themselves.
 	 */
 
-	*chPtr = (Tcl_UniChar) byte;
+	if ((unsigned)(byte-0x80) < (unsigned) 0x20) {
+	    *chPtr = (Tcl_UniChar) cp1252[byte-0x80];
+	} else {
+	    *chPtr = (Tcl_UniChar) byte;
+	}
 	return 1;
     } else if (byte < 0xE0) {
 	if ((src[1] & 0xC0) == 0x80) {
@@ -329,17 +342,7 @@ Tcl_UtfToUniChar(
 	    /*
 	     * Four-byte-character lead byte followed by three trail bytes.
 	     */
-#if TCL_UTF_MAX == 3
-	    byte = (((byte & 0x07) << 18) | ((src[1] & 0x3F) << 12)
-		    | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F)) - 0x10000;
-	    if (byte & 0x100000) {
-		/* out of range, < 0x10000 or > 0x10ffff */
-	    } else {
-		/* produce replacement character, and advance source pointer */
-		*chPtr = (Tcl_UniChar) 0xFFFD;
-		return 4;
-	    }
-#elif TCL_UTF_MAX == 4
+#if TCL_UTF_MAX <= 4
 	    Tcl_UniChar surrogate;
 
 	    byte = (((byte & 0x07) << 18) | ((src[1] & 0x3F) << 12)
@@ -551,7 +554,7 @@ Tcl_UtfFindFirst(
     while (1) {
 	len = TclUtfToUniChar(src, &find);
 	fullchar = find;
-#if TCL_UTF_MAX == 4
+#if TCL_UTF_MAX <= 4
 	if (!len) {
 	    len += TclUtfToUniChar(src, &find);
 	    fullchar = (((fullchar & 0x3ff) << 10) | (find & 0x3ff)) + 0x10000;
@@ -599,7 +602,7 @@ Tcl_UtfFindLast(
     while (1) {
 	len = TclUtfToUniChar(src, &find);
 	fullchar = find;
-#if TCL_UTF_MAX == 4
+#if TCL_UTF_MAX <= 4
 	if (!len) {
 	    len += TclUtfToUniChar(src, &find);
 	    fullchar = (((fullchar & 0x3ff) << 10) | (find & 0x3ff)) + 0x10000;
@@ -642,7 +645,7 @@ Tcl_UtfNext(
     Tcl_UniChar ch = 0;
     int len = TclUtfToUniChar(src, &ch);
 
-#if TCL_UTF_MAX == 4
+#if TCL_UTF_MAX <= 4
     if (len == 0) {
       len = TclUtfToUniChar(src, &ch);
     }
@@ -717,17 +720,33 @@ Tcl_UtfPrev(
  *---------------------------------------------------------------------------
  */
 
-Tcl_UniChar
+int
 Tcl_UniCharAtIndex(
     register const char *src,	/* The UTF-8 string to dereference. */
     register int index)		/* The position of the desired character. */
 {
     Tcl_UniChar ch = 0;
+    int fullchar = 0;
+#if TCL_UTF_MAX <= 4
+	int len = 1;
+#endif
 
     while (index-- >= 0) {
+#if TCL_UTF_MAX <= 4
+	src += (len = TclUtfToUniChar(src, &ch));
+#else
 	src += TclUtfToUniChar(src, &ch);
+#endif
     }
-    return ch;
+    fullchar = ch;
+#if TCL_UTF_MAX <= 4
+     if (!len) {
+	/* If last Tcl_UniChar was an upper surrogate, combine with lower surrogate */
+	(void)TclUtfToUniChar(src, &ch);
+	fullchar = (((fullchar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000;
+    }
+#endif
+    return fullchar;
 }
 
 /*
@@ -835,7 +854,8 @@ int
 Tcl_UtfToUpper(
     char *str)			/* String to convert in place. */
 {
-    Tcl_UniChar ch = 0, upChar;
+    Tcl_UniChar ch = 0;
+    int upChar;
     char *src, *dst;
     int bytes;
 
@@ -846,7 +866,14 @@ Tcl_UtfToUpper(
     src = dst = str;
     while (*src) {
 	bytes = TclUtfToUniChar(src, &ch);
-	upChar = Tcl_UniCharToUpper(ch);
+	upChar = ch;
+	if (!bytes) {
+	    /* TclUtfToUniChar only returns 0 for chars > 0xffff ! */
+	    bytes = TclUtfToUniChar(src, &ch);
+	    /* Combine surrogates */
+	    upChar = (((upChar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000;
+	}
+	upChar = Tcl_UniCharToUpper(upChar);
 
 	/*
 	 * To keep badly formed Utf strings from getting inflated by the
@@ -888,7 +915,8 @@ int
 Tcl_UtfToLower(
     char *str)			/* String to convert in place. */
 {
-    Tcl_UniChar ch = 0, lowChar;
+    Tcl_UniChar ch = 0;
+    int lowChar;
     char *src, *dst;
     int bytes;
 
@@ -899,7 +927,14 @@ Tcl_UtfToLower(
     src = dst = str;
     while (*src) {
 	bytes = TclUtfToUniChar(src, &ch);
-	lowChar = Tcl_UniCharToLower(ch);
+	lowChar = ch;
+	if (!bytes) {
+	    /* TclUtfToUniChar only returns 0 for chars > 0xffff ! */
+	    bytes = TclUtfToUniChar(src, &ch);
+	    /* Combine surrogates */
+	    lowChar = (((lowChar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000;
+	}
+	lowChar = Tcl_UniCharToLower(lowChar);
 
 	/*
 	 * To keep badly formed Utf strings from getting inflated by the
@@ -942,7 +977,8 @@ int
 Tcl_UtfToTitle(
     char *str)			/* String to convert in place. */
 {
-    Tcl_UniChar ch = 0, titleChar, lowChar;
+    Tcl_UniChar ch = 0;
+    int titleChar, lowChar;
     char *src, *dst;
     int bytes;
 
@@ -955,7 +991,14 @@ Tcl_UtfToTitle(
 
     if (*src) {
 	bytes = TclUtfToUniChar(src, &ch);
-	titleChar = Tcl_UniCharToTitle(ch);
+	titleChar = ch;
+	if (!bytes) {
+	    /* TclUtfToUniChar only returns 0 for chars > 0xffff ! */
+	    bytes = TclUtfToUniChar(src, &ch);
+	    /* Combine surrogates */
+	    titleChar = (((titleChar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000;
+	}
+	titleChar = Tcl_UniCharToTitle(titleChar);
 
 	if (bytes < TclUtfCount(titleChar)) {
 	    memcpy(dst, src, (size_t) bytes);
@@ -967,7 +1010,14 @@ Tcl_UtfToTitle(
     }
     while (*src) {
 	bytes = TclUtfToUniChar(src, &ch);
-	lowChar = Tcl_UniCharToLower(ch);
+	lowChar = ch;
+	if (!bytes) {
+	    /* TclUtfToUniChar only returns 0 for chars > 0xffff ! */
+	    bytes = TclUtfToUniChar(src, &ch);
+	    /* Combine surrogates */
+	    lowChar = (((lowChar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000;
+	}
+	lowChar = Tcl_UniCharToLower(lowChar);
 
 	if (bytes < TclUtfCount(lowChar)) {
 	    memcpy(dst, src, (size_t) bytes);
@@ -1069,7 +1119,7 @@ Tcl_UtfNcmp(
 	cs += TclUtfToUniChar(cs, &ch1);
 	ct += TclUtfToUniChar(ct, &ch2);
 	if (ch1 != ch2) {
-#if TCL_UTF_MAX == 4
+#if TCL_UTF_MAX <= 4
 	    /* Surrogates always report higher than non-surrogates */
 	    if (((ch1 & 0xFC00) == 0xD800)) {
 	    if ((ch2 & 0xFC00) != 0xD800) {
@@ -1120,7 +1170,7 @@ Tcl_UtfNcasecmp(
 	cs += TclUtfToUniChar(cs, &ch1);
 	ct += TclUtfToUniChar(ct, &ch2);
 	if (ch1 != ch2) {
-#if TCL_UTF_MAX == 4
+#if TCL_UTF_MAX <= 4
 	    /* Surrogates always report higher than non-surrogates */
 	    if (((ch1 & 0xFC00) == 0xD800)) {
 	    if ((ch2 & 0xFC00) != 0xD800) {
@@ -1169,7 +1219,7 @@ TclUtfCmp(
 	cs += TclUtfToUniChar(cs, &ch1);
 	ct += TclUtfToUniChar(ct, &ch2);
 	if (ch1 != ch2) {
-#if TCL_UTF_MAX == 4
+#if TCL_UTF_MAX <= 4
 	    /* Surrogates always report higher than non-surrogates */
 	    if (((ch1 & 0xFC00) == 0xD800)) {
 	    if ((ch2 & 0xFC00) != 0xD800) {
@@ -1215,7 +1265,7 @@ TclUtfCasecmp(
 	cs += TclUtfToUniChar(cs, &ch1);
 	ct += TclUtfToUniChar(ct, &ch2);
 	if (ch1 != ch2) {
-#if TCL_UTF_MAX == 4
+#if TCL_UTF_MAX <= 4
 	    /* Surrogates always report higher than non-surrogates */
 	    if (((ch1 & 0xFC00) == 0xD800)) {
 	    if ((ch2 & 0xFC00) != 0xD800) {
@@ -1252,16 +1302,18 @@ TclUtfCasecmp(
  *----------------------------------------------------------------------
  */
 
-Tcl_UniChar
+int
 Tcl_UniCharToUpper(
     int ch)			/* Unicode character to convert. */
 {
-    int info = GetUniCharInfo(ch);
+    if (!UNICODE_OUT_OF_RANGE(ch)) {
+	int info = GetUniCharInfo(ch);
 
-    if (GetCaseType(info) & 0x04) {
-	ch -= GetDelta(info);
+	if (GetCaseType(info) & 0x04) {
+	    ch -= GetDelta(info);
+	}
     }
-    return (Tcl_UniChar) ch;
+    return ch & 0x1FFFFF;
 }
 
 /*
@@ -1280,16 +1332,18 @@ Tcl_UniCharToUpper(
  *----------------------------------------------------------------------
  */
 
-Tcl_UniChar
+int
 Tcl_UniCharToLower(
     int ch)			/* Unicode character to convert. */
 {
-    int info = GetUniCharInfo(ch);
+    if (!UNICODE_OUT_OF_RANGE(ch)) {
+	int info = GetUniCharInfo(ch);
 
-    if (GetCaseType(info) & 0x02) {
-	ch += GetDelta(info);
+	if (GetCaseType(info) & 0x02) {
+	    ch += GetDelta(info);
+	}
     }
-    return (Tcl_UniChar) ch;
+    return ch & 0x1FFFFF;
 }
 
 /*
@@ -1308,23 +1362,25 @@ Tcl_UniCharToLower(
  *----------------------------------------------------------------------
  */
 
-Tcl_UniChar
+int
 Tcl_UniCharToTitle(
     int ch)			/* Unicode character to convert. */
 {
-    int info = GetUniCharInfo(ch);
-    int mode = GetCaseType(info);
+    if (!UNICODE_OUT_OF_RANGE(ch)) {
+	int info = GetUniCharInfo(ch);
+	int mode = GetCaseType(info);
 
-    if (mode & 0x1) {
-	/*
-	 * Subtract or add one depending on the original case.
-	 */
+	if (mode & 0x1) {
+	    /*
+	     * Subtract or add one depending on the original case.
+	     */
 
-	ch += ((mode & 0x4) ? -1 : 1);
-    } else if (mode == 0x4) {
-	ch -= GetDelta(info);
+	    ch += ((mode & 0x4) ? -1 : 1);
+	} else if (mode == 0x4) {
+	    ch -= GetDelta(info);
+	}
     }
-    return (Tcl_UniChar) ch;
+    return ch & 0x1FFFFF;
 }
 
 /*
@@ -1458,11 +1514,9 @@ int
 Tcl_UniCharIsAlnum(
     int ch)			/* Unicode character to test. */
 {
-#if TCL_UTF_MAX > 3
     if (UNICODE_OUT_OF_RANGE(ch)) {
 	return 0;
     }
-#endif
     return (((ALPHA_BITS | DIGIT_BITS) >> GetCategory(ch)) & 1);
 }
 
@@ -1486,11 +1540,9 @@ int
 Tcl_UniCharIsAlpha(
     int ch)			/* Unicode character to test. */
 {
-#if TCL_UTF_MAX > 3
     if (UNICODE_OUT_OF_RANGE(ch)) {
 	return 0;
     }
-#endif
     return ((ALPHA_BITS >> GetCategory(ch)) & 1);
 }
 
@@ -1514,7 +1566,6 @@ int
 Tcl_UniCharIsControl(
     int ch)			/* Unicode character to test. */
 {
-#if TCL_UTF_MAX > 3
     if (UNICODE_OUT_OF_RANGE(ch)) {
 	ch &= 0x1FFFFF;
 	if ((ch == 0xE0001) || ((ch >= 0xE0020) && (ch <= 0xE007f))) {
@@ -1525,7 +1576,6 @@ Tcl_UniCharIsControl(
 	}
 	return 0;
     }
-#endif
     return ((CONTROL_BITS >> GetCategory(ch)) & 1);
 }
 
@@ -1549,11 +1599,9 @@ int
 Tcl_UniCharIsDigit(
     int ch)			/* Unicode character to test. */
 {
-#if TCL_UTF_MAX > 3
     if (UNICODE_OUT_OF_RANGE(ch)) {
 	return 0;
     }
-#endif
     return (GetCategory(ch) == DECIMAL_DIGIT_NUMBER);
 }
 
@@ -1577,12 +1625,10 @@ int
 Tcl_UniCharIsGraph(
     int ch)			/* Unicode character to test. */
 {
-#if TCL_UTF_MAX > 3
     if (UNICODE_OUT_OF_RANGE(ch)) {
 	ch &= 0x1FFFFF;
 	return (ch >= 0xE0100) && (ch <= 0xE01EF);
     }
-#endif
     return ((GRAPH_BITS >> GetCategory(ch)) & 1);
 }
 
@@ -1606,11 +1652,9 @@ int
 Tcl_UniCharIsLower(
     int ch)			/* Unicode character to test. */
 {
-#if TCL_UTF_MAX > 3
     if (UNICODE_OUT_OF_RANGE(ch)) {
 	return 0;
     }
-#endif
     return (GetCategory(ch) == LOWERCASE_LETTER);
 }
 
@@ -1634,12 +1678,10 @@ int
 Tcl_UniCharIsPrint(
     int ch)			/* Unicode character to test. */
 {
-#if TCL_UTF_MAX > 3
     if (UNICODE_OUT_OF_RANGE(ch)) {
 	ch &= 0x1FFFFF;
 	return (ch >= 0xE0100) && (ch <= 0xE01EF);
     }
-#endif
     return (((GRAPH_BITS|SPACE_BITS) >> GetCategory(ch)) & 1);
 }
 
@@ -1663,11 +1705,9 @@ int
 Tcl_UniCharIsPunct(
     int ch)			/* Unicode character to test. */
 {
-#if TCL_UTF_MAX > 3
     if (UNICODE_OUT_OF_RANGE(ch)) {
 	return 0;
     }
-#endif
     return ((PUNCT_BITS >> GetCategory(ch)) & 1);
 }
 
@@ -1691,13 +1731,8 @@ int
 Tcl_UniCharIsSpace(
     int ch)			/* Unicode character to test. */
 {
-#if TCL_UTF_MAX > 3
     /* Ignore upper 11 bits. */
     ch &= 0x1FFFFF;
-#else
-    /* Ignore upper 16 bits. */
-    ch &= 0xFFFF;
-#endif
 
     /*
      * If the character is within the first 127 characters, just use the
@@ -1706,10 +1741,8 @@ Tcl_UniCharIsSpace(
 
     if (ch < 0x80) {
 	return TclIsSpaceProc((char) ch);
-#if TCL_UTF_MAX > 3
     } else if (UNICODE_OUT_OF_RANGE(ch)) {
 	return 0;
-#endif
     } else if (ch == 0x0085 || ch == 0x180E || ch == 0x200B
 	    || ch == 0x202F || ch == 0x2060 || ch == 0xFEFF) {
 	return 1;
@@ -1738,11 +1771,9 @@ int
 Tcl_UniCharIsUpper(
     int ch)			/* Unicode character to test. */
 {
-#if TCL_UTF_MAX > 3
     if (UNICODE_OUT_OF_RANGE(ch)) {
 	return 0;
     }
-#endif
     return (GetCategory(ch) == UPPERCASE_LETTER);
 }
 
@@ -1766,11 +1797,9 @@ int
 Tcl_UniCharIsWordChar(
     int ch)			/* Unicode character to test. */
 {
-#if TCL_UTF_MAX > 3
     if (UNICODE_OUT_OF_RANGE(ch)) {
 	return 0;
     }
-#endif
     return ((WORD_BITS >> GetCategory(ch)) & 1);
 }
 
diff --git a/tests/string.test b/tests/string.test
index da302eb..2300f08 100644
--- a/tests/string.test
+++ b/tests/string.test
@@ -487,6 +487,9 @@ test string-5.19.$noComp {string index, bytearray object out of bounds} {
 test string-5.20.$noComp {string index, bytearray object out of bounds} {
     run {string index [binary format I* {0x50515253 0x52}] 20}
 } {}
+test string-5.21 {string index, surrogates, bug [11ae2be95dac9417]} fullutf {
+    list [string index a\U100000b 1] [string index a\U100000b 2] [string index a\U100000b 3]
+} [list \U100000 {} b]
 
 
 proc largest_int {} {