From eb8c96f7b284ef2edd736fb55e5ff1ea399c631a Mon Sep 17 00:00:00 2001
From: hobbs <hobbs>
Date: Fri, 26 May 2000 08:51:11 +0000
Subject: 	* generic/tclCmdMZ.c (Tcl_RegsubObjCmd): reworked to operate
 in 	Unicode, tweaked for performance. 	(Tcl_StringObjCmd) changed
 STR_FIRST/STR_LAST error message to 	something more understandable,
 reworked STR_FIRST, STR_LAST, 	STR_MAP, STR_MATCH, STR_RANGE, STR_REPLACE to
 operate in Unicode. 	Removed inneffectual STR_RANGE "special" ByteArray
 support. 	Optimized STR_MAP algorithm, especially optimized for one-pair
 case. 	Fixed possible mem overrun in STR_INDEX bytearray case.

---
 generic/tclCmdMZ.c | 535 +++++++++++++++++++++++++----------------------------
 1 file changed, 253 insertions(+), 282 deletions(-)
diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c
index cbb2f83..cc17067 100644
--- a/generic/tclCmdMZ.c
+++ b/generic/tclCmdMZ.c
@@ -8,12 +8,12 @@
  *
  * Copyright (c) 1987-1993 The Regents of the University of California.
  * Copyright (c) 1994-1997 Sun Microsystems, Inc.
- * Copyright (c) 1998-1999 by Scriptics Corporation.
+ * Copyright (c) 1998-2000 Scriptics Corporation.
  *
  * See the file "license.terms" for information on usage and redistribution
  * of this file, and for a DISCLAIMER OF ALL WARRANTIES.
  *
- * RCS: @(#) $Id: tclCmdMZ.c,v 1.26 2000/04/10 21:08:26 ericm Exp $
+ * RCS: @(#) $Id: tclCmdMZ.c,v 1.27 2000/05/26 08:51:11 hobbs Exp $
  */
 
 #include "tclInt.h"
@@ -61,7 +61,7 @@ typedef struct {
 static char *		TraceVarProc _ANSI_ARGS_((ClientData clientData,
 			    Tcl_Interp *interp, char *name1, char *name2,
 			    int flags));
-
+
 /*
  *----------------------------------------------------------------------
  *
@@ -444,11 +444,12 @@ Tcl_RegsubObjCmd(dummy, interp, objc, objv)
     int objc;				/* Number of arguments. */
     Tcl_Obj *CONST objv[];		/* Argument objects. */
 {
-    int i, result, cflags, all, wlen, numMatches, offset;
+    int idx, result, cflags, all, wlen, wsublen, numMatches, offset;
+    int start, end, subStart, subEnd, match;
     Tcl_RegExp regExpr;
+    Tcl_RegExpInfo info;
     Tcl_Obj *resultPtr, *varPtr, *objPtr;
-    Tcl_UniChar *wstring;
-    char *subspec;
+    Tcl_UniChar ch, *wsrc, *wfirstChar, *wstring, *wsubspec;
 
     static char *options[] = {
 	"-all",		"-nocase",	"-expanded",
@@ -465,16 +466,16 @@ Tcl_RegsubObjCmd(dummy, interp, objc, objv)
     all = 0;
     offset = 0;
 
-    for (i = 1; i < objc; i++) {
+    for (idx = 1; idx < objc; idx++) {
 	char *name;
 	int index;
 	
-	name = Tcl_GetString(objv[i]);
+	name = Tcl_GetString(objv[idx]);
 	if (name[0] != '-') {
 	    break;
 	}
-	if (Tcl_GetIndexFromObj(interp, objv[i], options, "switch", TCL_EXACT,
-		&index) != TCL_OK) {
+	if (Tcl_GetIndexFromObj(interp, objv[idx], options, "switch",
+		TCL_EXACT, &index) != TCL_OK) {
 	    return TCL_ERROR;
 	}
 	switch ((enum options) index) {
@@ -503,10 +504,10 @@ Tcl_RegsubObjCmd(dummy, interp, objc, objv)
 		break;
 	    }
 	    case REGSUB_START: {
-		if (++i >= objc) {
+		if (++idx >= objc) {
 		    goto endOfForLoop;
 		}
-		if (Tcl_GetIntFromObj(interp, objv[i], &offset) != TCL_OK) {
+		if (Tcl_GetIntFromObj(interp, objv[idx], &offset) != TCL_OK) {
 		    return TCL_ERROR;
 		}
 		if (offset < 0) {
@@ -515,35 +516,36 @@ Tcl_RegsubObjCmd(dummy, interp, objc, objv)
 		break;
 	    }
 	    case REGSUB_LAST: {
-		i++;
+		idx++;
 		goto endOfForLoop;
 	    }
 	}
     }
     endOfForLoop:
-    if (objc - i != 4) {
+    if (objc - idx != 4) {
 	Tcl_WrongNumArgs(interp, 1, objv,
 		"?switches? exp string subSpec varName");
 	return TCL_ERROR;
     }
 
-    objv += i;
+    objv += idx;
 
     regExpr = Tcl_GetRegExpFromObj(interp, objv[0], cflags);
     if (regExpr == NULL) {
 	return TCL_ERROR;
     }
 
+    objPtr	= objv[1];
+    wstring	= Tcl_GetUnicode(objPtr);
+    wlen	= Tcl_GetCharLength(objPtr);
+    wsubspec	= Tcl_GetUnicode(objv[2]);
+    wsublen	= Tcl_GetCharLength(objv[2]);
+    varPtr	= objv[3];
+
     result = TCL_OK;
-    resultPtr = Tcl_NewObj();
+    resultPtr = Tcl_NewUnicodeObj(wstring, 0);
     Tcl_IncrRefCount(resultPtr);
 
-    objPtr = objv[1];
-    wlen = Tcl_GetCharLength(objPtr);
-    wstring = Tcl_GetUnicode(objPtr);
-    subspec = Tcl_GetString(objv[2]);
-    varPtr = objv[3];
-
     /*
      * The following loop is to handle multiple matches within the
      * same source string;  each iteration handles one match and its
@@ -553,10 +555,6 @@ Tcl_RegsubObjCmd(dummy, interp, objc, objv)
 
     numMatches = 0;
     for ( ; offset < wlen; ) {
-	int start, end, subStart, subEnd, match;
-	char *src, *firstChar;
-	char c;
-	Tcl_RegExpInfo info;
 
 	/*
 	 * The flags argument is set if string is part of a larger string,
@@ -598,22 +596,21 @@ Tcl_RegsubObjCmd(dummy, interp, objc, objv)
 	 * subSpec to reduce the number of calls to Tcl_SetVar.
 	 */
 
-	src = subspec;
-	firstChar = subspec;
-	for (c = *src; c != '\0'; src++, c = *src) {
-	    int index;
-    
-	    if (c == '&') {
-		index = 0;
-	    } else if (c == '\\') {
-		c = src[1];
-		if ((c >= '0') && (c <= '9')) {
-		    index = c - '0';
-		} else if ((c == '\\') || (c == '&')) {
-		    Tcl_AppendToObj(resultPtr, firstChar, src - firstChar);
-		    Tcl_AppendToObj(resultPtr, &c, 1);
-		    firstChar = src + 2;
-		    src++;
+	wsrc = wfirstChar = wsubspec;
+	for (ch = *wsrc; ch != '\0'; wsrc++, ch = *wsrc) {
+	    if (ch == '&') {
+		idx = 0;
+	    } else if (ch == '\\') {
+		ch = wsrc[1];
+		if ((ch >= '0') && (ch <= '9')) {
+		    idx = ch - '0';
+		} else if ((ch == '\\') || (ch == '&')) {
+		    *wsrc = ch;
+		    Tcl_AppendUnicodeToObj(resultPtr, wfirstChar,
+			    wsrc - wfirstChar + 1);
+		    *wsrc = '\\';
+		    wfirstChar = wsrc + 2;
+		    wsrc++;
 		    continue;
 		} else {
 		    continue;
@@ -621,24 +618,25 @@ Tcl_RegsubObjCmd(dummy, interp, objc, objv)
 	    } else {
 		continue;
 	    }
-	    if (firstChar != src) {
-		Tcl_AppendToObj(resultPtr, firstChar, src - firstChar);
+	    if (wfirstChar != wsrc) {
+		Tcl_AppendUnicodeToObj(resultPtr, wfirstChar,
+			wsrc - wfirstChar);
 	    }
-	    if (index <= info.nsubs) {
-		subStart = info.matches[index].start;
-		subEnd = info.matches[index].end;
+	    if (idx <= info.nsubs) {
+		subStart = info.matches[idx].start;
+		subEnd = info.matches[idx].end;
 		if ((subStart >= 0) && (subEnd >= 0)) {
 		    Tcl_AppendUnicodeToObj(resultPtr,
 			    wstring + offset + subStart, subEnd - subStart);
 		}
 	    }
-	    if (*src == '\\') {
-		src++;
+	    if (*wsrc == '\\') {
+		wsrc++;
 	    }
-	    firstChar = src + 1;
+	    wfirstChar = wsrc + 1;
 	}
-	if (firstChar != src) {
-	    Tcl_AppendToObj(resultPtr, firstChar, src - firstChar);
+	if (wfirstChar != wsrc) {
+	    Tcl_AppendUnicodeToObj(resultPtr, wfirstChar, wsrc - wfirstChar);
 	}
 	if (end == 0) {
 	    /*
@@ -648,8 +646,9 @@ Tcl_RegsubObjCmd(dummy, interp, objc, objv)
 
 	    Tcl_AppendUnicodeToObj(resultPtr, wstring + offset, 1);
 	    offset++;
+	} else {
+	    offset += end;
 	}
-	offset += end;
 	if (!all) {
 	    break;
 	}
@@ -675,8 +674,8 @@ Tcl_RegsubObjCmd(dummy, interp, objc, objv)
 	result = TCL_ERROR;
     } else {
 	/*
-	 * Set the interpreter's object result to an integer object holding the
-	 * number of matches. 
+	 * Set the interpreter's object result to an integer object
+	 * holding the number of matches. 
 	 */
 	
 	Tcl_SetIntObj(Tcl_GetObjResult(interp), numMatches);
@@ -1021,10 +1020,10 @@ Tcl_StringObjCmd(dummy, interp, objc, objv)
 	    for (i = 2; i < objc-2; i++) {
 		string2 = Tcl_GetStringFromObj(objv[i], &length2);
 		if ((length2 > 1)
-			&& strncmp(string2, "-nocase", (size_t) length2) == 0) {
+			&& strncmp(string2, "-nocase", (size_t)length2) == 0) {
 		    nocase = 1;
 		} else if ((length2 > 1)
-			&& strncmp(string2, "-length", (size_t) length2) == 0) {
+			&& strncmp(string2, "-length", (size_t)length2) == 0) {
 		    if (i+1 >= objc-2) {
 			goto str_cmp_args;
 		    }
@@ -1103,91 +1102,75 @@ Tcl_StringObjCmd(dummy, interp, objc, objv)
 	    break;
 	}
 	case STR_FIRST: {
-	    register char *p, *end;
-	    int match, utflen, start;
+	    Tcl_UniChar *ustring1, *ustring2;
+	    int match, start;
 
 	    if (objc < 4 || objc > 5) {
 	        Tcl_WrongNumArgs(interp, 2, objv,
-				 "string1 string2 ?startIndex?");
+				 "subString string ?startIndex?");
 		return TCL_ERROR;
 	    }
 
 	    /*
-	     * This algorithm fails on improperly formed UTF strings.
 	     * We are searching string2 for the sequence string1.
 	     */
 
 	    match = -1;
 	    start = 0;
-	    utflen = -1;
-	    string1 = Tcl_GetStringFromObj(objv[2], &length1);
-	    string2 = Tcl_GetStringFromObj(objv[3], &length2);
+	    length2 = -1;
+
+	    ustring1 = Tcl_GetUnicode(objv[2]);
+	    length1  = Tcl_GetCharLength(objv[2]);
+	    ustring2 = Tcl_GetUnicode(objv[3]);
+	    length2  = Tcl_GetCharLength(objv[3]);
 
 	    if (objc == 5) {
 		/*
-		 * If a startIndex is specified, we will need to fast forward
-		 * to that point in the string before we think about a match
+		 * If a startIndex is specified, we will need to fast
+		 * forward to that point in the string before we think
+		 * about a match
 		 */
-		utflen = Tcl_NumUtfChars(string2, length2);
-		if (TclGetIntForIndex(interp, objv[4], utflen-1,
-				      &start) != TCL_OK) {
+		if (TclGetIntForIndex(interp, objv[4], length2 - 1,
+			&start) != TCL_OK) {
 		    return TCL_ERROR;
 		}
-		if (start >= utflen) {
+		if (start >= length2) {
 		    goto str_first_done;
 		} else if (start > 0) {
-		    if (length2 == utflen) {
-			/* no unicode chars */
-			string2 += start;
-			length2 -= start;
-		    } else {
-			char *s = Tcl_UtfAtIndex(string2, start);
-			length2 -= s - string2;
-			string2 = s;
-		    }
+		    ustring2 += start;
+		    length2  -= start;
 		}
 	    }
 
 	    if (length1 > 0) {
-		end = string2 + length2 - length1 + 1;
-		for (p = string2;  p < end;  p++) {
+		register Tcl_UniChar *p, *end;
+
+		end = ustring2 + length2 - length1 + 1;
+		for (p = ustring2;  p < end;  p++) {
 		    /*
 		     * Scan forward to find the first character.
 		     */
-
-		    p = memchr(p, *string1, (unsigned) (end - p));
-		    if (p == NULL) {
-			break;
-		    }
-		    if (memcmp(string1, p, (unsigned) length1) == 0) {
-			match = p - string2;
+		    if ((*p == *ustring1) &&
+			    (Tcl_UniCharNcmp(ustring1, p,
+				    (unsigned long) length1) == 0)) {
+			match = p - ustring2;
 			break;
 		    }
 		}
 	    }
-
 	    /*
 	     * Compute the character index of the matching string by
 	     * counting the number of characters before the match.
 	     */
-	str_first_done:
-	    if (match != -1) {
-		if (objc == 4) {
-		    match = Tcl_NumUtfChars(string2, match);
-		} else if (length2 == utflen) {
-		    /* no unicode chars */
-		    match += start;
-		} else {
-		    match = start + Tcl_NumUtfChars(string2, match);
-		}
+	    if ((match != -1) && (objc == 5)) {
+		match += start;
 	    }
+
+	    str_first_done:
 	    Tcl_SetIntObj(resultPtr, match);
 	    break;
 	}
 	case STR_INDEX: {
-	    char buf[TCL_UTF_MAX];
-	    Tcl_UniChar unichar;
-
 	    if (objc != 4) {
 	        Tcl_WrongNumArgs(interp, 2, objv, "string charIndex");
 		return TCL_ERROR;
@@ -1201,33 +1184,33 @@ Tcl_StringObjCmd(dummy, interp, objc, objv)
 	     */
 
 	    if (objv[2]->typePtr == &tclByteArrayType) {
-
-		string1 = (char *)Tcl_GetByteArrayFromObj(objv[2], &length1);
+		string1 = (char *) Tcl_GetByteArrayFromObj(objv[2], &length1);
 
 		if (TclGetIntForIndex(interp, objv[3], length1 - 1,
 			&index) != TCL_OK) {
 		    return TCL_ERROR;
 		}
-		Tcl_SetByteArrayObj(resultPtr,
-			(unsigned char *)(&string1[index]), 1);
+		if ((index >= 0) && (index < length1)) {
+		    Tcl_SetByteArrayObj(resultPtr,
+			    (unsigned char *)(&string1[index]), 1);
+		}
 	    } else {
-		string1 = Tcl_GetStringFromObj(objv[2], &length1);
-		
 		/*
-		 * convert to Unicode internal rep to calulate what
-		 * 'end' really means.
+		 * Get Unicode char length to calulate what 'end' means.
 		 */
+		length1 = Tcl_GetCharLength(objv[2]);
 
-		length2 = Tcl_GetCharLength(objv[2]);
-    
-		if (TclGetIntForIndex(interp, objv[3], length2 - 1,
+		if (TclGetIntForIndex(interp, objv[3], length1 - 1,
 			&index) != TCL_OK) {
 		    return TCL_ERROR;
 		}
-		if ((index >= 0) && (index < length2)) {
-		    unichar = Tcl_GetUniChar(objv[2], index);
-		    length2 = Tcl_UniCharToUtf((int)unichar, buf);
-		    Tcl_SetStringObj(resultPtr, buf, length2);
+		if ((index >= 0) && (index < length1)) {
+		    char buf[TCL_UTF_MAX];
+		    Tcl_UniChar ch;
+
+		    ch      = Tcl_GetUniChar(objv[2], index);
+		    length1 = Tcl_UniCharToUtf(ch, buf);
+		    Tcl_SetStringObj(resultPtr, buf, length1);
 		}
 	    }
 	    break;
@@ -1275,7 +1258,8 @@ Tcl_StringObjCmd(dummy, interp, objc, objv)
 			strncmp(string2, "-strict", (size_t) length2) == 0) {
 			strict = 1;
 		    } else if ((length2 > 1) &&
-			       strncmp(string2, "-failindex", (size_t) length2) == 0) {
+			    strncmp(string2, "-failindex",
+				    (size_t) length2) == 0) {
 			if (i+1 >= objc-1) {
 			    Tcl_WrongNumArgs(interp, 3, objv,
 					     "?-strict? ?-failindex var? str");
@@ -1508,78 +1492,63 @@ Tcl_StringObjCmd(dummy, interp, objc, objv)
 	    break;
 	}
 	case STR_LAST: {
-	    register char *p;
-	    int match, utflen, start;
+	    Tcl_UniChar *ustring1, *ustring2, *p;
+	    int match, start;
 
 	    if (objc < 4 || objc > 5) {
 	        Tcl_WrongNumArgs(interp, 2, objv,
-				 "string1 string2 ?startIndex?");
+				 "subString string ?startIndex?");
 		return TCL_ERROR;
 	    }
 
 	    /*
-	     * This algorithm fails on improperly formed UTF strings.
+	     * We are searching string2 for the sequence string1.
 	     */
 
 	    match = -1;
 	    start = 0;
-	    utflen = -1;
-	    string1 = Tcl_GetStringFromObj(objv[2], &length1);
-	    string2 = Tcl_GetStringFromObj(objv[3], &length2);
+	    length2 = -1;
+
+	    ustring1 = Tcl_GetUnicode(objv[2]);
+	    length1  = Tcl_GetCharLength(objv[2]);
+	    ustring2 = Tcl_GetUnicode(objv[3]);
+	    length2  = Tcl_GetCharLength(objv[3]);
 
 	    if (objc == 5) {
 		/*
 		 * If a startIndex is specified, we will need to restrict
 		 * the string range to that char index in the string
 		 */
-		utflen = Tcl_NumUtfChars(string2, length2);
-		if (TclGetIntForIndex(interp, objv[4], utflen-1,
-				      &start) != TCL_OK) {
+		if (TclGetIntForIndex(interp, objv[4], length2 - 1,
+			&start) != TCL_OK) {
 		    return TCL_ERROR;
 		}
 		if (start < 0) {
 		    goto str_last_done;
-		} else if (start < utflen) {
-		    if (length2 == utflen) {
-			/* no unicode chars */
-			p = string2 + start + 1 - length1;
-		    } else {
-			p = Tcl_UtfAtIndex(string2, start+1) - length1;
-		    }
+		} else if (start < length2) {
+		    p = ustring2 + start + 1 - length1;
 		} else {
-		    p = string2 + length2 - length1;
+		    p = ustring2 + length2 - length1;
 		}
 	    } else {
-		p = string2 + length2 - length1;
+		p = ustring2 + length2 - length1;
 	    }
 
 	    if (length1 > 0) {
-		for (;  p >= string2;  p--) {
+		for (; p >= ustring2;  p--) {
 		    /*
 		     * Scan backwards to find the first character.
 		     */
-
-		    while ((p != string2) && (*p != *string1)) {
-			p--;
-		    }
-		    if (memcmp(string1, p, (unsigned) length1) == 0) {
-			match = p - string2;
+		    if ((*p == *ustring1) &&
+			    (memcmp((char *) ustring1, (char *) p, (size_t)
+				    (length1 * sizeof(Tcl_UniChar))) == 0)) {
+			match = p - ustring2;
 			break;
 		    }
 		}
 	    }
 
-	    /*
-	     * Compute the character index of the matching string by counting
-	     * the number of characters before the match.
-	     */
-	str_last_done:
-	    if (match != -1) {
-		if ((objc == 4) || (length2 != utflen)) {
-		    /* only check when we've got unicode chars */
-		    match = Tcl_NumUtfChars(string2, match);
-		}
-	    }
+	    str_last_done:
 	    Tcl_SetIntObj(resultPtr, match);
 	    break;
 	}
@@ -1592,7 +1561,6 @@ Tcl_StringObjCmd(dummy, interp, objc, objv)
 
 	    if ((enum options) index == STR_BYTELENGTH) {
 		(void) Tcl_GetStringFromObj(objv[2], &length1);
-		Tcl_SetIntObj(resultPtr, length1);
 	    } else {
 		/*
 		 * If we have a ByteArray object, avoid recomputing the
@@ -1603,20 +1571,19 @@ Tcl_StringObjCmd(dummy, interp, objc, objv)
 
 		if (objv[2]->typePtr == &tclByteArrayType) {
 		    (void) Tcl_GetByteArrayFromObj(objv[2], &length1);
-		    Tcl_SetIntObj(resultPtr, length1);
 		} else {
-		    Tcl_SetIntObj(resultPtr,
-			    Tcl_GetCharLength(objv[2]));
+		    length1 = Tcl_GetCharLength(objv[2]);
 		}
 	    }
+	    Tcl_SetIntObj(resultPtr, length1);
 	    break;
 	}
 	case STR_MAP: {
-	    int uselen, mapElemc, len, nocase = 0;
+	    int mapElemc, nocase = 0;
 	    Tcl_Obj **mapElemv;
-	    char *end;
-	    Tcl_UniChar ch;
-	    int (*str_comp_fn)();
+	    Tcl_UniChar *ustring1, *ustring2, *p, *end;
+	    int (*strCmpFn)(CONST Tcl_UniChar*, CONST Tcl_UniChar*,
+		    unsigned long);
 
 	    if (objc < 4 || objc > 5) {
 	        Tcl_WrongNumArgs(interp, 2, objv, "?-nocase? charMap string");
@@ -1652,63 +1619,111 @@ Tcl_StringObjCmd(dummy, interp, objc, objv)
 		Tcl_SetStringObj(resultPtr, "char map list unbalanced", -1);
 		return TCL_ERROR;
 	    }
-	    string1 = Tcl_GetStringFromObj(objv[objc-1], &length1);
+	    objc--;
+
+	    ustring1 = Tcl_GetUnicode(objv[objc]);
+	    length1  = Tcl_GetCharLength(objv[objc]);
 	    if (length1 == 0) {
+		/*
+		 * Empty input string, just stop now
+		 */
 		break;
 	    }
-	    end = string1 + length1;
+	    end = ustring1 + length1;
 
-	    if (nocase) {
-		length1 = Tcl_NumUtfChars(string1, length1);
-		str_comp_fn = Tcl_UtfNcasecmp;
-	    } else {
-		str_comp_fn = memcmp;
-	    }
+	    strCmpFn = (nocase) ? Tcl_UniCharNcasecmp : Tcl_UniCharNcmp;
 
-	    for ( ; string1 < end; string1 += len) {
-		len = Tcl_UtfToUniChar(string1, &ch);
-		for (index = 0; index < mapElemc; index +=2) {
-		    /*
-		     * Get the key string to match on
-		     */
-		    string2 = Tcl_GetStringFromObj(mapElemv[index],
-						   &length2);
-		    if (nocase) {
-			uselen = Tcl_NumUtfChars(string2, length2);
-		    } else {
-			uselen = length2;
+	    /*
+	     * Force result to be Unicode
+	     */
+	    Tcl_SetUnicodeObj(resultPtr, ustring1, 0);
+
+	    if (mapElemc == 2) {
+		/*
+		 * Special case for one map pair which avoids the extra
+		 * for loop and extra calls to get Unicode data.  The
+		 * algorithm is otherwise identical to the multi-pair case.
+		 * This will be >30% faster on larger strings.
+		 */
+		Tcl_UniChar *mapString = Tcl_GetUnicode(mapElemv[1]);
+		int mapLen = Tcl_GetCharLength(mapElemv[1]);
+		ustring2 = Tcl_GetUnicode(mapElemv[0]);
+		length2  = Tcl_GetCharLength(mapElemv[0]);
+		for (p = ustring1; ustring1 < end; ustring1++) {
+		    if ((length2 > 0) &&
+			    (nocase || (*ustring1 == *ustring2)) &&
+			    (strCmpFn(ustring1, ustring2,
+				    (unsigned long) length2) == 0)) {
+			if (p != ustring1) {
+			    Tcl_AppendUnicodeToObj(resultPtr, p,
+				    ustring1 - p);
+			    p = ustring1 + length2;
+			} else {
+			    p += length2;
+			}
+			ustring1 = p - 1;
+
+			Tcl_AppendUnicodeToObj(resultPtr, mapString, mapLen);
 		    }
-		    if ((uselen > 0) && (uselen <= length1) &&
-			(str_comp_fn(string2, string1, uselen) == 0)) {
-			/*
-			 * Adjust len to be full length of matched string
-			 * it has to be the BYTE length
-			 */
-			len = length2;
+		}
+	    } else {
+		Tcl_UniChar **mapStrings =
+		    (Tcl_UniChar **) ckalloc((mapElemc * 2)
+			    * sizeof(Tcl_UniChar *));
+		int *mapLens =
+		    (int *) ckalloc((mapElemc * 2) * sizeof(int));
+		/*
+		 * Precompute pointers to the unicode string and length.
+		 * This saves us repeated function calls later,
+		 * significantly speeding up the algorithm.
+		 */
+		for (index = 0; index < mapElemc; index++) {
+		    mapStrings[index] = Tcl_GetUnicode(mapElemv[index]);
+		    mapLens[index]    = Tcl_GetCharLength(mapElemv[index]);
+		}
+		for (p = ustring1; ustring1 < end; ustring1++) {
+		    for (index = 0; index < mapElemc; index += 2) {
 			/*
-			 * Change string2 and length2 to the map value
+			 * Get the key string to match on
 			 */
-			string2 = Tcl_GetStringFromObj(mapElemv[index+1],
-						       &length2);
-			Tcl_AppendToObj(resultPtr, string2, length2);
-			break;
+			ustring2 = mapStrings[index];
+			length2  = mapLens[index];
+			if ((length2 > 0) &&
+				(nocase || (*ustring1 == *ustring2)) &&
+				(strCmpFn(ustring2, ustring1,
+					(unsigned long) length2) == 0)) {
+			    if (p != ustring1) {
+				/*
+				 * Put the skipped chars onto the result first
+				 */
+				Tcl_AppendUnicodeToObj(resultPtr, p,
+					ustring1 - p);
+				p = ustring1 + length2;
+			    } else {
+				p += length2;
+			    }
+			    /*
+			     * Adjust len to be full length of matched string
+			     */
+			    ustring1 = p - 1;
+
+			    /*
+			     * Append the map value to the unicode string
+			     */
+			    Tcl_AppendUnicodeToObj(resultPtr,
+				    mapStrings[index+1], mapLens[index+1]);
+			    break;
+			}
 		    }
 		}
-		if (index == mapElemc) {
-		    /*
-		     * No match was found, put the char onto result
-		     */
-		    Tcl_AppendToObj(resultPtr, string1, len);
-		}
+		ckfree((char *) mapStrings);
+		ckfree((char *) mapLens);
+	    }
+	    if (p != ustring1) {
 		/*
-		 * in nocase, length1 is in chars
-		 * otherwise it is in bytes
+		 * Put the rest of the unmapped chars onto result
 		 */
-		if (nocase) {
-		    length1--;
-		} else {
-		    length1 -= len;
-		}
+		Tcl_AppendUnicodeToObj(resultPtr, p, ustring1 - p);
 	    }
 	    break;
 	}
@@ -1734,9 +1749,8 @@ Tcl_StringObjCmd(dummy, interp, objc, objv)
 	    }
 
 	    Tcl_SetBooleanObj(resultPtr,
-			      Tcl_StringCaseMatch(Tcl_GetString(objv[objc-1]),
-						  Tcl_GetString(objv[objc-2]),
-						  nocase));
+		    Tcl_UniCharCaseMatch(Tcl_GetUnicode(objv[objc-1]),
+			    Tcl_GetUnicode(objv[objc-2]), nocase));
 	    break;
 	}
 	case STR_RANGE: {
@@ -1748,64 +1762,24 @@ Tcl_StringObjCmd(dummy, interp, objc, objv)
 	    }
 
 	    /*
-	     * If we have a ByteArray object, avoid indexing in the
-	     * Utf string since the byte array contains one byte per
-	     * character.  Otherwise, use the Unicode string rep to
-	     * get the range.
+	     * Get the length in actual characters.
 	     */
+	    length1 = Tcl_GetCharLength(objv[2]) - 1;
 
-	    if (objv[2]->typePtr == &tclByteArrayType) {
-
-		string1 = (char *)Tcl_GetByteArrayFromObj(objv[2], &length1);
-
-		if (TclGetIntForIndex(interp, objv[3], length1 - 1,
-			&first) != TCL_OK) {
-		    return TCL_ERROR;
-		}
-		if (TclGetIntForIndex(interp, objv[4], length1 - 1,
-			&last) != TCL_OK) {
-		    return TCL_ERROR;
-		}
-		if (first < 0) {
-		    first = 0;
-		}
-		if (last >= length1 - 1) {
-		    last = length1 - 1;
-		}
-		if (last >= first) {
-		    int numBytes = last - first + 1;
-		    resultPtr = Tcl_NewByteArrayObj(
-				(unsigned char *) &string1[first], numBytes);
-		    Tcl_SetObjResult(interp, resultPtr);
-		}
-	    } else {
-		string1 = Tcl_GetStringFromObj(objv[2], &length1);
-		
-		/*
-		 * Convert to Unicode internal rep to calulate length and
-		 * create a result object.
-		 */
+	    if ((TclGetIntForIndex(interp, objv[3], length1, &first) != TCL_OK)
+		    || (TclGetIntForIndex(interp, objv[4], length1,
+			    &last) != TCL_OK)) {
+		return TCL_ERROR;
+	    }
 
-		length2 = Tcl_GetCharLength(objv[2]) - 1;
-    
-		if (TclGetIntForIndex(interp, objv[3], length2,
-			&first) != TCL_OK) {
-		    return TCL_ERROR;
-		}
-		if (TclGetIntForIndex(interp, objv[4], length2,
-			&last) != TCL_OK) {
-		    return TCL_ERROR;
-		}
-		if (first < 0) {
-		    first = 0;
-		}
-		if (last >= length2) {
-		    last = length2;
-		}
-		if (last >= first) {
-		    resultPtr = Tcl_GetRange(objv[2], first, last);
-		    Tcl_SetObjResult(interp, resultPtr);
-		}
+	    if (first < 0) {
+		first = 0;
+	    }
+	    if (last >= length1) {
+		last = length1;
+	    }
+	    if (last >= first) {
+		Tcl_SetObjResult(interp, Tcl_GetRange(objv[2], first, last));
 	    }
 	    break;
 	}
@@ -1830,6 +1804,7 @@ Tcl_StringObjCmd(dummy, interp, objc, objv)
 	    break;
 	}
 	case STR_REPLACE: {
+	    Tcl_UniChar *ustring1;
 	    int first, last;
 
 	    if (objc < 5 || objc > 6) {
@@ -1838,33 +1813,29 @@ Tcl_StringObjCmd(dummy, interp, objc, objv)
 		return TCL_ERROR;
 	    }
 
-	    string1 = Tcl_GetStringFromObj(objv[2], &length1);
-	    length1 = Tcl_NumUtfChars(string1, length1) - 1;
-	    if (TclGetIntForIndex(interp, objv[3], length1,
-				  &first) != TCL_OK) {
-		return TCL_ERROR;
-	    }
-	    if (TclGetIntForIndex(interp, objv[4], length1,
-		    &last) != TCL_OK) {
+	    ustring1 = Tcl_GetUnicode(objv[2]);
+	    length1  = Tcl_GetCharLength(objv[2]) - 1;
+
+	    if ((TclGetIntForIndex(interp, objv[3], length1, &first) != TCL_OK)
+		    || (TclGetIntForIndex(interp, objv[4], length1,
+			    &last) != TCL_OK)) {
 		return TCL_ERROR;
 	    }
-	    if ((last < first) || (first > length1) || (last < 0)) {
+
+	    if ((last < first) || (last < 0) || (first > length1)) {
 		Tcl_SetObjResult(interp, objv[2]);
 	    } else {
-		char *start, *end;
-
 		if (first < 0) {
 		    first = 0;
 		}
-		start = Tcl_UtfAtIndex(string1, first);
-		end = Tcl_UtfAtIndex(start, ((last > length1) ? length1 : last)
-				     - first + 1);
-	        Tcl_SetStringObj(resultPtr, string1, start - string1);
+
+		Tcl_SetUnicodeObj(resultPtr, ustring1, first);
 		if (objc == 6) {
 		    Tcl_AppendObjToObj(resultPtr, objv[5]);
 		}
 		if (last < length1) {
-		    Tcl_AppendToObj(resultPtr, end, -1);
+		    Tcl_AppendUnicodeToObj(resultPtr, ustring1 + last + 1,
+			    length1 - last);
 		}
 	    }
 	    break;
-- 
cgit v0.12