From f94af92683741d5d67ba648141ce996627f09979 Mon Sep 17 00:00:00 2001
From: hershey <hershey>
Date: Thu, 29 Apr 1999 00:04:41 +0000
Subject: fixed part of bug 1791:  Tcl_UtfToUpper and Tcl_UtfToLower now work
 on badly formed Utf strings.

---
 generic/tclUtf.c | 96 +++++++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 88 insertions(+), 8 deletions(-)
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index 89c6b60..a04dca8 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -8,7 +8,7 @@
  * See the file "license.terms" for information on usage and redistribution
  * of this file, and for a DISCLAIMER OF ALL WARRANTIES.
  *
- * RCS: @(#) $Id: tclUtf.c,v 1.2 1999/04/16 00:46:55 stanton Exp $
+ * RCS: @(#) $Id: tclUtf.c,v 1.3 1999/04/29 00:04:41 hershey Exp $
  */
 
 #include "tclInt.h"
@@ -73,6 +73,12 @@ CONST unsigned char totalBytes[256] = {
 #endif
 };
 
+/*
+ * Procedures used only in this module.
+ */
+
+static int UtfCount _ANSI_ARGS_((int ch));
+
 
 /*
  *---------------------------------------------------------------------------
@@ -80,6 +86,50 @@ CONST unsigned char totalBytes[256] = {
  * Tcl_UniCharToUtf --
  *
  *	Store the given Tcl_UniChar as a sequence of UTF-8 bytes in the
+ *	Utf character "ch".
+ *
+ * Results:
+ *	The return values is the number of bytes in the Utf character "ch".
+ *
+ * Side effects:
+ *	None.
+ *
+ *---------------------------------------------------------------------------
+ */
+ 
+static int
+UtfCount(ch)
+    int ch;			/* The Tcl_UniChar whose size is returned. */
+{
+    if ((ch > 0) && (ch < UNICODE_SELF)) {
+	return 1;
+    }
+    if (ch <= 0x7FF) {
+	return 2;
+    }
+    if (ch <= 0xFFFF) {
+	return 3;
+    }
+#if TCL_UTF_MAX > 3
+    if (ch <= 0x1FFFFF) {
+	return 4;
+    }
+    if (ch <= 0x3FFFFFF) {
+	return 5;
+    }
+    if (ch <= 0x7FFFFFFF) {
+	return 6;
+    }
+#endif
+    return 3;
+}
+
+/*
+ *---------------------------------------------------------------------------
+ *
+ * Tcl_UniCharToUtf --
+ *
+ *	Store the given Tcl_UniChar as a sequence of UTF-8 bytes in the
  *	provided buffer.  Equivalent to Plan 9 runetochar().
  *
  * Results:
@@ -849,17 +899,32 @@ int
 Tcl_UtfToUpper(str)
     char *str;			/* String to convert in place. */
 {
-    Tcl_UniChar ch;
+    Tcl_UniChar ch, upChar;
     char *src, *dst;
-    
+    int bytes;
+
     /*
      * Iterate over the string until we hit the terminating null.
      */
 
     src = dst = str;
     while (*src) {
-	src += Tcl_UtfToUniChar(src, &ch);
-	dst += Tcl_UniCharToUtf(Tcl_UniCharToUpper(ch), dst);
+        bytes = Tcl_UtfToUniChar(src, &ch);
+	upChar = Tcl_UniCharToUpper(ch);
+
+	/*
+	 * To keep badly formed Utf strings from getting inflated by
+	 * the conversion (thereby causing a segfault), only copy the
+	 * upper case char to dst if its size is <= the original char.
+	 */
+	
+	if (bytes < UtfCount(upChar)) {
+	    memcpy(dst, src, (size_t) bytes);
+	    dst += bytes;
+	} else {
+	    dst += Tcl_UniCharToUtf(upChar, dst);
+	}
+	src += bytes;
     }
     *dst = '\0';
     return (dst - str);
@@ -887,8 +952,9 @@ int
 Tcl_UtfToLower(str)
     char *str;			/* String to convert in place. */
 {
-    Tcl_UniChar ch;
+    Tcl_UniChar ch, lowChar;
     char *src, *dst;
+    int bytes;
     
     /*
      * Iterate over the string until we hit the terminating null.
@@ -896,8 +962,22 @@ Tcl_UtfToLower(str)
 
     src = dst = str;
     while (*src) {
-	src += Tcl_UtfToUniChar(src, &ch);
-	dst += Tcl_UniCharToUtf(Tcl_UniCharToLower(ch), dst);
+	bytes = Tcl_UtfToUniChar(src, &ch);
+	lowChar = Tcl_UniCharToLower(ch);
+
+	/*
+	 * To keep badly formed Utf strings from getting inflated by
+	 * the conversion (thereby causing a segfault), only copy the
+	 * lower case char to dst if its size is <= the original char.
+	 */
+	
+	if (bytes < UtfCount(lowChar)) {
+	    memcpy(dst, src, (size_t) bytes);
+	    dst += bytes;
+	} else {
+	    dst += Tcl_UniCharToUtf(lowChar, dst);
+	}
+	src += bytes;
     }
     *dst = '\0';
     return (dst - str);
-- 
cgit v0.12