From 635a8263e2a11333a1940f788b530eaeeda89275 Mon Sep 17 00:00:00 2001
From: vincentdarley <vincentdarley>
Date: Wed, 7 Jan 2004 15:20:53 +0000
Subject: utf-8 aware text wordstart and wordend

---
 ChangeLog             |   7 +++
 generic/tkTextIndex.c |  31 +++++++++-----
 tests/textIndex.test  | 115 ++++++++++++++++++++++++++++++++++++++++++--------
 3 files changed, 126 insertions(+), 27 deletions(-)
diff --git a/ChangeLog b/ChangeLog
index b8a5d28..489682b 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,12 @@
 2004-01-07  Vince Darley <vincentdarley@users.sourceforge.net>
 
+	* generic/tkTextIndex.c:
+	* tests/textIndex.test: fixed bug in which 'wordstart' and 
+	'wordend' were not utf-8 aware (they haven't been changed since
+	Tk 8.0), and added tests.
+	
+2004-01-07  Vince Darley <vincentdarley@users.sourceforge.net>
+
 	* win/tkWinMenu.c: only provide a submenu handle when the
 	MF_POPUP flag is given, fixing a recently-introduced crash
 	when submenus are disabled.  Also better error checking for
diff --git a/generic/tkTextIndex.c b/generic/tkTextIndex.c
index 667cb64..60138eb 100644
--- a/generic/tkTextIndex.c
+++ b/generic/tkTextIndex.c
@@ -10,7 +10,7 @@
  * See the file "license.terms" for information on usage and redistribution
  * of this file, and for a DISCLAIMER OF ALL WARRANTIES.
  *
- * RCS: @(#) $Id: tkTextIndex.c,v 1.13 2003/12/15 11:51:06 vincentdarley Exp $
+ * RCS: @(#) $Id: tkTextIndex.c,v 1.14 2004/01/07 15:21:02 vincentdarley Exp $
  */
 
 #include "default.h"
@@ -1957,7 +1957,6 @@ StartEnd(textPtr, string, indexPtr)
     TkTextIndex *indexPtr;	/* Index to modify based on string. */
 {
     CONST char *p;
-    int c, offset;
     size_t length;
     register TkTextSegment *segPtr;
     int modifier;
@@ -2024,6 +2023,7 @@ StartEnd(textPtr, string, indexPtr)
     } else if ((*string == 'w') && (strncmp(string, "wordend", length) == 0)
 	    && (length >= 5)) {
 	int firstChar = 1;
+	int offset;
 
 	/*
 	 * If the current character isn't part of a word then just move
@@ -2037,15 +2037,17 @@ StartEnd(textPtr, string, indexPtr)
 	}
 	segPtr = TkTextIndexToSeg(indexPtr, &offset);
 	while (1) {
+	    int chSize = 1;
 	    if (segPtr->typePtr == &tkTextCharType) {
-		c = segPtr->body.chars[offset];
-		if (!isalnum(UCHAR(c)) && (c != '_')) {
+		Tcl_UniChar ch;
+		chSize = TclUtfToUniChar(segPtr->body.chars + offset, &ch);
+		if (!Tcl_UniCharIsWordChar(ch)) {
 		    break;
 		}
 		firstChar = 0;
 	    }
-	    offset += 1;
-	    indexPtr->byteIndex += sizeof(char);
+	    offset += chSize;
+	    indexPtr->byteIndex += chSize;
 	    if (offset >= segPtr->size) {
 		segPtr = TkTextIndexToSeg(indexPtr, &offset);
 	    }
@@ -2062,6 +2064,7 @@ StartEnd(textPtr, string, indexPtr)
     } else if ((*string == 'w') && (strncmp(string, "wordstart", length) == 0)
 	    && (length >= 5)) {
 	int firstChar = 1;
+	int offset;
 
 	if (modifier == TKINDEX_DISPLAY) {
 	    TkTextIndexForwChars(NULL, indexPtr, 0, indexPtr, 
@@ -2076,15 +2079,23 @@ StartEnd(textPtr, string, indexPtr)
 
 	segPtr = TkTextIndexToSeg(indexPtr, &offset);
 	while (1) {
+	    int chSize = 1;
 	    if (segPtr->typePtr == &tkTextCharType) {
-		c = segPtr->body.chars[offset];
-		if (!isalnum(UCHAR(c)) && (c != '_')) {
+		Tcl_UniChar ch;
+		TclUtfToUniChar(segPtr->body.chars + offset, &ch);
+		if (!Tcl_UniCharIsWordChar(ch)) {
 		    break;
 		}
+		if (offset > 0) {
+		    chSize = (segPtr->body.chars + offset 
+			      - Tcl_UtfPrev(segPtr->body.chars + offset, 
+					    segPtr->body.chars));
+		}
 		firstChar = 0;
+	    } else {
 	    }
-	    offset -= 1;
-	    indexPtr->byteIndex -= sizeof(char);
+	    offset -= chSize;
+	    indexPtr->byteIndex -= chSize;
 	    if (offset < 0) {
 		if (indexPtr->byteIndex < 0) {
 		    indexPtr->byteIndex = 0;
diff --git a/tests/textIndex.test b/tests/textIndex.test
index 1c53b17..3186fe0 100644
--- a/tests/textIndex.test
+++ b/tests/textIndex.test
@@ -6,7 +6,7 @@
 # Copyright (c) 1998-1999 by Scriptics Corporation.
 # All rights reserved.
 #
-# RCS: @(#) $Id: textIndex.test,v 1.10 2003/10/31 09:02:17 vincentdarley Exp $
+# RCS: @(#) $Id: textIndex.test,v 1.11 2004/01/07 15:20:53 vincentdarley Exp $
 
 package require tcltest 2.1
 eval tcltest::configure $argv
@@ -645,7 +645,7 @@ test textIndex-15.15 {StartEnd} {
     list [catch {.t index {2.12 word}} msg] $msg
 } {1 {bad text index "2.12 word"}}
 
-test testIndex-16.1 {TkTextPrintIndex} {
+test textIndex-16.1 {TkTextPrintIndex} {
     set t [text .t2]
     $t insert end \n
     $t window create end -window [button $t.b]
@@ -654,7 +654,7 @@ test testIndex-16.1 {TkTextPrintIndex} {
     catch {destroy $t}
 } 0
 
-test testIndex-16.2 {TkTextPrintIndex} {
+test textIndex-16.2 {TkTextPrintIndex} {
     set t [text .t2]
     $t insert end \n
     $t window create end -window [button $t.b]
@@ -663,7 +663,7 @@ test testIndex-16.2 {TkTextPrintIndex} {
     catch {destroy $t}
 } 0
 
-test testIndex-17.1 {Object indices} {
+test textIndex-17.1 {Object indices} {
     set res {}
     set t [text .t2 -height 20]
     for {set i 0} {$i < 100} {incr i} {
@@ -681,7 +681,7 @@ test testIndex-17.1 {Object indices} {
     list $res
 } {{@0,0 1.0 @0,0 37.0}}
 
-test testIndex-18.1 {Object indices don't cache mark names} {
+test textIndex-18.1 {Object indices don't cache mark names} {
     set res {}
     text .t2
     .t2 insert 1.0 1234\n1234\n1234
@@ -737,54 +737,135 @@ set str [string repeat "hello " 20]
 .t insert end "$str one two three four five six seven height nine ten\n"
 .t insert end "$str one two three four five six seven height nine ten\n"
 
-test testIndex-19.1 {Display lines} {
+test textIndex-19.1 {Display lines} {
     .t index "2.7 displaylinestart"
 } {2.0}
 
-test testIndex-19.2 {Display lines} {
+test textIndex-19.2 {Display lines} {
     .t index "2.7 displaylineend"
 } {2.19}
 
-test testIndex-19.3 {Display lines} {
+test textIndex-19.3 {Display lines} {
     .t index "2.30 displaylinestart"
 } {2.20}
 
-test testIndex-19.4 {Display lines} {
+test textIndex-19.4 {Display lines} {
     .t index "2.30 displaylineend"
 } {2.39}
 
-test testIndex-19.5 {Display lines} {
+test textIndex-19.5 {Display lines} {
     .t index "2.40 displaylinestart"
 } {2.40}
 
-test testIndex-19.6 {Display lines} {
+test textIndex-19.6 {Display lines} {
     .t index "2.40 displaylineend"
 } {2.59}
 
-test testIndex-19.7 {Display lines} {
+test textIndex-19.7 {Display lines} {
     .t index "2.7 +1displaylines"
 } {2.27}
 
-test testIndex-19.8 {Display lines} {
+test textIndex-19.8 {Display lines} {
     .t index "2.7 -1displaylines"
 } {1.167}
 
-test testIndex-19.9 {Display lines} {
+test textIndex-19.9 {Display lines} {
     .t index "2.30 +1displaylines"
 } {2.50}
 
-test testIndex-19.10 {Display lines} {
+test textIndex-19.10 {Display lines} {
     .t index "2.30 -1displaylines"
 } {2.10}
 
-test testIndex-19.11 {Display lines} {
+test textIndex-19.11 {Display lines} {
     .t index "2.40 +1displaylines"
 } {2.60}
 
-test testIndex-19.12 {Display lines} {
+test textIndex-19.12 {Display lines} {
     .t index "2.40 -1displaylines"
 } {2.20}
 
+proc text_test_word {startend chars start} {
+    destroy .t
+    text .t
+    .t insert end $chars
+    if {[regexp {end} $start]} {
+	set start [.t index "${start}chars -2c"]
+    } else {
+	set start [.t index "1.0 + ${start}chars"]
+    }
+    if {[.t compare $start >= "end-1c"]} {
+	set start "end-2c"
+    }
+    set res [.t index "$start $startend"]
+    .t count 1.0 $res
+}
+
+# Following tests copied from tests from string wordstart/end in Tcl
+
+test textIndex-21.4 {text index wordend} {
+    text_test_word wordend abc. -1
+} 3
+test textIndex-21.5 {text index wordend} {
+    text_test_word wordend abc. 100
+} 4
+test textIndex-21.6 {text index wordend} {
+    text_test_word wordend "word_one two three" 2
+} 8
+test textIndex-21.7 {text index wordend} {
+    text_test_word wordend "one .&# three" 5
+} 6
+test textIndex-21.8 {text index wordend} {
+    text_test_word worde "x.y" 0
+} 1
+test textIndex-21.9 {text index wordend} {
+    text_test_word worde "x.y" end-1
+} 2
+test textIndex-21.10 {text index wordend, unicode} {
+    text_test_word wordend "xyz\u00c7de fg" 0
+} 6
+test textIndex-21.11 {text index wordend, unicode} {
+    text_test_word wordend "xyz\uc700de fg" 0
+} 6
+test textIndex-21.12 {text index wordend, unicode} {
+    text_test_word wordend "xyz\u203fde fg" 0
+} 6
+test textIndex-21.13 {text index wordend, unicode} {
+    text_test_word wordend "xyz\u2045de fg" 0
+} 3
+test textIndex-21.14 {text index wordend, unicode} {
+    text_test_word wordend "\uc700\uc700 abc" 8
+} 6
+
+test textIndex-22.5 {text index wordstart} {
+    text_test_word wordstart "one two three_words" 400
+} 8
+test textIndex-22.6 {text index wordstart} {
+    text_test_word wordstart "one two three_words" 2
+} 0
+test textIndex-22.7 {text index wordstart} {
+    text_test_word wordstart "one two three_words" -2
+} 0
+test textIndex-22.8 {text index wordstart} {
+    text_test_word wordstart "one .*&^ three" 6
+} 6
+test textIndex-22.9 {text index wordstart} {
+    text_test_word wordstart "one two three" 4
+} 4
+test textIndex-22.10 {text index wordstart} {
+    text_test_word wordstart "one two three" end-5
+} 7
+test textIndex-22.11 {text index wordstart, unicode} {
+    text_test_word wordstart "one tw\u00c7o three" 7
+} 4
+test textIndex-22.12 {text index wordstart, unicode} {
+    text_test_word wordstart "ab\uc700\uc700 cdef ghi" 12
+} 10
+test textIndex-22.13 {text index wordstart, unicode} {
+    text_test_word wordstart "\uc700\uc700 abc" 8
+} 3
+
+
 # cleanup
 rename textimage {}
 catch {destroy .t}
-- 
cgit v0.12