2 files changed, 108 insertions, 90 deletions
diff --git a/ChangeLog b/ChangeLog
index 4270533..a40d045 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,23 +1,27 @@
+2007-08-01  Donal K. Fellows  <donal.k.fellows@manchester.ac.uk>
+
+	* library/word.tcl: Rewrote for greater efficiency. [Bug 1764318]
+
 2007-08-01  Pat Thoyts  <patthoyts@users.sourceforge.net>
 
 	* generic/tclInt.h:     Added a TclOffset macro ala Tk_Offset to
-	* generic/tclVar.c:     abstract out 'offsetof' which may not be 
+	* generic/tclVar.c:     abstract out 'offsetof' which may not be
 	* generic/tclExceute.c: defined (eg: msvc6).
 
 2007-08-01  Miguel Sofer  <msofer@users.sf.net>
 
 	* generic/tclVar.c (TclCleanupVar): fix [Bug 1765225], thx Larry
-	Virden. 
+	Virden.
 
 2007-07-31  Miguel Sofer  <msofer@users.sf.net>
 
 	* doc/Hash.3:
 	* generic/tclHash.c:
-	* generic/tclObj.c:  
-	* generic/tclThreadStorage.c: (changes part of the patch below) 
-	Stop Tcl_CreateHashVar from resetting hPtr->clientData to NULL
-	after calling the allocEntryProc for a custom table. 
-	
+	* generic/tclObj.c:
+	* generic/tclThreadStorage.c: (changes part of the patch below)
+	Stop Tcl_CreateHashVar from resetting hPtr->clientData to NULL after
+	calling the allocEntryProc for a custom table.
+
 	* generic/tcl.h:
 	* generic/tclBasic.c:
 	* generic/tclCmdIL.c:
@@ -46,27 +50,27 @@
 	malfunction. Var flag values and semantics changed too.
 	2. 'struct Bytecode' has an additional field that has to be
 	initialised to NULL
-	3. 'struct Namespace' is larger, as the varTable is now one
-	pointer larger than a Tcl_HashTable. Direct access to its fields
-	will malfunction.
-	4. 'struct CallFrame' grew one more field (the second such growth
-	with respect to Tcl8.4).
-	5. api change for the functions TclFindCompiledLocal,
-	TclDeleteVars and many internal functions in tclVar.c
-
-	Additionally, direct access to variable hash tables via the
-	standard Tcl_Hash* interface is to be considered as deprecated. It
-	still works in the present version, but will be broken by further 
-	specialisation of these hash tables. This concerns especially the
-	table of array elements in an array, as well as the varTable field
-	in the Namespace struct.
+	3. 'struct Namespace' is larger, as the varTable is now one pointer
+	larger than a Tcl_HashTable. Direct access to its fields will
+	malfunction.
+	4. 'struct CallFrame' grew one more field (the second such growth with
+	respect to Tcl8.4).
+	5. api change for the functions TclFindCompiledLocal, TclDeleteVars
+	and many internal functions in tclVar.c
+
+	Additionally, direct access to variable hash tables via the standard
+	Tcl_Hash* interface is to be considered as deprecated. It still works
+	in the present version, but will be broken by further specialisation
+	of these hash tables. This concerns especially the table of array
+	elements in an array, as well as the varTable field in the Namespace
+	struct.
 
 2007-07-31  Miguel Sofer  <msofer@users.sf.net>
 
 	* unix/configure.in: allow use of 'inline' in Tcl sources
 	* win/configure.in:  [Patch 1754128]
 	* win/makefile.vc:   Regen with autoconf 2.61
-	
+
 2007-07-31  Donal K. Fellows  <donal.k.fellows@manchester.ac.uk>
 
 	* unix/tclUnixInit.c (TclpSetVariables): Use the thread-safe getpwuid
diff --git a/library/word.tcl b/library/word.tcl
index 05c3bab..0eb9237 100644
--- a/library/word.tcl
+++ b/library/word.tcl
@@ -1,132 +1,146 @@
 # word.tcl --
 #
-# This file defines various procedures for computing word boundaries
-# in strings.  This file is primarily needed so Tk text and entry
-# widgets behave properly for different platforms.
+# This file defines various procedures for computing word boundaries in
+# strings. This file is primarily needed so Tk text and entry widgets behave
+# properly for different platforms.
 #
 # Copyright (c) 1996 by Sun Microsystems, Inc.
 # Copyright (c) 1998 by Scritpics Corporation.
 #
-# See the file "license.terms" for information on usage and redistribution
-# of this file, and for a DISCLAIMER OF ALL WARRANTIES.
-# 
-# RCS: @(#) $Id: word.tcl,v 1.8 2005/07/23 04:12:49 dgp Exp $
+# See the file "license.terms" for information on usage and redistribution of
+# this file, and for a DISCLAIMER OF ALL WARRANTIES.
+#
+# RCS: @(#) $Id: word.tcl,v 1.9 2007/08/01 15:17:12 dkf Exp $
 
 # The following variables are used to determine which characters are
-# interpreted as white space.  
+# interpreted as white space.
 
 if {$::tcl_platform(platform) eq "windows"} {
     # Windows style - any but a unicode space char
-    set tcl_wordchars "\\S"
-    set tcl_nonwordchars "\\s"
+    set ::tcl_wordchars {\S}
+    set ::tcl_nonwordchars {\s}
 } else {
     # Motif style - any unicode word char (number, letter, or underscore)
-    set tcl_wordchars "\\w"
-    set tcl_nonwordchars "\\W"
+    set ::tcl_wordchars {\w}
+    set ::tcl_nonwordchars {\W}
+}
+
+# Arrange for caches of the real matcher REs to be kept, which enables the REs
+# themselves to be cached for greater performance (and somewhat greater
+# clarity too).
+
+namespace eval ::tcl {
+    variable WordBreakRE
+    array set WordBreakRE {}
+
+    proc UpdateWordBreakREs args {
+	# Ignores the arguments
+	global tcl_wordchars tcl_nonwordchars
+	variable WordBreakRE
+
+	# To keep the RE strings short...
+	set letter $tcl_wordchars
+	set space $tcl_nonwordchars
+
+	set WordBreakRE(after)		"$letter$space|$space$letter"
+	set WordBreakRE(before)		"^.*($letter$space|$space$letter)"
+	set WordBreakRE(end)		"$space*$letter+$space"
+	set WordBreakRE(next)		"$letter*$space+$letter"
+	set WordBreakRE(previous)	"$space*($letter+)$space*\$"
+    }
+
+    # Initialize the cache
+    UpdateWordBreakREs
+    trace add variable ::tcl_wordchars write ::tcl::UpdateWordBreakREs
+    trace add variable ::tcl_nonwordchars write ::tcl::UpdateWordBreakREs
 }
 
 # tcl_wordBreakAfter --
 #
-# This procedure returns the index of the first word boundary
-# after the starting point in the given string, or -1 if there
-# are no more boundaries in the given string.  The index returned refers
-# to the first character of the pair that comprises a boundary.
+# This procedure returns the index of the first word boundary after the
+# starting point in the given string, or -1 if there are no more boundaries in
+# the given string. The index returned refers to the first character of the
+# pair that comprises a boundary.
 #
 # Arguments:
 # str -		String to search.
 # start -	Index into string specifying starting point.
 
 proc tcl_wordBreakAfter {str start} {
-    global tcl_nonwordchars tcl_wordchars
-    set str [string range $str $start end]
-    if {[regexp -indices "$tcl_wordchars$tcl_nonwordchars|$tcl_nonwordchars$tcl_wordchars" $str result]} {
-	return [expr {[lindex $result 1] + $start}]
-    }
-    return -1
+    variable ::tcl::WordBreakRE
+    set result {-1 -1}
+    regexp -indices -start $start $WordBreakRE(after) $str result
+    return [lindex $result 1]
 }
 
 # tcl_wordBreakBefore --
 #
-# This procedure returns the index of the first word boundary
-# before the starting point in the given string, or -1 if there
-# are no more boundaries in the given string.  The index returned
-# refers to the second character of the pair that comprises a boundary.
+# This procedure returns the index of the first word boundary before the
+# starting point in the given string, or -1 if there are no more boundaries in
+# the given string. The index returned refers to the second character of the
+# pair that comprises a boundary.
 #
 # Arguments:
 # str -		String to search.
 # start -	Index into string specifying starting point.
 
 proc tcl_wordBreakBefore {str start} {
-    global tcl_nonwordchars tcl_wordchars
-    if {$start eq "end"} {
-	set start [string length $str]
-    }
-    if {[regexp -indices "^.*($tcl_wordchars$tcl_nonwordchars|$tcl_nonwordchars$tcl_wordchars)" [string range $str 0 $start] result]} {
-	return [lindex $result 1]
-    }
-    return -1
+    variable ::tcl::WordBreakRE
+    set result {-1 -1}
+    regexp -indices $WordBreakRE(before) [string range $str 0 $start] result
+    return [lindex $result 1]
 }
 
 # tcl_endOfWord --
 #
-# This procedure returns the index of the first end-of-word location
-# after a starting index in the given string.  An end-of-word location
-# is defined to be the first whitespace character following the first
-# non-whitespace character after the starting point.  Returns -1 if
-# there are no more words after the starting point.
+# This procedure returns the index of the first end-of-word location after a
+# starting index in the given string. An end-of-word location is defined to be
+# the first whitespace character following the first non-whitespace character
+# after the starting point. Returns -1 if there are no more words after the
+# starting point.
 #
 # Arguments:
 # str -		String to search.
 # start -	Index into string specifying starting point.
 
 proc tcl_endOfWord {str start} {
-    global tcl_nonwordchars tcl_wordchars
-    if {[regexp -indices "$tcl_nonwordchars*$tcl_wordchars+$tcl_nonwordchars" \
-	    [string range $str $start end] result]} {
-	return [expr {[lindex $result 1] + $start}]
-    }
-    return -1
+    variable ::tcl::WordBreakRE
+    set result {-1 -1}
+    regexp -indices -start $start $WordBreakRE(end) $str result
+    return [lindex $result 1]
 }
 
 # tcl_startOfNextWord --
 #
-# This procedure returns the index of the first start-of-word location
-# after a starting index in the given string.  A start-of-word
-# location is defined to be a non-whitespace character following a
-# whitespace character.  Returns -1 if there are no more start-of-word
-# locations after the starting point.
+# This procedure returns the index of the first start-of-word location after a
+# starting index in the given string. A start-of-word location is defined to
+# be a non-whitespace character following a whitespace character. Returns -1
+# if there are no more start-of-word locations after the starting point.
 #
 # Arguments:
 # str -		String to search.
 # start -	Index into string specifying starting point.
 
 proc tcl_startOfNextWord {str start} {
-    global tcl_nonwordchars tcl_wordchars
-    if {[regexp -indices "$tcl_wordchars*$tcl_nonwordchars+$tcl_wordchars" \
-	    [string range $str $start end] result]} {
-	return [expr {[lindex $result 1] + $start}]
-    }
-    return -1
+    variable ::tcl::WordBreakRE
+    set result {-1 -1}
+    regexp -indices -start $start $WordBreakRE(next) $str result
+    return [lindex $result 1]
 }
 
 # tcl_startOfPreviousWord --
 #
-# This procedure returns the index of the first start-of-word location
-# before a starting index in the given string.
+# This procedure returns the index of the first start-of-word location before
+# a starting index in the given string.
 #
 # Arguments:
 # str -		String to search.
 # start -	Index into string specifying starting point.
 
 proc tcl_startOfPreviousWord {str start} {
-    global tcl_nonwordchars tcl_wordchars
-    if {$start eq "end"} {
-	set start [string length $str]
-    }
-    if {[regexp -indices \
-	    "$tcl_nonwordchars*($tcl_wordchars+)$tcl_nonwordchars*\$" \
-	    [string range $str 0 [expr {$start - 1}]] result word]} {
-	return [lindex $word 0]
-    }
-    return -1
+    variable ::tcl::WordBreakRE
+    set word {-1 -1}
+    regexp -indices $WordBreakRE(previous) [string range $str 0 $start-1] \
+	    result word
+    return [lindex $word 0]
 }