Rewrote word.tcl for greater efficiency. [Bug 1764318]

author: dkf <donal.k.fellows@manchester.ac.uk> 2007-08-01 15:17:11 (GMT)
committer: dkf <donal.k.fellows@manchester.ac.uk> 2007-08-01 15:17:11 (GMT)
commit: f4ba6d32f660d1c4353b87eef65be808dfcb31a0 (patch)
tree: 3946de5f0bad36db60101a2809d9aece99113b9e /library
parent: 151923cca7b86c51a2292b427fbf70e75b05bbe3 (diff)
download: tcl-f4ba6d32f660d1c4353b87eef65be808dfcb31a0.zip
tcl-f4ba6d32f660d1c4353b87eef65be808dfcb31a0.tar.gz
tcl-f4ba6d32f660d1c4353b87eef65be808dfcb31a0.tar.bz2
1 files changed, 82 insertions, 68 deletions
diff --git a/library/word.tcl b/library/word.tcl
index 05c3bab..0eb9237 100644
--- a/library/word.tcl
+++ b/library/word.tcl
@@ -1,132 +1,146 @@
 # word.tcl --
 #
-# This file defines various procedures for computing word boundaries
-# in strings.  This file is primarily needed so Tk text and entry
-# widgets behave properly for different platforms.
+# This file defines various procedures for computing word boundaries in
+# strings. This file is primarily needed so Tk text and entry widgets behave
+# properly for different platforms.
 #
 # Copyright (c) 1996 by Sun Microsystems, Inc.
 # Copyright (c) 1998 by Scritpics Corporation.
 #
-# See the file "license.terms" for information on usage and redistribution
-# of this file, and for a DISCLAIMER OF ALL WARRANTIES.
-# 
-# RCS: @(#) $Id: word.tcl,v 1.8 2005/07/23 04:12:49 dgp Exp $
+# See the file "license.terms" for information on usage and redistribution of
+# this file, and for a DISCLAIMER OF ALL WARRANTIES.
+#
+# RCS: @(#) $Id: word.tcl,v 1.9 2007/08/01 15:17:12 dkf Exp $
 
 # The following variables are used to determine which characters are
-# interpreted as white space.  
+# interpreted as white space.
 
 if {$::tcl_platform(platform) eq "windows"} {
     # Windows style - any but a unicode space char
-    set tcl_wordchars "\\S"
-    set tcl_nonwordchars "\\s"
+    set ::tcl_wordchars {\S}
+    set ::tcl_nonwordchars {\s}
 } else {
     # Motif style - any unicode word char (number, letter, or underscore)
-    set tcl_wordchars "\\w"
-    set tcl_nonwordchars "\\W"
+    set ::tcl_wordchars {\w}
+    set ::tcl_nonwordchars {\W}
+}
+
+# Arrange for caches of the real matcher REs to be kept, which enables the REs
+# themselves to be cached for greater performance (and somewhat greater
+# clarity too).
+
+namespace eval ::tcl {
+    variable WordBreakRE
+    array set WordBreakRE {}
+
+    proc UpdateWordBreakREs args {
+	# Ignores the arguments
+	global tcl_wordchars tcl_nonwordchars
+	variable WordBreakRE
+
+	# To keep the RE strings short...
+	set letter $tcl_wordchars
+	set space $tcl_nonwordchars
+
+	set WordBreakRE(after)		"$letter$space|$space$letter"
+	set WordBreakRE(before)		"^.*($letter$space|$space$letter)"
+	set WordBreakRE(end)		"$space*$letter+$space"
+	set WordBreakRE(next)		"$letter*$space+$letter"
+	set WordBreakRE(previous)	"$space*($letter+)$space*\$"
+    }
+
+    # Initialize the cache
+    UpdateWordBreakREs
+    trace add variable ::tcl_wordchars write ::tcl::UpdateWordBreakREs
+    trace add variable ::tcl_nonwordchars write ::tcl::UpdateWordBreakREs
 }
 
 # tcl_wordBreakAfter --
 #
-# This procedure returns the index of the first word boundary
-# after the starting point in the given string, or -1 if there
-# are no more boundaries in the given string.  The index returned refers
-# to the first character of the pair that comprises a boundary.
+# This procedure returns the index of the first word boundary after the
+# starting point in the given string, or -1 if there are no more boundaries in
+# the given string. The index returned refers to the first character of the
+# pair that comprises a boundary.
 #
 # Arguments:
 # str -		String to search.
 # start -	Index into string specifying starting point.
 
 proc tcl_wordBreakAfter {str start} {
-    global tcl_nonwordchars tcl_wordchars
-    set str [string range $str $start end]
-    if {[regexp -indices "$tcl_wordchars$tcl_nonwordchars|$tcl_nonwordchars$tcl_wordchars" $str result]} {
-	return [expr {[lindex $result 1] + $start}]
-    }
-    return -1
+    variable ::tcl::WordBreakRE
+    set result {-1 -1}
+    regexp -indices -start $start $WordBreakRE(after) $str result
+    return [lindex $result 1]
 }
 
 # tcl_wordBreakBefore --
 #
-# This procedure returns the index of the first word boundary
-# before the starting point in the given string, or -1 if there
-# are no more boundaries in the given string.  The index returned
-# refers to the second character of the pair that comprises a boundary.
+# This procedure returns the index of the first word boundary before the
+# starting point in the given string, or -1 if there are no more boundaries in
+# the given string. The index returned refers to the second character of the
+# pair that comprises a boundary.
 #
 # Arguments:
 # str -		String to search.
 # start -	Index into string specifying starting point.
 
 proc tcl_wordBreakBefore {str start} {
-    global tcl_nonwordchars tcl_wordchars
-    if {$start eq "end"} {
-	set start [string length $str]
-    }
-    if {[regexp -indices "^.*($tcl_wordchars$tcl_nonwordchars|$tcl_nonwordchars$tcl_wordchars)" [string range $str 0 $start] result]} {
-	return [lindex $result 1]
-    }
-    return -1
+    variable ::tcl::WordBreakRE
+    set result {-1 -1}
+    regexp -indices $WordBreakRE(before) [string range $str 0 $start] result
+    return [lindex $result 1]
 }
 
 # tcl_endOfWord --
 #
-# This procedure returns the index of the first end-of-word location
-# after a starting index in the given string.  An end-of-word location
-# is defined to be the first whitespace character following the first
-# non-whitespace character after the starting point.  Returns -1 if
-# there are no more words after the starting point.
+# This procedure returns the index of the first end-of-word location after a
+# starting index in the given string. An end-of-word location is defined to be
+# the first whitespace character following the first non-whitespace character
+# after the starting point. Returns -1 if there are no more words after the
+# starting point.
 #
 # Arguments:
 # str -		String to search.
 # start -	Index into string specifying starting point.
 
 proc tcl_endOfWord {str start} {
-    global tcl_nonwordchars tcl_wordchars
-    if {[regexp -indices "$tcl_nonwordchars*$tcl_wordchars+$tcl_nonwordchars" \
-	    [string range $str $start end] result]} {
-	return [expr {[lindex $result 1] + $start}]
-    }
-    return -1
+    variable ::tcl::WordBreakRE
+    set result {-1 -1}
+    regexp -indices -start $start $WordBreakRE(end) $str result
+    return [lindex $result 1]
 }
 
 # tcl_startOfNextWord --
 #
-# This procedure returns the index of the first start-of-word location
-# after a starting index in the given string.  A start-of-word
-# location is defined to be a non-whitespace character following a
-# whitespace character.  Returns -1 if there are no more start-of-word
-# locations after the starting point.
+# This procedure returns the index of the first start-of-word location after a
+# starting index in the given string. A start-of-word location is defined to
+# be a non-whitespace character following a whitespace character. Returns -1
+# if there are no more start-of-word locations after the starting point.
 #
 # Arguments:
 # str -		String to search.
 # start -	Index into string specifying starting point.
 
 proc tcl_startOfNextWord {str start} {
-    global tcl_nonwordchars tcl_wordchars
-    if {[regexp -indices "$tcl_wordchars*$tcl_nonwordchars+$tcl_wordchars" \
-	    [string range $str $start end] result]} {
-	return [expr {[lindex $result 1] + $start}]
-    }
-    return -1
+    variable ::tcl::WordBreakRE
+    set result {-1 -1}
+    regexp -indices -start $start $WordBreakRE(next) $str result
+    return [lindex $result 1]
 }
 
 # tcl_startOfPreviousWord --
 #
-# This procedure returns the index of the first start-of-word location
-# before a starting index in the given string.
+# This procedure returns the index of the first start-of-word location before
+# a starting index in the given string.
 #
 # Arguments:
 # str -		String to search.
 # start -	Index into string specifying starting point.
 
 proc tcl_startOfPreviousWord {str start} {
-    global tcl_nonwordchars tcl_wordchars
-    if {$start eq "end"} {
-	set start [string length $str]
-    }
-    if {[regexp -indices \
-	    "$tcl_nonwordchars*($tcl_wordchars+)$tcl_nonwordchars*\$" \
-	    [string range $str 0 [expr {$start - 1}]] result word]} {
-	return [lindex $word 0]
-    }
-    return -1
+    variable ::tcl::WordBreakRE
+    set word {-1 -1}
+    regexp -indices $WordBreakRE(previous) [string range $str 0 $start-1] \
+	    result word
+    return [lindex $word 0]
 }
author	dkf <donal.k.fellows@manchester.ac.uk>	2007-08-01 15:17:11 (GMT)
committer	dkf <donal.k.fellows@manchester.ac.uk>	2007-08-01 15:17:11 (GMT)
commit	f4ba6d32f660d1c4353b87eef65be808dfcb31a0 (patch)
tree	3946de5f0bad36db60101a2809d9aece99113b9e /library
parent	151923cca7b86c51a2292b427fbf70e75b05bbe3 (diff)
download	tcl-f4ba6d32f660d1c4353b87eef65be808dfcb31a0.zip tcl-f4ba6d32f660d1c4353b87eef65be808dfcb31a0.tar.gz tcl-f4ba6d32f660d1c4353b87eef65be808dfcb31a0.tar.bz2