summaryrefslogtreecommitdiffstats
path: root/tcllib/modules/soundex/soundex.tcl
blob: 08ba4faccb20539f30dd066e1342e780a9a8ff47 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
# soundex.tcl --
#
#	Implementation of soundex in Tcl
#
# Copyright (c) 2003 by Andreas Kupries <andreas_kupries@users.sourceforge.net>
#
# See the file "license.terms" for information on usage and redistribution
# of this file, and for a DISCLAIMER OF ALL WARRANTIES.
# 
# RCS: @(#) $Id: soundex.tcl,v 1.3 2004/01/15 06:36:14 andreas_kupries Exp $

package require Tcl 8.2

namespace eval ::soundex {}

## ------------------------------------------------------------
##
## I. Soundex by Knuth.

# This implementation of the Soundex algorithm is released to the public
# domain: anyone may use it for any purpose.  See if I care.

# N. Dean Pentcheff 1/13/89 Dept. of Zoology University of California Berkeley,
#    CA  94720 dean@violet.berkeley.edu
# TCL port by Evan Rempel 2/10/98 Dept Comp Services University of Victoria.
# erempel@uvic.ca

# proc ::soundex::knuth ( string )
#
#   Given as argument: a character string. Returns: a static string, 4 characters long
#   This string is the Soundex key for the argument string.
#   Side effects and limitations:
#   Does not clobber the string passed in as the argument. No limit on
#   argument string length. Assumes a character set with continuously
#   ascending and contiguous letters within each case and within the digits
#   (e.g. this works for ASCII and bombs in EBCDIC. But then, most things
#   do.). Reference: Adapted from Knuth, D.E. (1973) The art of computer
#   programming; Volume 3: Sorting and searching.  Addison-Wesley Publishing
#   Company: Reading, Mass. Page 392.
#   Special cases: Leading or embedded spaces, numerals, or punctuation are squeezed
#   out before encoding begins.
#
#   Null strings or those with no encodable letters return the code 'Z000'.
#
#   Test data from Knuth (1973):
#   Euler   Gauss   Hilbert Knuth   Lloyd   Lukasiewicz
#   E460    G200    H416    K530    L300    L222

namespace eval ::soundex {
    variable  soundexKnuthCode
    array set soundexKnuthCode {
	a 0 b 1 c 2 d 3 e 0 f 1 g 2 h 0 i 0 j 2 k 2 l 4 m 5
	n 5 o 0 p 1 q 2 r 6 s 2 t 3 u 0 v 1 w 0 x 2 y 0 z 2
    }
}
proc ::soundex::knuth {in} {
    variable soundexKnuthCode
    set key ""

    # Remove the leading/trailing white space punctuation etc.

    set TempIn [string trim $in "\t\n\r .,'-"]

    # Only use alphabetic characters, so strip out all others
    # also, soundex index uses only lower case chars, so force to lower

    regsub -all {[^a-z]} [string tolower $TempIn] {} TempIn
    if {[string length $TempIn] == 0} {
	return Z000
    }
    set last [string index $TempIn 0]
    set key  [string toupper $last]
    set last $soundexKnuthCode($last)

    # Scan rest of string, stop at end of string or when the key is
    # full

    set count    1
    set MaxIndex [string length $TempIn]

    for {set index 1} {(($count < 4) && ($index < $MaxIndex))} {incr index } {
	set chcode $soundexKnuthCode([string index $TempIn $index])
	# Fold together adjacent letters sharing the same code
	if {![string equal $last $chcode]} {
	    set last $chcode
	    # Ignore code==0 letters except as separators
	    if {$last != 0} then {
		set key $key$last
		incr count
	    }
	}
    }
    return [string range ${key}0000 0 3]
}

package provide soundex 1.0