summaryrefslogtreecommitdiffstats
path: root/tools/findDocWords.tcl
blob: 0b253156e10bb25571df1975ea6646a56695ff4b (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
# findDocWords.tcl --
#
# This script attempts to find all non-dictionary words in the Tcl or Tk
# documentation tree. It handles the fairly common compoundWord trick our
# docs use, and isn't confused by nroff formatting directives, so it isn't
# just a standard spell check.
#
# Arguments:
#    1: Directory to look for man source files in.
#    2: Path to a plain text dictionary. Try /usr/share/dict/words on Linux.
#
# Copyright © 2024 Donal K Fellows.
# See "license.terms" for the license.

lassign $argv dir dictionary

set f [open $dictionary]
while {[gets $f line] > 0} {
    dict set realWord [string tolower $line] yes
}
close $f
puts "loaded [dict size $realWord] words from dictionary"

set files [glob -directory $dir {*.[13n]}]
set found {}

proc identifyWords {fragment filename} {
    global realWord found
    foreach frag [split [string map {\\fB "" \\fR "" \\fI "" \\fP "" \\0 _} $fragment] _] {
	if {[string is entier $frag]} continue
	set frag [string trim $frag "\\0123456789"]
	if {$frag eq ""} continue
	foreach word [regexp -all -inline {^[a-z]+|[A-Z][a-z]*} $frag] {
	    set word [string tolower $word]
	    if {![dict exists $realWord $word]} {
		dict lappend found $word $filename
	    }
	}
    }
}

foreach fn $files {
    set f [open $fn]
    foreach word [regexp -all -inline {[\\\w]+} [read $f]] {
	identifyWords $word $fn
    }
    close $f
}
set len [tcl::mathfunc::max {*}[lmap word [dict keys $found] {string length $word}]]
foreach word [lsort [dict keys $found]] {
    puts [format "%-${len}s: %s" $word [lindex [dict get $found $word] 0]]
}