summaryrefslogtreecommitdiffstats
path: root/library/cookiejar/idna.tcl
blob: afc7128e9c2c75a65de4f6cdbb6fccc0bca78e4a (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
# cookiejar.tcl --
#
#	Implementation of IDNA (Internationalized Domain Names for
#	Applications) encoding/decoding system, built on a punycode engine
#	developed directly from the code in RFC 3492, Appendix C (with
#	substantial modifications).
#
# This implementation includes code from that RFC, translated to Tcl; the
# other parts are:
# Copyright (c) 2014 Donal K. Fellows
#
# See the file "license.terms" for information on usage and redistribution of
# this file, and for a DISCLAIMER OF ALL WARRANTIES.

namespace eval ::tcl::idna {
    namespace ensemble create -command puny -map {
	encode punyencode
	decode punydecode
    }
    namespace ensemble create -command ::tcl::idna -map {
	encode IDNAencode
	decode IDNAdecode
	puny puny
	version {::apply {{} {package present tcl::idna} ::}}
    }

    proc IDNAencode hostname {
	set parts {}
	# Split term from RFC 3490, Sec 3.1
	foreach part [split $hostname "\u002E\u3002\uFF0E\uFF61"] {
	    if {[regexp {[^-A-Za-z0-9]} $part]} {
		if {[regexp {[^-A-Za-z0-9\u00a1-\uffff]} $part ch]} {
		    scan $ch %c c
		    if {$ch < "!" || $ch > "~"} {
			set ch [format "\\u%04x" $c]
		    }
		    throw [list IDNA INVALID_NAME_CHARACTER $ch] \
			"bad character \"$ch\" in DNS name"
		}
		set part xn--[punyencode $part]
		# Length restriction from RFC 5890, Sec 2.3.1
		if {[string length $part] > 63} {
		    throw [list IDNA OVERLONG_PART $part] \
			"hostname part too long"
		}
	    }
	    lappend parts $part
	}
	return [join $parts .]
    }
    proc IDNAdecode hostname {
	set parts {}
	# Split term from RFC 3490, Sec 3.1
	foreach part [split $hostname "\u002E\u3002\uFF0E\uFF61"] {
	    if {[string match -nocase "xn--*" $part]} {
		set part [punydecode [string range $part 4 end]]
	    }
	    lappend parts $part
	}
	return [join $parts .]
    }

    variable digits [split "abcdefghijklmnopqrstuvwxyz0123456789" ""]
    # Bootstring parameters for Punycode
    variable base 36
    variable tmin 1
    variable tmax 26
    variable skew 38
    variable damp 700
    variable initial_bias 72
    variable initial_n 0x80

    variable max_codepoint 0x10FFFF

    proc adapt {delta first numchars} {
	variable base
	variable tmin
	variable tmax
	variable damp
	variable skew

	set delta [expr {$delta / ($first ? $damp : 2)}]
	incr delta [expr {$delta / $numchars}]
	set k 0
	while {$delta > ($base - $tmin) * $tmax / 2} {
	    set delta [expr {$delta / ($base-$tmin)}]
	    incr k $base
	}
	return [expr {$k + ($base-$tmin+1) * $delta / ($delta+$skew)}]
    }

    # Main punycode encoding function
    proc punyencode {string {case ""}} {
	variable digits
	variable tmin
	variable tmax
	variable base
	variable initial_n
	variable initial_bias

	if {![string is boolean $case]} {
	    return -code error "\"$case\" must be boolean"
	}

	set in {}
	foreach char [set string [split $string ""]] {
	    scan $char "%c" ch
	    lappend in $ch
	}
	set output {}

	# Initialize the state:
	set n $initial_n
	set delta 0
	set bias $initial_bias

	# Handle the basic code points:
	foreach ch $string {
	    if {$ch < "\u0080"} {
		if {$case eq ""} {
		    append output $ch
		} elseif {[string is true $case]} {
		    append output [string toupper $ch]
		} elseif {[string is false $case]} {
		    append output [string tolower $ch]
		}
	    }
	}

	set b [string length $output]

	# h is the number of code points that have been handled, b is the
	# number of basic code points.

	if {$b > 0} {
	    append output "-"
	}

	# Main encoding loop:

	for {set h $b} {$h < [llength $in]} {incr delta; incr n} {
	    # All non-basic code points < n have been handled already.  Find
	    # the next larger one:

	    set m inf
	    foreach ch $in {
		if {$ch >= $n && $ch < $m} {
		    set m $ch
		}
	    }

	    # Increase delta enough to advance the decoder's <n,i> state to
	    # <m,0>, but guard against overflow:

	    if {$m-$n > (0xFFFFFFFF-$delta)/($h+1)} {
		throw {PUNYCODE OVERFLOW} "overflow in delta computation"
	    }
	    incr delta [expr {($m-$n) * ($h+1)}]
	    set n $m

	    foreach ch $in {
		if {$ch < $n && ([incr delta] & 0xFFFFFFFF) == 0} {
		    throw {PUNYCODE OVERFLOW} "overflow in delta computation"
		}

		if {$ch != $n} {
		    continue
		}

		# Represent delta as a generalized variable-length integer:

		for {set q $delta; set k $base} true {incr k $base} {
		    set t [expr {min(max($k-$bias, $tmin), $tmax)}]
		    if {$q < $t} {
			break
		    }
		    append output \
			[lindex $digits [expr {$t + ($q-$t)%($base-$t)}]]
		    set q [expr {($q-$t) / ($base-$t)}]
		}

		append output [lindex $digits $q]
		set bias [adapt $delta [expr {$h==$b}] [expr {$h+1}]]
		set delta 0
		incr h
	    }
	}

	return $output
    }

    # Main punycode decode function
    proc punydecode {string {case ""}} {
	variable tmin
	variable tmax
	variable base
	variable initial_n
	variable initial_bias
	variable max_codepoint

	if {![string is boolean $case]} {
	    return -code error "\"$case\" must be boolean"
	}

	# Initialize the state:

	set n $initial_n
	set i 0
	set first 1
	set bias $initial_bias

	# Split the string into the "real" ASCII characters and the ones to
	# feed into the main decoder. Note that we don't need to check the
	# result of [regexp] because that RE will technically match any string
	# at all.

	regexp {^(?:(.*)-)?([^-]*)$} $string -> pre post
	if {[string is true -strict $case]} {
	    set pre [string toupper $pre]
	} elseif {[string is false -strict $case]} {
	    set pre [string tolower $pre]
	}
	set output [split $pre ""]
	set out [llength $output]

	# Main decoding loop:

	for {set in 0} {$in < [string length $post]} {incr in} {
	    # Decode a generalized variable-length integer into delta, which
	    # gets added to i. The overflow checking is easier if we increase
	    # i as we go, then subtract off its starting value at the end to
	    # obtain delta.

	    for {set oldi $i; set w 1; set k $base} 1 {incr in} {
		if {[set ch [string index $post $in]] eq ""} {
		    throw {PUNYCODE BAD_INPUT LENGTH} "exceeded input data"
		}
		if {[string match -nocase {[a-z]} $ch]} {
		    scan [string toupper $ch] %c digit
		    incr digit -65
		} elseif {[string match {[0-9]} $ch]} {
		    set digit [expr {$ch + 26}]
		} else {
		    throw {PUNYCODE BAD_INPUT CHAR} \
			    "bad decode character \"$ch\""
		}
		incr i [expr {$digit * $w}]
		set t [expr {min(max($tmin, $k-$bias), $tmax)}]
		if {$digit < $t} {
		    set bias [adapt [expr {$i-$oldi}] $first [incr out]]
		    set first 0
		    break
		}
		if {[set w [expr {$w * ($base - $t)}]] > 0x7FFFFFFF} {
		    throw {PUNYCODE OVERFLOW} \
			"excessively large integer computed in digit decode"
		}
		incr k $base
	    }

	    # i was supposed to wrap around from out+1 to 0, incrementing n
	    # each time, so we'll fix that now:

	    if {[incr n [expr {$i / $out}]] > 0x7FFFFFFF} {
		throw {PUNYCODE OVERFLOW} \
		    "excessively large integer computed in character choice"
	    } elseif {$n > $max_codepoint} {
		if {$n >= 0x00D800 && $n < 0x00E000} {
		    # Bare surrogate?!
		    throw {PUNYCODE NON_BMP} \
			[format "unsupported character U+%06x" $n]
		}
		throw {PUNYCODE NON_UNICODE} "bad codepoint $n"
	    }
	    set i [expr {$i % $out}]

	    # Insert n at position i of the output:

	    set output [linsert $output $i [format "%c" $n]]
	    incr i
	}

	return [join $output ""]
    }
}

package provide tcl::idna 1.0.1

# Local variables:
# mode: tcl
# fill-column: 78
# End: