summaryrefslogtreecommitdiffstats
path: root/tcllib/modules/stringprep/unicode.tcl
diff options
context:
space:
mode:
Diffstat (limited to 'tcllib/modules/stringprep/unicode.tcl')
-rw-r--r--tcllib/modules/stringprep/unicode.tcl292
1 files changed, 292 insertions, 0 deletions
diff --git a/tcllib/modules/stringprep/unicode.tcl b/tcllib/modules/stringprep/unicode.tcl
new file mode 100644
index 0000000..428f735
--- /dev/null
+++ b/tcllib/modules/stringprep/unicode.tcl
@@ -0,0 +1,292 @@
+# unicode.tcl -*- tcl -*-
+#
+# Implementation of RFC 3454 "Preparation of Internationalized Strings"
+#
+# Copyright (c) 2007 Sergei Golovan
+#
+# See the file "license.terms" for information on usage and redistribution
+# of this file, and for a DISCLAIMER OF ALL WARRANTIES.
+#
+# RCS: @(#) $Id: unicode.tcl,v 1.1 2008/01/29 02:18:10 patthoyts Exp $
+
+package require unicode::data 1.0
+
+namespace eval ::unicode {
+ # Hangul constants
+ set SBase 0xac00
+ set LBase 0x1100
+ set VBase 0x1161
+ set TBase 0x11a7
+ set LCount 19
+ set VCount 21
+ set TCount 28
+ set NCount [expr {$VCount * $TCount}]
+ set SCount [expr {$LCount * $NCount}]
+}
+
+########################################################################
+# ::unicode::fromstring converts string to list of integers
+
+proc ::unicode::fromstring {str} {
+ set uclist {}
+ foreach char [split $str ""] {
+ lappend uclist [scan $char %c]
+ }
+ return $uclist
+}
+
+########################################################################
+# ::unicode::tostring converts list of integers to string
+
+proc ::unicode::tostring {uclist} {
+ set res ""
+ foreach num $uclist {
+ append res [format %c $num]
+ }
+ return $res
+}
+
+########################################################################
+# ::unicode::normalize normalizes list of integers according to
+# http://unicode.org/reports/tr15/
+# form is to be D, C, KD, or KC
+
+proc ::unicode::normalize {form uclist} {
+ switch -- $form {
+ D { return [normalizeD $uclist] }
+ C { return [normalizeC $uclist] }
+ KD { return [normalizeKD $uclist] }
+ KC { return [normalizeKC $uclist] }
+ default {
+ return -code error \
+ "::unicode::normalize: Only D, C, KD and KC forms are\
+ allowed"
+ }
+ }
+}
+
+########################################################################
+# ::unicode::normalizeS normalizes string according to
+# http://unicode.org/reports/tr15/
+# form is to be D, C, KD, or KC
+
+proc ::unicode::normalizeS {form str} {
+ switch -- $form {
+ D { return [tostring [normalizeD [fromstring $str]]] }
+ C { return [tostring [normalizeC [fromstring $str]]] }
+ KD { return [tostring [normalizeKD [fromstring $str]]] }
+ KC { return [tostring [normalizeKC [fromstring $str]]] }
+ default {
+ return -code error \
+ "::unicode::normalizeS: Only D, C, KD and KC forms are\
+ allowed"
+ }
+ }
+}
+
+########################################################################
+
+proc ::unicode::normalizeD {uclist} {
+ set res {}
+ foreach uc $uclist {
+ set res [concat $res [decomposeCanonical $uc]]
+ }
+
+ canonicalOrdering $res
+}
+
+proc ::unicode::normalizeC {uclist} {
+ composeCanonical [normalizeD $uclist]
+}
+
+proc ::unicode::normalizeKD {uclist} {
+ set res {}
+ foreach uc $uclist {
+ set res [concat $res [decomposeCompat $uc]]
+ }
+
+ canonicalOrdering $res
+}
+
+proc ::unicode::normalizeKC {uclist} {
+ composeCanonical [normalizeKD $uclist]
+}
+
+########################################################################
+# Adjacent characters with nonzero character class should go in
+# order of increasing character class
+
+proc ::unicode::canonicalOrdering {uclist} {
+ set res {}
+ set slist {}
+ foreach uc $uclist {
+ set cclass [data::GetUniCharCClass $uc]
+ if {$cclass != 0} {
+ lappend slist [list $uc $cclass]
+ } else {
+ foreach s [lsort -integer -index 1 $slist] {
+ lappend res [lindex $s 0]
+ }
+ set slist {}
+ lappend res $uc
+ }
+ }
+ foreach s [lsort -integer -index 1 $slist] {
+ lappend res [lindex $s 0]
+ }
+
+ return $res
+}
+
+########################################################################
+
+proc ::unicode::decomposeHangul {uc} {
+ variable SBase
+ variable LBase
+ variable VBase
+ variable TBase
+ variable LCount
+ variable VCount
+ variable TCount
+ variable NCount
+ variable SCount
+
+ # Hangul decomposition is algorithmic
+ set SIndex [expr {$uc - $SBase}]
+ if {$SIndex >= 0 && $SIndex < $SCount} {
+ set res {}
+ set L [expr {$LBase + $SIndex / $NCount}]
+ set V [expr {$VBase + ($SIndex % $NCount) / $TCount}]
+ set T [expr {$TBase + $SIndex % $TCount}]
+ set res [list $L $V]
+ if {$T != $TBase} {
+ lappend res $T
+ }
+ return $res
+ }
+ return -1
+}
+
+########################################################################
+
+proc ::unicode::decomposeCanonical {uc} {
+ # Try to decompose Hangul first
+ set res [decomposeHangul $uc]
+ if {$res >= 0} {
+ return $res
+ }
+
+ # For others do a lookup in data tables
+ set info [data::GetUniCharDecompInfo $uc]
+ if {$info >= 0} {
+ set res {}
+ foreach c [data::GetDecompList $info] {
+ set res [concat $res [decomposeCanonical $c]]
+ }
+ return $res
+ } else {
+ return [list $uc]
+ }
+}
+
+########################################################################
+
+proc ::unicode::decomposeCompat {uc} {
+ # Try to decompose Hangul first
+ set res [decomposeHangul $uc]
+ if {$res >= 0} {
+ return $res
+ }
+
+ # For others do a lookup in data tables
+ set info [data::GetUniCharDecompCompatInfo $uc]
+ if {$info >= 0} {
+ set res {}
+ foreach c [data::GetDecompList $info] {
+ set res [concat $res [decomposeCompat $c]]
+ }
+ return $res
+ } else {
+ return [list $uc]
+ }
+}
+
+########################################################################
+
+proc ::unicode::composeTwo {uc1 uc2} {
+ variable SBase
+ variable LBase
+ variable VBase
+ variable TBase
+ variable LCount
+ variable VCount
+ variable TCount
+ variable NCount
+ variable SCount
+
+ # Hangul composition is algorithmic
+ if {$uc1 >= $LBase && $uc1 < $LBase + $LCount && \
+ $uc2 >= $VBase && $uc2 < $VBase + $VCount} {
+ return [expr {$SBase + (($uc1 - $LBase) * $VCount + \
+ ($uc2 - $VBase)) * $TCount}]
+ }
+
+ if {$uc1 >= $SBase && $uc1 < $SBase + $SCount && \
+ (($uc1 - $SBase) % $TCount) == 0 && \
+ $uc2 >= $TBase && $uc2 < $TBase + $TCount} {
+ return [expr {$uc1 + $uc2 - $TBase}]
+ }
+
+ # For others do a lookup in data tables
+ set info1 [data::GetUniCharCompInfo $uc1]
+ set res [data::GetCompFirst $uc2 $info1]
+ if {$res != -1} {
+ return $res
+ }
+
+ set info2 [data::GetUniCharCompInfo $uc2]
+ set res [data::GetCompSecond $uc1 $info2]
+ if {$res != -1} {
+ return $res
+ }
+
+ data::GetCompBoth $info1 $info2
+}
+
+########################################################################
+
+proc ::unicode::composeCanonical {uclist} {
+ if {[llength $uclist] == 0} {
+ return {}
+ }
+
+ set res {}
+ set comps {}
+ set ch1 [lindex $uclist 0]
+ set cclass_prev [data::GetUniCharCClass $ch1]
+ foreach ch2 [lrange $uclist 1 end] {
+ set cclass [data::GetUniCharCClass $ch2]
+ if {($cclass_prev == 0 || $cclass > $cclass_prev) && \
+ [set ruc [composeTwo $ch1 $ch2]]} {
+ set ch1 $ruc
+ } else {
+ if {$cclass == 0} {
+ lappend res $ch1
+ set res [concat $res $comps]
+ set comps {}
+ set ch1 $ch2
+ set cclass_prev 0
+ } else {
+ lappend comps $ch2
+ set cclass_prev $cclass
+ }
+ }
+ }
+ lappend res $ch1
+ concat $res $comps
+}
+
+########################################################################
+
+package provide unicode 1.0.0
+