diff options
Diffstat (limited to 'tcllib/modules/stringprep/unicode.tcl')
-rw-r--r-- | tcllib/modules/stringprep/unicode.tcl | 292 |
1 files changed, 292 insertions, 0 deletions
diff --git a/tcllib/modules/stringprep/unicode.tcl b/tcllib/modules/stringprep/unicode.tcl new file mode 100644 index 0000000..428f735 --- /dev/null +++ b/tcllib/modules/stringprep/unicode.tcl @@ -0,0 +1,292 @@ +# unicode.tcl -*- tcl -*- +# +# Implementation of RFC 3454 "Preparation of Internationalized Strings" +# +# Copyright (c) 2007 Sergei Golovan +# +# See the file "license.terms" for information on usage and redistribution +# of this file, and for a DISCLAIMER OF ALL WARRANTIES. +# +# RCS: @(#) $Id: unicode.tcl,v 1.1 2008/01/29 02:18:10 patthoyts Exp $ + +package require unicode::data 1.0 + +namespace eval ::unicode { + # Hangul constants + set SBase 0xac00 + set LBase 0x1100 + set VBase 0x1161 + set TBase 0x11a7 + set LCount 19 + set VCount 21 + set TCount 28 + set NCount [expr {$VCount * $TCount}] + set SCount [expr {$LCount * $NCount}] +} + +######################################################################## +# ::unicode::fromstring converts string to list of integers + +proc ::unicode::fromstring {str} { + set uclist {} + foreach char [split $str ""] { + lappend uclist [scan $char %c] + } + return $uclist +} + +######################################################################## +# ::unicode::tostring converts list of integers to string + +proc ::unicode::tostring {uclist} { + set res "" + foreach num $uclist { + append res [format %c $num] + } + return $res +} + +######################################################################## +# ::unicode::normalize normalizes list of integers according to +# http://unicode.org/reports/tr15/ +# form is to be D, C, KD, or KC + +proc ::unicode::normalize {form uclist} { + switch -- $form { + D { return [normalizeD $uclist] } + C { return [normalizeC $uclist] } + KD { return [normalizeKD $uclist] } + KC { return [normalizeKC $uclist] } + default { + return -code error \ + "::unicode::normalize: Only D, C, KD and KC forms are\ + allowed" + } + } +} + +######################################################################## +# ::unicode::normalizeS normalizes string according to +# http://unicode.org/reports/tr15/ +# form is to be D, C, KD, or KC + +proc ::unicode::normalizeS {form str} { + switch -- $form { + D { return [tostring [normalizeD [fromstring $str]]] } + C { return [tostring [normalizeC [fromstring $str]]] } + KD { return [tostring [normalizeKD [fromstring $str]]] } + KC { return [tostring [normalizeKC [fromstring $str]]] } + default { + return -code error \ + "::unicode::normalizeS: Only D, C, KD and KC forms are\ + allowed" + } + } +} + +######################################################################## + +proc ::unicode::normalizeD {uclist} { + set res {} + foreach uc $uclist { + set res [concat $res [decomposeCanonical $uc]] + } + + canonicalOrdering $res +} + +proc ::unicode::normalizeC {uclist} { + composeCanonical [normalizeD $uclist] +} + +proc ::unicode::normalizeKD {uclist} { + set res {} + foreach uc $uclist { + set res [concat $res [decomposeCompat $uc]] + } + + canonicalOrdering $res +} + +proc ::unicode::normalizeKC {uclist} { + composeCanonical [normalizeKD $uclist] +} + +######################################################################## +# Adjacent characters with nonzero character class should go in +# order of increasing character class + +proc ::unicode::canonicalOrdering {uclist} { + set res {} + set slist {} + foreach uc $uclist { + set cclass [data::GetUniCharCClass $uc] + if {$cclass != 0} { + lappend slist [list $uc $cclass] + } else { + foreach s [lsort -integer -index 1 $slist] { + lappend res [lindex $s 0] + } + set slist {} + lappend res $uc + } + } + foreach s [lsort -integer -index 1 $slist] { + lappend res [lindex $s 0] + } + + return $res +} + +######################################################################## + +proc ::unicode::decomposeHangul {uc} { + variable SBase + variable LBase + variable VBase + variable TBase + variable LCount + variable VCount + variable TCount + variable NCount + variable SCount + + # Hangul decomposition is algorithmic + set SIndex [expr {$uc - $SBase}] + if {$SIndex >= 0 && $SIndex < $SCount} { + set res {} + set L [expr {$LBase + $SIndex / $NCount}] + set V [expr {$VBase + ($SIndex % $NCount) / $TCount}] + set T [expr {$TBase + $SIndex % $TCount}] + set res [list $L $V] + if {$T != $TBase} { + lappend res $T + } + return $res + } + return -1 +} + +######################################################################## + +proc ::unicode::decomposeCanonical {uc} { + # Try to decompose Hangul first + set res [decomposeHangul $uc] + if {$res >= 0} { + return $res + } + + # For others do a lookup in data tables + set info [data::GetUniCharDecompInfo $uc] + if {$info >= 0} { + set res {} + foreach c [data::GetDecompList $info] { + set res [concat $res [decomposeCanonical $c]] + } + return $res + } else { + return [list $uc] + } +} + +######################################################################## + +proc ::unicode::decomposeCompat {uc} { + # Try to decompose Hangul first + set res [decomposeHangul $uc] + if {$res >= 0} { + return $res + } + + # For others do a lookup in data tables + set info [data::GetUniCharDecompCompatInfo $uc] + if {$info >= 0} { + set res {} + foreach c [data::GetDecompList $info] { + set res [concat $res [decomposeCompat $c]] + } + return $res + } else { + return [list $uc] + } +} + +######################################################################## + +proc ::unicode::composeTwo {uc1 uc2} { + variable SBase + variable LBase + variable VBase + variable TBase + variable LCount + variable VCount + variable TCount + variable NCount + variable SCount + + # Hangul composition is algorithmic + if {$uc1 >= $LBase && $uc1 < $LBase + $LCount && \ + $uc2 >= $VBase && $uc2 < $VBase + $VCount} { + return [expr {$SBase + (($uc1 - $LBase) * $VCount + \ + ($uc2 - $VBase)) * $TCount}] + } + + if {$uc1 >= $SBase && $uc1 < $SBase + $SCount && \ + (($uc1 - $SBase) % $TCount) == 0 && \ + $uc2 >= $TBase && $uc2 < $TBase + $TCount} { + return [expr {$uc1 + $uc2 - $TBase}] + } + + # For others do a lookup in data tables + set info1 [data::GetUniCharCompInfo $uc1] + set res [data::GetCompFirst $uc2 $info1] + if {$res != -1} { + return $res + } + + set info2 [data::GetUniCharCompInfo $uc2] + set res [data::GetCompSecond $uc1 $info2] + if {$res != -1} { + return $res + } + + data::GetCompBoth $info1 $info2 +} + +######################################################################## + +proc ::unicode::composeCanonical {uclist} { + if {[llength $uclist] == 0} { + return {} + } + + set res {} + set comps {} + set ch1 [lindex $uclist 0] + set cclass_prev [data::GetUniCharCClass $ch1] + foreach ch2 [lrange $uclist 1 end] { + set cclass [data::GetUniCharCClass $ch2] + if {($cclass_prev == 0 || $cclass > $cclass_prev) && \ + [set ruc [composeTwo $ch1 $ch2]]} { + set ch1 $ruc + } else { + if {$cclass == 0} { + lappend res $ch1 + set res [concat $res $comps] + set comps {} + set ch1 $ch2 + set cclass_prev 0 + } else { + lappend comps $ch2 + set cclass_prev $cclass + } + } + } + lappend res $ch1 + concat $res $comps +} + +######################################################################## + +package provide unicode 1.0.0 + |