From 3a833016110a7d20d44d1703966edeebd42209d8 Mon Sep 17 00:00:00 2001 From: nijtmans Date: Mon, 18 Oct 2010 21:47:36 +0000 Subject: [Bug 3085863]: tclUniData 9 years old Added testcases for Unicode 6.0 --- ChangeLog | 7 +++++++ generic/regcomp.c | 2 +- tests/utf.test | 34 +++++++++++++++++----------------- tools/uniParse.tcl | 10 +++++++--- 4 files changed, 32 insertions(+), 21 deletions(-) diff --git a/ChangeLog b/ChangeLog index de50354..8e372eb 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,10 @@ +2010-10-18 Jan Nijtmans + + * tools/uniParse.tcl: [Bug 3085863]: tclUniData 9 years old + Ignore non-BMP characters and fix comment about UnicodeData.txt file. + * generic/regcomp.c: fix comment + * tests/utf.test: Add some Unicode 6 testcases + 2010-10-17 Alexandre Ferrieux * doc/info.n: Document [info errorstack] faithfully. diff --git a/generic/regcomp.c b/generic/regcomp.c index 9753ca4..d7ae05e 100644 --- a/generic/regcomp.c +++ b/generic/regcomp.c @@ -2131,7 +2131,7 @@ stdump( /* - stid - identify a subtree node for dumping - ^ static char *stid(struct subre *, char *, size_t); + ^ static const char *stid(struct subre *, char *, size_t); */ static const char * /* points to buf or constant string */ stid( diff --git a/tests/utf.test b/tests/utf.test index 575a5cd..3a45d13 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -8,7 +8,7 @@ # See the file "license.terms" for information on usage and redistribution # of this file, and for a DISCLAIMER OF ALL WARRANTIES. # -# RCS: @(#) $Id: utf.test,v 1.14 2007/05/02 01:37:28 kennykb Exp $ +# RCS: @(#) $Id: utf.test,v 1.15 2010/10/18 21:47:36 nijtmans Exp $ if {[lsearch [namespace children] ::tcltest] == -1} { package require tcltest 2 @@ -276,12 +276,12 @@ test utf-20.1 {TclUniCharNcmp} { } {} test utf-21.1 {TclUniCharIsAlnum} { - # this returns 1 with Unicode 3 compliance - string is alnum \u1040\u021f + # this returns 1 with Unicode 6 compliance + string is alnum \u1040\u021f\u0220 } {1} test utf-21.2 {unicode alnum char in regc_locale.c} { - # this returns 1 with Unicode 3 compliance - list [regexp {^[[:alnum:]]+$} \u1040\u021f] [regexp {^\w+$} \u1040\u021f] + # this returns 1 with Unicode 6 compliance + list [regexp {^[[:alnum:]]+$} \u1040\u021f\u0220] [regexp {^\w+$} \u1040\u021f\u0220] } {1 1} test utf-22.1 {TclUniCharIsWordChar} { @@ -292,30 +292,30 @@ test utf-22.2 {TclUniCharIsWordChar} { } 10 test utf-23.1 {TclUniCharIsAlpha} { - # this returns 1 with Unicode 3 compliance - string is alpha \u021f + # this returns 1 with Unicode 6 compliance + string is alpha \u021f\u0220 } {1} test utf-23.2 {unicode alpha char in regc_locale.c} { - # this returns 1 with Unicode 3 compliance - regexp {^[[:alpha:]]+$} \u021f + # this returns 1 with Unicode 6 compliance + regexp {^[[:alpha:]]+$} \u021f\u0220 } {1} test utf-24.1 {TclUniCharIsDigit} { - # this returns 1 with Unicode 3 compliance - string is digit \u1040 + # this returns 1 with Unicode 6 compliance + string is digit \u1040\uabf0 } {1} test utf-24.2 {unicode digit char in regc_locale.c} { - # this returns 1 with Unicode 3 compliance - list [regexp {^[[:digit:]]+$} \u1040] [regexp {^\d+$} \u1040] + # this returns 1 with Unicode 6 compliance + list [regexp {^[[:digit:]]+$} \u1040\uabf0] [regexp {^\d+$} \u1040\uabf0] } {1 1} test utf-24.3 {TclUniCharIsSpace} { - # this returns 1 with Unicode 3 compliance - string is space \u1680 + # this returns 1 with Unicode 6 compliance + string is space \u1680\u180e } {1} test utf-24.4 {unicode space char in regc_locale.c} { - # this returns 1 with Unicode 3 compliance - list [regexp {^[[:space:]]+$} \u1680] [regexp {^\s+$} \u1680] + # this returns 1 with Unicode 6 compliance + list [regexp {^[[:space:]]+$} \u1680\u180e] [regexp {^\s+$} \u1680\u180e] } {1 1} testConstraint teststringobj [llength [info commands teststringobj]] diff --git a/tools/uniParse.tcl b/tools/uniParse.tcl index a7f4237..0ec0848 100644 --- a/tools/uniParse.tcl +++ b/tools/uniParse.tcl @@ -4,12 +4,12 @@ # corresponding tclUniData.c file with compressed character # data tables. The input to this program should be the latest # UnicodeData file from: -# ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData-Latest.txt +# ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt # # Copyright (c) 1998-1999 by Scriptics Corporation. # All rights reserved. # -# RCS: @(#) $Id: uniParse.tcl,v 1.8 2010/10/15 15:25:52 nijtmans Exp $ +# RCS: @(#) $Id: uniParse.tcl,v 1.9 2010/10/18 21:47:36 nijtmans Exp $ namespace eval uni { @@ -116,7 +116,11 @@ proc uni::buildTables {data} { set items [split $line \;] - scan [lindex $items 0] %4x index + scan [lindex $items 0] %x index + if {$index > 0xFFFF} then { + # Ignore non-BMP characters, as long as Tcl doesn't support them + continue + } set index [format 0x%0.4x $index] set gIndex [getGroup [getValue $items $index]] -- cgit v0.12