From 5c3c63768ab7d042c62d35f3f254d9723c87c83e Mon Sep 17 00:00:00 2001 From: Ritt Konstantin Date: Wed, 24 Feb 2010 20:13:48 +0100 Subject: prefer DerivedNormalizationProps.txt over CompositionExclusions.txt 1) http://www.unicode.org/reports/tr44/ :2.1 > Implementations should simply use the derived properties, > and should not try to rederive them from lists of simple > properties and collections of rules, because of the chances > for error and divergence when doing so. 2) DerivedNormalizationProps.txt file also provides additional info that can be used for Normalization Form Quick Check algorithm implementation some later Note: this commit *must not* change anything in the (re)generated data; any change in generated data will point to the data inconsistency between CompositionExclusions.txt, DerivedNormalizationProps.txt and UnicodeData.txt files Merge-request: 480 Reviewed-by: Thiago Macieira --- util/unicode/data/CompositionExclusions.txt | 197 ---------------------------- util/unicode/main.cpp | 39 ++++-- 2 files changed, 28 insertions(+), 208 deletions(-) delete mode 100644 util/unicode/data/CompositionExclusions.txt diff --git a/util/unicode/data/CompositionExclusions.txt b/util/unicode/data/CompositionExclusions.txt deleted file mode 100644 index 8a9b7be..0000000 --- a/util/unicode/data/CompositionExclusions.txt +++ /dev/null @@ -1,197 +0,0 @@ -# CompositionExclusions-5.0.0.txt -# Date: 2006-05-23, 12:42:00 PST [KW] -# -# This file lists the characters for the Composition Exclusion Table -# defined in UAX #15, Unicode Normalization Forms. -# -# This file is a normative contributory data file in the -# Unicode Character Database. -# -# Copyright (c) 1991-2006 Unicode, Inc. -# For terms of use, see http://www.unicode.org/terms_of_use.html -# -# For more information, see -# http://www.unicode.org/unicode/reports/tr15/#Primary Exclusion List Table -# -# For a full derivation of composition exclusions, see the derived property -# Full_Composition_Exclusion in DerivedNormalizationProps.txt -# - -# ================================================ -# (1) Script Specifics -# -# This list of characters cannot be derived from the UnicodeData.txt file. -# ================================================ - -0958 # DEVANAGARI LETTER QA -0959 # DEVANAGARI LETTER KHHA -095A # DEVANAGARI LETTER GHHA -095B # DEVANAGARI LETTER ZA -095C # DEVANAGARI LETTER DDDHA -095D # DEVANAGARI LETTER RHA -095E # DEVANAGARI LETTER FA -095F # DEVANAGARI LETTER YYA -09DC # BENGALI LETTER RRA -09DD # BENGALI LETTER RHA -09DF # BENGALI LETTER YYA -0A33 # GURMUKHI LETTER LLA -0A36 # GURMUKHI LETTER SHA -0A59 # GURMUKHI LETTER KHHA -0A5A # GURMUKHI LETTER GHHA -0A5B # GURMUKHI LETTER ZA -0A5E # GURMUKHI LETTER FA -0B5C # ORIYA LETTER RRA -0B5D # ORIYA LETTER RHA -0F43 # TIBETAN LETTER GHA -0F4D # TIBETAN LETTER DDHA -0F52 # TIBETAN LETTER DHA -0F57 # TIBETAN LETTER BHA -0F5C # TIBETAN LETTER DZHA -0F69 # TIBETAN LETTER KSSA -0F76 # TIBETAN VOWEL SIGN VOCALIC R -0F78 # TIBETAN VOWEL SIGN VOCALIC L -0F93 # TIBETAN SUBJOINED LETTER GHA -0F9D # TIBETAN SUBJOINED LETTER DDHA -0FA2 # TIBETAN SUBJOINED LETTER DHA -0FA7 # TIBETAN SUBJOINED LETTER BHA -0FAC # TIBETAN SUBJOINED LETTER DZHA -0FB9 # TIBETAN SUBJOINED LETTER KSSA -FB1D # HEBREW LETTER YOD WITH HIRIQ -FB1F # HEBREW LIGATURE YIDDISH YOD YOD PATAH -FB2A # HEBREW LETTER SHIN WITH SHIN DOT -FB2B # HEBREW LETTER SHIN WITH SIN DOT -FB2C # HEBREW LETTER SHIN WITH DAGESH AND SHIN DOT -FB2D # HEBREW LETTER SHIN WITH DAGESH AND SIN DOT -FB2E # HEBREW LETTER ALEF WITH PATAH -FB2F # HEBREW LETTER ALEF WITH QAMATS -FB30 # HEBREW LETTER ALEF WITH MAPIQ -FB31 # HEBREW LETTER BET WITH DAGESH -FB32 # HEBREW LETTER GIMEL WITH DAGESH -FB33 # HEBREW LETTER DALET WITH DAGESH -FB34 # HEBREW LETTER HE WITH MAPIQ -FB35 # HEBREW LETTER VAV WITH DAGESH -FB36 # HEBREW LETTER ZAYIN WITH DAGESH -FB38 # HEBREW LETTER TET WITH DAGESH -FB39 # HEBREW LETTER YOD WITH DAGESH -FB3A # HEBREW LETTER FINAL KAF WITH DAGESH -FB3B # HEBREW LETTER KAF WITH DAGESH -FB3C # HEBREW LETTER LAMED WITH DAGESH -FB3E # HEBREW LETTER MEM WITH DAGESH -FB40 # HEBREW LETTER NUN WITH DAGESH -FB41 # HEBREW LETTER SAMEKH WITH DAGESH -FB43 # HEBREW LETTER FINAL PE WITH DAGESH -FB44 # HEBREW LETTER PE WITH DAGESH -FB46 # HEBREW LETTER TSADI WITH DAGESH -FB47 # HEBREW LETTER QOF WITH DAGESH -FB48 # HEBREW LETTER RESH WITH DAGESH -FB49 # HEBREW LETTER SHIN WITH DAGESH -FB4A # HEBREW LETTER TAV WITH DAGESH -FB4B # HEBREW LETTER VAV WITH HOLAM -FB4C # HEBREW LETTER BET WITH RAFE -FB4D # HEBREW LETTER KAF WITH RAFE -FB4E # HEBREW LETTER PE WITH RAFE - -# Total code points: 67 - -# ================================================ -# (2) Post Composition Version precomposed characters -# -# These characters cannot be derived solely from the UnicodeData.txt file -# in this version of Unicode. -# -# Note that characters added to the standard after the -# Composition Version and which have canonical decomposition mappings -# are not automatically added to this list of Post Composition -# Version precomposed characters. -# ================================================ - -2ADC # FORKING -1D15E # MUSICAL SYMBOL HALF NOTE -1D15F # MUSICAL SYMBOL QUARTER NOTE -1D160 # MUSICAL SYMBOL EIGHTH NOTE -1D161 # MUSICAL SYMBOL SIXTEENTH NOTE -1D162 # MUSICAL SYMBOL THIRTY-SECOND NOTE -1D163 # MUSICAL SYMBOL SIXTY-FOURTH NOTE -1D164 # MUSICAL SYMBOL ONE HUNDRED TWENTY-EIGHTH NOTE -1D1BB # MUSICAL SYMBOL MINIMA -1D1BC # MUSICAL SYMBOL MINIMA BLACK -1D1BD # MUSICAL SYMBOL SEMIMINIMA WHITE -1D1BE # MUSICAL SYMBOL SEMIMINIMA BLACK -1D1BF # MUSICAL SYMBOL FUSA WHITE -1D1C0 # MUSICAL SYMBOL FUSA BLACK - -# Total code points: 14 - -# ================================================ -# (3) Singleton Decompositions -# -# These characters can be derived from the UnicodeData.txt file -# by including all characters whose canonical decomposition -# consists of a single character. -# -# These characters are simply quoted here for reference. -# See also Full_Composition_Exclusion in DerivedNormalizationProps.txt -# ================================================ - -# 0340..0341 [2] COMBINING GRAVE TONE MARK..COMBINING ACUTE TONE MARK -# 0343 COMBINING GREEK KORONIS -# 0374 GREEK NUMERAL SIGN -# 037E GREEK QUESTION MARK -# 0387 GREEK ANO TELEIA -# 1F71 GREEK SMALL LETTER ALPHA WITH OXIA -# 1F73 GREEK SMALL LETTER EPSILON WITH OXIA -# 1F75 GREEK SMALL LETTER ETA WITH OXIA -# 1F77 GREEK SMALL LETTER IOTA WITH OXIA -# 1F79 GREEK SMALL LETTER OMICRON WITH OXIA -# 1F7B GREEK SMALL LETTER UPSILON WITH OXIA -# 1F7D GREEK SMALL LETTER OMEGA WITH OXIA -# 1FBB GREEK CAPITAL LETTER ALPHA WITH OXIA -# 1FBE GREEK PROSGEGRAMMENI -# 1FC9 GREEK CAPITAL LETTER EPSILON WITH OXIA -# 1FCB GREEK CAPITAL LETTER ETA WITH OXIA -# 1FD3 GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA -# 1FDB GREEK CAPITAL LETTER IOTA WITH OXIA -# 1FE3 GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA -# 1FEB GREEK CAPITAL LETTER UPSILON WITH OXIA -# 1FEE..1FEF [2] GREEK DIALYTIKA AND OXIA..GREEK VARIA -# 1FF9 GREEK CAPITAL LETTER OMICRON WITH OXIA -# 1FFB GREEK CAPITAL LETTER OMEGA WITH OXIA -# 1FFD GREEK OXIA -# 2000..2001 [2] EN QUAD..EM QUAD -# 2126 OHM SIGN -# 212A..212B [2] KELVIN SIGN..ANGSTROM SIGN -# 2329 LEFT-POINTING ANGLE BRACKET -# 232A RIGHT-POINTING ANGLE BRACKET -# F900..FA0D [270] CJK COMPATIBILITY IDEOGRAPH-F900..CJK COMPATIBILITY IDEOGRAPH-FA0D -# FA10 CJK COMPATIBILITY IDEOGRAPH-FA10 -# FA12 CJK COMPATIBILITY IDEOGRAPH-FA12 -# FA15..FA1E [10] CJK COMPATIBILITY IDEOGRAPH-FA15..CJK COMPATIBILITY IDEOGRAPH-FA1E -# FA20 CJK COMPATIBILITY IDEOGRAPH-FA20 -# FA22 CJK COMPATIBILITY IDEOGRAPH-FA22 -# FA25..FA26 [2] CJK COMPATIBILITY IDEOGRAPH-FA25..CJK COMPATIBILITY IDEOGRAPH-FA26 -# FA2A..FA2D [4] CJK COMPATIBILITY IDEOGRAPH-FA2A..CJK COMPATIBILITY IDEOGRAPH-FA2D -# FA30..FA6A [59] CJK COMPATIBILITY IDEOGRAPH-FA30..CJK COMPATIBILITY IDEOGRAPH-FA6A -# FA70..FAD9 [106] CJK COMPATIBILITY IDEOGRAPH-FA70..CJK COMPATIBILITY IDEOGRAPH-FAD9 -# 2F800..2FA1D [542] CJK COMPATIBILITY IDEOGRAPH-2F800..CJK COMPATIBILITY IDEOGRAPH-2FA1D - -# Total code points: 924 - -# ================================================ -# (4) Non-Starter Decompositions -# -# These characters can be derived from the UnicodeData file -# by including all characters whose canonical decomposition consists -# of a sequence of characters, the first of which has a non-zero -# combining class. -# -# These characters are simply quoted here for reference. -# See also Full_Composition_Exclusion in DerivedNormalizationProps.txt -# ================================================ - -# 0344 COMBINING GREEK DIALYTIKA TONOS -# 0F73 TIBETAN VOWEL SIGN II -# 0F75 TIBETAN VOWEL SIGN UU -# 0F81 TIBETAN VOWEL SIGN REVERSED II - -# Total code points: 4 - diff --git a/util/unicode/main.cpp b/util/unicode/main.cpp index 368cd83..c5d04c0 100644 --- a/util/unicode/main.cpp +++ b/util/unicode/main.cpp @@ -946,11 +946,11 @@ static void readDerivedAge() } -static void readCompositionExclusion() +static void readDerivedNormalizationProps() { - QFile f("data/CompositionExclusions.txt"); + QFile f("data/DerivedNormalizationProps.txt"); if (!f.exists()) - qFatal("Couldn't find CompositionExclusions.txt"); + qFatal("Couldn't find DerivedNormalizationProps.txt"); f.open(QFile::ReadOnly); @@ -963,20 +963,36 @@ static void readCompositionExclusion() int comment = line.indexOf('#'); if (comment >= 0) line = line.left(comment); - line.replace(" ", ""); - if (line.isEmpty()) + if (line.trimmed().isEmpty()) continue; - Q_ASSERT(!line.contains("..")); + QList l = line.split(';'); + Q_ASSERT(l.size() == 2); + + QByteArray propName = l[1].trimmed(); + if (propName != "Full_Composition_Exclusion") + // ### + continue; + + QByteArray codes = l[0].trimmed(); + codes.replace("..", "."); + QList cl = codes.split('.'); bool ok; - int codepoint = line.toInt(&ok, 16); + int from = cl[0].toInt(&ok, 16); Q_ASSERT(ok); + int to = from; + if (cl.size() == 2) { + to = cl[1].toInt(&ok, 16); + Q_ASSERT(ok); + } - UnicodeData d = unicodeData.value(codepoint, UnicodeData(codepoint)); - d.excludedComposition = true; - unicodeData.insert(codepoint, d); + for (int codepoint = from; codepoint <= to; ++codepoint) { + UnicodeData d = unicodeData.value(codepoint, UnicodeData(codepoint)); + d.excludedComposition = true; + unicodeData.insert(codepoint, d); + } } for (int codepoint = 0; codepoint <= LAST_CODEPOINT; ++codepoint) { @@ -1000,6 +1016,7 @@ static void readCompositionExclusion() } } + struct NormalizationCorrection { uint codepoint; uint mapped; @@ -2529,7 +2546,7 @@ int main(int, char **) readBidiMirroring(); readArabicShaping(); readDerivedAge(); - readCompositionExclusion(); + readDerivedNormalizationProps(); readSpecialCasing(); readCaseFolding(); // readBlocks(); -- cgit v0.12