summaryrefslogtreecommitdiffstats
path: root/util/unicode
diff options
context:
space:
mode:
authorRitt Konstantin <ritt.ks@gmail.com>2010-02-24 19:13:48 (GMT)
committerThiago Macieira <thiago.macieira@nokia.com>2010-03-05 15:58:09 (GMT)
commit5c3c63768ab7d042c62d35f3f254d9723c87c83e (patch)
tree43c9f04629d59043fa4b6b151092083e840bcfef /util/unicode
parent76eb4f454073a30ef62d8bb85862589a13415605 (diff)
downloadQt-5c3c63768ab7d042c62d35f3f254d9723c87c83e.zip
Qt-5c3c63768ab7d042c62d35f3f254d9723c87c83e.tar.gz
Qt-5c3c63768ab7d042c62d35f3f254d9723c87c83e.tar.bz2
prefer DerivedNormalizationProps.txt over CompositionExclusions.txt
1) http://www.unicode.org/reports/tr44/ :2.1 > Implementations should simply use the derived properties, > and should not try to rederive them from lists of simple > properties and collections of rules, because of the chances > for error and divergence when doing so. 2) DerivedNormalizationProps.txt file also provides additional info that can be used for Normalization Form Quick Check algorithm implementation some later Note: this commit *must not* change anything in the (re)generated data; any change in generated data will point to the data inconsistency between CompositionExclusions.txt, DerivedNormalizationProps.txt and UnicodeData.txt files Merge-request: 480 Reviewed-by: Thiago Macieira <thiago.macieira@nokia.com>
Diffstat (limited to 'util/unicode')
-rw-r--r--util/unicode/data/CompositionExclusions.txt197
-rw-r--r--util/unicode/main.cpp39
2 files changed, 28 insertions, 208 deletions
diff --git a/util/unicode/data/CompositionExclusions.txt b/util/unicode/data/CompositionExclusions.txt
deleted file mode 100644
index 8a9b7be..0000000
--- a/util/unicode/data/CompositionExclusions.txt
+++ /dev/null
@@ -1,197 +0,0 @@
-# CompositionExclusions-5.0.0.txt
-# Date: 2006-05-23, 12:42:00 PST [KW]
-#
-# This file lists the characters for the Composition Exclusion Table
-# defined in UAX #15, Unicode Normalization Forms.
-#
-# This file is a normative contributory data file in the
-# Unicode Character Database.
-#
-# Copyright (c) 1991-2006 Unicode, Inc.
-# For terms of use, see http://www.unicode.org/terms_of_use.html
-#
-# For more information, see
-# http://www.unicode.org/unicode/reports/tr15/#Primary Exclusion List Table
-#
-# For a full derivation of composition exclusions, see the derived property
-# Full_Composition_Exclusion in DerivedNormalizationProps.txt
-#
-
-# ================================================
-# (1) Script Specifics
-#
-# This list of characters cannot be derived from the UnicodeData.txt file.
-# ================================================
-
-0958 # DEVANAGARI LETTER QA
-0959 # DEVANAGARI LETTER KHHA
-095A # DEVANAGARI LETTER GHHA
-095B # DEVANAGARI LETTER ZA
-095C # DEVANAGARI LETTER DDDHA
-095D # DEVANAGARI LETTER RHA
-095E # DEVANAGARI LETTER FA
-095F # DEVANAGARI LETTER YYA
-09DC # BENGALI LETTER RRA
-09DD # BENGALI LETTER RHA
-09DF # BENGALI LETTER YYA
-0A33 # GURMUKHI LETTER LLA
-0A36 # GURMUKHI LETTER SHA
-0A59 # GURMUKHI LETTER KHHA
-0A5A # GURMUKHI LETTER GHHA
-0A5B # GURMUKHI LETTER ZA
-0A5E # GURMUKHI LETTER FA
-0B5C # ORIYA LETTER RRA
-0B5D # ORIYA LETTER RHA
-0F43 # TIBETAN LETTER GHA
-0F4D # TIBETAN LETTER DDHA
-0F52 # TIBETAN LETTER DHA
-0F57 # TIBETAN LETTER BHA
-0F5C # TIBETAN LETTER DZHA
-0F69 # TIBETAN LETTER KSSA
-0F76 # TIBETAN VOWEL SIGN VOCALIC R
-0F78 # TIBETAN VOWEL SIGN VOCALIC L
-0F93 # TIBETAN SUBJOINED LETTER GHA
-0F9D # TIBETAN SUBJOINED LETTER DDHA
-0FA2 # TIBETAN SUBJOINED LETTER DHA
-0FA7 # TIBETAN SUBJOINED LETTER BHA
-0FAC # TIBETAN SUBJOINED LETTER DZHA
-0FB9 # TIBETAN SUBJOINED LETTER KSSA
-FB1D # HEBREW LETTER YOD WITH HIRIQ
-FB1F # HEBREW LIGATURE YIDDISH YOD YOD PATAH
-FB2A # HEBREW LETTER SHIN WITH SHIN DOT
-FB2B # HEBREW LETTER SHIN WITH SIN DOT
-FB2C # HEBREW LETTER SHIN WITH DAGESH AND SHIN DOT
-FB2D # HEBREW LETTER SHIN WITH DAGESH AND SIN DOT
-FB2E # HEBREW LETTER ALEF WITH PATAH
-FB2F # HEBREW LETTER ALEF WITH QAMATS
-FB30 # HEBREW LETTER ALEF WITH MAPIQ
-FB31 # HEBREW LETTER BET WITH DAGESH
-FB32 # HEBREW LETTER GIMEL WITH DAGESH
-FB33 # HEBREW LETTER DALET WITH DAGESH
-FB34 # HEBREW LETTER HE WITH MAPIQ
-FB35 # HEBREW LETTER VAV WITH DAGESH
-FB36 # HEBREW LETTER ZAYIN WITH DAGESH
-FB38 # HEBREW LETTER TET WITH DAGESH
-FB39 # HEBREW LETTER YOD WITH DAGESH
-FB3A # HEBREW LETTER FINAL KAF WITH DAGESH
-FB3B # HEBREW LETTER KAF WITH DAGESH
-FB3C # HEBREW LETTER LAMED WITH DAGESH
-FB3E # HEBREW LETTER MEM WITH DAGESH
-FB40 # HEBREW LETTER NUN WITH DAGESH
-FB41 # HEBREW LETTER SAMEKH WITH DAGESH
-FB43 # HEBREW LETTER FINAL PE WITH DAGESH
-FB44 # HEBREW LETTER PE WITH DAGESH
-FB46 # HEBREW LETTER TSADI WITH DAGESH
-FB47 # HEBREW LETTER QOF WITH DAGESH
-FB48 # HEBREW LETTER RESH WITH DAGESH
-FB49 # HEBREW LETTER SHIN WITH DAGESH
-FB4A # HEBREW LETTER TAV WITH DAGESH
-FB4B # HEBREW LETTER VAV WITH HOLAM
-FB4C # HEBREW LETTER BET WITH RAFE
-FB4D # HEBREW LETTER KAF WITH RAFE
-FB4E # HEBREW LETTER PE WITH RAFE
-
-# Total code points: 67
-
-# ================================================
-# (2) Post Composition Version precomposed characters
-#
-# These characters cannot be derived solely from the UnicodeData.txt file
-# in this version of Unicode.
-#
-# Note that characters added to the standard after the
-# Composition Version and which have canonical decomposition mappings
-# are not automatically added to this list of Post Composition
-# Version precomposed characters.
-# ================================================
-
-2ADC # FORKING
-1D15E # MUSICAL SYMBOL HALF NOTE
-1D15F # MUSICAL SYMBOL QUARTER NOTE
-1D160 # MUSICAL SYMBOL EIGHTH NOTE
-1D161 # MUSICAL SYMBOL SIXTEENTH NOTE
-1D162 # MUSICAL SYMBOL THIRTY-SECOND NOTE
-1D163 # MUSICAL SYMBOL SIXTY-FOURTH NOTE
-1D164 # MUSICAL SYMBOL ONE HUNDRED TWENTY-EIGHTH NOTE
-1D1BB # MUSICAL SYMBOL MINIMA
-1D1BC # MUSICAL SYMBOL MINIMA BLACK
-1D1BD # MUSICAL SYMBOL SEMIMINIMA WHITE
-1D1BE # MUSICAL SYMBOL SEMIMINIMA BLACK
-1D1BF # MUSICAL SYMBOL FUSA WHITE
-1D1C0 # MUSICAL SYMBOL FUSA BLACK
-
-# Total code points: 14
-
-# ================================================
-# (3) Singleton Decompositions
-#
-# These characters can be derived from the UnicodeData.txt file
-# by including all characters whose canonical decomposition
-# consists of a single character.
-#
-# These characters are simply quoted here for reference.
-# See also Full_Composition_Exclusion in DerivedNormalizationProps.txt
-# ================================================
-
-# 0340..0341 [2] COMBINING GRAVE TONE MARK..COMBINING ACUTE TONE MARK
-# 0343 COMBINING GREEK KORONIS
-# 0374 GREEK NUMERAL SIGN
-# 037E GREEK QUESTION MARK
-# 0387 GREEK ANO TELEIA
-# 1F71 GREEK SMALL LETTER ALPHA WITH OXIA
-# 1F73 GREEK SMALL LETTER EPSILON WITH OXIA
-# 1F75 GREEK SMALL LETTER ETA WITH OXIA
-# 1F77 GREEK SMALL LETTER IOTA WITH OXIA
-# 1F79 GREEK SMALL LETTER OMICRON WITH OXIA
-# 1F7B GREEK SMALL LETTER UPSILON WITH OXIA
-# 1F7D GREEK SMALL LETTER OMEGA WITH OXIA
-# 1FBB GREEK CAPITAL LETTER ALPHA WITH OXIA
-# 1FBE GREEK PROSGEGRAMMENI
-# 1FC9 GREEK CAPITAL LETTER EPSILON WITH OXIA
-# 1FCB GREEK CAPITAL LETTER ETA WITH OXIA
-# 1FD3 GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA
-# 1FDB GREEK CAPITAL LETTER IOTA WITH OXIA
-# 1FE3 GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA
-# 1FEB GREEK CAPITAL LETTER UPSILON WITH OXIA
-# 1FEE..1FEF [2] GREEK DIALYTIKA AND OXIA..GREEK VARIA
-# 1FF9 GREEK CAPITAL LETTER OMICRON WITH OXIA
-# 1FFB GREEK CAPITAL LETTER OMEGA WITH OXIA
-# 1FFD GREEK OXIA
-# 2000..2001 [2] EN QUAD..EM QUAD
-# 2126 OHM SIGN
-# 212A..212B [2] KELVIN SIGN..ANGSTROM SIGN
-# 2329 LEFT-POINTING ANGLE BRACKET
-# 232A RIGHT-POINTING ANGLE BRACKET
-# F900..FA0D [270] CJK COMPATIBILITY IDEOGRAPH-F900..CJK COMPATIBILITY IDEOGRAPH-FA0D
-# FA10 CJK COMPATIBILITY IDEOGRAPH-FA10
-# FA12 CJK COMPATIBILITY IDEOGRAPH-FA12
-# FA15..FA1E [10] CJK COMPATIBILITY IDEOGRAPH-FA15..CJK COMPATIBILITY IDEOGRAPH-FA1E
-# FA20 CJK COMPATIBILITY IDEOGRAPH-FA20
-# FA22 CJK COMPATIBILITY IDEOGRAPH-FA22
-# FA25..FA26 [2] CJK COMPATIBILITY IDEOGRAPH-FA25..CJK COMPATIBILITY IDEOGRAPH-FA26
-# FA2A..FA2D [4] CJK COMPATIBILITY IDEOGRAPH-FA2A..CJK COMPATIBILITY IDEOGRAPH-FA2D
-# FA30..FA6A [59] CJK COMPATIBILITY IDEOGRAPH-FA30..CJK COMPATIBILITY IDEOGRAPH-FA6A
-# FA70..FAD9 [106] CJK COMPATIBILITY IDEOGRAPH-FA70..CJK COMPATIBILITY IDEOGRAPH-FAD9
-# 2F800..2FA1D [542] CJK COMPATIBILITY IDEOGRAPH-2F800..CJK COMPATIBILITY IDEOGRAPH-2FA1D
-
-# Total code points: 924
-
-# ================================================
-# (4) Non-Starter Decompositions
-#
-# These characters can be derived from the UnicodeData file
-# by including all characters whose canonical decomposition consists
-# of a sequence of characters, the first of which has a non-zero
-# combining class.
-#
-# These characters are simply quoted here for reference.
-# See also Full_Composition_Exclusion in DerivedNormalizationProps.txt
-# ================================================
-
-# 0344 COMBINING GREEK DIALYTIKA TONOS
-# 0F73 TIBETAN VOWEL SIGN II
-# 0F75 TIBETAN VOWEL SIGN UU
-# 0F81 TIBETAN VOWEL SIGN REVERSED II
-
-# Total code points: 4
-
diff --git a/util/unicode/main.cpp b/util/unicode/main.cpp
index 368cd83..c5d04c0 100644
--- a/util/unicode/main.cpp
+++ b/util/unicode/main.cpp
@@ -946,11 +946,11 @@ static void readDerivedAge()
}
-static void readCompositionExclusion()
+static void readDerivedNormalizationProps()
{
- QFile f("data/CompositionExclusions.txt");
+ QFile f("data/DerivedNormalizationProps.txt");
if (!f.exists())
- qFatal("Couldn't find CompositionExclusions.txt");
+ qFatal("Couldn't find DerivedNormalizationProps.txt");
f.open(QFile::ReadOnly);
@@ -963,20 +963,36 @@ static void readCompositionExclusion()
int comment = line.indexOf('#');
if (comment >= 0)
line = line.left(comment);
- line.replace(" ", "");
- if (line.isEmpty())
+ if (line.trimmed().isEmpty())
continue;
- Q_ASSERT(!line.contains(".."));
+ QList<QByteArray> l = line.split(';');
+ Q_ASSERT(l.size() == 2);
+
+ QByteArray propName = l[1].trimmed();
+ if (propName != "Full_Composition_Exclusion")
+ // ###
+ continue;
+
+ QByteArray codes = l[0].trimmed();
+ codes.replace("..", ".");
+ QList<QByteArray> cl = codes.split('.');
bool ok;
- int codepoint = line.toInt(&ok, 16);
+ int from = cl[0].toInt(&ok, 16);
Q_ASSERT(ok);
+ int to = from;
+ if (cl.size() == 2) {
+ to = cl[1].toInt(&ok, 16);
+ Q_ASSERT(ok);
+ }
- UnicodeData d = unicodeData.value(codepoint, UnicodeData(codepoint));
- d.excludedComposition = true;
- unicodeData.insert(codepoint, d);
+ for (int codepoint = from; codepoint <= to; ++codepoint) {
+ UnicodeData d = unicodeData.value(codepoint, UnicodeData(codepoint));
+ d.excludedComposition = true;
+ unicodeData.insert(codepoint, d);
+ }
}
for (int codepoint = 0; codepoint <= LAST_CODEPOINT; ++codepoint) {
@@ -1000,6 +1016,7 @@ static void readCompositionExclusion()
}
}
+
struct NormalizationCorrection {
uint codepoint;
uint mapped;
@@ -2529,7 +2546,7 @@ int main(int, char **)
readBidiMirroring();
readArabicShaping();
readDerivedAge();
- readCompositionExclusion();
+ readDerivedNormalizationProps();
readSpecialCasing();
readCaseFolding();
// readBlocks();