4 files changed, 353 insertions, 443 deletions
diff --git a/util/unicode/.gitattributes b/util/unicode/.gitattributes
new file mode 100644
index 0000000..772b88f
--- /dev/null
+++ b/util/unicode/.gitattributes
@@ -0,0 +1 @@
+data/*.txt	-crlf
diff --git a/util/unicode/data/CompositionExclusions.txt b/util/unicode/data/CompositionExclusions.txt
deleted file mode 100644
index 8a9b7be..0000000
--- a/util/unicode/data/CompositionExclusions.txt
+++ /dev/null
@@ -1,197 +0,0 @@
-# CompositionExclusions-5.0.0.txt
-# Date: 2006-05-23, 12:42:00 PST [KW]
-#
-# This file lists the characters for the Composition Exclusion Table
-# defined in UAX #15, Unicode Normalization Forms.
-#
-# This file is a normative contributory data file in the
-# Unicode Character Database.
-#
-# Copyright (c) 1991-2006 Unicode, Inc.
-# For terms of use, see http://www.unicode.org/terms_of_use.html
-#
-# For more information, see
-# http://www.unicode.org/unicode/reports/tr15/#Primary Exclusion List Table
-#
-# For a full derivation of composition exclusions, see the derived property
-# Full_Composition_Exclusion in DerivedNormalizationProps.txt
-#
-
-# ================================================
-# (1) Script Specifics
-#
-# This list of characters cannot be derived from the UnicodeData.txt file.
-# ================================================
-
-0958    #  DEVANAGARI LETTER QA
-0959    #  DEVANAGARI LETTER KHHA
-095A    #  DEVANAGARI LETTER GHHA
-095B    #  DEVANAGARI LETTER ZA
-095C    #  DEVANAGARI LETTER DDDHA
-095D    #  DEVANAGARI LETTER RHA
-095E    #  DEVANAGARI LETTER FA
-095F    #  DEVANAGARI LETTER YYA
-09DC    #  BENGALI LETTER RRA
-09DD    #  BENGALI LETTER RHA
-09DF    #  BENGALI LETTER YYA
-0A33    #  GURMUKHI LETTER LLA
-0A36    #  GURMUKHI LETTER SHA
-0A59    #  GURMUKHI LETTER KHHA
-0A5A    #  GURMUKHI LETTER GHHA
-0A5B    #  GURMUKHI LETTER ZA
-0A5E    #  GURMUKHI LETTER FA
-0B5C    #  ORIYA LETTER RRA
-0B5D    #  ORIYA LETTER RHA
-0F43    #  TIBETAN LETTER GHA
-0F4D    #  TIBETAN LETTER DDHA
-0F52    #  TIBETAN LETTER DHA
-0F57    #  TIBETAN LETTER BHA
-0F5C    #  TIBETAN LETTER DZHA
-0F69    #  TIBETAN LETTER KSSA
-0F76    #  TIBETAN VOWEL SIGN VOCALIC R
-0F78    #  TIBETAN VOWEL SIGN VOCALIC L
-0F93    #  TIBETAN SUBJOINED LETTER GHA
-0F9D    #  TIBETAN SUBJOINED LETTER DDHA
-0FA2    #  TIBETAN SUBJOINED LETTER DHA
-0FA7    #  TIBETAN SUBJOINED LETTER BHA
-0FAC    #  TIBETAN SUBJOINED LETTER DZHA
-0FB9    #  TIBETAN SUBJOINED LETTER KSSA
-FB1D    #  HEBREW LETTER YOD WITH HIRIQ
-FB1F    #  HEBREW LIGATURE YIDDISH YOD YOD PATAH
-FB2A    #  HEBREW LETTER SHIN WITH SHIN DOT
-FB2B    #  HEBREW LETTER SHIN WITH SIN DOT
-FB2C    #  HEBREW LETTER SHIN WITH DAGESH AND SHIN DOT
-FB2D    #  HEBREW LETTER SHIN WITH DAGESH AND SIN DOT
-FB2E    #  HEBREW LETTER ALEF WITH PATAH
-FB2F    #  HEBREW LETTER ALEF WITH QAMATS
-FB30    #  HEBREW LETTER ALEF WITH MAPIQ
-FB31    #  HEBREW LETTER BET WITH DAGESH
-FB32    #  HEBREW LETTER GIMEL WITH DAGESH
-FB33    #  HEBREW LETTER DALET WITH DAGESH
-FB34    #  HEBREW LETTER HE WITH MAPIQ
-FB35    #  HEBREW LETTER VAV WITH DAGESH
-FB36    #  HEBREW LETTER ZAYIN WITH DAGESH
-FB38    #  HEBREW LETTER TET WITH DAGESH
-FB39    #  HEBREW LETTER YOD WITH DAGESH
-FB3A    #  HEBREW LETTER FINAL KAF WITH DAGESH
-FB3B    #  HEBREW LETTER KAF WITH DAGESH
-FB3C    #  HEBREW LETTER LAMED WITH DAGESH
-FB3E    #  HEBREW LETTER MEM WITH DAGESH
-FB40    #  HEBREW LETTER NUN WITH DAGESH
-FB41    #  HEBREW LETTER SAMEKH WITH DAGESH
-FB43    #  HEBREW LETTER FINAL PE WITH DAGESH
-FB44    #  HEBREW LETTER PE WITH DAGESH
-FB46    #  HEBREW LETTER TSADI WITH DAGESH
-FB47    #  HEBREW LETTER QOF WITH DAGESH
-FB48    #  HEBREW LETTER RESH WITH DAGESH
-FB49    #  HEBREW LETTER SHIN WITH DAGESH
-FB4A    #  HEBREW LETTER TAV WITH DAGESH
-FB4B    #  HEBREW LETTER VAV WITH HOLAM
-FB4C    #  HEBREW LETTER BET WITH RAFE
-FB4D    #  HEBREW LETTER KAF WITH RAFE
-FB4E    #  HEBREW LETTER PE WITH RAFE
-
-# Total code points: 67
-
-# ================================================
-# (2) Post Composition Version precomposed characters
-#
-# These characters cannot be derived solely from the UnicodeData.txt file
-# in this version of Unicode.
-#
-# Note that characters added to the standard after the
-# Composition Version and which have canonical decomposition mappings
-# are not automatically added to this list of Post Composition
-# Version precomposed characters.
-# ================================================
-
-2ADC    #  FORKING
-1D15E   #  MUSICAL SYMBOL HALF NOTE
-1D15F   #  MUSICAL SYMBOL QUARTER NOTE
-1D160   #  MUSICAL SYMBOL EIGHTH NOTE
-1D161   #  MUSICAL SYMBOL SIXTEENTH NOTE
-1D162   #  MUSICAL SYMBOL THIRTY-SECOND NOTE
-1D163   #  MUSICAL SYMBOL SIXTY-FOURTH NOTE
-1D164   #  MUSICAL SYMBOL ONE HUNDRED TWENTY-EIGHTH NOTE
-1D1BB   #  MUSICAL SYMBOL MINIMA
-1D1BC   #  MUSICAL SYMBOL MINIMA BLACK
-1D1BD   #  MUSICAL SYMBOL SEMIMINIMA WHITE
-1D1BE   #  MUSICAL SYMBOL SEMIMINIMA BLACK
-1D1BF   #  MUSICAL SYMBOL FUSA WHITE
-1D1C0   #  MUSICAL SYMBOL FUSA BLACK
-
-# Total code points: 14
-
-# ================================================
-# (3) Singleton Decompositions
-#
-# These characters can be derived from the UnicodeData.txt file
-# by including all characters whose canonical decomposition
-# consists of a single character.
-#
-# These characters are simply quoted here for reference.
-# See also Full_Composition_Exclusion in DerivedNormalizationProps.txt
-# ================================================
-
-# 0340..0341       [2] COMBINING GRAVE TONE MARK..COMBINING ACUTE TONE MARK
-# 0343                 COMBINING GREEK KORONIS
-# 0374                 GREEK NUMERAL SIGN
-# 037E                 GREEK QUESTION MARK
-# 0387                 GREEK ANO TELEIA
-# 1F71                 GREEK SMALL LETTER ALPHA WITH OXIA
-# 1F73                 GREEK SMALL LETTER EPSILON WITH OXIA
-# 1F75                 GREEK SMALL LETTER ETA WITH OXIA
-# 1F77                 GREEK SMALL LETTER IOTA WITH OXIA
-# 1F79                 GREEK SMALL LETTER OMICRON WITH OXIA
-# 1F7B                 GREEK SMALL LETTER UPSILON WITH OXIA
-# 1F7D                 GREEK SMALL LETTER OMEGA WITH OXIA
-# 1FBB                 GREEK CAPITAL LETTER ALPHA WITH OXIA
-# 1FBE                 GREEK PROSGEGRAMMENI
-# 1FC9                 GREEK CAPITAL LETTER EPSILON WITH OXIA
-# 1FCB                 GREEK CAPITAL LETTER ETA WITH OXIA
-# 1FD3                 GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA
-# 1FDB                 GREEK CAPITAL LETTER IOTA WITH OXIA
-# 1FE3                 GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA
-# 1FEB                 GREEK CAPITAL LETTER UPSILON WITH OXIA
-# 1FEE..1FEF       [2] GREEK DIALYTIKA AND OXIA..GREEK VARIA
-# 1FF9                 GREEK CAPITAL LETTER OMICRON WITH OXIA
-# 1FFB                 GREEK CAPITAL LETTER OMEGA WITH OXIA
-# 1FFD                 GREEK OXIA
-# 2000..2001       [2] EN QUAD..EM QUAD
-# 2126                 OHM SIGN
-# 212A..212B       [2] KELVIN SIGN..ANGSTROM SIGN
-# 2329                 LEFT-POINTING ANGLE BRACKET
-# 232A                 RIGHT-POINTING ANGLE BRACKET
-# F900..FA0D     [270] CJK COMPATIBILITY IDEOGRAPH-F900..CJK COMPATIBILITY IDEOGRAPH-FA0D
-# FA10                 CJK COMPATIBILITY IDEOGRAPH-FA10
-# FA12                 CJK COMPATIBILITY IDEOGRAPH-FA12
-# FA15..FA1E      [10] CJK COMPATIBILITY IDEOGRAPH-FA15..CJK COMPATIBILITY IDEOGRAPH-FA1E
-# FA20                 CJK COMPATIBILITY IDEOGRAPH-FA20
-# FA22                 CJK COMPATIBILITY IDEOGRAPH-FA22
-# FA25..FA26       [2] CJK COMPATIBILITY IDEOGRAPH-FA25..CJK COMPATIBILITY IDEOGRAPH-FA26
-# FA2A..FA2D       [4] CJK COMPATIBILITY IDEOGRAPH-FA2A..CJK COMPATIBILITY IDEOGRAPH-FA2D
-# FA30..FA6A      [59] CJK COMPATIBILITY IDEOGRAPH-FA30..CJK COMPATIBILITY IDEOGRAPH-FA6A
-# FA70..FAD9     [106] CJK COMPATIBILITY IDEOGRAPH-FA70..CJK COMPATIBILITY IDEOGRAPH-FAD9
-# 2F800..2FA1D   [542] CJK COMPATIBILITY IDEOGRAPH-2F800..CJK COMPATIBILITY IDEOGRAPH-2FA1D
-
-# Total code points: 924
-
-# ================================================
-# (4) Non-Starter Decompositions
-#
-# These characters can be derived from the UnicodeData file
-# by including all characters whose canonical decomposition consists
-# of a sequence of characters, the first of which has a non-zero
-# combining class.
-#
-# These characters are simply quoted here for reference.
-# See also Full_Composition_Exclusion in DerivedNormalizationProps.txt
-# ================================================
-
-# 0344                 COMBINING GREEK DIALYTIKA TONOS
-# 0F73                 TIBETAN VOWEL SIGN II
-# 0F75                 TIBETAN VOWEL SIGN UU
-# 0F81                 TIBETAN VOWEL SIGN REVERSED II
-
-# Total code points: 4
-
diff --git a/util/unicode/main.cpp b/util/unicode/main.cpp
index 8f27d4a..f2ebe7c 100644
--- a/util/unicode/main.cpp
+++ b/util/unicode/main.cpp
@@ -38,36 +38,56 @@
 ** $QT_END_LICENSE$
 **
 ****************************************************************************/
+
 #include <qlist.h>
 #include <qhash.h>
 #include <qfile.h>
+#include <qbytearray.h>
 #include <qstring.h>
 #include <qchar.h>
-#include <private/qunicodetables_p.h>
 #include <qvector.h>
 #include <qdebug.h>
+#if 0
+#include <private/qunicodetables_p.h>
+#endif
 
+#define DATA_VERSION_S "5.0"
+#define DATA_VERSION_STR "QChar::Unicode_5_0"
+
+#define LAST_CODEPOINT 0x10ffff
+#define LAST_CODEPOINT_STR "0x10ffff"
+
+
+static QHash<QByteArray, QChar::UnicodeVersion> age_map;
+
+static void initAgeMap()
+{
+    struct AgeMap {
+        const QChar::UnicodeVersion version;
+        const char *age;
+    } ageMap[] = {
+        { QChar::Unicode_1_1,   "1.1" },
+        { QChar::Unicode_2_0,   "2.0" },
+        { QChar::Unicode_2_1_2, "2.1" },
+        { QChar::Unicode_3_0,   "3.0" },
+        { QChar::Unicode_3_1,   "3.1" },
+        { QChar::Unicode_3_2,   "3.2" },
+        { QChar::Unicode_4_0,   "4.0" },
+        { QChar::Unicode_4_1,   "4.1" },
+        { QChar::Unicode_5_0,   "5.0" },
+        { QChar::Unicode_Unassigned, 0 }
+    };
+    AgeMap *d = ageMap;
+    while (d->age) {
+        age_map.insert(d->age, d->version);
+        ++d;
+    }
+}
 
-static struct AgeMap {
-    const char *age;
-    const QChar::UnicodeVersion version;
-} ageMap [] = {
-    { "1.1", QChar::Unicode_1_1 },
-    { "2.0", QChar::Unicode_2_0 },
-    { "2.1", QChar::Unicode_2_1_2 },
-    { "3.0", QChar::Unicode_3_0 },
-    { "3.1", QChar::Unicode_3_1 },
-    { "3.2", QChar::Unicode_3_2 },
-    { "4.0", QChar::Unicode_4_0 },
-    { "4.1", QChar::Unicode_4_1 },
-    { "5.0", QChar::Unicode_5_0 },
-    { 0, QChar::Unicode_Unassigned }
-};
-#define CURRENT_UNICODE_VERSION "QChar::Unicode_5_0"
 
 static const char *grapheme_break_string =
     "    enum GraphemeBreak {\n"
-    "        GraphemeBreakOther, \n"
+    "        GraphemeBreakOther,\n"
     "        GraphemeBreakCR,\n"
     "        GraphemeBreakLF,\n"
     "        GraphemeBreakControl,\n"
@@ -90,9 +110,11 @@ enum GraphemeBreak {
     GraphemeBreakT,
     GraphemeBreakLV,
     GraphemeBreakLVT
+
+    , GraphemeBreak_Unassigned
 };
 
-QHash<QByteArray, GraphemeBreak> grapheme_break_map;
+static QHash<QByteArray, GraphemeBreak> grapheme_break_map;
 
 static void initGraphemeBreak()
 {
@@ -110,7 +132,7 @@ static void initGraphemeBreak()
         { GraphemeBreakT, "T" },
         { GraphemeBreakLV, "LV" },
         { GraphemeBreakLVT, "LVT" },
-        { GraphemeBreakOther, 0 }
+        { GraphemeBreak_Unassigned, 0 }
     };
     GraphemeBreakList *d = breaks;
     while (d->name) {
@@ -119,7 +141,8 @@ static void initGraphemeBreak()
     }
 }
 
-const char *word_break_string =
+
+static const char *word_break_string =
     "    enum WordBreak {\n"
     "        WordBreakOther,\n"
     "        WordBreakFormat,\n"
@@ -140,10 +163,11 @@ enum WordBreak {
     WordBreakMidNum,
     WordBreakNumeric,
     WordBreakExtendNumLet
-};
 
+    , WordBreak_Unassigned
+};
 
-QHash<QByteArray, WordBreak> word_break_map;
+static QHash<QByteArray, WordBreak> word_break_map;
 
 static void initWordBreak()
 {
@@ -159,7 +183,7 @@ static void initWordBreak()
         { WordBreakMidNum, "MidNum" },
         { WordBreakNumeric, "Numeric" },
         { WordBreakExtendNumLet, "ExtendNumLet" },
-        { WordBreakFormat,  0 }
+        { WordBreak_Unassigned, 0 }
     };
     WordBreakList *d = breaks;
     while (d->name) {
@@ -196,10 +220,11 @@ enum SentenceBreak {
     SentenceBreakATerm,
     SentenceBreakSTerm,
     SentenceBreakClose
-};
 
+    , SentenceBreak_Unassigned
+};
 
-QHash<QByteArray, SentenceBreak> sentence_break_map;
+static QHash<QByteArray, SentenceBreak> sentence_break_map;
 
 static void initSentenceBreak()
 {
@@ -218,7 +243,7 @@ static void initSentenceBreak()
         { SentenceBreakATerm, "ATerm" },
         { SentenceBreakSTerm, "STerm" },
         { SentenceBreakClose, "Close" },
-        { SentenceBreakOther,  0 }
+        { SentenceBreak_Unassigned, 0 }
     };
     SentenceBreakList *d = breaks;
     while (d->name) {
@@ -228,33 +253,7 @@ static void initSentenceBreak()
 }
 
 
-// Keep this one in sync with the code in createPropertyInfo
-const char *property_string =
-    "    struct Properties {\n"
-    "        ushort category : 8;\n"
-    "        ushort line_break_class : 8;\n"
-    "        ushort direction : 8;\n"
-    "        ushort combiningClass :8;\n"
-    "        ushort joining : 2;\n"
-    "        signed short digitValue : 6; /* 5 needed */\n"
-    "        ushort unicodeVersion : 4;\n"
-    "        ushort lowerCaseSpecial : 1;\n"
-    "        ushort upperCaseSpecial : 1;\n"
-    "        ushort titleCaseSpecial : 1;\n"
-    "        ushort caseFoldSpecial : 1; /* currently unused */\n"
-    "        signed short mirrorDiff : 16;\n"
-    "        signed short lowerCaseDiff : 16;\n"
-    "        signed short upperCaseDiff : 16;\n"
-    "        signed short titleCaseDiff : 16;\n"
-    "        signed short caseFoldDiff : 16;\n"
-    "        ushort graphemeBreak : 8;\n"
-    "        ushort wordBreak : 8;\n"
-    "        ushort sentenceBreak : 8;\n"
-    "    };\n"
-    "    Q_CORE_EXPORT const Properties * QT_FASTCALL properties(uint ucs4);\n"
-    "    Q_CORE_EXPORT const Properties * QT_FASTCALL properties(ushort ucs2);\n";
-
-const char *lineBreakClass =
+static const char *lineBreakClass =
     "    // see http://www.unicode.org/reports/tr14/tr14-19.html\n"
     "    // we don't use the XX, AI and CB properties and map them to AL instead.\n"
     "    // as we don't support any EBDIC based OS'es, NL is ignored and mapped to AL as well.\n"
@@ -268,16 +267,108 @@ const char *lineBreakClass =
     "        LineBreak_SP, LineBreak_CR, LineBreak_LF, LineBreak_BK\n"
     "    };\n\n";
 
-const char *methods =
+enum LineBreakClass {
+    LineBreak_OP, LineBreak_CL, LineBreak_QU, LineBreak_GL, LineBreak_NS,
+    LineBreak_EX, LineBreak_SY, LineBreak_IS, LineBreak_PR, LineBreak_PO,
+    LineBreak_NU, LineBreak_AL, LineBreak_ID, LineBreak_IN, LineBreak_HY,
+    LineBreak_BA, LineBreak_BB, LineBreak_B2, LineBreak_ZW, LineBreak_CM,
+    LineBreak_WJ, LineBreak_H2, LineBreak_H3, LineBreak_JL, LineBreak_JV,
+    LineBreak_JT, LineBreak_SA, LineBreak_SG,
+    LineBreak_SP, LineBreak_CR, LineBreak_LF, LineBreak_BK
+
+    , LineBreak_Unassigned
+};
+
+static QHash<QByteArray, LineBreakClass> line_break_map;
+
+static void initLineBreak()
+{
+    // ### Classes XX and AI are left out and mapped to AL for now;
+    // ### Class NL is ignored and mapped to AL as well.
+    struct LineBreakList {
+        LineBreakClass brk;
+        const char *name;
+    } breaks[] = {
+        { LineBreak_BK, "BK" },
+        { LineBreak_CR, "CR" },
+        { LineBreak_LF, "LF" },
+        { LineBreak_CM, "CM" },
+        { LineBreak_AL, "NL" },
+        { LineBreak_SG, "SG" },
+        { LineBreak_WJ, "WJ" },
+        { LineBreak_ZW, "ZW" },
+        { LineBreak_GL, "GL" },
+        { LineBreak_SP, "SP" },
+        { LineBreak_B2, "B2" },
+        { LineBreak_BA, "BA" },
+        { LineBreak_BB, "BB" },
+        { LineBreak_HY, "HY" },
+        { LineBreak_AL, "CB" }, // ###
+        { LineBreak_CL, "CL" },
+        { LineBreak_EX, "EX" },
+        { LineBreak_IN, "IN" },
+        { LineBreak_NS, "NS" },
+        { LineBreak_OP, "OP" },
+        { LineBreak_QU, "QU" },
+        { LineBreak_IS, "IS" },
+        { LineBreak_NU, "NU" },
+        { LineBreak_PO, "PO" },
+        { LineBreak_PR, "PR" },
+        { LineBreak_SY, "SY" },
+        { LineBreak_AL, "AI" },
+        { LineBreak_AL, "AL" },
+        { LineBreak_H2, "H2" },
+        { LineBreak_H3, "H3" },
+        { LineBreak_ID, "ID" },
+        { LineBreak_JL, "JL" },
+        { LineBreak_JV, "JV" },
+        { LineBreak_JT, "JT" },
+        { LineBreak_SA, "SA" },
+        { LineBreak_AL, "XX" },
+        { LineBreak_Unassigned, 0 }
+    };
+    LineBreakList *d = breaks;
+    while (d->name) {
+        line_break_map.insert(d->name, d->brk);
+        ++d;
+    }
+}
+
+
+// Keep this one in sync with the code in createPropertyInfo
+static const char *property_string =
+    "    struct Properties {\n"
+    "        ushort category         : 8; /* 5 needed */\n"
+    "        ushort line_break_class : 8; /* 6 needed */\n"
+    "        ushort direction        : 8; /* 5 needed */\n"
+    "        ushort combiningClass   : 8;\n"
+    "        ushort joining          : 2;\n"
+    "        signed short digitValue : 6; /* 5 needed */\n"
+    "        ushort unicodeVersion   : 4;\n"
+    "        ushort lowerCaseSpecial : 1;\n"
+    "        ushort upperCaseSpecial : 1;\n"
+    "        ushort titleCaseSpecial : 1;\n"
+    "        ushort caseFoldSpecial  : 1; /* currently unused */\n"
+    "        signed short mirrorDiff    : 16;\n"
+    "        signed short lowerCaseDiff : 16;\n"
+    "        signed short upperCaseDiff : 16;\n"
+    "        signed short titleCaseDiff : 16;\n"
+    "        signed short caseFoldDiff  : 16;\n"
+    "        ushort graphemeBreak    : 8; /* 4 needed */\n"
+    "        ushort wordBreak        : 8; /* 4 needed */\n"
+    "        ushort sentenceBreak    : 8; /* 4 needed */\n"
+    "    };\n"
+    "    Q_CORE_EXPORT const Properties * QT_FASTCALL properties(uint ucs4);\n"
+    "    Q_CORE_EXPORT const Properties * QT_FASTCALL properties(ushort ucs2);\n";
+
+static const char *methods =
     "    Q_CORE_EXPORT QUnicodeTables::LineBreakClass QT_FASTCALL lineBreakClass(uint ucs4);\n"
-    "    inline int lineBreakClass(const QChar &ch) {\n"
-    "        return QUnicodeTables::lineBreakClass(ch.unicode());\n"
-    "    }\n"
+    "    inline int lineBreakClass(const QChar &ch)\n"
+    "    { return lineBreakClass(ch.unicode()); }\n"
     "\n"
     "    Q_CORE_EXPORT int QT_FASTCALL script(uint ucs4);\n"
-    "    Q_CORE_EXPORT_INLINE int QT_FASTCALL script(const QChar &ch) {\n"
-    "        return script(ch.unicode());\n"
-    "    }\n\n";
+    "    inline int script(const QChar &ch)\n"
+    "    { return script(ch.unicode()); }\n\n";
 
 
 struct PropertyFlags {
@@ -312,7 +403,7 @@ struct PropertyFlags {
     // from DerivedAge.txt
     QChar::UnicodeVersion age : 4;
     int digitValue;
-    uint line_break_class : 5;
+    uint line_break_class : 6;
 
     int mirrorDiff : 16;
 
@@ -329,8 +420,9 @@ struct PropertyFlags {
     SentenceBreak sentenceBreak;
 };
 
-QList<int> specialCaseMap;
-int specialCaseMaxLen = 0;
+
+static QList<int> specialCaseMap;
+static int specialCaseMaxLen = 0;
 
 static int appendToSpecialCaseMap(const QList<int> &map)
 {
@@ -347,7 +439,7 @@ static int appendToSpecialCaseMap(const QList<int> &map)
     specialCaseMaxLen = qMax(specialCaseMaxLen, utf16map.size());
     utf16map << 0;
 
-    for (int i = 0; i < specialCaseMap.size() - utf16map.size() - 1; ++i) {
+    for (int i = 0; i < specialCaseMap.size() - utf16map.size() + 1; ++i) {
         int j;
         for (j = 0; j < utf16map.size(); ++j) {
             if (specialCaseMap.at(i+j) != utf16map.at(j))
@@ -364,7 +456,7 @@ static int appendToSpecialCaseMap(const QList<int> &map)
 
 struct UnicodeData {
     UnicodeData(int codepoint = 0) {
-        p.category = QChar::NoCategory;
+        p.category = QChar::Other_NotAssigned; // Cn
         p.combiningClass = 0;
 
         p.direction = QChar::DirL;
@@ -387,7 +479,7 @@ struct UnicodeData {
         p.age = QChar::Unicode_Unassigned;
         p.mirrorDiff = 0;
         p.digitValue = -1;
-        p.line_break_class = QUnicodeTables::LineBreak_AL;
+        p.line_break_class = LineBreak_AL; // XX -> AL
         p.lowerCaseDiff = 0;
         p.upperCaseDiff = 0;
         p.titleCaseDiff = 0;
@@ -438,14 +530,15 @@ enum UniDataFields {
     UD_TitleCase
 };
 
-QHash<QByteArray, QChar::Category> categoryMap;
+
+static QHash<QByteArray, QChar::Category> categoryMap;
 
 static void initCategoryMap()
 {
     struct Cat {
         QChar::Category cat;
         const char *name;
-    } categories [] = {
+    } categories[] = {
         { QChar::Mark_NonSpacing,          "Mn" },
         { QChar::Mark_SpacingCombining,    "Mc" },
         { QChar::Mark_Enclosing,           "Me" },
@@ -485,13 +578,14 @@ static void initCategoryMap()
         { QChar::NoCategory, 0 }
     };
     Cat *c = categories;
-    while (c->cat != QChar::NoCategory) {
+    while (c->name) {
         categoryMap.insert(c->name, c->cat);
         ++c;
     }
 }
 
-QHash<QByteArray, QChar::Direction> directionMap;
+
+static QHash<QByteArray, QChar::Direction> directionMap;
 
 static void initDirectionMap()
 {
@@ -528,7 +622,7 @@ static void initDirectionMap()
 }
 
 
-QHash<QByteArray, QChar::Decomposition> decompositionMap;
+static QHash<QByteArray, QChar::Decomposition> decompositionMap;
 
 static void initDecompositionMap()
 {
@@ -553,7 +647,7 @@ static void initDecompositionMap()
         { QChar::Square, "<square>" },
         { QChar::Compat, "<compat>" },
         { QChar::Fraction, "<fraction>" },
-        { QChar::NoDecomposition,  0 }
+        { QChar::NoDecomposition, 0 }
     };
     Dec *d = decompositions;
     while (d->name) {
@@ -563,28 +657,31 @@ static void initDecompositionMap()
 }
 
 
-QHash<int, UnicodeData> unicodeData;
-QList<PropertyFlags> uniqueProperties;
+static QHash<int, UnicodeData> unicodeData;
+static QList<PropertyFlags> uniqueProperties;
 
 
-QHash<int, int> decompositionLength;
-int highestComposedCharacter = 0;
-int numLigatures = 0;
-int highestLigature = 0;
+static QHash<int, int> decompositionLength;
+static int highestComposedCharacter = 0;
+static int numLigatures = 0;
+static int highestLigature = 0;
 
-struct Ligature {ushort u1; ushort u2; ushort ligature;};
+struct Ligature {
+    ushort u1;
+    ushort u2;
+    ushort ligature;
+};
 // we need them sorted after the first component for fast lookup
-bool operator < (const Ligature &l1, const Ligature &l2) {
-    return l1.u1 < l2.u1;
-}
+bool operator < (const Ligature &l1, const Ligature &l2)
+{ return l1.u1 < l2.u1; }
 
-QHash<ushort, QList<Ligature> > ligatureHashes;
+static QHash<ushort, QList<Ligature> > ligatureHashes;
 
-QHash<int, int> combiningClassUsage;
+static QHash<int, int> combiningClassUsage;
 
-int maxLowerCaseDiff = 0;
-int maxUpperCaseDiff = 0;
-int maxTitleCaseDiff = 0;
+static int maxLowerCaseDiff = 0;
+static int maxUpperCaseDiff = 0;
+static int maxTitleCaseDiff = 0;
 
 static void readUnicodeData()
 {
@@ -609,6 +706,8 @@ static void readUnicodeData()
         QList<QByteArray> properties = line.split(';');
         bool ok;
         int codepoint = properties[UD_Value].toInt(&ok, 16);
+        Q_ASSERT(ok);
+        Q_ASSERT(codepoint <= LAST_CODEPOINT);
         int lastCodepoint = codepoint;
 
         QByteArray name = properties[UD_Name];
@@ -617,11 +716,16 @@ static void readUnicodeData()
             nextLine.resize(1024);
             f.readLine(nextLine.data(), 1024);
             QList<QByteArray> properties = nextLine.split(';');
+            Q_ASSERT(properties[UD_Name].startsWith('<') && properties[UD_Name].contains("Last"));
             lastCodepoint = properties[UD_Value].toInt(&ok, 16);
+            Q_ASSERT(ok);
+            Q_ASSERT(lastCodepoint <= LAST_CODEPOINT);
         }
 
         UnicodeData data(codepoint);
         data.p.category = categoryMap.value(properties[UD_Category], QChar::NoCategory);
+        if (data.p.category == QChar::NoCategory)
+            qFatal("unassigned char category: %s", properties[UD_Category].constData());
         data.p.combiningClass = properties[UD_CombiningClass].toInt();
 
         if (!combiningClassUsage.contains(data.p.combiningClass))
@@ -634,6 +738,8 @@ static void readUnicodeData()
         if (!properties[UD_UpperCase].isEmpty()) {
             int upperCase = properties[UD_UpperCase].toInt(&ok, 16);
             Q_ASSERT(ok);
+            if (qAbs(upperCase - codepoint) >= (1<<14))
+                qWarning() << "upperCaseDiff exceeded (" << hex << codepoint << "->" << upperCase << ")";
             data.p.upperCaseDiff = upperCase - codepoint;
             maxUpperCaseDiff = qMax(maxUpperCaseDiff, qAbs(data.p.upperCaseDiff));
             if (codepoint > 0xffff) {
@@ -644,7 +750,9 @@ static void readUnicodeData()
         }
         if (!properties[UD_LowerCase].isEmpty()) {
             int lowerCase = properties[UD_LowerCase].toInt(&ok, 16);
-            Q_ASSERT (ok);
+            Q_ASSERT(ok);
+            if (qAbs(lowerCase - codepoint) >= (1<<14))
+                qWarning() << "lowerCaseDiff exceeded (" << hex << codepoint << "->" << lowerCase << ")";
             data.p.lowerCaseDiff = lowerCase - codepoint;
             maxLowerCaseDiff = qMax(maxLowerCaseDiff, qAbs(data.p.lowerCaseDiff));
             if (codepoint > 0xffff) {
@@ -658,7 +766,9 @@ static void readUnicodeData()
             properties[UD_TitleCase] = properties[UD_UpperCase];
         if (!properties[UD_TitleCase].isEmpty()) {
             int titleCase = properties[UD_TitleCase].toInt(&ok, 16);
-            Q_ASSERT (ok);
+            Q_ASSERT(ok);
+            if (qAbs(titleCase - codepoint) >= (1<<14))
+                qWarning() << "titleCaseDiff exceeded (" << hex << codepoint << "->" << titleCase << ")";
             data.p.titleCaseDiff = titleCase - codepoint;
             maxTitleCaseDiff = qMax(maxTitleCaseDiff, qAbs(data.p.titleCaseDiff));
             if (codepoint > 0xffff) {
@@ -677,13 +787,17 @@ static void readUnicodeData()
             highestComposedCharacter = qMax(highestComposedCharacter, codepoint);
             QList<QByteArray> d = decomposition.split(' ');
             if (d[0].contains('<')) {
-                data.decompositionType = decompositionMap.value(d[0], QChar::Canonical);
+                data.decompositionType = decompositionMap.value(d[0], QChar::NoDecomposition);
+                if (data.decompositionType == QChar::NoDecomposition)
+                    qFatal("unassigned decomposition type: %s", d[0].constData());
                 d.takeFirst();
             } else {
                 data.decompositionType = QChar::Canonical;
             }
-            for (int i = 0; i < d.size(); ++i)
+            for (int i = 0; i < d.size(); ++i) {
                 data.decomposition.append(d[i].toInt(&ok, 16));
+                Q_ASSERT(ok);
+            }
             if (!decompositionLength.contains(data.decomposition.size()))
                 decompositionLength[data.decomposition.size()] = 1;
             else
@@ -725,14 +839,14 @@ static void readBidiMirroring()
 
         bool ok;
         int codepoint = pair[0].toInt(&ok, 16);
+        Q_ASSERT(ok);
         int mirror = pair[1].toInt(&ok, 16);
+        Q_ASSERT(ok);
 
         UnicodeData d = unicodeData.value(codepoint, UnicodeData(codepoint));
         d.mirroredChar = mirror;
-        if (qAbs(codepoint-d.mirroredChar) > maxMirroredDiff)
-            maxMirroredDiff = qAbs(codepoint - d.mirroredChar);
-
         d.p.mirrorDiff = d.mirroredChar - codepoint;
+        maxMirroredDiff = qMax(maxMirroredDiff, qAbs(d.p.mirrorDiff));
         unicodeData.insert(codepoint, d);
     }
 }
@@ -764,6 +878,8 @@ static void readArabicShaping()
 
         bool ok;
         int codepoint = shaping[0].toInt(&ok, 16);
+        Q_ASSERT(ok);
+
         QChar::Joining j = QChar::OtherJoining;
         QByteArray shape = shaping[2].trimmed();
         if (shape == "R")
@@ -810,22 +926,17 @@ static void readDerivedAge()
 
         bool ok;
         int from = cl[0].toInt(&ok, 16);
+        Q_ASSERT(ok);
         int to = from;
-        if (cl.size() == 2)
+        if (cl.size() == 2) {
             to = cl[1].toInt(&ok, 16);
-
-        QChar::UnicodeVersion age = QChar::Unicode_Unassigned;
-        QByteArray ba = l[1];
-        AgeMap *map = ageMap;
-        while (map->age) {
-            if (ba == map->age) {
-                age = map->version;
-                break;
-            }
-            ++map;
+            Q_ASSERT(ok);
         }
+
+        QChar::UnicodeVersion age = age_map.value(l[1].trimmed(), QChar::Unicode_Unassigned);
         //qDebug() << hex << from << ".." << to << ba << age;
-        Q_ASSERT(age != QChar::Unicode_Unassigned);
+        if (age == QChar::Unicode_Unassigned)
+            qFatal("unassigned or unhandled age value: %s", l[1].constData());
 
         for (int codepoint = from; codepoint <= to; ++codepoint) {
             UnicodeData d = unicodeData.value(codepoint, UnicodeData(codepoint));
@@ -836,11 +947,11 @@ static void readDerivedAge()
 }
 
 
-static void readCompositionExclusion()
+static void readDerivedNormalizationProps()
 {
-    QFile f("data/CompositionExclusions.txt");
+    QFile f("data/DerivedNormalizationProps.txt");
     if (!f.exists())
-        qFatal("Couldn't find CompositionExclusions.txt");
+        qFatal("Couldn't find DerivedNormalizationProps.txt");
 
     f.open(QFile::ReadOnly);
 
@@ -853,42 +964,61 @@ static void readCompositionExclusion()
         int comment = line.indexOf('#');
         if (comment >= 0)
             line = line.left(comment);
-        line.replace(" ", "");
 
-        if (line.isEmpty())
+        if (line.trimmed().isEmpty())
             continue;
 
-        Q_ASSERT(!line.contains(".."));
+        QList<QByteArray> l = line.split(';');
+        Q_ASSERT(l.size() >= 2);
+
+        QByteArray propName = l[1].trimmed();
+        if (propName != "Full_Composition_Exclusion")
+            // ###
+            continue;
+
+        QByteArray codes = l[0].trimmed();
+        codes.replace("..", ".");
+        QList<QByteArray> cl = codes.split('.');
 
         bool ok;
-        int codepoint = line.toInt(&ok, 16);
+        int from = cl[0].toInt(&ok, 16);
+        Q_ASSERT(ok);
+        int to = from;
+        if (cl.size() == 2) {
+            to = cl[1].toInt(&ok, 16);
+            Q_ASSERT(ok);
+        }
 
-        UnicodeData d = unicodeData.value(codepoint, UnicodeData(codepoint));
-        d.excludedComposition = true;
-        unicodeData.insert(codepoint, d);
+        for (int codepoint = from; codepoint <= to; ++codepoint) {
+            UnicodeData d = unicodeData.value(codepoint, UnicodeData(codepoint));
+            d.excludedComposition = true;
+            unicodeData.insert(codepoint, d);
+        }
     }
 
-    for (int i = 0; i < 0x110000; ++i) {
-        UnicodeData data = unicodeData.value(i, UnicodeData(i));
-        if (!data.excludedComposition
-            && data.decompositionType == QChar::Canonical
-            && data.decomposition.size() > 1) {
-            Q_ASSERT(data.decomposition.size() == 2);
-
-            uint part1 = data.decomposition.at(0);
-            uint part2 = data.decomposition.at(1);
-            UnicodeData first = unicodeData.value(part1, UnicodeData(part1));
-            if (first.p.combiningClass != 0)
-                continue;
+    for (int codepoint = 0; codepoint <= LAST_CODEPOINT; ++codepoint) {
+        UnicodeData d = unicodeData.value(codepoint, UnicodeData(codepoint));
+        if (!d.excludedComposition
+            && d.decompositionType == QChar::Canonical
+            && d.decomposition.size() > 1) {
+            Q_ASSERT(d.decomposition.size() == 2);
+
+            uint part1 = d.decomposition.at(0);
+            uint part2 = d.decomposition.at(1);
+
+            // all non-starters are listed in DerivedNormalizationProps.txt
+            // and already excluded from composition
+            Q_ASSERT(unicodeData.value(part1, UnicodeData(part1)).p.combiningClass == 0);
 
             ++numLigatures;
             highestLigature = qMax(highestLigature, (int)part1);
-            Ligature l = {(ushort)part1, (ushort)part2, i};
+            Ligature l = {(ushort)part1, (ushort)part2, codepoint};
             ligatureHashes[part2].append(l);
         }
     }
 }
 
+
 struct NormalizationCorrection {
     uint codepoint;
     uint mapped;
@@ -933,10 +1063,12 @@ static QByteArray createNormalizationCorrections()
         QList<QByteArray> fields = line.split(';');
         Q_ASSERT(fields.size() == 4);
 
-        NormalizationCorrection c;
+        NormalizationCorrection c = { 0, 0, 0 };
         bool ok;
         c.codepoint = fields.at(0).toInt(&ok, 16);
+        Q_ASSERT(ok);
         c.mapped = fields.at(1).toInt(&ok, 16);
+        Q_ASSERT(ok);
         if (fields.at(3) == "3.2.0")
             c.version = QChar::Unicode_3_2;
         else if (fields.at(3) == "4.0.0")
@@ -953,7 +1085,6 @@ static QByteArray createNormalizationCorrections()
 
            "enum { NumNormalizationCorrections = " + QByteArray::number(numCorrections) + " };\n\n";
 
-
     return out;
 }
 
@@ -961,7 +1092,7 @@ static QByteArray createNormalizationCorrections()
 static void computeUniqueProperties()
 {
     qDebug("computeUniqueProperties:");
-    for (int uc = 0; uc < 0x110000; ++uc) {
+    for (int uc = 0; uc <= LAST_CODEPOINT; ++uc) {
         UnicodeData d = unicodeData.value(uc, UnicodeData(uc));
 
         int index = uniqueProperties.indexOf(d.p);
@@ -972,7 +1103,7 @@ static void computeUniqueProperties()
         d.propertyIndex = index;
         unicodeData.insert(uc, d);
     }
-    qDebug("    %d unicode properties found", uniqueProperties.size());
+    qDebug("    %d unique unicode properties found", uniqueProperties.size());
 }
 
 
@@ -1007,54 +1138,17 @@ static void readLineBreak()
 
         bool ok;
         int from = cl[0].toInt(&ok, 16);
+        Q_ASSERT(ok);
         int to = from;
-        if (cl.size() == 2)
+        if (cl.size() == 2) {
             to = cl[1].toInt(&ok, 16);
-
-        // ### Classes XX and AI are left out and mapped to AL for now
-        QUnicodeTables::LineBreakClass lb = QUnicodeTables::LineBreak_AL;
-        QByteArray ba = l[1];
-
-        if (ba == "AI") lb = QUnicodeTables::LineBreak_AL;
-        else if (ba == "XX") lb = QUnicodeTables::LineBreak_AL;
-        else if (ba == "NL") lb = QUnicodeTables::LineBreak_AL;
-        else if (ba == "OP") lb = QUnicodeTables::LineBreak_OP;
-        else if (ba == "CL") lb = QUnicodeTables::LineBreak_CL;
-        else if (ba == "QU") lb = QUnicodeTables::LineBreak_QU;
-        else if (ba == "GL") lb = QUnicodeTables::LineBreak_GL;
-        else if (ba == "NS") lb = QUnicodeTables::LineBreak_NS;
-        else if (ba == "EX") lb = QUnicodeTables::LineBreak_EX;
-        else if (ba == "SY") lb = QUnicodeTables::LineBreak_SY;
-        else if (ba == "IS") lb = QUnicodeTables::LineBreak_IS;
-        else if (ba == "PR") lb = QUnicodeTables::LineBreak_PR;
-        else if (ba == "PO") lb = QUnicodeTables::LineBreak_PO;
-        else if (ba == "NU") lb = QUnicodeTables::LineBreak_NU;
-        else if (ba == "AL") lb = QUnicodeTables::LineBreak_AL;
-        else if (ba == "ID") lb = QUnicodeTables::LineBreak_ID;
-        else if (ba == "IN") lb = QUnicodeTables::LineBreak_IN;
-        else if (ba == "HY") lb = QUnicodeTables::LineBreak_HY;
-        else if (ba == "BA") lb = QUnicodeTables::LineBreak_BA;
-        else if (ba == "BB") lb = QUnicodeTables::LineBreak_BB;
-        else if (ba == "B2") lb = QUnicodeTables::LineBreak_B2;
-        else if (ba == "ZW") lb = QUnicodeTables::LineBreak_ZW;
-        else if (ba == "CM") lb = QUnicodeTables::LineBreak_CM;
-        else if (ba == "SA") lb = QUnicodeTables::LineBreak_SA;
-        else if (ba == "BK") lb = QUnicodeTables::LineBreak_BK;
-        else if (ba == "CR") lb = QUnicodeTables::LineBreak_CR;
-        else if (ba == "LF") lb = QUnicodeTables::LineBreak_LF;
-        else if (ba == "SG") lb = QUnicodeTables::LineBreak_SG;
-        else if (ba == "CB") lb = QUnicodeTables::LineBreak_AL;
-        else if (ba == "SP") lb = QUnicodeTables::LineBreak_SP;
-        else if (ba == "WJ") lb = QUnicodeTables::LineBreak_WJ;
-        else if (ba == "H2") lb = QUnicodeTables::LineBreak_H2;
-        else if (ba == "H3") lb = QUnicodeTables::LineBreak_H3;
-        else if (ba == "JL") lb = QUnicodeTables::LineBreak_JL;
-        else if (ba == "JV") lb = QUnicodeTables::LineBreak_JV;
-        else if (ba == "JT") lb = QUnicodeTables::LineBreak_JT;
-        else {
-            qDebug() << "unhandled line break class:" << ba;
+            Q_ASSERT(ok);
         }
 
+        LineBreakClass lb = line_break_map.value(l[1].trimmed(), LineBreak_Unassigned);
+        if (lb == LineBreak_Unassigned)
+            qFatal("unassigned line break class: %s", l[1].constData());
+
         for (int codepoint = from; codepoint <= to; ++codepoint) {
             UnicodeData d = unicodeData.value(codepoint, UnicodeData(codepoint));
             d.p.line_break_class = lb;
@@ -1066,7 +1160,7 @@ static void readLineBreak()
 
 static void readSpecialCasing()
 {
-//     qDebug() << "Reading SpecialCasing.txt";
+    qDebug() << "Reading SpecialCasing.txt";
     QFile f("data/SpecialCasing.txt");
     if (!f.exists())
         qFatal("Couldn't find SpecialCasing.txt");
@@ -1114,8 +1208,6 @@ static void readSpecialCasing()
         for (int i = 0; i < title.size(); ++i) {
             bool ok;
             titleMap.append(title.at(i).toInt(&ok, 16));
-            if (!ok)
-                qDebug() << line << title.at(i);
             Q_ASSERT(ok);
         }
 
@@ -1151,7 +1243,7 @@ static void readSpecialCasing()
     }
 }
 
-int maxCaseFoldDiff = 0;
+static int maxCaseFoldDiff = 0;
 
 static void readCaseFolding()
 {
@@ -1178,7 +1270,7 @@ static void readCaseFolding()
         QList<QByteArray> l = line.split(';');
 
         bool ok;
-        uint codepoint = l[0].trimmed().toInt(&ok, 16);
+        int codepoint = l[0].trimmed().toInt(&ok, 16);
         Q_ASSERT(ok);
 
 
@@ -1198,8 +1290,10 @@ static void readCaseFolding()
 
         UnicodeData ud = unicodeData.value(codepoint, UnicodeData(codepoint));
         if (foldMap.size() == 1) {
+            if (qAbs(foldMap.at(0) - codepoint) >= (1<<14))
+                qWarning() << "caseFoldDiff exceeded (" << hex << codepoint << "->" << foldMap.at(0) << ")";
             ud.p.caseFoldDiff = foldMap.at(0) - codepoint;
-            maxCaseFoldDiff = qMax(maxCaseFoldDiff, ud.p.caseFoldDiff);
+            maxCaseFoldDiff = qMax(maxCaseFoldDiff, qAbs(ud.p.caseFoldDiff));
             if (codepoint > 0xffff) {
                 // if the condition below doesn't hold anymore we need to modify our case folding code
                 //qDebug() << codepoint << QChar::highSurrogate(codepoint) << QChar::highSurrogate(foldMap.at(0));
@@ -1208,7 +1302,7 @@ static void readCaseFolding()
             if (foldMap.at(0) != codepoint + ud.p.lowerCaseDiff)
                 qDebug() << hex << codepoint;
         } else {
-            Q_ASSERT(false); // we currently don't support full case foldings
+            qFatal("we currently don't support full case foldings");
 //             qDebug() << "special" << hex << foldMap;
             ud.p.caseFoldSpecial = true;
             ud.p.caseFoldDiff = appendToSpecialCaseMap(foldMap);
@@ -1254,7 +1348,9 @@ static void readGraphemeBreak()
             Q_ASSERT(ok);
         }
 
-        GraphemeBreak brk = grapheme_break_map.value(l[1].trimmed(), GraphemeBreakOther);
+        GraphemeBreak brk = grapheme_break_map.value(l[1].trimmed(), GraphemeBreak_Unassigned);
+        if (brk == GraphemeBreak_Unassigned)
+            qFatal("unassigned grapheme break class: %s", l[1].constData());
 
         for (int codepoint = from; codepoint <= to; ++codepoint) {
             UnicodeData ud = unicodeData.value(codepoint, UnicodeData(codepoint));
@@ -1301,8 +1397,9 @@ static void readWordBreak()
             Q_ASSERT(ok);
         }
 
-        WordBreak brk = word_break_map.value(l[1].trimmed(), WordBreakOther);
-        Q_ASSERT(brk != WordBreakOther);
+        WordBreak brk = word_break_map.value(l[1].trimmed(), WordBreak_Unassigned);
+        if (brk == WordBreak_Unassigned)
+            qFatal("unassigned word break class: %s", l[1].constData());
 
         for (int codepoint = from; codepoint <= to; ++codepoint) {
             UnicodeData ud = unicodeData.value(codepoint, UnicodeData(codepoint));
@@ -1349,8 +1446,9 @@ static void readSentenceBreak()
             Q_ASSERT(ok);
         }
 
-        SentenceBreak brk = sentence_break_map.value(l[1].trimmed(), SentenceBreakOther);
-        Q_ASSERT(brk != SentenceBreakOther);
+        SentenceBreak brk = sentence_break_map.value(l[1].trimmed(), SentenceBreak_Unassigned);
+        if (brk == SentenceBreak_Unassigned)
+            qFatal("unassigned sentence break class: %s", l[1].constData());
 
         for (int codepoint = from; codepoint <= to; ++codepoint) {
             UnicodeData ud = unicodeData.value(codepoint, UnicodeData(codepoint));
@@ -1644,6 +1742,7 @@ QByteArray createScriptEnumDeclaration()
         "Lao",
         "Malayalam",
         "Myanmar",
+        "Nko",
         "Ogham",
         "Oriya",
         "Runic",
@@ -1661,7 +1760,7 @@ QByteArray createScriptEnumDeclaration()
     // generate script enum
     QByteArray declaration;
 
-    declaration += "    // See http://www.unicode.org/reports/tr24/tr24-5.html\n\n";
+    declaration += "    // See http://www.unicode.org/reports/tr24/tr24-5.html\n";
     declaration += "    enum Script {\n        Common";
 
     int uniqueScripts = 1; // Common
@@ -1671,31 +1770,35 @@ QByteArray createScriptEnumDeclaration()
         QByteArray scriptName = scriptNames.at(i);
         // does the script require special processing?
         bool special = false;
-        for (int s = 0; !special && s < specialScriptsCount; ++s) {
-            if (scriptName == specialScripts[s])
+        for (int s = 0; s < specialScriptsCount; ++s) {
+            if (scriptName == specialScripts[s]) {
                 special = true;
+                break;
+            }
         }
         if (!special) {
-            scriptHash[i] =  0; // alias for 'Common'
+            scriptHash[i] = 0; // alias for 'Common'
             continue;
         } else {
             ++uniqueScripts;
             scriptHash[i] = i;
         }
 
-        declaration += ",\n        ";
-        declaration += scriptName;
+        if (scriptName != "Inherited") {
+            declaration += ",\n        ";
+            declaration += scriptName;
+        }
     }
+    declaration += ",\n        Inherited";
     declaration += ",\n        ScriptCount = Inherited";
 
     // output the ones that are an alias for 'Common'
     for (int i = 1; i < scriptNames.size(); ++i) {
         if (scriptHash.value(i) != 0)
             continue;
-        QByteArray scriptName = scriptNames.at(i);
-        scriptName += " = Common";
         declaration += ",\n        ";
-        declaration += scriptName;
+        declaration += scriptNames.at(i);
+        declaration += " = Common";
     }
 
     declaration += "\n    };\n";
@@ -1831,14 +1934,15 @@ struct PropertyBlock {
     PropertyBlock() { index = -1; }
     int index;
     QList<int> properties;
-    bool operator ==(const PropertyBlock &other) { return properties == other.properties; }
+    bool operator==(const PropertyBlock &other)
+    { return properties == other.properties; }
 };
 
 static QByteArray createPropertyInfo()
 {
     qDebug("createPropertyInfo:");
 
-    const int BMP_BLOCKSIZE=32;
+    const int BMP_BLOCKSIZE = 32;
     const int BMP_SHIFT = 5;
     const int BMP_END = 0x11000;
     const int SMP_END = 0x110000;
@@ -1890,14 +1994,14 @@ static QByteArray createPropertyInfo()
     int bmp_block_data = bmp_blocks*BMP_BLOCKSIZE*2;
     int bmp_trie = BMP_END/BMP_BLOCKSIZE*2;
     int bmp_mem = bmp_block_data + bmp_trie;
-    qDebug("    %d unique blocks in BMP.",blocks.size());
+    qDebug("    %d unique blocks in BMP.", blocks.size());
     qDebug("        block data uses: %d bytes", bmp_block_data);
     qDebug("        trie data uses : %d bytes", bmp_trie);
 
-    int smp_block_data = (blocks.size()- bmp_blocks)*SMP_BLOCKSIZE*2;
+    int smp_block_data = (blocks.size() - bmp_blocks)*SMP_BLOCKSIZE*2;
     int smp_trie = (SMP_END-BMP_END)/SMP_BLOCKSIZE*2;
     int smp_mem = smp_block_data + smp_trie;
-    qDebug("    %d unique blocks in SMP.",blocks.size()-bmp_blocks);
+    qDebug("    %d unique blocks in SMP.", blocks.size()-bmp_blocks);
     qDebug("        block data uses: %d bytes", smp_block_data);
     qDebug("        trie data uses : %d bytes", smp_trie);
 
@@ -1908,7 +2012,7 @@ static QByteArray createPropertyInfo()
     out += "static const unsigned short uc_property_trie[] = {\n";
 
     // first write the map
-    out += "    // 0x" + QByteArray::number(BMP_END, 16);
+    out += "    // 0 - 0x" + QByteArray::number(BMP_END, 16);
     for (int i = 0; i < BMP_END/BMP_BLOCKSIZE; ++i) {
         if (!(i % 8)) {
             if (out.endsWith(' '))
@@ -1977,7 +2081,7 @@ static QByteArray createPropertyInfo()
            "] + (ucs2 & 0x" + QByteArray::number(BMP_BLOCKSIZE-1, 16)+ ")])\n\n"
 
 
-           "static const QUnicodeTables::Properties uc_properties [] = {\n";
+           "static const QUnicodeTables::Properties uc_properties[] = {\n";
 
     // keep in sync with the property declaration
     for (int i = 0; i < uniqueProperties.size(); ++i) {
@@ -2036,7 +2140,7 @@ static QByteArray createPropertyInfo()
         out += QByteArray::number( p.wordBreak );
         out += ", ";
         out += QByteArray::number( p.sentenceBreak );
-        out += "},\n";
+        out += " },\n";
     }
     out += "};\n\n";
 
@@ -2064,20 +2168,18 @@ static QByteArray createPropertyInfo()
            "    return uc_properties + index;\n"
            "}\n\n";
 
-    out += "#define CURRENT_VERSION "CURRENT_UNICODE_VERSION"\n\n";
-
-    out += "static const ushort specialCaseMap [] = {";
+    out += "static const ushort specialCaseMap[] = {\n   ";
     for (int i = 0; i < specialCaseMap.size(); ++i) {
-        if (!(i % 16))
-            out += "\n   ";
         out += QByteArray(" 0x") + QByteArray::number(specialCaseMap.at(i), 16);
         if (i < specialCaseMap.size() - 1)
             out += ",";
+        if (!specialCaseMap.at(i))
+            out += "\n   ";
     }
     out += "\n};\n";
     out += "#define SPECIAL_CASE_MAX_LEN " + QByteArray::number(specialCaseMaxLen) + "\n\n";
 
-    qDebug() << "Special case map uses " << specialCaseMap.size()*2 << "bytes";
+    qDebug("Special case map uses : %d bytes", specialCaseMap.size()*2);
 
     return out;
 }
@@ -2088,14 +2190,14 @@ struct DecompositionBlock {
     int index;
     QList<int> decompositionPositions;
     bool operator ==(const DecompositionBlock &other)
-        { return decompositionPositions == other.decompositionPositions; }
+    { return decompositionPositions == other.decompositionPositions; }
 };
 
 static QByteArray createCompositionInfo()
 {
     qDebug("createCompositionInfo:");
 
-    const int BMP_BLOCKSIZE=16;
+    const int BMP_BLOCKSIZE = 16;
     const int BMP_SHIFT = 4;
     const int BMP_END = 0x3400; // start of Han
     const int SMP_END = 0x30000;
@@ -2120,15 +2222,14 @@ static QByteArray createCompositionInfo()
             if (!d.decomposition.isEmpty()) {
                 int utf16Chars = 0;
                 for (int j = 0; j < d.decomposition.size(); ++j)
-                    utf16Chars += d.decomposition.at(j) > 0x10000 ? 2 : 1;
+                    utf16Chars += d.decomposition.at(j) >= 0x10000 ? 2 : 1;
                 decompositions.append(d.decompositionType + (utf16Chars<<8));
                 for (int j = 0; j < d.decomposition.size(); ++j) {
                     int code = d.decomposition.at(j);
-                    if (code > 0x10000) {
+                    if (code >= 0x10000) {
                         // save as surrogate pair
-                        code -= 0x10000;
-                        ushort high = code/0x400 + 0xd800;
-                        ushort low = code%0x400 + 0xdc00;
+                        ushort high = QChar::highSurrogate(code);
+                        ushort low = QChar::lowSurrogate(code);
                         decompositions.append(high);
                         decompositions.append(low);
                     } else {
@@ -2162,15 +2263,14 @@ static QByteArray createCompositionInfo()
             if (!d.decomposition.isEmpty()) {
                 int utf16Chars = 0;
                 for (int j = 0; j < d.decomposition.size(); ++j)
-                    utf16Chars += d.decomposition.at(j) > 0x10000 ? 2 : 1;
+                    utf16Chars += d.decomposition.at(j) >= 0x10000 ? 2 : 1;
                 decompositions.append(d.decompositionType + (utf16Chars<<8));
                 for (int j = 0; j < d.decomposition.size(); ++j) {
                     int code = d.decomposition.at(j);
-                    if (code > 0x10000) {
+                    if (code >= 0x10000) {
                         // save as surrogate pair
-                        code -= 0x10000;
-                        ushort high = code/0x400 + 0xd800;
-                        ushort low = code%0x400 + 0xdc00;
+                        ushort high = QChar::highSurrogate(code);
+                        ushort low = QChar::lowSurrogate(code);
                         decompositions.append(high);
                         decompositions.append(low);
                     } else {
@@ -2196,15 +2296,15 @@ static QByteArray createCompositionInfo()
     int bmp_block_data = bmp_blocks*BMP_BLOCKSIZE*2;
     int bmp_trie = BMP_END/BMP_BLOCKSIZE*2;
     int bmp_mem = bmp_block_data + bmp_trie;
-    qDebug("    %d unique blocks in BMP.",blocks.size());
+    qDebug("    %d unique blocks in BMP.", blocks.size());
     qDebug("        block data uses: %d bytes", bmp_block_data);
     qDebug("        trie data uses : %d bytes", bmp_trie);
     qDebug("        memory usage: %d bytes", bmp_mem);
 
-    int smp_block_data = (blocks.size()- bmp_blocks)*SMP_BLOCKSIZE*2;
+    int smp_block_data = (blocks.size() - bmp_blocks)*SMP_BLOCKSIZE*2;
     int smp_trie = (SMP_END-BMP_END)/SMP_BLOCKSIZE*2;
     int smp_mem = smp_block_data + smp_trie;
-    qDebug("    %d unique blocks in SMP.",blocks.size()-bmp_blocks);
+    qDebug("    %d unique blocks in SMP.", blocks.size()-bmp_blocks);
     qDebug("        block data uses: %d bytes", smp_block_data);
     qDebug("        trie data uses : %d bytes", smp_trie);
 
@@ -2347,7 +2447,7 @@ static QByteArray createLigatureInfo()
     int bmp_block_data = bmp_blocks*BMP_BLOCKSIZE*2;
     int bmp_trie = BMP_END/BMP_BLOCKSIZE*2;
     int bmp_mem = bmp_block_data + bmp_trie;
-    qDebug("    %d unique blocks in BMP.",blocks.size());
+    qDebug("    %d unique blocks in BMP.", blocks.size());
     qDebug("        block data uses: %d bytes", bmp_block_data);
     qDebug("        trie data uses : %d bytes", bmp_trie);
     qDebug("        ligature data uses : %d bytes", ligatures.size()*2);
@@ -2399,7 +2499,7 @@ static QByteArray createLigatureInfo()
            "uc_ligature_trie[uc_ligature_trie[u2>>" + QByteArray::number(BMP_SHIFT) +
            "] + (u2 & 0x" + QByteArray::number(BMP_BLOCKSIZE-1, 16)+ ")] : 0xffff);\n\n"
 
-           "static const unsigned short uc_ligature_map [] = {\n";
+           "static const unsigned short uc_ligature_map[] = {\n";
 
     for (int i = 0; i < ligatures.size(); ++i) {
         if (!(i % 8)) {
@@ -2433,19 +2533,20 @@ QByteArray createCasingInfo()
 
 int main(int, char **)
 {
+    initAgeMap();
     initCategoryMap();
     initDirectionMap();
     initDecompositionMap();
     initGraphemeBreak();
     initWordBreak();
     initSentenceBreak();
-    
+    initLineBreak();
+
     readUnicodeData();
     readBidiMirroring();
     readArabicShaping();
     readDerivedAge();
-    readCompositionExclusion();
-    readLineBreak();
+    readDerivedNormalizationProps();
     readSpecialCasing();
     readCaseFolding();
     // readBlocks();
@@ -2453,6 +2554,7 @@ int main(int, char **)
     readGraphemeBreak();
     readWordBreak();
     readSentenceBreak();
+    readLineBreak();
 
     computeUniqueProperties();
     QByteArray properties = createPropertyInfo();
@@ -2462,9 +2564,6 @@ int main(int, char **)
     QByteArray scriptEnumDeclaration = createScriptEnumDeclaration();
     QByteArray scriptTableDeclaration = createScriptTableDeclaration();
 
-    QFile f("../../src/corelib/tools/qunicodetables.cpp");
-    f.open(QFile::WriteOnly|QFile::Truncate);
-
     QByteArray header =
         "/****************************************************************************\n"
         "**\n"
@@ -2505,9 +2604,10 @@ int main(int, char **)
         "**\n"
         "** $QT_END_LICENSE$\n"
         "**\n"
-        "****************************************************************************/\n\n"
+        "****************************************************************************/\n\n";
 
-        "/* This file is autogenerated from the Unicode 5.0 database. Do not edit */\n\n";
+    QByteArray note =
+        "/* This file is autogenerated from the Unicode "DATA_VERSION_S" database. Do not edit */\n\n";
 
     QByteArray warning =
         "//\n"
@@ -2521,41 +2621,47 @@ int main(int, char **)
         "// We mean it.\n"
         "//\n\n";
 
+    QFile f("../../src/corelib/tools/qunicodetables.cpp");
+    f.open(QFile::WriteOnly|QFile::Truncate);
     f.write(header);
+    f.write(note);
     f.write("QT_BEGIN_NAMESPACE\n\n");
     f.write(properties);
     f.write(compositions);
     f.write(ligatures);
     f.write(normalizationCorrections);
     f.write(scriptTableDeclaration);
-    f.write("\nQT_END_NAMESPACE\n");
+    f.write("QT_END_NAMESPACE\n");
     f.close();
 
     f.setFileName("../../src/corelib/tools/qunicodetables_p.h");
     f.open(QFile::WriteOnly | QFile::Truncate);
     f.write(header);
+    f.write(note);
     f.write(warning);
     f.write("#ifndef QUNICODETABLES_P_H\n"
             "#define QUNICODETABLES_P_H\n\n"
             "#include <QtCore/qchar.h>\n\n"
             "QT_BEGIN_NAMESPACE\n\n");
-    f.write("namespace QUnicodeTables {\n");
+    f.write("#define UNICODE_DATA_VERSION "DATA_VERSION_STR"\n\n");
+    f.write("#define UNICODE_LAST_CODEPOINT "LAST_CODEPOINT_STR"\n\n");
+    f.write("namespace QUnicodeTables {\n\n");
     f.write(property_string);
     f.write("\n");
     f.write(scriptEnumDeclaration);
     f.write("\n");
     f.write(lineBreakClass);
     f.write("\n");
-    f.write(methods);
-    f.write("\n");
     f.write(grapheme_break_string);
     f.write("\n");
     f.write(word_break_string);
     f.write("\n");
     f.write(sentence_break_string);
-    f.write("\n}\n\n"
+    f.write("\n");
+    f.write(methods);
+    f.write("} // namespace QUnicodeTables\n\n"
             "QT_END_NAMESPACE\n\n"
-            "#endif\n");
+            "#endif // QUNICODETABLES_P_H\n");
     f.close();
 
     qDebug() << "maxMirroredDiff  = " << hex << maxMirroredDiff;
@@ -2578,7 +2684,7 @@ int main(int, char **)
         sum += decompositionLength.value(i, 0);
     }
     qDebug("    len decomposition map %d, average length %f, num composed chars %d",
-           totalcompositions, (float)totalcompositions/(float)sum,  sum);
+           totalcompositions, (float)totalcompositions/(float)sum, sum);
     qDebug("highest composed character %x", highestComposedCharacter);
     qDebug("num ligatures = %d highest=%x, maxLength=%d", numLigatures, highestLigature, longestLigature);
 
@@ -2599,4 +2705,3 @@ int main(int, char **)
 
 #endif
 }
-
diff --git a/util/unicode/unicode.pro b/util/unicode/unicode.pro
index a53f70a..0250c2a 100644
--- a/util/unicode/unicode.pro
+++ b/util/unicode/unicode.pro
@@ -1,2 +1,3 @@
 SOURCES += main.cpp
 QT = core
+CONFIG += console