Patch #626485: Support Unicode normalization.

author: Martin v. Löwis <martin@v.loewis.de> 2002-11-23 22:08:15 (GMT)
committer: Martin v. Löwis <martin@v.loewis.de> 2002-11-23 22:08:15 (GMT)
commit: 677bde2dd14ac2c8f170779adcc732f991db8bd6 (patch)
tree: daaeacd804a9e45a96c7819ece9d78d73a690439 /Tools/unicode
parent: 74a530d42dcd0d33587aed66d600a6687ce30cbd (diff)
download: cpython-677bde2dd14ac2c8f170779adcc732f991db8bd6.zip
cpython-677bde2dd14ac2c8f170779adcc732f991db8bd6.tar.gz
cpython-677bde2dd14ac2c8f170779adcc732f991db8bd6.tar.bz2
1 files changed, 90 insertions, 3 deletions
diff --git a/Tools/unicode/makeunicodedata.py b/Tools/unicode/makeunicodedata.py
index 3b2fd11..42cbcf1 100644
--- a/Tools/unicode/makeunicodedata.py
+++ b/Tools/unicode/makeunicodedata.py
@@ -13,6 +13,9 @@
 # 2000-11-03 fl   expand first/last ranges
 # 2001-01-19 fl   added character name tables (2.1)
 # 2001-01-21 fl   added decomp compression; dynamic phrasebook threshold
+# 2002-09-11 wd   use string methods
+# 2002-10-18 mvl  update to Unicode 3.2
+# 2002-10-22 mvl  generate NFC tables
 #
 # written by Fredrik Lundh (fredrik@pythonware.com)
 #
@@ -22,7 +25,8 @@ import sys
 SCRIPT = sys.argv[0]
 VERSION = "2.1"
 
-UNICODE_DATA = "UnicodeData-Latest.txt"
+UNICODE_DATA = "UnicodeData.txt"
+COMPOSITION_EXCLUSIONS = "CompositionExclusions.txt"
 
 CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
     "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
@@ -47,7 +51,7 @@ def maketables(trace=0):
 
     print "--- Reading", UNICODE_DATA, "..."
 
-    unicode = UnicodeData(UNICODE_DATA)
+    unicode = UnicodeData(UNICODE_DATA, COMPOSITION_EXCLUSIONS)
 
     print len(filter(None, unicode.table)), "characters"
 
@@ -96,6 +100,10 @@ def makeunicodedata(unicode, trace):
     decomp_index = [0] * len(unicode.chars)
     decomp_size = 0
 
+    comp_pairs = []
+    comp_first = [None] * len(unicode.chars)
+    comp_last = [None] * len(unicode.chars)
+
     for char in unicode.chars:
         record = unicode.table[char]
         if record:
@@ -116,6 +124,14 @@ def makeunicodedata(unicode, trace):
                 # content
                 decomp = [prefix + (len(decomp)<<8)] +\
                          map(lambda s: int(s, 16), decomp)
+                # Collect NFC pairs
+                if not prefix and len(decomp) == 3 and \
+                   char not in unicode.exclusions and \
+                   unicode.table[decomp[1]][3] == "0":
+                    p, l, r = decomp
+                    comp_first[l] = 1
+                    comp_last[r] = 1
+                    comp_pairs.append((l,r,char))
                 try:
                     i = decomp_data.index(decomp)
                 except ValueError:
@@ -126,10 +142,49 @@ def makeunicodedata(unicode, trace):
                 i = 0
             decomp_index[char] = i
 
+    f = l = 0
+    comp_first_ranges = []
+    comp_last_ranges = []
+    prev_f = prev_l = None
+    for i in unicode.chars:
+        if comp_first[i] is not None:
+            comp_first[i] = f
+            f += 1
+            if prev_f is None:
+                prev_f = (i,i)
+            elif prev_f[1]+1 == i:
+                prev_f = prev_f[0],i
+            else:
+                comp_first_ranges.append(prev_f)
+                prev_f = (i,i)
+        if comp_last[i] is not None:
+            comp_last[i] = l
+            l += 1
+            if prev_l is None:
+                prev_l = (i,i)
+            elif prev_l[1]+1 == i:
+                prev_l = prev_l[0],i
+            else:
+                comp_last_ranges.append(prev_l)
+                prev_l = (i,i)
+    comp_first_ranges.append(prev_f)
+    comp_last_ranges.append(prev_l)
+    total_first = f
+    total_last = l
+
+    comp_data = [0]*(total_first*total_last)
+    for f,l,char in comp_pairs:
+        f = comp_first[f]
+        l = comp_last[l]
+        comp_data[f*total_last+l] = char
+
     print len(table), "unique properties"
     print len(decomp_prefix), "unique decomposition prefixes"
     print len(decomp_data), "unique decomposition entries:",
     print decomp_size, "bytes"
+    print total_first, "first characters in NFC"
+    print total_last, "last characters in NFC"
+    print len(comp_pairs), "NFC pairs"
 
     print "--- Writing", FILE, "..."
 
@@ -144,6 +199,21 @@ def makeunicodedata(unicode, trace):
     print >>fp, "};"
     print >>fp
 
+    print >>fp, "/* Reindexing of NFC first characters. */"
+    print >>fp, "#define TOTAL_FIRST",total_first
+    print >>fp, "#define TOTAL_LAST",total_last
+    print >>fp, "struct reindex{int start;short count,index;};"
+    print >>fp, "struct reindex nfc_first[] = {"
+    for start,end in comp_first_ranges:
+        print >>fp,"  { %d, %d, %d}," % (start,end-start,comp_first[start])
+    print >>fp,"  {0,0,0}"
+    print >>fp,"};\n"
+    print >>fp, "struct reindex nfc_last[] = {"
+    for start,end in comp_last_ranges:
+        print >>fp,"  { %d, %d, %d}," % (start,end-start,comp_last[start])
+    print >>fp,"  {0,0,0}"
+    print >>fp,"};\n"
+
     # FIXME: <fl> the following tables could be made static, and
     # the support code moved into unicodedatabase.c
 
@@ -185,6 +255,12 @@ def makeunicodedata(unicode, trace):
     Array("decomp_index1", index1).dump(fp, trace)
     Array("decomp_index2", index2).dump(fp, trace)
 
+    index, index2, shift = splitbins(comp_data, trace)
+    print >>fp, "/* NFC pairs */"
+    print >>fp, "#define COMP_SHIFT", shift
+    Array("comp_index", index).dump(fp, trace)
+    Array("comp_data", index2).dump(fp, trace)
+
     fp.close()
 
 # --------------------------------------------------------------------
@@ -454,7 +530,7 @@ import sys
 
 class UnicodeData:
 
-    def __init__(self, filename, expand=1):
+    def __init__(self, filename, exclusions, expand=1):
         file = open(filename)
         table = [None] * 0x110000
         while 1:
@@ -486,6 +562,17 @@ class UnicodeData:
         self.table = table
         self.chars = range(0x110000) # unicode 3.2
 
+        file = open(exclusions)
+        self.exclusions = {}
+        for s in file:
+            s = s.strip()
+            if not s:
+                continue
+            if s[0] == '#':
+                continue
+            char = int(s.split()[0],16)
+            self.exclusions[char] = 1
+
     def uselatin1(self):
         # restrict character range to ISO Latin 1
         self.chars = range(256)
author	Martin v. Löwis <martin@v.loewis.de>	2002-11-23 22:08:15 (GMT)
committer	Martin v. Löwis <martin@v.loewis.de>	2002-11-23 22:08:15 (GMT)
commit	677bde2dd14ac2c8f170779adcc732f991db8bd6 (patch)
tree	daaeacd804a9e45a96c7819ece9d78d73a690439 /Tools/unicode
parent	74a530d42dcd0d33587aed66d600a6687ce30cbd (diff)
download	cpython-677bde2dd14ac2c8f170779adcc732f991db8bd6.zip cpython-677bde2dd14ac2c8f170779adcc732f991db8bd6.tar.gz cpython-677bde2dd14ac2c8f170779adcc732f991db8bd6.tar.bz2