#1571184: makeunicodedata.py now generates the functions _PyUnicode_ToNumeric,

_PyUnicode_IsLinebreak and _PyUnicode_IsWhitespace. It now also parses the Unihan.txt for numeric values.
author: Amaury Forgeot d'Arc <amauryfa@gmail.com> 2009-10-06 19:56:32 (GMT)
committer: Amaury Forgeot d'Arc <amauryfa@gmail.com> 2009-10-06 19:56:32 (GMT)
commit: d0052d17b1a067e4aa8a69f5564a4b94e0c00502 (patch)
tree: c80b69d55175b2f9d9090262963fc1989358b963 /Tools/unicode
parent: 85ea4bf781203d8b4fd2873791d0a7a26e103652 (diff)
download: cpython-d0052d17b1a067e4aa8a69f5564a4b94e0c00502.zip
cpython-d0052d17b1a067e4aa8a69f5564a4b94e0c00502.tar.gz
cpython-d0052d17b1a067e4aa8a69f5564a4b94e0c00502.tar.bz2
1 files changed, 123 insertions, 8 deletions
diff --git a/Tools/unicode/makeunicodedata.py b/Tools/unicode/makeunicodedata.py
index e3842e5..92268ad 100644
--- a/Tools/unicode/makeunicodedata.py
+++ b/Tools/unicode/makeunicodedata.py
@@ -34,6 +34,7 @@ UNIDATA_VERSION = "5.1.0"
 UNICODE_DATA = "UnicodeData%s.txt"
 COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
 EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
+UNIHAN = "Unihan%s.txt"
 DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
 
 old_versions = ["3.2.0"]
@@ -59,6 +60,7 @@ SPACE_MASK = 0x20
 TITLE_MASK = 0x40
 UPPER_MASK = 0x80
 NODELTA_MASK = 0x100
+NUMERIC_MASK = 0x200
 
 def maketables(trace=0):
 
@@ -68,6 +70,7 @@ def maketables(trace=0):
     unicode = UnicodeData(UNICODE_DATA % version,
                           COMPOSITION_EXCLUSIONS % version,
                           EASTASIAN_WIDTH % version,
+                          UNIHAN % version,
                           DERIVEDNORMALIZATION_PROPS % version)
 
     print len(filter(None, unicode.table)), "characters"
@@ -76,7 +79,8 @@ def maketables(trace=0):
         print "--- Reading", UNICODE_DATA % ("-"+version), "..."
         old_unicode = UnicodeData(UNICODE_DATA % ("-"+version),
                                   COMPOSITION_EXCLUSIONS % ("-"+version),
-                                  EASTASIAN_WIDTH % ("-"+version))
+                                  EASTASIAN_WIDTH % ("-"+version),
+                                  UNIHAN % ("-"+version))
         print len(filter(None, old_unicode.table)), "characters"
         merge_old_version(version, unicode, old_unicode)
 
@@ -352,6 +356,9 @@ def makeunicodetype(unicode, trace):
     table = [dummy]
     cache = {0: dummy}
     index = [0] * len(unicode.chars)
+    numeric = {}
+    spaces = []
+    linebreaks = []
 
     for char in unicode.chars:
         record = unicode.table[char]
@@ -367,8 +374,10 @@ def makeunicodetype(unicode, trace):
                 flags |= LOWER_MASK
             if category == "Zl" or bidirectional == "B":
                 flags |= LINEBREAK_MASK
+                linebreaks.append(char)
             if category == "Zs" or bidirectional in ("WS", "B", "S"):
                 flags |= SPACE_MASK
+                spaces.append(char)
             if category == "Lt":
                 flags |= TITLE_MASK
             if category == "Lu":
@@ -411,6 +420,9 @@ def makeunicodetype(unicode, trace):
             if record[7]:
                 flags |= DIGIT_MASK
                 digit = int(record[7])
+            if record[8]:
+                flags |= NUMERIC_MASK
+                numeric.setdefault(record[8], []).append(char)
             item = (
                 upper, lower, title, decimal, digit, flags
                 )
@@ -422,6 +434,9 @@ def makeunicodetype(unicode, trace):
             index[char] = i
 
     print len(table), "unique character type entries"
+    print sum(map(len, numeric.values())), "numeric code points"
+    print len(spaces), "whitespace code points"
+    print len(linebreaks), "linebreak code points"
 
     print "--- Writing", FILE, "..."
 
@@ -443,6 +458,97 @@ def makeunicodetype(unicode, trace):
     Array("index1", index1).dump(fp, trace)
     Array("index2", index2).dump(fp, trace)
 
+    # Generate code for _PyUnicode_ToNumeric()
+    numeric_items = numeric.items()
+    numeric_items.sort()
+    print >>fp, '/* Returns the numeric value as double for Unicode characters'
+    print >>fp, ' * having this property, -1.0 otherwise.'
+    print >>fp, ' */'
+    print >>fp, 'double _PyUnicode_ToNumeric(Py_UNICODE ch)'
+    print >>fp, '{'
+    print >>fp, '    switch (ch) {'
+    for value, codepoints in numeric_items:
+        haswide = False
+        hasnonewide = False
+        codepoints.sort()
+        for codepoint in codepoints:
+            if codepoint < 0x10000:
+                hasnonewide = True
+            if codepoint >= 0x10000 and not haswide:
+                print >>fp, '#ifdef Py_UNICODE_WIDE'
+                haswide = True
+            print >>fp, '    case 0x%04X:' % (codepoint,)
+        if haswide and hasnonewide:
+            print >>fp, '#endif'
+        print >>fp, '        return (double) %s;' % (value,)
+        if haswide and not hasnonewide:
+            print >>fp, '#endif'
+    print >>fp,'    }'
+    print >>fp,'    return -1.0;'
+    print >>fp,'}'
+    print >>fp
+
+    # Generate code for _PyUnicode_IsWhitespace()
+    print >>fp, "/* Returns 1 for Unicode characters having the bidirectional"
+    print >>fp, " * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise."
+    print >>fp, " */"
+    print >>fp, 'int _PyUnicode_IsWhitespace(register const Py_UNICODE ch)'
+    print >>fp, '{'
+    print >>fp, '#ifdef WANT_WCTYPE_FUNCTIONS'
+    print >>fp, '    return iswspace(ch);'
+    print >>fp, '#else'
+    print >>fp, '    switch (ch) {'
+
+    haswide = False
+    hasnonewide = False
+    spaces.sort()
+    for codepoint in spaces:
+        if codepoint < 0x10000:
+            hasnonewide = True
+        if codepoint >= 0x10000 and not haswide:
+            print >>fp, '#ifdef Py_UNICODE_WIDE'
+            haswide = True
+        print >>fp, '    case 0x%04X:' % (codepoint,)
+    if haswide and hasnonewide:
+        print >>fp, '#endif'
+    print >>fp, '        return 1;'
+    if haswide and not hasnonewide:
+        print >>fp, '#endif'
+
+    print >>fp,'    }'
+    print >>fp,'    return 0;'
+    print >>fp, '#endif'
+    print >>fp,'}'
+    print >>fp
+
+    # Generate code for _PyUnicode_IsLinebreak()
+    print >>fp, "/* Returns 1 for Unicode characters having the category 'Zl',"
+    print >>fp, " * 'Zp' or type 'B', 0 otherwise."
+    print >>fp, " */"
+    print >>fp, 'int _PyUnicode_IsLinebreak(register const Py_UNICODE ch)'
+    print >>fp, '{'
+    print >>fp, '    switch (ch) {'
+    haswide = False
+    hasnonewide = False
+    linebreaks.sort()
+    for codepoint in linebreaks:
+        if codepoint < 0x10000:
+            hasnonewide = True
+        if codepoint >= 0x10000 and not haswide:
+            print >>fp, '#ifdef Py_UNICODE_WIDE'
+            haswide = True
+        print >>fp, '    case 0x%04X:' % (codepoint,)
+    if haswide and hasnonewide:
+        print >>fp, '#endif'
+    print >>fp, '        return 1;'
+    if haswide and not hasnonewide:
+        print >>fp, '#endif'
+
+    print >>fp,'    }'
+    print >>fp,'    return 0;'
+    print >>fp,'}'
+    print >>fp
+
     fp.close()
 
 # --------------------------------------------------------------------
@@ -660,12 +766,11 @@ def merge_old_version(version, new, old):
                     elif k == 8:
                         # print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]
                         # Since 0 encodes "no change", the old value is better not 0
-                        assert value != "0" and value != "-1"
                         if not value:
                             numeric_changes[i] = -1
                         else:
-                            assert re.match("^[0-9]+$", value)
-                            numeric_changes[i] = int(value)
+                            numeric_changes[i] = float(value)
+                            assert numeric_changes[i] not in (0, -1)
                     elif k == 9:
                         if value == 'Y':
                             mirrored_changes[i] = '1'
@@ -698,11 +803,9 @@ def merge_old_version(version, new, old):
 
 # load a unicode-data file from disk
 
-import sys
-
 class UnicodeData:
 
-    def __init__(self, filename, exclusions, eastasianwidth,
+    def __init__(self, filename, exclusions, eastasianwidth, unihan,
                  derivednormalizationprops=None, expand=1):
         self.changed = []
         file = open(filename)
@@ -789,6 +892,19 @@ class UnicodeData:
                 if table[i] is not None:
                     table[i].append(quickchecks[i])
 
+        for line in open(unihan):
+            if not line.startswith('U+'):
+                continue
+            code, tag, value = line.split(None, 3)[:3]
+            if tag not in ('kAccountingNumeric', 'kPrimaryNumeric',
+                           'kOtherNumeric'):
+                continue
+            value = value.strip().replace(',', '')
+            i = int(code[2:], 16)
+            # Patch the numeric field
+            if table[i] is not None:
+                table[i][8] = value
+
     def uselatin1(self):
         # restrict character range to ISO Latin 1
         self.chars = range(256)
@@ -938,7 +1054,6 @@ def splitbins(t, trace=0):
     you'll get.
     """
 
-    import sys
     if trace:
         def dump(t1, t2, shift, bytes):
             print >>sys.stderr, "%d+%d bins at shift %d; %d bytes" % (
author	Amaury Forgeot d'Arc <amauryfa@gmail.com>	2009-10-06 19:56:32 (GMT)
committer	Amaury Forgeot d'Arc <amauryfa@gmail.com>	2009-10-06 19:56:32 (GMT)
commit	d0052d17b1a067e4aa8a69f5564a4b94e0c00502 (patch)
tree	c80b69d55175b2f9d9090262963fc1989358b963 /Tools/unicode
parent	85ea4bf781203d8b4fd2873791d0a7a26e103652 (diff)
download	cpython-d0052d17b1a067e4aa8a69f5564a4b94e0c00502.zip cpython-d0052d17b1a067e4aa8a69f5564a4b94e0c00502.tar.gz cpython-d0052d17b1a067e4aa8a69f5564a4b94e0c00502.tar.bz2