diff options
author | Hye-Shik Chang <hyeshik@gmail.com> | 2004-06-02 16:49:17 (GMT) |
---|---|---|
committer | Hye-Shik Chang <hyeshik@gmail.com> | 2004-06-02 16:49:17 (GMT) |
commit | 974ed7cfa50b666c9ab91f7a3f8f26049d387107 (patch) | |
tree | d821c74c26231d988f34764d0fdfe3494036ee95 /Tools | |
parent | b6568b91fdf7de1377dba395c6725a7307b818ee (diff) | |
download | cpython-974ed7cfa50b666c9ab91f7a3f8f26049d387107.zip cpython-974ed7cfa50b666c9ab91f7a3f8f26049d387107.tar.gz cpython-974ed7cfa50b666c9ab91f7a3f8f26049d387107.tar.bz2 |
- SF #962502: Add two more methods for unicode type; width() and
iswide() for east asian width manipulation. (Inspired by David
Goodger, Reviewed by Martin v. Loewis)
- Move _PyUnicode_TypeRecord.flags to the end of the struct so that
no padding is added for UCS-4 builds. (Suggested by Martin v. Loewis)
Diffstat (limited to 'Tools')
-rw-r--r-- | Tools/unicode/makeunicodedata.py | 33 |
1 files changed, 29 insertions, 4 deletions
diff --git a/Tools/unicode/makeunicodedata.py b/Tools/unicode/makeunicodedata.py index c948312..6c29fd1 100644 --- a/Tools/unicode/makeunicodedata.py +++ b/Tools/unicode/makeunicodedata.py @@ -18,6 +18,7 @@ # 2002-10-22 mvl generate NFC tables # 2002-11-24 mvl expand all ranges, sort names version-independently # 2002-11-25 mvl add UNIDATA_VERSION +# 2004-05-29 perky add east asian width information # # written by Fredrik Lundh (fredrik@pythonware.com) # @@ -25,12 +26,13 @@ import sys SCRIPT = sys.argv[0] -VERSION = "2.2" +VERSION = "2.3" # The Unicode Database UNIDATA_VERSION = "3.2.0" UNICODE_DATA = "UnicodeData.txt" COMPOSITION_EXCLUSIONS = "CompositionExclusions.txt" +EASTASIAN_WIDTH = "EastAsianWidth.txt" CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd", "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm", @@ -50,12 +52,14 @@ LINEBREAK_MASK = 0x10 SPACE_MASK = 0x20 TITLE_MASK = 0x40 UPPER_MASK = 0x80 +WIDE_MASK = 0x100 def maketables(trace=0): print "--- Reading", UNICODE_DATA, "..." - unicode = UnicodeData(UNICODE_DATA, COMPOSITION_EXCLUSIONS) + unicode = UnicodeData(UNICODE_DATA, COMPOSITION_EXCLUSIONS, + EASTASIAN_WIDTH) print len(filter(None, unicode.table)), "characters" @@ -330,8 +334,10 @@ def makeunicodetype(unicode, trace): if record[7]: flags |= DIGIT_MASK digit = int(record[7]) + if record[15] in ('W', 'F'): # Wide or Full width + flags |= WIDE_MASK item = ( - flags, upper, lower, title, decimal, digit + upper, lower, title, decimal, digit, flags ) # add entry to index and item tables i = cache.get(item) @@ -538,7 +544,7 @@ import sys class UnicodeData: - def __init__(self, filename, exclusions, expand=1): + def __init__(self, filename, exclusions, eastasianwidth, expand=1): file = open(filename) table = [None] * 0x110000 while 1: @@ -581,6 +587,25 @@ class UnicodeData: char = int(s.split()[0],16) self.exclusions[char] = 1 + widths = [None] * 0x110000 + for s in open(eastasianwidth): + s = s.strip() + if not s: + continue + if s[0] == '#': + continue + s = s.split()[0].split(';') + if '..' in s[0]: + first, last = [int(c, 16) for c in s[0].split('..')] + chars = range(first, last+1) + else: + chars = [int(s[0], 16)] + for char in chars: + widths[char] = s[1] + for i in range(0, 0x110000): + if table[i] is not None: + table[i].append(widths[i]) + def uselatin1(self): # restrict character range to ISO Latin 1 self.chars = range(256) |