diff options
author | Florent Xicluna <florent.xicluna@gmail.com> | 2010-03-30 19:34:18 (GMT) |
---|---|---|
committer | Florent Xicluna <florent.xicluna@gmail.com> | 2010-03-30 19:34:18 (GMT) |
commit | 806d8cf0e8056726580e210e1dea960d6e77c910 (patch) | |
tree | ed95ffd06d353ecdffdbdacba271d5dda71f80aa /Tools | |
parent | 364129ef5a806bf919b5d321206cc1b72aed7272 (diff) | |
download | cpython-806d8cf0e8056726580e210e1dea960d6e77c910.zip cpython-806d8cf0e8056726580e210e1dea960d6e77c910.tar.gz cpython-806d8cf0e8056726580e210e1dea960d6e77c910.tar.bz2 |
Merged revisions 79494,79496 via svnmerge from
svn+ssh://pythondev@svn.python.org/python/trunk
........
r79494 | florent.xicluna | 2010-03-30 10:24:06 +0200 (mar, 30 mar 2010) | 2 lines
#7643: Unicode codepoints VT (0x0B) and FF (0x0C) are linebreaks according to Unicode Standard Annex #14.
........
r79496 | florent.xicluna | 2010-03-30 18:29:03 +0200 (mar, 30 mar 2010) | 2 lines
Highlight the change of behavior related to r79494. Now VT and FF are linebreaks.
........
Diffstat (limited to 'Tools')
-rw-r--r-- | Tools/unicode/makeunicodedata.py | 29 |
1 files changed, 24 insertions, 5 deletions
diff --git a/Tools/unicode/makeunicodedata.py b/Tools/unicode/makeunicodedata.py index 4eda1b9..f38b866 100644 --- a/Tools/unicode/makeunicodedata.py +++ b/Tools/unicode/makeunicodedata.py @@ -38,6 +38,7 @@ EASTASIAN_WIDTH = "EastAsianWidth%s.txt" UNIHAN = "Unihan%s.txt" DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt" DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt" +LINE_BREAK = "LineBreak%s.txt" old_versions = ["3.2.0"] @@ -52,6 +53,8 @@ BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO", EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ] +MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ] + # note: should match definitions in Objects/unicodectype.c ALPHA_MASK = 0x01 DECIMAL_MASK = 0x02 @@ -77,7 +80,8 @@ def maketables(trace=0): EASTASIAN_WIDTH % version, UNIHAN % version, DERIVED_CORE_PROPERTIES % version, - DERIVEDNORMALIZATION_PROPS % version) + DERIVEDNORMALIZATION_PROPS % version, + LINE_BREAK % version) print(len(list(filter(None, unicode.table))), "characters") @@ -378,7 +382,7 @@ def makeunicodetype(unicode, trace): flags |= ALPHA_MASK if category == "Ll": flags |= LOWER_MASK - if category == "Zl" or bidirectional == "B": + if 'Line_Break' in properties or bidirectional == "B": flags |= LINEBREAK_MASK linebreaks.append(char) if category == "Zs" or bidirectional in ("WS", "B", "S"): @@ -537,8 +541,9 @@ def makeunicodetype(unicode, trace): print(file=fp) # Generate code for _PyUnicode_IsLinebreak() - print("/* Returns 1 for Unicode characters having the category 'Zl',", file=fp) - print(" * 'Zp' or type 'B', 0 otherwise.", file=fp) + print("/* Returns 1 for Unicode characters having the line break", file=fp) + print(" * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional", file=fp) + print(" * type 'B', 0 otherwise.", file=fp) print(" */", file=fp) print('int _PyUnicode_IsLinebreak(register const Py_UNICODE ch)', file=fp) print('{', file=fp) @@ -826,7 +831,8 @@ class UnicodeData: # derived-props] (17) def __init__(self, filename, exclusions, eastasianwidth, unihan, - derivedprops, derivednormalizationprops=None, expand=1): + derivedprops, derivednormalizationprops=None, linebreakprops=None, + expand=1): self.changed = [] file = open(filename) table = [None] * 0x110000 @@ -912,6 +918,19 @@ class UnicodeData: # apply to unassigned code points; ignore them table[char][-1].add(p) + if linebreakprops: + for s in open(linebreakprops): + s = s.partition('#')[0] + s = [i.strip() for i in s.split(';')] + if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS: + continue + if '..' not in s[0]: + first = last = int(s[0], 16) + else: + first, last = [int(c, 16) for c in s[0].split('..')] + for char in range(first, last+1): + table[char][-1].add('Line_Break') + if derivednormalizationprops: quickchecks = [0] * 0x110000 # default is Yes qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split() |