From 806d8cf0e8056726580e210e1dea960d6e77c910 Mon Sep 17 00:00:00 2001 From: Florent Xicluna Date: Tue, 30 Mar 2010 19:34:18 +0000 Subject: Merged revisions 79494,79496 via svnmerge from svn+ssh://pythondev@svn.python.org/python/trunk ........ r79494 | florent.xicluna | 2010-03-30 10:24:06 +0200 (mar, 30 mar 2010) | 2 lines #7643: Unicode codepoints VT (0x0B) and FF (0x0C) are linebreaks according to Unicode Standard Annex #14. ........ r79496 | florent.xicluna | 2010-03-30 18:29:03 +0200 (mar, 30 mar 2010) | 2 lines Highlight the change of behavior related to r79494. Now VT and FF are linebreaks. ........ --- Lib/test/test_unicodedata.py | 13 ++++++++++++- Misc/NEWS | 5 +++++ Objects/unicodeobject.c | 8 +++++--- Objects/unicodetype_db.h | 9 ++++++--- Tools/unicode/makeunicodedata.py | 29 ++++++++++++++++++++++++----- 5 files changed, 52 insertions(+), 12 deletions(-) diff --git a/Lib/test/test_unicodedata.py b/Lib/test/test_unicodedata.py index 9777f73..59e6d39 100644 --- a/Lib/test/test_unicodedata.py +++ b/Lib/test/test_unicodedata.py @@ -25,7 +25,7 @@ class UnicodeMethodsTest(unittest.TestCase): def test_method_checksum(self): h = hashlib.sha1() - for i in range(65536): + for i in range(0x10000): char = chr(i) data = [ # Predicates (single char) @@ -284,6 +284,17 @@ class UnicodeMiscTest(UnicodeDatabaseTest): self.assertEqual("\u01c5".title(), "\u01c5") self.assertEqual("\u01c6".title(), "\u01c5") + def test_linebreak_7643(self): + for i in range(0x10000): + lines = (chr(i) + 'A').splitlines() + if i in (0x0a, 0x0b, 0x0c, 0x0d, 0x85, + 0x1c, 0x1d, 0x1e, 0x2028, 0x2029): + self.assertEqual(len(lines), 2, + r"\u%.4x should be a linebreak" % i) + else: + self.assertEqual(len(lines), 1, + r"\u%.4x should not be a linebreak" % i) + def test_main(): test.support.run_unittest( UnicodeMiscTest, diff --git a/Misc/NEWS b/Misc/NEWS index d6a0292..4da6c23 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -293,6 +293,11 @@ C-API Library ------- +- Backwards incompatible change: Unicode codepoints line tabulation (0x0B) and + form feed (0x0C) are now considered linebreaks, as specified in Unicode + Standard Annex #14. See issue #7643. + http://www.unicode.org/reports/tr14/ + - Comparisons using one of <, <=, >, >= between a complex instance and a Fractions instance now raise TypeError instead of returning True/False. This makes Fraction <=> complex comparisons consistent with diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 28b8c66..a409b22 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -126,9 +126,9 @@ static const char unicode_default_encoding[] = "utf-8"; /* Fast detection of the most frequent whitespace characters */ const unsigned char _Py_ascii_whitespace[] = { 0, 0, 0, 0, 0, 0, 0, 0, -/* case 0x0009: * HORIZONTAL TABULATION */ +/* case 0x0009: * CHARACTER TABULATION */ /* case 0x000A: * LINE FEED */ -/* case 0x000B: * VERTICAL TABULATION */ +/* case 0x000B: * LINE TABULATION */ /* case 0x000C: * FORM FEED */ /* case 0x000D: * CARRIAGE RETURN */ 0, 1, 1, 1, 1, 1, 0, 0, @@ -163,8 +163,10 @@ static PyObject *unicode_encode_call_errorhandler(const char *errors, static unsigned char ascii_linebreak[] = { 0, 0, 0, 0, 0, 0, 0, 0, /* 0x000A, * LINE FEED */ +/* 0x000B, * LINE TABULATION */ +/* 0x000C, * FORM FEED */ /* 0x000D, * CARRIAGE RETURN */ - 0, 0, 1, 0, 0, 1, 0, 0, + 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x001C, * FILE SEPARATOR */ /* 0x001D, * GROUP SEPARATOR */ diff --git a/Objects/unicodetype_db.h b/Objects/unicodetype_db.h index 8c8955c..424a317 100644 --- a/Objects/unicodetype_db.h +++ b/Objects/unicodetype_db.h @@ -694,7 +694,7 @@ static unsigned char index1[] = { }; static unsigned char index2[] = { - 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 2, 2, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 2, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 5, 5, 5, 5, 5, 5, 5, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, @@ -3395,13 +3395,16 @@ int _PyUnicode_IsWhitespace(register const Py_UNICODE ch) #endif } -/* Returns 1 for Unicode characters having the category 'Zl', - * 'Zp' or type 'B', 0 otherwise. +/* Returns 1 for Unicode characters having the line break + * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional + * type 'B', 0 otherwise. */ int _PyUnicode_IsLinebreak(register const Py_UNICODE ch) { switch (ch) { case 0x000A: + case 0x000B: + case 0x000C: case 0x000D: case 0x001C: case 0x001D: diff --git a/Tools/unicode/makeunicodedata.py b/Tools/unicode/makeunicodedata.py index 4eda1b9..f38b866 100644 --- a/Tools/unicode/makeunicodedata.py +++ b/Tools/unicode/makeunicodedata.py @@ -38,6 +38,7 @@ EASTASIAN_WIDTH = "EastAsianWidth%s.txt" UNIHAN = "Unihan%s.txt" DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt" DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt" +LINE_BREAK = "LineBreak%s.txt" old_versions = ["3.2.0"] @@ -52,6 +53,8 @@ BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO", EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ] +MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ] + # note: should match definitions in Objects/unicodectype.c ALPHA_MASK = 0x01 DECIMAL_MASK = 0x02 @@ -77,7 +80,8 @@ def maketables(trace=0): EASTASIAN_WIDTH % version, UNIHAN % version, DERIVED_CORE_PROPERTIES % version, - DERIVEDNORMALIZATION_PROPS % version) + DERIVEDNORMALIZATION_PROPS % version, + LINE_BREAK % version) print(len(list(filter(None, unicode.table))), "characters") @@ -378,7 +382,7 @@ def makeunicodetype(unicode, trace): flags |= ALPHA_MASK if category == "Ll": flags |= LOWER_MASK - if category == "Zl" or bidirectional == "B": + if 'Line_Break' in properties or bidirectional == "B": flags |= LINEBREAK_MASK linebreaks.append(char) if category == "Zs" or bidirectional in ("WS", "B", "S"): @@ -537,8 +541,9 @@ def makeunicodetype(unicode, trace): print(file=fp) # Generate code for _PyUnicode_IsLinebreak() - print("/* Returns 1 for Unicode characters having the category 'Zl',", file=fp) - print(" * 'Zp' or type 'B', 0 otherwise.", file=fp) + print("/* Returns 1 for Unicode characters having the line break", file=fp) + print(" * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional", file=fp) + print(" * type 'B', 0 otherwise.", file=fp) print(" */", file=fp) print('int _PyUnicode_IsLinebreak(register const Py_UNICODE ch)', file=fp) print('{', file=fp) @@ -826,7 +831,8 @@ class UnicodeData: # derived-props] (17) def __init__(self, filename, exclusions, eastasianwidth, unihan, - derivedprops, derivednormalizationprops=None, expand=1): + derivedprops, derivednormalizationprops=None, linebreakprops=None, + expand=1): self.changed = [] file = open(filename) table = [None] * 0x110000 @@ -912,6 +918,19 @@ class UnicodeData: # apply to unassigned code points; ignore them table[char][-1].add(p) + if linebreakprops: + for s in open(linebreakprops): + s = s.partition('#')[0] + s = [i.strip() for i in s.split(';')] + if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS: + continue + if '..' not in s[0]: + first = last = int(s[0], 16) + else: + first, last = [int(c, 16) for c in s[0].split('..')] + for char in range(first, last+1): + table[char][-1].add('Line_Break') + if derivednormalizationprops: quickchecks = [0] * 0x110000 # default is Yes qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split() -- cgit v0.12