Merged revisions 79494,79496 via svnmerge from

svn+ssh://pythondev@svn.python.org/python/trunk ........ r79494 | florent.xicluna | 2010-03-30 10:24:06 +0200 (mar, 30 mar 2010) | 2 lines #7643: Unicode codepoints VT (0x0B) and FF (0x0C) are linebreaks according to Unicode Standard Annex #14. ........ r79496 | florent.xicluna | 2010-03-30 18:29:03 +0200 (mar, 30 mar 2010) | 2 lines Highlight the change of behavior related to r79494. Now VT and FF are linebreaks. ........
author: Florent Xicluna <florent.xicluna@gmail.com> 2010-03-30 19:34:18 (GMT)
committer: Florent Xicluna <florent.xicluna@gmail.com> 2010-03-30 19:34:18 (GMT)
commit: 806d8cf0e8056726580e210e1dea960d6e77c910 (patch)
tree: ed95ffd06d353ecdffdbdacba271d5dda71f80aa /Tools/unicode
parent: 364129ef5a806bf919b5d321206cc1b72aed7272 (diff)
download: cpython-806d8cf0e8056726580e210e1dea960d6e77c910.zip
cpython-806d8cf0e8056726580e210e1dea960d6e77c910.tar.gz
cpython-806d8cf0e8056726580e210e1dea960d6e77c910.tar.bz2
1 files changed, 24 insertions, 5 deletions
diff --git a/Tools/unicode/makeunicodedata.py b/Tools/unicode/makeunicodedata.py
index 4eda1b9..f38b866 100644
--- a/Tools/unicode/makeunicodedata.py
+++ b/Tools/unicode/makeunicodedata.py
@@ -38,6 +38,7 @@ EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
 UNIHAN = "Unihan%s.txt"
 DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"
 DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
+LINE_BREAK = "LineBreak%s.txt"
 
 old_versions = ["3.2.0"]
 
@@ -52,6 +53,8 @@ BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
 
 EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
 
+MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ]
+
 # note: should match definitions in Objects/unicodectype.c
 ALPHA_MASK = 0x01
 DECIMAL_MASK = 0x02
@@ -77,7 +80,8 @@ def maketables(trace=0):
                           EASTASIAN_WIDTH % version,
                           UNIHAN % version,
                           DERIVED_CORE_PROPERTIES % version,
-                          DERIVEDNORMALIZATION_PROPS % version)
+                          DERIVEDNORMALIZATION_PROPS % version,
+                          LINE_BREAK % version)
 
     print(len(list(filter(None, unicode.table))), "characters")
 
@@ -378,7 +382,7 @@ def makeunicodetype(unicode, trace):
                 flags |= ALPHA_MASK
             if category == "Ll":
                 flags |= LOWER_MASK
-            if category == "Zl" or bidirectional == "B":
+            if 'Line_Break' in properties or bidirectional == "B":
                 flags |= LINEBREAK_MASK
                 linebreaks.append(char)
             if category == "Zs" or bidirectional in ("WS", "B", "S"):
@@ -537,8 +541,9 @@ def makeunicodetype(unicode, trace):
     print(file=fp)
 
     # Generate code for _PyUnicode_IsLinebreak()
-    print("/* Returns 1 for Unicode characters having the category 'Zl',", file=fp)
-    print(" * 'Zp' or type 'B', 0 otherwise.", file=fp)
+    print("/* Returns 1 for Unicode characters having the line break", file=fp)
+    print(" * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional", file=fp)
+    print(" * type 'B', 0 otherwise.", file=fp)
     print(" */", file=fp)
     print('int _PyUnicode_IsLinebreak(register const Py_UNICODE ch)', file=fp)
     print('{', file=fp)
@@ -826,7 +831,8 @@ class UnicodeData:
     #  derived-props] (17)
 
     def __init__(self, filename, exclusions, eastasianwidth, unihan,
-                 derivedprops, derivednormalizationprops=None, expand=1):
+                 derivedprops, derivednormalizationprops=None, linebreakprops=None,
+                 expand=1):
         self.changed = []
         file = open(filename)
         table = [None] * 0x110000
@@ -912,6 +918,19 @@ class UnicodeData:
                     # apply to unassigned code points; ignore them
                     table[char][-1].add(p)
 
+        if linebreakprops:
+            for s in open(linebreakprops):
+                s = s.partition('#')[0]
+                s = [i.strip() for i in s.split(';')]
+                if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS:
+                    continue
+                if '..' not in s[0]:
+                    first = last = int(s[0], 16)
+                else:
+                    first, last = [int(c, 16) for c in s[0].split('..')]
+                for char in range(first, last+1):
+                    table[char][-1].add('Line_Break')
+
         if derivednormalizationprops:
             quickchecks = [0] * 0x110000 # default is Yes
             qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()
author	Florent Xicluna <florent.xicluna@gmail.com>	2010-03-30 19:34:18 (GMT)
committer	Florent Xicluna <florent.xicluna@gmail.com>	2010-03-30 19:34:18 (GMT)
commit	806d8cf0e8056726580e210e1dea960d6e77c910 (patch)
tree	ed95ffd06d353ecdffdbdacba271d5dda71f80aa /Tools/unicode
parent	364129ef5a806bf919b5d321206cc1b72aed7272 (diff)
download	cpython-806d8cf0e8056726580e210e1dea960d6e77c910.zip cpython-806d8cf0e8056726580e210e1dea960d6e77c910.tar.gz cpython-806d8cf0e8056726580e210e1dea960d6e77c910.tar.bz2