#7643: Unicode codepoints VT (0x0B) and FF (0x0C) are linebreaks according to Unicode Standard Annex #14.

author: Florent Xicluna <florent.xicluna@gmail.com> 2010-03-30 08:24:06 (GMT)
committer: Florent Xicluna <florent.xicluna@gmail.com> 2010-03-30 08:24:06 (GMT)
commit: 22b243809eea5f4f504f9a7ac157539adde6b3f7 (patch)
tree: c910ac626faa4b6ec4565e1762b41089364aee5d /Objects
parent: e6410c536c9dca5a3a7899888c071f41a1767291 (diff)
download: cpython-22b243809eea5f4f504f9a7ac157539adde6b3f7.zip
cpython-22b243809eea5f4f504f9a7ac157539adde6b3f7.tar.gz
cpython-22b243809eea5f4f504f9a7ac157539adde6b3f7.tar.bz2
2 files changed, 11 insertions, 6 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 4943413..930d58c 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -115,9 +115,9 @@ static char unicode_default_encoding[100];
 /* Fast detection of the most frequent whitespace characters */
 const unsigned char _Py_ascii_whitespace[] = {
     0, 0, 0, 0, 0, 0, 0, 0,
-/*     case 0x0009: * HORIZONTAL TABULATION */
+/*     case 0x0009: * CHARACTER TABULATION */
 /*     case 0x000A: * LINE FEED */
-/*     case 0x000B: * VERTICAL TABULATION */
+/*     case 0x000B: * LINE TABULATION */
 /*     case 0x000C: * FORM FEED */
 /*     case 0x000D: * CARRIAGE RETURN */
     0, 1, 1, 1, 1, 1, 0, 0,
@@ -147,8 +147,10 @@ const unsigned char _Py_ascii_whitespace[] = {
 static unsigned char ascii_linebreak[] = {
     0, 0, 0, 0, 0, 0, 0, 0,
 /*         0x000A, * LINE FEED */
+/*         0x000B, * LINE TABULATION */
+/*         0x000C, * FORM FEED */
 /*         0x000D, * CARRIAGE RETURN */
-    0, 0, 1, 0, 0, 1, 0, 0,
+    0, 0, 1, 1, 1, 1, 0, 0,
     0, 0, 0, 0, 0, 0, 0, 0,
 /*         0x001C, * FILE SEPARATOR */
 /*         0x001D, * GROUP SEPARATOR */
diff --git a/Objects/unicodetype_db.h b/Objects/unicodetype_db.h
index 443693d..d2ec46b 100644
--- a/Objects/unicodetype_db.h
+++ b/Objects/unicodetype_db.h
@@ -661,7 +661,7 @@ static unsigned char index1[] = {
 };
 
 static unsigned char index2[] = {
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 2, 2, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
     1, 1, 1, 1, 3, 3, 3, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
     4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1, 1, 1, 1, 1, 1, 1, 14, 14, 14, 14, 
     14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 
@@ -3313,13 +3313,16 @@ int _PyUnicode_IsWhitespace(register const Py_UNICODE ch)
 #endif
 }
 
-/* Returns 1 for Unicode characters having the category 'Zl',
- * 'Zp' or type 'B', 0 otherwise.
+/* Returns 1 for Unicode characters having the line break
+ * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional
+ * type 'B', 0 otherwise.
  */
 int _PyUnicode_IsLinebreak(register const Py_UNICODE ch)
 {
     switch (ch) {
     case 0x000A:
+    case 0x000B:
+    case 0x000C:
     case 0x000D:
     case 0x001C:
     case 0x001D:
author	Florent Xicluna <florent.xicluna@gmail.com>	2010-03-30 08:24:06 (GMT)
committer	Florent Xicluna <florent.xicluna@gmail.com>	2010-03-30 08:24:06 (GMT)
commit	22b243809eea5f4f504f9a7ac157539adde6b3f7 (patch)
tree	c910ac626faa4b6ec4565e1762b41089364aee5d /Objects
parent	e6410c536c9dca5a3a7899888c071f41a1767291 (diff)
download	cpython-22b243809eea5f4f504f9a7ac157539adde6b3f7.zip cpython-22b243809eea5f4f504f9a7ac157539adde6b3f7.tar.gz cpython-22b243809eea5f4f504f9a7ac157539adde6b3f7.tar.bz2