Issue #4163: Use unicode-friendly word splitting in the textwrap functions when given an unicode string.

author: Antoine Pitrou <solipsis@pitrou.net> 2008-12-13 23:12:30 (GMT)
committer: Antoine Pitrou <solipsis@pitrou.net> 2008-12-13 23:12:30 (GMT)
commit: 74af3bbfbdb925b5af5ec26f75cbc56698331d62 (patch)
tree: 2140ff820d9ab1498ad128606ee216cc681e8f88 /Lib/textwrap.py
parent: 9f35070a6b101d61f45b36679092c26fa4f5532f (diff)
download: cpython-74af3bbfbdb925b5af5ec26f75cbc56698331d62.zip
cpython-74af3bbfbdb925b5af5ec26f75cbc56698331d62.tar.gz
cpython-74af3bbfbdb925b5af5ec26f75cbc56698331d62.tar.bz2
1 files changed, 8 insertions, 6 deletions
diff --git a/Lib/textwrap.py b/Lib/textwrap.py
index 53f2f1b..192b43b 100644
--- a/Lib/textwrap.py
+++ b/Lib/textwrap.py
@@ -84,16 +84,16 @@ class TextWrapper:
     # splits into
     #   Hello/ /there/ /--/ /you/ /goof-/ball,/ /use/ /the/ /-b/ /option!
     # (after stripping out empty strings).
-    wordsep_re = re.compile(
+    wordsep_re = (
         r'(\s+|'                                  # any whitespace
-        r'[^\s\w]*\w+[a-zA-Z]-(?=\w+[a-zA-Z])|'   # hyphenated words
+        r'[^\s\w]*\w+[^0-9\W]-(?=\w+[^0-9\W])|'   # hyphenated words
         r'(?<=[\w\!\"\'\&\.\,\?])-{2,}(?=\w))')   # em-dash
 
     # This less funky little regex just split on recognized spaces. E.g.
     #   "Hello there -- you goof-ball, use the -b option!"
     # splits into
     #   Hello/ /there/ /--/ /you/ /goof-ball,/ /use/ /the/ /-b/ /option!/
-    wordsep_simple_re = re.compile(r'(\s+)')
+    wordsep_simple_re = r'(\s+)'
 
     # XXX this is not locale- or charset-aware -- string.lowercase
     # is US-ASCII only (and therefore English-only)
@@ -160,10 +160,12 @@ class TextWrapper:
           'use', ' ', 'the', ' ', '-b', ' ', option!'
         otherwise.
         """
-        if self.break_on_hyphens is True:
-            chunks = self.wordsep_re.split(text)
+        flags = re.UNICODE if isinstance(text, unicode) else 0
+        if self.break_on_hyphens:
+            pat = self.wordsep_re
         else:
-            chunks = self.wordsep_simple_re.split(text)
+            pat = self.wordsep_simple_re
+        chunks = re.compile(pat, flags).split(text)
         chunks = filter(None, chunks)  # remove empty chunks
         return chunks
author	Antoine Pitrou <solipsis@pitrou.net>	2008-12-13 23:12:30 (GMT)
committer	Antoine Pitrou <solipsis@pitrou.net>	2008-12-13 23:12:30 (GMT)
commit	74af3bbfbdb925b5af5ec26f75cbc56698331d62 (patch)
tree	2140ff820d9ab1498ad128606ee216cc681e8f88 /Lib/textwrap.py
parent	9f35070a6b101d61f45b36679092c26fa4f5532f (diff)
download	cpython-74af3bbfbdb925b5af5ec26f75cbc56698331d62.zip cpython-74af3bbfbdb925b5af5ec26f75cbc56698331d62.tar.gz cpython-74af3bbfbdb925b5af5ec26f75cbc56698331d62.tar.bz2