diff options
author | Antoine Pitrou <solipsis@pitrou.net> | 2008-12-13 23:12:30 (GMT) |
---|---|---|
committer | Antoine Pitrou <solipsis@pitrou.net> | 2008-12-13 23:12:30 (GMT) |
commit | 74af3bbfbdb925b5af5ec26f75cbc56698331d62 (patch) | |
tree | 2140ff820d9ab1498ad128606ee216cc681e8f88 /Lib/textwrap.py | |
parent | 9f35070a6b101d61f45b36679092c26fa4f5532f (diff) | |
download | cpython-74af3bbfbdb925b5af5ec26f75cbc56698331d62.zip cpython-74af3bbfbdb925b5af5ec26f75cbc56698331d62.tar.gz cpython-74af3bbfbdb925b5af5ec26f75cbc56698331d62.tar.bz2 |
Issue #4163: Use unicode-friendly word splitting in the textwrap functions when given an unicode string.
Diffstat (limited to 'Lib/textwrap.py')
-rw-r--r-- | Lib/textwrap.py | 14 |
1 files changed, 8 insertions, 6 deletions
diff --git a/Lib/textwrap.py b/Lib/textwrap.py index 53f2f1b..192b43b 100644 --- a/Lib/textwrap.py +++ b/Lib/textwrap.py @@ -84,16 +84,16 @@ class TextWrapper: # splits into # Hello/ /there/ /--/ /you/ /goof-/ball,/ /use/ /the/ /-b/ /option! # (after stripping out empty strings). - wordsep_re = re.compile( + wordsep_re = ( r'(\s+|' # any whitespace - r'[^\s\w]*\w+[a-zA-Z]-(?=\w+[a-zA-Z])|' # hyphenated words + r'[^\s\w]*\w+[^0-9\W]-(?=\w+[^0-9\W])|' # hyphenated words r'(?<=[\w\!\"\'\&\.\,\?])-{2,}(?=\w))') # em-dash # This less funky little regex just split on recognized spaces. E.g. # "Hello there -- you goof-ball, use the -b option!" # splits into # Hello/ /there/ /--/ /you/ /goof-ball,/ /use/ /the/ /-b/ /option!/ - wordsep_simple_re = re.compile(r'(\s+)') + wordsep_simple_re = r'(\s+)' # XXX this is not locale- or charset-aware -- string.lowercase # is US-ASCII only (and therefore English-only) @@ -160,10 +160,12 @@ class TextWrapper: 'use', ' ', 'the', ' ', '-b', ' ', option!' otherwise. """ - if self.break_on_hyphens is True: - chunks = self.wordsep_re.split(text) + flags = re.UNICODE if isinstance(text, unicode) else 0 + if self.break_on_hyphens: + pat = self.wordsep_re else: - chunks = self.wordsep_simple_re.split(text) + pat = self.wordsep_simple_re + chunks = re.compile(pat, flags).split(text) chunks = filter(None, chunks) # remove empty chunks return chunks |