summaryrefslogtreecommitdiffstats
path: root/Lib/textwrap.py
diff options
context:
space:
mode:
authorSerhiy Storchaka <storchaka@gmail.com>2015-03-24 16:32:27 (GMT)
committerSerhiy Storchaka <storchaka@gmail.com>2015-03-24 16:32:27 (GMT)
commit72bd327db0b26e542a327449bef77bd2bc059da4 (patch)
treeafd02416b97da57618656c464f292cda959453cd /Lib/textwrap.py
parentb365a06a8444c90e4c79d4bff1c58cd9dd793569 (diff)
downloadcpython-72bd327db0b26e542a327449bef77bd2bc059da4.zip
cpython-72bd327db0b26e542a327449bef77bd2bc059da4.tar.gz
cpython-72bd327db0b26e542a327449bef77bd2bc059da4.tar.bz2
Issue #22687: Fixed some corner cases in breaking words in tetxtwrap.
Got rid of quadratic complexity in breaking long words.
Diffstat (limited to 'Lib/textwrap.py')
-rw-r--r--Lib/textwrap.py23
1 files changed, 19 insertions, 4 deletions
diff --git a/Lib/textwrap.py b/Lib/textwrap.py
index 2489180..49ea9a6 100644
--- a/Lib/textwrap.py
+++ b/Lib/textwrap.py
@@ -79,10 +79,25 @@ class TextWrapper:
# splits into
# Hello/ /there/ /--/ /you/ /goof-/ball,/ /use/ /the/ /-b/ /option!
# (after stripping out empty strings).
- wordsep_re = re.compile(
- r'(\s+|' # any whitespace
- r'[^\s\w]*\w+[^0-9\W]-(?=\w+[^0-9\W])|' # hyphenated words
- r'(?<=[\w\!\"\'\&\.\,\?])-{2,}(?=\w))') # em-dash
+ word_punct = r'[\w!"\'&.,?]'
+ letter = r'[^\d\W]'
+ wordsep_re = re.compile(r'''
+ ( # any whitespace
+ \s+
+ | # em-dash between words
+ (?<=%(wp)s) -{2,} (?=\w)
+ | # word, possibly hyphenated
+ \S+? (?:
+ # hyphenated word
+ -(?: (?<=%(lt)s{2}-) | (?<=%(lt)s-%(lt)s-))
+ (?= %(lt)s -? %(lt)s)
+ | # end of word
+ (?=\s|\Z)
+ | # em-dash
+ (?<=%(wp)s) (?=-{2,}\w)
+ )
+ )''' % {'wp': word_punct, 'lt': letter}, re.VERBOSE)
+ del word_punct, letter
# This less funky little regex just split on recognized spaces. E.g.
# "Hello there -- you goof-ball, use the -b option!"