summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSerhiy Storchaka <storchaka@gmail.com>2015-03-24 16:32:27 (GMT)
committerSerhiy Storchaka <storchaka@gmail.com>2015-03-24 16:32:27 (GMT)
commit72bd327db0b26e542a327449bef77bd2bc059da4 (patch)
treeafd02416b97da57618656c464f292cda959453cd
parentb365a06a8444c90e4c79d4bff1c58cd9dd793569 (diff)
downloadcpython-72bd327db0b26e542a327449bef77bd2bc059da4.zip
cpython-72bd327db0b26e542a327449bef77bd2bc059da4.tar.gz
cpython-72bd327db0b26e542a327449bef77bd2bc059da4.tar.bz2
Issue #22687: Fixed some corner cases in breaking words in tetxtwrap.
Got rid of quadratic complexity in breaking long words.
-rw-r--r--Lib/test/test_textwrap.py16
-rw-r--r--Lib/textwrap.py23
-rw-r--r--Misc/NEWS3
3 files changed, 38 insertions, 4 deletions
diff --git a/Lib/test/test_textwrap.py b/Lib/test/test_textwrap.py
index 1bba77e..707aaaa 100644
--- a/Lib/test/test_textwrap.py
+++ b/Lib/test/test_textwrap.py
@@ -184,6 +184,16 @@ What a mess!
self.check_wrap(text, 42,
["this-is-a-useful-feature-for-reformatting-",
"posts-from-tim-peters'ly"])
+ # The test tests current behavior but is not testing parts of the API.
+ expect = ("this-|is-|a-|useful-|feature-|for-|"
+ "reformatting-|posts-|from-|tim-|peters'ly").split('|')
+ self.check_wrap(text, 1, expect, break_long_words=False)
+ self.check_split(text, expect)
+
+ self.check_split('e-mail', ['e-mail'])
+ self.check_split('Jelly-O', ['Jelly-O'])
+ # The test tests current behavior but is not testing parts of the API.
+ self.check_split('half-a-crown', 'half-|a-|crown'.split('|'))
def test_hyphenated_numbers(self):
# Test that hyphenated numbers (eg. dates) are not broken like words.
@@ -195,6 +205,7 @@ What a mess!
'released on 1994-02-15.'])
self.check_wrap(text, 40, ['Python 1.0.0 was released on 1994-01-26.',
'Python 1.0.1 was released on 1994-02-15.'])
+ self.check_wrap(text, 1, text.split(), break_long_words=False)
text = "I do all my shopping at 7-11."
self.check_wrap(text, 25, ["I do all my shopping at",
@@ -202,6 +213,7 @@ What a mess!
self.check_wrap(text, 27, ["I do all my shopping at",
"7-11."])
self.check_wrap(text, 29, ["I do all my shopping at 7-11."])
+ self.check_wrap(text, 1, text.split(), break_long_words=False)
def test_em_dash(self):
# Test text with em-dashes
@@ -326,6 +338,10 @@ What a mess!
self.check_split("the ['wibble-wobble'] widget",
['the', ' ', "['wibble-", "wobble']", ' ', 'widget'])
+ # The test tests current behavior but is not testing parts of the API.
+ self.check_split("what-d'you-call-it.",
+ "what-d'you-|call-|it.".split('|'))
+
def test_funky_parens (self):
# Second part of SF bug #596434: long option strings inside
# parentheses.
diff --git a/Lib/textwrap.py b/Lib/textwrap.py
index 2489180..49ea9a6 100644
--- a/Lib/textwrap.py
+++ b/Lib/textwrap.py
@@ -79,10 +79,25 @@ class TextWrapper:
# splits into
# Hello/ /there/ /--/ /you/ /goof-/ball,/ /use/ /the/ /-b/ /option!
# (after stripping out empty strings).
- wordsep_re = re.compile(
- r'(\s+|' # any whitespace
- r'[^\s\w]*\w+[^0-9\W]-(?=\w+[^0-9\W])|' # hyphenated words
- r'(?<=[\w\!\"\'\&\.\,\?])-{2,}(?=\w))') # em-dash
+ word_punct = r'[\w!"\'&.,?]'
+ letter = r'[^\d\W]'
+ wordsep_re = re.compile(r'''
+ ( # any whitespace
+ \s+
+ | # em-dash between words
+ (?<=%(wp)s) -{2,} (?=\w)
+ | # word, possibly hyphenated
+ \S+? (?:
+ # hyphenated word
+ -(?: (?<=%(lt)s{2}-) | (?<=%(lt)s-%(lt)s-))
+ (?= %(lt)s -? %(lt)s)
+ | # end of word
+ (?=\s|\Z)
+ | # em-dash
+ (?<=%(wp)s) (?=-{2,}\w)
+ )
+ )''' % {'wp': word_punct, 'lt': letter}, re.VERBOSE)
+ del word_punct, letter
# This less funky little regex just split on recognized spaces. E.g.
# "Hello there -- you goof-ball, use the -b option!"
diff --git a/Misc/NEWS b/Misc/NEWS
index 7d888a2..3110aad 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -26,6 +26,9 @@ Core and Builtins
Library
-------
+- Issue #22687: Fixed some corner cases in breaking words in tetxtwrap.
+ Got rid of quadratic complexity in breaking long words.
+
- Issue #20289: The copy module now uses pickle protocol 4 (PEP 3154) and
supports copying of instances of classes whose __new__ method takes
keyword-only arguments.