Issue #22687: Fixed some corner cases in breaking words in tetxtwrap.

Got rid of quadratic complexity in breaking long words.
author: Serhiy Storchaka <storchaka@gmail.com> 2015-03-24 16:32:27 (GMT)
committer: Serhiy Storchaka <storchaka@gmail.com> 2015-03-24 16:32:27 (GMT)
commit: 72bd327db0b26e542a327449bef77bd2bc059da4 (patch)
tree: afd02416b97da57618656c464f292cda959453cd /Lib
parent: b365a06a8444c90e4c79d4bff1c58cd9dd793569 (diff)
download: cpython-72bd327db0b26e542a327449bef77bd2bc059da4.zip
cpython-72bd327db0b26e542a327449bef77bd2bc059da4.tar.gz
cpython-72bd327db0b26e542a327449bef77bd2bc059da4.tar.bz2
2 files changed, 35 insertions, 4 deletions
diff --git a/Lib/test/test_textwrap.py b/Lib/test/test_textwrap.py
index 1bba77e..707aaaa 100644
--- a/Lib/test/test_textwrap.py
+++ b/Lib/test/test_textwrap.py
@@ -184,6 +184,16 @@ What a mess!
         self.check_wrap(text, 42,
                         ["this-is-a-useful-feature-for-reformatting-",
                          "posts-from-tim-peters'ly"])
+        # The test tests current behavior but is not testing parts of the API.
+        expect = ("this-|is-|a-|useful-|feature-|for-|"
+                  "reformatting-|posts-|from-|tim-|peters'ly").split('|')
+        self.check_wrap(text, 1, expect, break_long_words=False)
+        self.check_split(text, expect)
+
+        self.check_split('e-mail', ['e-mail'])
+        self.check_split('Jelly-O', ['Jelly-O'])
+        # The test tests current behavior but is not testing parts of the API.
+        self.check_split('half-a-crown', 'half-|a-|crown'.split('|'))
 
     def test_hyphenated_numbers(self):
         # Test that hyphenated numbers (eg. dates) are not broken like words.
@@ -195,6 +205,7 @@ What a mess!
                                    'released on 1994-02-15.'])
         self.check_wrap(text, 40, ['Python 1.0.0 was released on 1994-01-26.',
                                    'Python 1.0.1 was released on 1994-02-15.'])
+        self.check_wrap(text, 1, text.split(), break_long_words=False)
 
         text = "I do all my shopping at 7-11."
         self.check_wrap(text, 25, ["I do all my shopping at",
@@ -202,6 +213,7 @@ What a mess!
         self.check_wrap(text, 27, ["I do all my shopping at",
                                    "7-11."])
         self.check_wrap(text, 29, ["I do all my shopping at 7-11."])
+        self.check_wrap(text, 1, text.split(), break_long_words=False)
 
     def test_em_dash(self):
         # Test text with em-dashes
@@ -326,6 +338,10 @@ What a mess!
         self.check_split("the ['wibble-wobble'] widget",
                          ['the', ' ', "['wibble-", "wobble']", ' ', 'widget'])
 
+        # The test tests current behavior but is not testing parts of the API.
+        self.check_split("what-d'you-call-it.",
+                         "what-d'you-|call-|it.".split('|'))
+
     def test_funky_parens (self):
         # Second part of SF bug #596434: long option strings inside
         # parentheses.
diff --git a/Lib/textwrap.py b/Lib/textwrap.py
index 2489180..49ea9a6 100644
--- a/Lib/textwrap.py
+++ b/Lib/textwrap.py
@@ -79,10 +79,25 @@ class TextWrapper:
     # splits into
     #   Hello/ /there/ /--/ /you/ /goof-/ball,/ /use/ /the/ /-b/ /option!
     # (after stripping out empty strings).
-    wordsep_re = re.compile(
-        r'(\s+|'                                  # any whitespace
-        r'[^\s\w]*\w+[^0-9\W]-(?=\w+[^0-9\W])|'   # hyphenated words
-        r'(?<=[\w\!\"\'\&\.\,\?])-{2,}(?=\w))')   # em-dash
+    word_punct = r'[\w!"\'&.,?]'
+    letter = r'[^\d\W]'
+    wordsep_re = re.compile(r'''
+        ( # any whitespace
+          \s+
+        | # em-dash between words
+          (?<=%(wp)s) -{2,} (?=\w)
+        | # word, possibly hyphenated
+          \S+? (?:
+            # hyphenated word
+              -(?: (?<=%(lt)s{2}-) | (?<=%(lt)s-%(lt)s-))
+              (?= %(lt)s -? %(lt)s)
+            | # end of word
+              (?=\s|\Z)
+            | # em-dash
+              (?<=%(wp)s) (?=-{2,}\w)
+            )
+        )''' % {'wp': word_punct, 'lt': letter}, re.VERBOSE)
+    del word_punct, letter
 
     # This less funky little regex just split on recognized spaces. E.g.
     #   "Hello there -- you goof-ball, use the -b option!"
author	Serhiy Storchaka <storchaka@gmail.com>	2015-03-24 16:32:27 (GMT)
committer	Serhiy Storchaka <storchaka@gmail.com>	2015-03-24 16:32:27 (GMT)
commit	72bd327db0b26e542a327449bef77bd2bc059da4 (patch)
tree	afd02416b97da57618656c464f292cda959453cd /Lib
parent	b365a06a8444c90e4c79d4bff1c58cd9dd793569 (diff)
download	cpython-72bd327db0b26e542a327449bef77bd2bc059da4.zip cpython-72bd327db0b26e542a327449bef77bd2bc059da4.tar.gz cpython-72bd327db0b26e542a327449bef77bd2bc059da4.tar.bz2