diff options
Diffstat (limited to 'Lib/test/test_re.py')
-rw-r--r-- | Lib/test/test_re.py | 566 |
1 files changed, 425 insertions, 141 deletions
diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index 7348af3..7a74141 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -38,6 +38,24 @@ class ReTests(unittest.TestCase): self.assertIs(type(actual), type(expect), msg) recurse(actual, expect) + def checkPatternError(self, pattern, errmsg, pos=None): + with self.assertRaises(re.error) as cm: + re.compile(pattern) + with self.subTest(pattern=pattern): + err = cm.exception + self.assertEqual(err.msg, errmsg) + if pos is not None: + self.assertEqual(err.pos, pos) + + def checkTemplateError(self, pattern, repl, string, errmsg, pos=None): + with self.assertRaises(re.error) as cm: + re.sub(pattern, repl, string) + with self.subTest(pattern=pattern, repl=repl): + err = cm.exception + self.assertEqual(err.msg, errmsg) + if pos is not None: + self.assertEqual(err.pos, pos) + def test_keep_buffer(self): # See bug 14212 b = bytearray(b'x') @@ -84,7 +102,7 @@ class ReTests(unittest.TestCase): self.assertEqual(re.sub("(?i)b+", "x", "bbbb BBBB"), 'x x') self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y'), '9.3 -3 24x100y') - self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y', 3), + self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y', count=3), '9.3 -3 23x99y') self.assertEqual(re.sub('.', lambda m: r"\n", 'x'), '\\n') @@ -100,11 +118,14 @@ class ReTests(unittest.TestCase): self.assertEqual(re.sub('(?P<unk>x)', '\g<unk>\g<unk>', 'xx'), 'xxxx') self.assertEqual(re.sub('(?P<unk>x)', '\g<1>\g<1>', 'xx'), 'xxxx') - self.assertEqual(re.sub('a',r'\t\n\v\r\f\a\b\B\Z\a\A\w\W\s\S\d\D','a'), - '\t\n\v\r\f\a\b\\B\\Z\a\\A\\w\\W\\s\\S\\d\\D') - self.assertEqual(re.sub('a', '\t\n\v\r\f\a', 'a'), '\t\n\v\r\f\a') - self.assertEqual(re.sub('a', '\t\n\v\r\f\a', 'a'), - (chr(9)+chr(10)+chr(11)+chr(13)+chr(12)+chr(7))) + self.assertEqual(re.sub('a', r'\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b') + self.assertEqual(re.sub('a', '\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b') + self.assertEqual(re.sub('a', '\t\n\v\r\f\a\b', 'a'), + (chr(9)+chr(10)+chr(11)+chr(13)+chr(12)+chr(7)+chr(8))) + for c in 'cdehijklmopqsuwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ': + with self.subTest(c): + with self.assertWarns(DeprecationWarning): + self.assertEqual(re.sub('a', '\\' + c, 'a'), '\\' + c) self.assertEqual(re.sub('^\s*', 'X', 'test'), 'Xtest') @@ -145,6 +166,7 @@ class ReTests(unittest.TestCase): self.assertEqual(re.sub('x', r'\009', 'x'), '\0' + '9') self.assertEqual(re.sub('x', r'\111', 'x'), '\111') self.assertEqual(re.sub('x', r'\117', 'x'), '\117') + self.assertEqual(re.sub('x', r'\377', 'x'), '\377') self.assertEqual(re.sub('x', r'\1111', 'x'), '\1111') self.assertEqual(re.sub('x', r'\1111', 'x'), '\111' + '1') @@ -155,21 +177,25 @@ class ReTests(unittest.TestCase): self.assertEqual(re.sub('x', r'\09', 'x'), '\0' + '9') self.assertEqual(re.sub('x', r'\0a', 'x'), '\0' + 'a') - self.assertEqual(re.sub('x', r'\400', 'x'), '\0') - self.assertEqual(re.sub('x', r'\777', 'x'), '\377') - - self.assertRaises(re.error, re.sub, 'x', r'\1', 'x') - self.assertRaises(re.error, re.sub, 'x', r'\8', 'x') - self.assertRaises(re.error, re.sub, 'x', r'\9', 'x') - self.assertRaises(re.error, re.sub, 'x', r'\11', 'x') - self.assertRaises(re.error, re.sub, 'x', r'\18', 'x') - self.assertRaises(re.error, re.sub, 'x', r'\1a', 'x') - self.assertRaises(re.error, re.sub, 'x', r'\90', 'x') - self.assertRaises(re.error, re.sub, 'x', r'\99', 'x') - self.assertRaises(re.error, re.sub, 'x', r'\118', 'x') # r'\11' + '8' - self.assertRaises(re.error, re.sub, 'x', r'\11a', 'x') - self.assertRaises(re.error, re.sub, 'x', r'\181', 'x') # r'\18' + '1' - self.assertRaises(re.error, re.sub, 'x', r'\800', 'x') # r'\80' + '0' + self.checkTemplateError('x', r'\400', 'x', + r'octal escape value \400 outside of ' + r'range 0-0o377', 0) + self.checkTemplateError('x', r'\777', 'x', + r'octal escape value \777 outside of ' + r'range 0-0o377', 0) + + self.checkTemplateError('x', r'\1', 'x', 'invalid group reference') + self.checkTemplateError('x', r'\8', 'x', 'invalid group reference') + self.checkTemplateError('x', r'\9', 'x', 'invalid group reference') + self.checkTemplateError('x', r'\11', 'x', 'invalid group reference') + self.checkTemplateError('x', r'\18', 'x', 'invalid group reference') + self.checkTemplateError('x', r'\1a', 'x', 'invalid group reference') + self.checkTemplateError('x', r'\90', 'x', 'invalid group reference') + self.checkTemplateError('x', r'\99', 'x', 'invalid group reference') + self.checkTemplateError('x', r'\118', 'x', 'invalid group reference') # r'\11' + '8' + self.checkTemplateError('x', r'\11a', 'x', 'invalid group reference') + self.checkTemplateError('x', r'\181', 'x', 'invalid group reference') # r'\18' + '1' + self.checkTemplateError('x', r'\800', 'x', 'invalid group reference') # r'\80' + '0' # in python2.3 (etc), these loop endlessly in sre_parser.py self.assertEqual(re.sub('(((((((((((x)))))))))))', r'\11', 'x'), 'x') @@ -180,7 +206,7 @@ class ReTests(unittest.TestCase): def test_qualified_re_sub(self): self.assertEqual(re.sub('a', 'b', 'aaaaa'), 'bbbbb') - self.assertEqual(re.sub('a', 'b', 'aaaaa', 1), 'baaaa') + self.assertEqual(re.sub('a', 'b', 'aaaaa', count=1), 'baaaa') def test_bug_114660(self): self.assertEqual(re.sub(r'(\S)\s+(\S)', r'\1 \2', 'hello there'), @@ -194,75 +220,105 @@ class ReTests(unittest.TestCase): def test_symbolic_groups(self): re.compile('(?P<a>x)(?P=a)(?(a)y)') re.compile('(?P<a1>x)(?P=a1)(?(a1)y)') - self.assertRaises(re.error, re.compile, '(?P<a>)(?P<a>)') - self.assertRaises(re.error, re.compile, '(?Px)') - self.assertRaises(re.error, re.compile, '(?P=)') - self.assertRaises(re.error, re.compile, '(?P=1)') - self.assertRaises(re.error, re.compile, '(?P=a)') - self.assertRaises(re.error, re.compile, '(?P=a1)') - self.assertRaises(re.error, re.compile, '(?P=a.)') - self.assertRaises(re.error, re.compile, '(?P<)') - self.assertRaises(re.error, re.compile, '(?P<>)') - self.assertRaises(re.error, re.compile, '(?P<1>)') - self.assertRaises(re.error, re.compile, '(?P<a.>)') - self.assertRaises(re.error, re.compile, '(?())') - self.assertRaises(re.error, re.compile, '(?(a))') - self.assertRaises(re.error, re.compile, '(?(1a))') - self.assertRaises(re.error, re.compile, '(?(a.))') + re.compile('(?P<a1>x)\1(?(1)y)') + self.checkPatternError('(?P<a>)(?P<a>)', + "redefinition of group name 'a' as group 2; " + "was group 1") + self.checkPatternError('(?P<a>(?P=a))', + "cannot refer to an open group", 10) + self.checkPatternError('(?Pxy)', 'unknown extension ?Px') + self.checkPatternError('(?P<a>)(?P=a', 'missing ), unterminated name', 11) + self.checkPatternError('(?P=', 'missing group name', 4) + self.checkPatternError('(?P=)', 'missing group name', 4) + self.checkPatternError('(?P=1)', "bad character in group name '1'", 4) + self.checkPatternError('(?P=a)', "unknown group name 'a'") + self.checkPatternError('(?P=a1)', "unknown group name 'a1'") + self.checkPatternError('(?P=a.)', "bad character in group name 'a.'", 4) + self.checkPatternError('(?P<)', 'missing >, unterminated name', 4) + self.checkPatternError('(?P<a', 'missing >, unterminated name', 4) + self.checkPatternError('(?P<', 'missing group name', 4) + self.checkPatternError('(?P<>)', 'missing group name', 4) + self.checkPatternError(r'(?P<1>)', "bad character in group name '1'", 4) + self.checkPatternError(r'(?P<a.>)', "bad character in group name 'a.'", 4) + self.checkPatternError(r'(?(', 'missing group name', 3) + self.checkPatternError(r'(?())', 'missing group name', 3) + self.checkPatternError(r'(?(a))', "unknown group name 'a'", 3) + self.checkPatternError(r'(?(-1))', "bad character in group name '-1'", 3) + self.checkPatternError(r'(?(1a))', "bad character in group name '1a'", 3) + self.checkPatternError(r'(?(a.))', "bad character in group name 'a.'", 3) # New valid/invalid identifiers in Python 3 re.compile('(?P<ยต>x)(?P=ยต)(?(ยต)y)') re.compile('(?P<๐๐ซ๐ฆ๐ ๐ฌ๐ก๐ข>x)(?P=๐๐ซ๐ฆ๐ ๐ฌ๐ก๐ข)(?(๐๐ซ๐ฆ๐ ๐ฌ๐ก๐ข)y)') - self.assertRaises(re.error, re.compile, '(?P<ยฉ>x)') + self.checkPatternError('(?P<ยฉ>x)', "bad character in group name 'ยฉ'", 4) + # Support > 100 groups. + pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1)) + pat = '(?:%s)(?(200)z|t)' % pat + self.assertEqual(re.match(pat, 'xc8yz').span(), (0, 5)) def test_symbolic_refs(self): - self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<a', 'xx') - self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<', 'xx') - self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g', 'xx') - self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<a a>', 'xx') - self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<>', 'xx') - self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<1a1>', 'xx') - self.assertRaises(IndexError, re.sub, '(?P<a>x)', '\g<ab>', 'xx') - self.assertRaises(re.error, re.sub, '(?P<a>x)|(?P<b>y)', '\g<b>', 'xx') - self.assertRaises(re.error, re.sub, '(?P<a>x)|(?P<b>y)', '\\2', 'xx') - self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<-1>', 'xx') + self.checkTemplateError('(?P<a>x)', '\g<a', 'xx', + 'missing >, unterminated name', 3) + self.checkTemplateError('(?P<a>x)', '\g<', 'xx', + 'missing group name', 3) + self.checkTemplateError('(?P<a>x)', '\g', 'xx', 'missing <', 2) + self.checkTemplateError('(?P<a>x)', '\g<a a>', 'xx', + "bad character in group name 'a a'", 3) + self.checkTemplateError('(?P<a>x)', '\g<>', 'xx', + 'missing group name', 3) + self.checkTemplateError('(?P<a>x)', '\g<1a1>', 'xx', + "bad character in group name '1a1'", 3) + self.checkTemplateError('(?P<a>x)', r'\g<2>', 'xx', + 'invalid group reference') + self.checkTemplateError('(?P<a>x)', r'\2', 'xx', + 'invalid group reference') + with self.assertRaisesRegex(IndexError, "unknown group name 'ab'"): + re.sub('(?P<a>x)', '\g<ab>', 'xx') + self.assertEqual(re.sub('(?P<a>x)|(?P<b>y)', r'\g<b>', 'xx'), '') + self.assertEqual(re.sub('(?P<a>x)|(?P<b>y)', r'\2', 'xx'), '') + self.checkTemplateError('(?P<a>x)', '\g<-1>', 'xx', + "bad character in group name '-1'", 3) # New valid/invalid identifiers in Python 3 self.assertEqual(re.sub('(?P<ยต>x)', r'\g<ยต>', 'xx'), 'xx') self.assertEqual(re.sub('(?P<๐๐ซ๐ฆ๐ ๐ฌ๐ก๐ข>x)', r'\g<๐๐ซ๐ฆ๐ ๐ฌ๐ก๐ข>', 'xx'), 'xx') - self.assertRaises(re.error, re.sub, '(?P<a>x)', r'\g<ยฉ>', 'xx') + self.checkTemplateError('(?P<a>x)', '\g<ยฉ>', 'xx', + "bad character in group name 'ยฉ'", 3) + # Support > 100 groups. + pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1)) + self.assertEqual(re.sub(pat, '\g<200>', 'xc8yzxc8y'), 'c8zc8') def test_re_subn(self): self.assertEqual(re.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2)) self.assertEqual(re.subn("b+", "x", "bbbb BBBB"), ('x BBBB', 1)) self.assertEqual(re.subn("b+", "x", "xyz"), ('xyz', 0)) self.assertEqual(re.subn("b*", "x", "xyz"), ('xxxyxzx', 4)) - self.assertEqual(re.subn("b*", "x", "xyz", 2), ('xxxyz', 2)) + self.assertEqual(re.subn("b*", "x", "xyz", count=2), ('xxxyz', 2)) def test_re_split(self): for string in ":a:b::c", S(":a:b::c"): self.assertTypedEqual(re.split(":", string), ['', 'a', 'b', '', 'c']) - self.assertTypedEqual(re.split(":*", string), + self.assertTypedEqual(re.split(":+", string), ['', 'a', 'b', 'c']) - self.assertTypedEqual(re.split("(:*)", string), + self.assertTypedEqual(re.split("(:+)", string), ['', ':', 'a', ':', 'b', '::', 'c']) for string in (b":a:b::c", B(b":a:b::c"), bytearray(b":a:b::c"), memoryview(b":a:b::c")): self.assertTypedEqual(re.split(b":", string), [b'', b'a', b'b', b'', b'c']) - self.assertTypedEqual(re.split(b":*", string), + self.assertTypedEqual(re.split(b":+", string), [b'', b'a', b'b', b'c']) - self.assertTypedEqual(re.split(b"(:*)", string), + self.assertTypedEqual(re.split(b"(:+)", string), [b'', b':', b'a', b':', b'b', b'::', b'c']) for a, b, c in ("\xe0\xdf\xe7", "\u0430\u0431\u0432", "\U0001d49c\U0001d49e\U0001d4b5"): string = ":%s:%s::%s" % (a, b, c) self.assertEqual(re.split(":", string), ['', a, b, '', c]) - self.assertEqual(re.split(":*", string), ['', a, b, c]) - self.assertEqual(re.split("(:*)", string), + self.assertEqual(re.split(":+", string), ['', a, b, c]) + self.assertEqual(re.split("(:+)", string), ['', ':', a, ':', b, '::', c]) - self.assertEqual(re.split("(?::*)", ":a:b::c"), ['', 'a', 'b', 'c']) - self.assertEqual(re.split("(:)*", ":a:b::c"), + self.assertEqual(re.split("(?::+)", ":a:b::c"), ['', 'a', 'b', 'c']) + self.assertEqual(re.split("(:)+", ":a:b::c"), ['', ':', 'a', ':', 'b', ':', 'c']) self.assertEqual(re.split("([b:]+)", ":a:b::c"), ['', ':', 'a', ':b::', 'c']) @@ -272,13 +328,34 @@ class ReTests(unittest.TestCase): self.assertEqual(re.split("(?:b)|(?::+)", ":a:b::c"), ['', 'a', '', '', 'c']) + for sep, expected in [ + (':*', ['', 'a', 'b', 'c']), + ('(?::*)', ['', 'a', 'b', 'c']), + ('(:*)', ['', ':', 'a', ':', 'b', '::', 'c']), + ('(:)*', ['', ':', 'a', ':', 'b', ':', 'c']), + ]: + with self.subTest(sep=sep), self.assertWarns(FutureWarning): + self.assertTypedEqual(re.split(sep, ':a:b::c'), expected) + + for sep, expected in [ + ('', [':a:b::c']), + (r'\b', [':a:b::c']), + (r'(?=:)', [':a:b::c']), + (r'(?<=:)', [':a:b::c']), + ]: + with self.subTest(sep=sep), self.assertRaises(ValueError): + self.assertTypedEqual(re.split(sep, ':a:b::c'), expected) + def test_qualified_re_split(self): - self.assertEqual(re.split(":", ":a:b::c", 2), ['', 'a', 'b::c']) - self.assertEqual(re.split(':', 'a:b:c:d', 2), ['a', 'b', 'c:d']) - self.assertEqual(re.split("(:)", ":a:b::c", 2), + self.assertEqual(re.split(":", ":a:b::c", maxsplit=2), ['', 'a', 'b::c']) + self.assertEqual(re.split(':', 'a:b:c:d', maxsplit=2), ['a', 'b', 'c:d']) + self.assertEqual(re.split("(:)", ":a:b::c", maxsplit=2), ['', ':', 'a', ':', 'b::c']) - self.assertEqual(re.split("(:*)", ":a:b::c", 2), + self.assertEqual(re.split("(:+)", ":a:b::c", maxsplit=2), ['', ':', 'a', ':', 'b::c']) + with self.assertWarns(FutureWarning): + self.assertEqual(re.split("(:*)", ":a:b::c", maxsplit=2), + ['', ':', 'a', ':', 'b::c']) def test_re_findall(self): self.assertEqual(re.findall(":+", "abc"), []) @@ -405,6 +482,23 @@ class ReTests(unittest.TestCase): self.assertIsNone(p.match('abd')) self.assertIsNone(p.match('ac')) + # Support > 100 groups. + pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1)) + pat = '(?:%s)(?(200)z)' % pat + self.assertEqual(re.match(pat, 'xc8yz').span(), (0, 5)) + + self.checkPatternError(r'(?P<a>)(?(0))', 'bad group number', 10) + self.checkPatternError(r'()(?(1)a|b', + 'missing ), unterminated subpattern', 2) + self.checkPatternError(r'()(?(1)a|b|c)', + 'conditional backref with more than ' + 'two branches', 10) + + def test_re_groupref_overflow(self): + self.checkTemplateError('()', '\g<%s>' % sre_constants.MAXGROUPS, 'xx', + 'invalid group reference', 3) + self.checkPatternError(r'(?P<a>)(?(%d))' % sre_constants.MAXGROUPS, + 'invalid group reference', 10) def test_re_groupref(self): self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', '|a|').groups(), @@ -418,6 +512,8 @@ class ReTests(unittest.TestCase): self.assertEqual(re.match(r'^(?:(a)|c)(\1)?$', 'c').groups(), (None, None)) + self.checkPatternError(r'(abc\1)', 'cannot refer to an open group', 4) + def test_groupdict(self): self.assertEqual(re.match('(?P<first>first) (?P<second>second)', 'first second').groupdict(), @@ -428,6 +524,10 @@ class ReTests(unittest.TestCase): "first second") .expand(r"\2 \1 \g<second> \g<first>"), "second first second first") + self.assertEqual(re.match("(?P<first>first)|(?P<second>second)", + "first") + .expand(r"\2 \g<second>"), + " ") def test_repeat_minmax(self): self.assertIsNone(re.match("^(\w){1}$", "abc")) @@ -451,6 +551,7 @@ class ReTests(unittest.TestCase): self.assertTrue(re.match("^x{3}$", "xxx")) self.assertTrue(re.match("^x{1,3}$", "xxx")) + self.assertTrue(re.match("^x{3,3}$", "xxx")) self.assertTrue(re.match("^x{1,4}$", "xxx")) self.assertTrue(re.match("^x{3,4}?$", "xxx")) self.assertTrue(re.match("^x{3}?$", "xxx")) @@ -461,6 +562,9 @@ class ReTests(unittest.TestCase): self.assertIsNone(re.match("^x{}$", "xxx")) self.assertTrue(re.match("^x{}$", "x{}")) + self.checkPatternError(r'x{2,1}', + 'min repeat greater than max repeat', 2) + def test_getattr(self): self.assertEqual(re.compile("(?i)(a)(b)").pattern, "(?i)(a)(b)") self.assertEqual(re.compile("(?i)(a)(b)").flags, re.I | re.U) @@ -475,6 +579,14 @@ class ReTests(unittest.TestCase): self.assertEqual(re.match("(a)", "a").regs, ((0, 1), (0, 1))) self.assertTrue(re.match("(a)", "a").re) + # Issue 14260. groupindex should be non-modifiable mapping. + p = re.compile(r'(?i)(?P<first>a)(?P<other>b)') + self.assertEqual(sorted(p.groupindex), ['first', 'other']) + self.assertEqual(p.groupindex['other'], 2) + with self.assertRaises(TypeError): + p.groupindex['other'] = 0 + self.assertEqual(p.groupindex['other'], 2) + def test_special_escapes(self): self.assertEqual(re.search(r"\b(b.)\b", "abcd abc bcd bx").group(1), "bx") @@ -484,10 +596,6 @@ class ReTests(unittest.TestCase): "abcd abc bcd bx", re.ASCII).group(1), "bx") self.assertEqual(re.search(r"\B(b.)\B", "abc bcd bc abxd", re.ASCII).group(1), "bx") - self.assertEqual(re.search(r"\b(b.)\b", - "abcd abc bcd bx", re.LOCALE).group(1), "bx") - self.assertEqual(re.search(r"\B(b.)\B", - "abc bcd bc abxd", re.LOCALE).group(1), "bx") self.assertEqual(re.search(r"^abc$", "\nabc\n", re.M).group(0), "abc") self.assertEqual(re.search(r"^\Aabc\Z$", "abc", re.M).group(0), "abc") self.assertIsNone(re.search(r"^\Aabc\Z$", "\nabc\n", re.M)) @@ -508,11 +616,32 @@ class ReTests(unittest.TestCase): b"1aa! a").group(0), b"1aa! a") self.assertEqual(re.search(r"\d\D\w\W\s\S", "1aa! a", re.ASCII).group(0), "1aa! a") - self.assertEqual(re.search(r"\d\D\w\W\s\S", - "1aa! a", re.LOCALE).group(0), "1aa! a") self.assertEqual(re.search(br"\d\D\w\W\s\S", b"1aa! a", re.LOCALE).group(0), b"1aa! a") + def test_other_escapes(self): + self.checkPatternError("\\", 'bad escape (end of pattern)', 0) + self.assertEqual(re.match(r"\(", '(').group(), '(') + self.assertIsNone(re.match(r"\(", ')')) + self.assertEqual(re.match(r"\\", '\\').group(), '\\') + self.assertEqual(re.match(r"[\]]", ']').group(), ']') + self.assertIsNone(re.match(r"[\]]", '[')) + self.assertEqual(re.match(r"[a\-c]", '-').group(), '-') + self.assertIsNone(re.match(r"[a\-c]", 'b')) + self.assertEqual(re.match(r"[\^a]+", 'a^').group(), 'a^') + self.assertIsNone(re.match(r"[\^a]+", 'b')) + re.purge() # for warnings + for c in 'ceghijklmopqyzCEFGHIJKLMNOPQRTVXY': + with self.subTest(c): + with self.assertWarns(DeprecationWarning): + self.assertEqual(re.fullmatch('\\%c' % c, c).group(), c) + self.assertIsNone(re.match('\\%c' % c, 'a')) + for c in 'ceghijklmopqyzABCEFGHIJKLMNOPQRTVXYZ': + with self.subTest(c): + with self.assertWarns(DeprecationWarning): + self.assertEqual(re.fullmatch('[\\%c]' % c, c).group(), c) + self.assertIsNone(re.match('[\\%c]' % c, 'a')) + def test_string_boundaries(self): # See http://bugs.python.org/issue10713 self.assertEqual(re.search(r"\b(abc)\b", "abc").group(1), @@ -574,9 +703,6 @@ class ReTests(unittest.TestCase): # Group reference. self.assertTrue(re.match(r'(a)b(?=\1)a', 'aba')) self.assertIsNone(re.match(r'(a)b(?=\1)c', 'abac')) - # Named group reference. - self.assertTrue(re.match(r'(?P<g>a)b(?=(?P=g))a', 'aba')) - self.assertIsNone(re.match(r'(?P<g>a)b(?=(?P=g))c', 'abac')) # Conditional group reference. self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(2)x|c))c', 'abc')) self.assertIsNone(re.match(r'(?:(a)|(x))b(?=(?(2)c|x))c', 'abc')) @@ -594,13 +720,25 @@ class ReTests(unittest.TestCase): self.assertIsNone(re.match(r'ab(?<!b)c', 'abc')) self.assertTrue(re.match(r'ab(?<!c)c', 'abc')) # Group reference. - self.assertWarns(RuntimeWarning, re.compile, r'(a)a(?<=\1)c') - # Named group reference. - self.assertWarns(RuntimeWarning, re.compile, r'(?P<g>a)a(?<=(?P=g))c') + self.assertTrue(re.match(r'(a)a(?<=\1)c', 'aac')) + self.assertIsNone(re.match(r'(a)b(?<=\1)a', 'abaa')) + self.assertIsNone(re.match(r'(a)a(?<!\1)c', 'aac')) + self.assertTrue(re.match(r'(a)b(?<!\1)a', 'abaa')) # Conditional group reference. - self.assertWarns(RuntimeWarning, re.compile, r'(a)b(?<=(?(1)b|x))c') + self.assertIsNone(re.match(r'(?:(a)|(x))b(?<=(?(2)x|c))c', 'abc')) + self.assertIsNone(re.match(r'(?:(a)|(x))b(?<=(?(2)b|x))c', 'abc')) + self.assertTrue(re.match(r'(?:(a)|(x))b(?<=(?(2)x|b))c', 'abc')) + self.assertIsNone(re.match(r'(?:(a)|(x))b(?<=(?(1)c|x))c', 'abc')) + self.assertTrue(re.match(r'(?:(a)|(x))b(?<=(?(1)b|x))c', 'abc')) # Group used before defined. - self.assertWarns(RuntimeWarning, re.compile, r'(a)b(?<=(?(2)b|x))(c)') + self.assertRaises(re.error, re.compile, r'(a)b(?<=(?(2)b|x))(c)') + self.assertIsNone(re.match(r'(a)b(?<=(?(1)c|x))(c)', 'abc')) + self.assertTrue(re.match(r'(a)b(?<=(?(1)b|x))(c)', 'abc')) + # Group defined in the same lookbehind pattern + self.assertRaises(re.error, re.compile, r'(a)b(?<=(.)\2)(c)') + self.assertRaises(re.error, re.compile, r'(a)b(?<=(?P<a>.)(?P=a))(c)') + self.assertRaises(re.error, re.compile, r'(a)b(?<=(a)(?(2)b|x))(c)') + self.assertRaises(re.error, re.compile, r'(a)b(?<=(.)(?<=\2))(c)') def test_ignore_case(self): self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC") @@ -692,9 +830,12 @@ class ReTests(unittest.TestCase): self.assertEqual(_sre.getlower(ord('A'), 0), ord('a')) self.assertEqual(_sre.getlower(ord('A'), re.LOCALE), ord('a')) self.assertEqual(_sre.getlower(ord('A'), re.UNICODE), ord('a')) + self.assertEqual(_sre.getlower(ord('A'), re.ASCII), ord('a')) self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC") self.assertEqual(re.match(b"abc", b"ABC", re.I).group(0), b"ABC") + self.assertEqual(re.match("abc", "ABC", re.I|re.A).group(0), "ABC") + self.assertEqual(re.match(b"abc", b"ABC", re.I|re.L).group(0), b"ABC") def test_not_literal(self): self.assertEqual(re.search("\s([^a])", " b").group(1), "b") @@ -779,8 +920,10 @@ class ReTests(unittest.TestCase): self.assertEqual(re.X, re.VERBOSE) def test_flags(self): - for flag in [re.I, re.M, re.X, re.S, re.L]: + for flag in [re.I, re.M, re.X, re.S, re.A, re.U]: self.assertTrue(re.compile('^pattern$', flag)) + for flag in [re.I, re.M, re.X, re.S, re.A, re.L]: + self.assertTrue(re.compile(b'^pattern$', flag)) def test_sre_character_literals(self): for i in [0, 8, 16, 32, 64, 127, 128, 255, 256, 0xFFFF, 0x10000, 0x10FFFF]: @@ -802,15 +945,17 @@ class ReTests(unittest.TestCase): self.assertTrue(re.match(r"\08", "\0008")) self.assertTrue(re.match(r"\01", "\001")) self.assertTrue(re.match(r"\018", "\0018")) - self.assertTrue(re.match(r"\567", chr(0o167))) - self.assertRaises(re.error, re.match, r"\911", "") - self.assertRaises(re.error, re.match, r"\x1", "") - self.assertRaises(re.error, re.match, r"\x1z", "") - self.assertRaises(re.error, re.match, r"\u123", "") - self.assertRaises(re.error, re.match, r"\u123z", "") - self.assertRaises(re.error, re.match, r"\U0001234", "") - self.assertRaises(re.error, re.match, r"\U0001234z", "") - self.assertRaises(re.error, re.match, r"\U00110000", "") + self.checkPatternError(r"\567", + r'octal escape value \567 outside of ' + r'range 0-0o377', 0) + self.checkPatternError(r"\911", 'invalid group reference', 0) + self.checkPatternError(r"\x1", r'incomplete escape \x1', 0) + self.checkPatternError(r"\x1z", r'incomplete escape \x1', 0) + self.checkPatternError(r"\u123", r'incomplete escape \u123', 0) + self.checkPatternError(r"\u123z", r'incomplete escape \u123', 0) + self.checkPatternError(r"\U0001234", r'incomplete escape \U0001234', 0) + self.checkPatternError(r"\U0001234z", r'incomplete escape \U0001234', 0) + self.checkPatternError(r"\U00110000", r'bad escape \U00110000', 0) def test_sre_character_class_literals(self): for i in [0, 8, 16, 32, 64, 127, 128, 255, 256, 0xFFFF, 0x10000, 0x10FFFF]: @@ -830,12 +975,15 @@ class ReTests(unittest.TestCase): self.assertTrue(re.match(r"[\U%08x]" % i, chr(i))) self.assertTrue(re.match(r"[\U%08x0]" % i, chr(i)+"0")) self.assertTrue(re.match(r"[\U%08xz]" % i, chr(i)+"z")) + self.checkPatternError(r"[\567]", + r'octal escape value \567 outside of ' + r'range 0-0o377', 1) + self.checkPatternError(r"[\911]", r'bad escape \9', 1) + self.checkPatternError(r"[\x1z]", r'incomplete escape \x1', 1) + self.checkPatternError(r"[\u123z]", r'incomplete escape \u123', 1) + self.checkPatternError(r"[\U0001234z]", r'incomplete escape \U0001234', 1) + self.checkPatternError(r"[\U00110000]", r'bad escape \U00110000', 1) self.assertTrue(re.match(r"[\U0001d49c-\U0001d4b5]", "\U0001d49e")) - self.assertRaises(re.error, re.match, r"[\911]", "") - self.assertRaises(re.error, re.match, r"[\x1z]", "") - self.assertRaises(re.error, re.match, r"[\u123z]", "") - self.assertRaises(re.error, re.match, r"[\U0001234z]", "") - self.assertRaises(re.error, re.match, r"[\U00110000]", "") def test_sre_byte_literals(self): for i in [0, 8, 16, 32, 64, 127, 128, 255]: @@ -845,16 +993,20 @@ class ReTests(unittest.TestCase): self.assertTrue(re.match((r"\x%02x" % i).encode(), bytes([i]))) self.assertTrue(re.match((r"\x%02x0" % i).encode(), bytes([i])+b"0")) self.assertTrue(re.match((r"\x%02xz" % i).encode(), bytes([i])+b"z")) - self.assertTrue(re.match(br"\u", b'u')) - self.assertTrue(re.match(br"\U", b'U')) + with self.assertWarns(DeprecationWarning): + self.assertTrue(re.match(br"\u1234", b'u1234')) + with self.assertWarns(DeprecationWarning): + self.assertTrue(re.match(br"\U00012345", b'U00012345')) self.assertTrue(re.match(br"\0", b"\000")) self.assertTrue(re.match(br"\08", b"\0008")) self.assertTrue(re.match(br"\01", b"\001")) self.assertTrue(re.match(br"\018", b"\0018")) - self.assertTrue(re.match(br"\567", bytes([0o167]))) - self.assertRaises(re.error, re.match, br"\911", b"") - self.assertRaises(re.error, re.match, br"\x1", b"") - self.assertRaises(re.error, re.match, br"\x1z", b"") + self.checkPatternError(br"\567", + r'octal escape value \567 outside of ' + r'range 0-0o377', 0) + self.checkPatternError(br"\911", 'invalid group reference', 0) + self.checkPatternError(br"\x1", r'incomplete escape \x1', 0) + self.checkPatternError(br"\x1z", r'incomplete escape \x1', 0) def test_sre_byte_class_literals(self): for i in [0, 8, 16, 32, 64, 127, 128, 255]: @@ -866,10 +1018,26 @@ class ReTests(unittest.TestCase): self.assertTrue(re.match((r"[\x%02x]" % i).encode(), bytes([i]))) self.assertTrue(re.match((r"[\x%02x0]" % i).encode(), bytes([i]))) self.assertTrue(re.match((r"[\x%02xz]" % i).encode(), bytes([i]))) - self.assertTrue(re.match(br"[\u]", b'u')) - self.assertTrue(re.match(br"[\U]", b'U')) - self.assertRaises(re.error, re.match, br"[\911]", b"") - self.assertRaises(re.error, re.match, br"[\x1z]", b"") + with self.assertWarns(DeprecationWarning): + self.assertTrue(re.match(br"[\u1234]", b'u')) + with self.assertWarns(DeprecationWarning): + self.assertTrue(re.match(br"[\U00012345]", b'U')) + self.checkPatternError(br"[\567]", + r'octal escape value \567 outside of ' + r'range 0-0o377', 1) + self.checkPatternError(br"[\911]", r'bad escape \9', 1) + self.checkPatternError(br"[\x1z]", r'incomplete escape \x1', 1) + + def test_character_set_errors(self): + self.checkPatternError(r'[', 'unterminated character set', 0) + self.checkPatternError(r'[^', 'unterminated character set', 0) + self.checkPatternError(r'[a', 'unterminated character set', 0) + # bug 545855 -- This pattern failed to cause a compile error as it + # should, instead provoking a TypeError. + self.checkPatternError(r"[a-", 'unterminated character set', 0) + self.checkPatternError(r"[\w-b]", r'bad character range \w-b', 1) + self.checkPatternError(r"[a-\w]", r'bad character range a-\w', 1) + self.checkPatternError(r"[b-a]", 'bad character range b-a', 1) def test_bug_113254(self): self.assertEqual(re.match(r'(a)|(b)', 'b').start(1), -1) @@ -884,11 +1052,6 @@ class ReTests(unittest.TestCase): self.assertEqual(re.match("(?P<a>a(b))", "ab").lastgroup, 'a') self.assertEqual(re.match("((a))", "a").lastindex, 1) - def test_bug_545855(self): - # bug 545855 -- This pattern failed to cause a compile error as it - # should, instead provoking a TypeError. - self.assertRaises(re.error, re.compile, 'foo[a-') - def test_bug_418626(self): # bugs 418626 at al. -- Testing Greg Chapman's addition of op code # SRE_OP_MIN_REPEAT_ONE for eliminating recursion on simple uses of @@ -912,6 +1075,24 @@ class ReTests(unittest.TestCase): self.assertEqual(re.match('(x)*y', 50000*'x'+'y').group(1), 'x') self.assertEqual(re.match('(x)*?y', 50000*'x'+'y').group(1), 'x') + def test_nothing_to_repeat(self): + for reps in '*', '+', '?', '{1,2}': + for mod in '', '?': + self.checkPatternError('%s%s' % (reps, mod), + 'nothing to repeat', 0) + self.checkPatternError('(?:%s%s)' % (reps, mod), + 'nothing to repeat', 3) + + def test_multiple_repeat(self): + for outer_reps in '*', '+', '{1,2}': + for outer_mod in '', '?': + outer_op = outer_reps + outer_mod + for inner_reps in '*', '+', '?', '{1,2}': + for inner_mod in '', '?': + inner_op = inner_reps + inner_mod + self.checkPatternError(r'x%s%s' % (inner_op, outer_op), + 'multiple repeat', 1 + len(inner_op)) + def test_unlimited_zero_width_repeat(self): # Issue #9669 self.assertIsNone(re.match(r'(?:a?)*y', 'z')) @@ -1062,8 +1243,8 @@ class ReTests(unittest.TestCase): def test_inline_flags(self): # Bug #1700 - upper_char = chr(0x1ea0) # Latin Capital Letter A with Dot Bellow - lower_char = chr(0x1ea1) # Latin Small Letter A with Dot Bellow + upper_char = '\u1ea0' # Latin Capital Letter A with Dot Below + lower_char = '\u1ea1' # Latin Small Letter A with Dot Below p = re.compile(upper_char, re.I | re.U) q = p.match(lower_char) @@ -1143,6 +1324,52 @@ class ReTests(unittest.TestCase): self.assertRaises(ValueError, re.compile, '(?a)\w', re.UNICODE) self.assertRaises(ValueError, re.compile, '(?au)\w') + def test_locale_flag(self): + import locale + _, enc = locale.getlocale(locale.LC_CTYPE) + # Search non-ASCII letter + for i in range(128, 256): + try: + c = bytes([i]).decode(enc) + sletter = c.lower() + if sletter == c: continue + bletter = sletter.encode(enc) + if len(bletter) != 1: continue + if bletter.decode(enc) != sletter: continue + bpat = re.escape(bytes([i])) + break + except (UnicodeError, TypeError): + pass + else: + bletter = None + bpat = b'A' + # Bytes patterns + pat = re.compile(bpat, re.LOCALE | re.IGNORECASE) + if bletter: + self.assertTrue(pat.match(bletter)) + pat = re.compile(b'(?L)' + bpat, re.IGNORECASE) + if bletter: + self.assertTrue(pat.match(bletter)) + pat = re.compile(bpat, re.IGNORECASE) + if bletter: + self.assertIsNone(pat.match(bletter)) + pat = re.compile(b'\w', re.LOCALE) + if bletter: + self.assertTrue(pat.match(bletter)) + pat = re.compile(b'(?L)\w') + if bletter: + self.assertTrue(pat.match(bletter)) + pat = re.compile(b'\w') + if bletter: + self.assertIsNone(pat.match(bletter)) + # Incompatibilities + self.assertWarns(DeprecationWarning, re.compile, '', re.LOCALE) + self.assertWarns(DeprecationWarning, re.compile, '(?L)') + self.assertWarns(DeprecationWarning, re.compile, b'', re.LOCALE | re.ASCII) + self.assertWarns(DeprecationWarning, re.compile, b'(?L)', re.ASCII) + self.assertWarns(DeprecationWarning, re.compile, b'(?a)', re.LOCALE) + self.assertWarns(DeprecationWarning, re.compile, b'(?aL)') + def test_bug_6509(self): # Replacement strings of both types must parse properly. # all strings @@ -1170,8 +1397,10 @@ class ReTests(unittest.TestCase): # a RuntimeError is raised instead of OverflowError. long_overflow = 2**128 self.assertRaises(TypeError, re.finditer, "a", {}) - self.assertRaises(OverflowError, _sre.compile, "abc", 0, [long_overflow]) - self.assertRaises(TypeError, _sre.compile, {}, 0, []) + with self.assertRaises(OverflowError): + _sre.compile("abc", 0, [long_overflow], 0, [], []) + with self.assertRaises(TypeError): + _sre.compile({}, 0, [], 0, [], []) def test_search_dot_unicode(self): self.assertTrue(re.search("123.*-", '123abc-')) @@ -1193,8 +1422,9 @@ class ReTests(unittest.TestCase): def test_bug_13899(self): # Issue #13899: re pattern r"[\A]" should work like "A" but matches # nothing. Ditto B and Z. - self.assertEqual(re.findall(r'[\A\B\b\C\Z]', 'AB\bCZ'), - ['A', 'B', '\b', 'C', 'Z']) + with self.assertWarns(DeprecationWarning): + self.assertEqual(re.findall(r'[\A\B\b\C\Z]', 'AB\bCZ'), + ['A', 'B', '\b', 'C', 'Z']) @bigmemtest(size=_2G, memuse=1) def test_large_search(self, size): @@ -1253,13 +1483,13 @@ class ReTests(unittest.TestCase): def test_backref_group_name_in_exception(self): # Issue 17341: Poor error message when compiling invalid regex - with self.assertRaisesRegex(sre_constants.error, '<foo>'): - re.compile('(?P=<foo>)') + self.checkPatternError('(?P=<foo>)', + "bad character in group name '<foo>'", 4) def test_group_name_in_exception(self): # Issue 17341: Poor error message when compiling invalid regex - with self.assertRaisesRegex(sre_constants.error, '\?foo'): - re.compile('(?P<?foo>)') + self.checkPatternError('(?P<?foo>)', + "bad character in group name '?foo'", 4) def test_issue17998(self): for reps in '*', '+', '?', '{1}': @@ -1309,22 +1539,22 @@ class ReTests(unittest.TestCase): with captured_stdout() as out: re.compile(pat, re.DEBUG) dump = '''\ -subpattern 1 - literal 46 -subpattern None - branch - in - literal 99 - literal 104 - or - literal 112 - literal 121 -subpattern None - groupref_exists 1 - at at_end - else - literal 58 - literal 32 +SUBPATTERN 1 + LITERAL 46 +SUBPATTERN None + BRANCH + IN + LITERAL 99 + LITERAL 104 + OR + LITERAL 112 + LITERAL 121 +SUBPATTERN None + GROUPREF_EXISTS 1 + AT AT_END + ELSE + LITERAL 58 + LITERAL 32 ''' self.assertEqual(out.getvalue(), dump) # Debug output is output again even a second time (bypassing @@ -1392,6 +1622,55 @@ subpattern None self.assertIsNone(re.match(b'(?Li)\xc5', b'\xe5')) self.assertIsNone(re.match(b'(?Li)\xe5', b'\xc5')) + def test_error(self): + with self.assertRaises(re.error) as cm: + re.compile('(\u20ac))') + err = cm.exception + self.assertIsInstance(err.pattern, str) + self.assertEqual(err.pattern, '(\u20ac))') + self.assertEqual(err.pos, 3) + self.assertEqual(err.lineno, 1) + self.assertEqual(err.colno, 4) + self.assertIn(err.msg, str(err)) + self.assertIn(' at position 3', str(err)) + self.assertNotIn(' at position 3', err.msg) + # Bytes pattern + with self.assertRaises(re.error) as cm: + re.compile(b'(\xa4))') + err = cm.exception + self.assertIsInstance(err.pattern, bytes) + self.assertEqual(err.pattern, b'(\xa4))') + self.assertEqual(err.pos, 3) + # Multiline pattern + with self.assertRaises(re.error) as cm: + re.compile(""" + ( + abc + ) + ) + ( + """, re.VERBOSE) + err = cm.exception + self.assertEqual(err.pos, 77) + self.assertEqual(err.lineno, 5) + self.assertEqual(err.colno, 17) + self.assertIn(err.msg, str(err)) + self.assertIn(' at position 77', str(err)) + self.assertIn('(line 5, column 17)', str(err)) + + def test_misc_errors(self): + self.checkPatternError(r'(', 'missing ), unterminated subpattern', 0) + self.checkPatternError(r'((a|b)', 'missing ), unterminated subpattern', 0) + self.checkPatternError(r'(a|b))', 'unbalanced parenthesis', 5) + self.checkPatternError(r'(?P', 'unexpected end of pattern', 3) + self.checkPatternError(r'(?z)', 'unknown extension ?z', 1) + self.checkPatternError(r'(?iz)', 'unknown flag', 3) + self.checkPatternError(r'(?i', 'missing )', 3) + self.checkPatternError(r'(?#abc', 'missing ), unterminated comment', 0) + self.checkPatternError(r'(?<', 'unexpected end of pattern', 3) + self.checkPatternError(r'(?<>)', 'unknown extension ?<>', 1) + self.checkPatternError(r'(?', 'unexpected end of pattern', 2) + class PatternReprTests(unittest.TestCase): def check(self, pattern, expected): @@ -1436,6 +1715,10 @@ class PatternReprTests(unittest.TestCase): self.check_flags(b'bytes pattern', re.A, "re.compile(b'bytes pattern', re.ASCII)") + def test_locale(self): + self.check_flags(b'bytes pattern', re.L, + "re.compile(b'bytes pattern', re.LOCALE)") + def test_quotes(self): self.check('random "double quoted" pattern', '''re.compile('random "double quoted" pattern')''') @@ -1549,8 +1832,16 @@ class ExternalTests(unittest.TestCase): pass else: with self.subTest('bytes pattern match'): - bpat = re.compile(bpat) - self.assertTrue(bpat.search(bs)) + obj = re.compile(bpat) + self.assertTrue(obj.search(bs)) + + # Try the match with LOCALE enabled, and check that it + # still succeeds. + with self.subTest('locale-sensitive match'): + obj = re.compile(bpat, re.LOCALE) + result = obj.search(bs) + if result is None: + print('=== Fails on locale-sensitive match', t) # Try the match with the search area limited to the extent # of the match and see if it still succeeds. \B will @@ -1568,13 +1859,6 @@ class ExternalTests(unittest.TestCase): obj = re.compile(pattern, re.IGNORECASE) self.assertTrue(obj.search(s)) - # Try the match with LOCALE enabled, and check that it - # still succeeds. - if '(?u)' not in pattern: - with self.subTest('locale-sensitive match'): - obj = re.compile(pattern, re.LOCALE) - self.assertTrue(obj.search(s)) - # Try the match with UNICODE locale enabled, and check # that it still succeeds. with self.subTest('unicode-sensitive match'): |