summaryrefslogtreecommitdiffstats
path: root/Lib
diff options
context:
space:
mode:
authorGustavo Niemeyer <gustavo@niemeyer.net>2004-09-03 17:06:10 (GMT)
committerGustavo Niemeyer <gustavo@niemeyer.net>2004-09-03 17:06:10 (GMT)
commita01a2ee933238dbd3e79bc3b07cfb703d40807a8 (patch)
tree4cdf29e0e0ffdb9eb72e2a4ab0ddd4656354fe19 /Lib
parentab9351bf369c6dfc4b29eff18236a1fcaefe94b6 (diff)
downloadcpython-a01a2ee933238dbd3e79bc3b07cfb703d40807a8.zip
cpython-a01a2ee933238dbd3e79bc3b07cfb703d40807a8.tar.gz
cpython-a01a2ee933238dbd3e79bc3b07cfb703d40807a8.tar.bz2
Applying modified version of patch #1018386, which fixes
some escaping bugs in SRE.
Diffstat (limited to 'Lib')
-rw-r--r--Lib/sre_parse.py78
-rw-r--r--Lib/test/test_re.py53
2 files changed, 89 insertions, 42 deletions
diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py
index 5c4298a..3e27145 100644
--- a/Lib/sre_parse.py
+++ b/Lib/sre_parse.py
@@ -217,21 +217,11 @@ def isname(name):
# check that group name is a valid string
if not isident(name[0]):
return False
- for char in name:
+ for char in name[1:]:
if not isident(char) and not isdigit(char):
return False
return True
-def _group(escape, groups):
- # check if the escape string represents a valid group
- try:
- gid = int(escape[1:])
- if gid and gid < groups:
- return gid
- except ValueError:
- pass
- return None # not a valid group
-
def _class_escape(source, escape):
# handle escape code inside character class
code = ESCAPES.get(escape)
@@ -241,7 +231,8 @@ def _class_escape(source, escape):
if code:
return code
try:
- if escape[1:2] == "x":
+ c = escape[1:2]
+ if c == "x":
# hexadecimal escape (exactly two digits)
while source.next in HEXDIGITS and len(escape) < 4:
escape = escape + source.get()
@@ -249,12 +240,14 @@ def _class_escape(source, escape):
if len(escape) != 2:
raise error, "bogus escape: %s" % repr("\\" + escape)
return LITERAL, int(escape, 16) & 0xff
- elif escape[1:2] in OCTDIGITS:
+ elif c in OCTDIGITS:
# octal escape (up to three digits)
- while source.next in OCTDIGITS and len(escape) < 5:
+ while source.next in OCTDIGITS and len(escape) < 4:
escape = escape + source.get()
escape = escape[1:]
return LITERAL, int(escape, 8) & 0xff
+ elif c in DIGITS:
+ raise error, "bogus escape: %s" % repr(escape)
if len(escape) == 2:
return LITERAL, ord(escape[1])
except ValueError:
@@ -270,19 +263,20 @@ def _escape(source, escape, state):
if code:
return code
try:
- if escape[1:2] == "x":
+ c = escape[1:2]
+ if c == "x":
# hexadecimal escape
while source.next in HEXDIGITS and len(escape) < 4:
escape = escape + source.get()
if len(escape) != 4:
raise ValueError
return LITERAL, int(escape[2:], 16) & 0xff
- elif escape[1:2] == "0":
+ elif c == "0":
# octal escape
while source.next in OCTDIGITS and len(escape) < 4:
escape = escape + source.get()
return LITERAL, int(escape[1:], 8) & 0xff
- elif escape[1:2] in DIGITS:
+ elif c in DIGITS:
# octal escape *or* decimal group reference (sigh)
if source.next in DIGITS:
escape = escape + source.get()
@@ -291,9 +285,9 @@ def _escape(source, escape, state):
# got three octal digits; this is an octal escape
escape = escape + source.get()
return LITERAL, int(escape[1:], 8) & 0xff
- # got at least one decimal digit; this is a group reference
- group = _group(escape, state.groups)
- if group:
+ # not an octal escape, so this is a group reference
+ group = int(escape[1:])
+ if group < state.groups:
if not state.checkgroup(group):
raise error, "cannot refer to open group"
return GROUPREF, group
@@ -709,7 +703,8 @@ def parse_template(source, pattern):
break # end of replacement string
if this and this[0] == "\\":
# group
- if this == "\\g":
+ c = this[1:2]
+ if c == "g":
name = ""
if s.match("<"):
while 1:
@@ -723,6 +718,8 @@ def parse_template(source, pattern):
raise error, "bad group name"
try:
index = int(name)
+ if index < 0:
+ raise error, "negative group number"
except ValueError:
if not isname(name):
raise error, "bad character in group name"
@@ -731,26 +728,23 @@ def parse_template(source, pattern):
except KeyError:
raise IndexError, "unknown group name"
a((MARK, index))
- elif len(this) > 1 and this[1] in DIGITS:
- code = None
- while 1:
- group = _group(this, pattern.groups+1)
- if group:
- if (s.next not in DIGITS or
- not _group(this + s.next, pattern.groups+1)):
- code = MARK, group
- break
- elif s.next in OCTDIGITS:
+ elif c == "0":
+ if s.next in OCTDIGITS:
+ this = this + sget()
+ if s.next in OCTDIGITS:
this = this + sget()
- else:
- break
- if not code:
- this = this[1:]
- code = LITERAL, makechar(int(this[-6:], 8) & 0xff)
- if code[0] is LITERAL:
- literal(code[1])
- else:
- a(code)
+ literal(makechar(int(this[1:], 8) & 0xff))
+ elif c in DIGITS:
+ isoctal = False
+ if s.next in DIGITS:
+ this = this + sget()
+ if (c in OCTDIGITS and s.next in OCTDIGITS and
+ this[2] in OCTDIGITS):
+ this = this + sget()
+ isoctal = True
+ literal(makechar(int(this[1:], 8) & 0xff))
+ if not isoctal:
+ a((MARK, int(this[1:])))
else:
try:
this = makechar(ESCAPES[this][1])
@@ -782,7 +776,7 @@ def expand_template(template, match):
for index, group in groups:
literals[index] = s = g(group)
if s is None:
- raise IndexError
+ raise error, "unmatched group"
except IndexError:
- raise error, "empty group"
+ raise error, "invalid group reference"
return sep.join(literals)
diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py
index c7afdc5..8f66ae9 100644
--- a/Lib/test/test_re.py
+++ b/Lib/test/test_re.py
@@ -83,6 +83,48 @@ class ReTests(unittest.TestCase):
self.assertEqual(re.sub('\r\n', '\n', 'abc\r\ndef\r\n'),
'abc\ndef\n')
+ def test_sub_template_numeric_escape(self):
+ # bug 776311 and friends
+ self.assertEqual(re.sub('x', r'\0', 'x'), '\0')
+ self.assertEqual(re.sub('x', r'\000', 'x'), '\000')
+ self.assertEqual(re.sub('x', r'\001', 'x'), '\001')
+ self.assertEqual(re.sub('x', r'\008', 'x'), '\0' + '8')
+ self.assertEqual(re.sub('x', r'\009', 'x'), '\0' + '9')
+ self.assertEqual(re.sub('x', r'\111', 'x'), '\111')
+ self.assertEqual(re.sub('x', r'\117', 'x'), '\117')
+
+ self.assertEqual(re.sub('x', r'\1111', 'x'), '\1111')
+ self.assertEqual(re.sub('x', r'\1111', 'x'), '\111' + '1')
+
+ self.assertEqual(re.sub('x', r'\00', 'x'), '\x00')
+ self.assertEqual(re.sub('x', r'\07', 'x'), '\x07')
+ self.assertEqual(re.sub('x', r'\08', 'x'), '\0' + '8')
+ self.assertEqual(re.sub('x', r'\09', 'x'), '\0' + '9')
+ self.assertEqual(re.sub('x', r'\0a', 'x'), '\0' + 'a')
+
+ self.assertEqual(re.sub('x', r'\400', 'x'), '\0')
+ self.assertEqual(re.sub('x', r'\777', 'x'), '\377')
+
+ self.assertRaises(re.error, re.sub, 'x', r'\1', 'x')
+ self.assertRaises(re.error, re.sub, 'x', r'\8', 'x')
+ self.assertRaises(re.error, re.sub, 'x', r'\9', 'x')
+ self.assertRaises(re.error, re.sub, 'x', r'\11', 'x')
+ self.assertRaises(re.error, re.sub, 'x', r'\18', 'x')
+ self.assertRaises(re.error, re.sub, 'x', r'\1a', 'x')
+ self.assertRaises(re.error, re.sub, 'x', r'\90', 'x')
+ self.assertRaises(re.error, re.sub, 'x', r'\99', 'x')
+ self.assertRaises(re.error, re.sub, 'x', r'\118', 'x') # r'\11' + '8'
+ self.assertRaises(re.error, re.sub, 'x', r'\11a', 'x')
+ self.assertRaises(re.error, re.sub, 'x', r'\181', 'x') # r'\18' + '1'
+ self.assertRaises(re.error, re.sub, 'x', r'\800', 'x') # r'\80' + '0'
+
+ # in python2.3 (etc), these loop endlessly in sre_parser.py
+ self.assertEqual(re.sub('(((((((((((x)))))))))))', r'\11', 'x'), 'x')
+ self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\118', 'xyz'),
+ 'xz8')
+ self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\11a', 'xyz'),
+ 'xza')
+
def test_qualified_re_sub(self):
self.assertEqual(re.sub('a', 'b', 'aaaaa'), 'bbbbb')
self.assertEqual(re.sub('a', 'b', 'aaaaa', 1), 'baaaa')
@@ -105,6 +147,7 @@ class ReTests(unittest.TestCase):
self.assertRaises(IndexError, re.sub, '(?P<a>x)', '\g<ab>', 'xx')
self.assertRaises(re.error, re.sub, '(?P<a>x)|(?P<b>y)', '\g<b>', 'xx')
self.assertRaises(re.error, re.sub, '(?P<a>x)|(?P<b>y)', '\\2', 'xx')
+ self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<-1>', 'xx')
def test_re_subn(self):
self.assertEqual(re.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2))
@@ -386,6 +429,16 @@ class ReTests(unittest.TestCase):
self.assertNotEqual(re.match(r"\x%02xz" % i, chr(i)+"z"), None)
self.assertRaises(re.error, re.match, "\911", "")
+ def test_sre_character_class_literals(self):
+ for i in [0, 8, 16, 32, 64, 127, 128, 255]:
+ self.assertNotEqual(re.match(r"[\%03o]" % i, chr(i)), None)
+ self.assertNotEqual(re.match(r"[\%03o0]" % i, chr(i)), None)
+ self.assertNotEqual(re.match(r"[\%03o8]" % i, chr(i)), None)
+ self.assertNotEqual(re.match(r"[\x%02x]" % i, chr(i)), None)
+ self.assertNotEqual(re.match(r"[\x%02x0]" % i, chr(i)), None)
+ self.assertNotEqual(re.match(r"[\x%02xz]" % i, chr(i)), None)
+ self.assertRaises(re.error, re.match, "[\911]", "")
+
def test_bug_113254(self):
self.assertEqual(re.match(r'(a)|(b)', 'b').start(1), -1)
self.assertEqual(re.match(r'(a)|(b)', 'b').end(1), -1)