summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Doc/whatsnew/3.5.rst6
-rw-r--r--Lib/sre_compile.py6
-rw-r--r--Lib/sre_constants.py2
-rw-r--r--Lib/sre_parse.py10
-rw-r--r--Lib/test/test_re.py18
-rw-r--r--Misc/NEWS3
-rw-r--r--Modules/_sre.c51
-rw-r--r--Modules/sre.h7
8 files changed, 76 insertions, 27 deletions
diff --git a/Doc/whatsnew/3.5.rst b/Doc/whatsnew/3.5.rst
index 5c2be47..7fb0fd5 100644
--- a/Doc/whatsnew/3.5.rst
+++ b/Doc/whatsnew/3.5.rst
@@ -217,6 +217,12 @@ os
* :class:`os.stat_result` now has a :attr:`~os.stat_result.st_file_attributes`
attribute on Windows (contributed by Ben Hoyt in :issue:`21719`).
+re
+--
+
+* Number of capturing groups in regular expression is no longer limited by 100.
+ (Contributed by Serhiy Storchaka in :issue:`22437`.)
+
shutil
------
diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py
index c6860b5..d4d129b 100644
--- a/Lib/sre_compile.py
+++ b/Lib/sre_compile.py
@@ -470,12 +470,6 @@ def compile(p, flags=0):
# print code
- # XXX: <fl> get rid of this limitation!
- if p.pattern.groups > 100:
- raise AssertionError(
- "sorry, but this version only supports 100 named groups"
- )
-
# map in either direction
groupindex = p.pattern.groupdict
indexgroup = [None] * p.pattern.groups
diff --git a/Lib/sre_constants.py b/Lib/sre_constants.py
index 23e3516..8815d1d 100644
--- a/Lib/sre_constants.py
+++ b/Lib/sre_constants.py
@@ -15,7 +15,7 @@
MAGIC = 20031017
-from _sre import MAXREPEAT
+from _sre import MAXREPEAT, MAXGROUPS
# SRE standard exception (access as sre.error)
# should this really be here?
diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py
index 7fd145b..b9a1852 100644
--- a/Lib/sre_parse.py
+++ b/Lib/sre_parse.py
@@ -72,6 +72,8 @@ class Pattern:
def opengroup(self, name=None):
gid = self.groups
self.groups = gid + 1
+ if self.groups > MAXGROUPS:
+ raise error("groups number is too large")
if name is not None:
ogid = self.groupdict.get(name, None)
if ogid is not None:
@@ -695,8 +697,14 @@ def _parse(source, state):
else:
try:
condgroup = int(condname)
+ if condgroup < 0:
+ raise ValueError
except ValueError:
raise error("bad character in group name")
+ if not condgroup:
+ raise error("bad group number")
+ if condgroup >= MAXGROUPS:
+ raise error("the group number is too large")
else:
# flags
if not source.next in FLAGS:
@@ -822,6 +830,8 @@ def parse_template(source, pattern):
index = int(name)
if index < 0:
raise error("negative group number")
+ if index >= MAXGROUPS:
+ raise error("the group number is too large")
except ValueError:
if not name.isidentifier():
raise error("bad character in group name")
diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py
index d85b767..e5ad6cb 100644
--- a/Lib/test/test_re.py
+++ b/Lib/test/test_re.py
@@ -193,6 +193,7 @@ class ReTests(unittest.TestCase):
def test_symbolic_groups(self):
re.compile('(?P<a>x)(?P=a)(?(a)y)')
re.compile('(?P<a1>x)(?P=a1)(?(a1)y)')
+ re.compile('(?P<a1>x)\1(?(1)y)')
self.assertRaises(re.error, re.compile, '(?P<a>)(?P<a>)')
self.assertRaises(re.error, re.compile, '(?Px)')
self.assertRaises(re.error, re.compile, '(?P=)')
@@ -212,6 +213,10 @@ class ReTests(unittest.TestCase):
re.compile('(?P<ยต>x)(?P=ยต)(?(ยต)y)')
re.compile('(?P<๐”˜๐”ซ๐”ฆ๐” ๐”ฌ๐”ก๐”ข>x)(?P=๐”˜๐”ซ๐”ฆ๐” ๐”ฌ๐”ก๐”ข)(?(๐”˜๐”ซ๐”ฆ๐” ๐”ฌ๐”ก๐”ข)y)')
self.assertRaises(re.error, re.compile, '(?P<ยฉ>x)')
+ # Support > 100 groups.
+ pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1))
+ pat = '(?:%s)(?(200)z|t)' % pat
+ self.assertEqual(re.match(pat, 'xc8yz').span(), (0, 5))
def test_symbolic_refs(self):
self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<a', 'xx')
@@ -228,6 +233,9 @@ class ReTests(unittest.TestCase):
self.assertEqual(re.sub('(?P<ยต>x)', r'\g<ยต>', 'xx'), 'xx')
self.assertEqual(re.sub('(?P<๐”˜๐”ซ๐”ฆ๐” ๐”ฌ๐”ก๐”ข>x)', r'\g<๐”˜๐”ซ๐”ฆ๐” ๐”ฌ๐”ก๐”ข>', 'xx'), 'xx')
self.assertRaises(re.error, re.sub, '(?P<a>x)', r'\g<ยฉ>', 'xx')
+ # Support > 100 groups.
+ pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1))
+ self.assertEqual(re.sub(pat, '\g<200>', 'xc8yzxc8y'), 'c8zc8')
def test_re_subn(self):
self.assertEqual(re.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2))
@@ -404,6 +412,10 @@ class ReTests(unittest.TestCase):
self.assertIsNone(p.match('abd'))
self.assertIsNone(p.match('ac'))
+ # Support > 100 groups.
+ pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1))
+ pat = '(?:%s)(?(200)z)' % pat
+ self.assertEqual(re.match(pat, 'xc8yz').span(), (0, 5))
def test_re_groupref(self):
self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', '|a|').groups(),
@@ -1070,8 +1082,10 @@ class ReTests(unittest.TestCase):
# a RuntimeError is raised instead of OverflowError.
long_overflow = 2**128
self.assertRaises(TypeError, re.finditer, "a", {})
- self.assertRaises(OverflowError, _sre.compile, "abc", 0, [long_overflow])
- self.assertRaises(TypeError, _sre.compile, {}, 0, [])
+ with self.assertRaises(OverflowError):
+ _sre.compile("abc", 0, [long_overflow], 0, [], [])
+ with self.assertRaises(TypeError):
+ _sre.compile({}, 0, [], 0, [], [])
def test_search_dot_unicode(self):
self.assertTrue(re.search("123.*-", '123abc-'))
diff --git a/Misc/NEWS b/Misc/NEWS
index 63942a9..77a2725 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -145,6 +145,9 @@ Core and Builtins
Library
-------
+- Issue #22437: Number of capturing groups in regular expression is no longer
+ limited by 100.
+
- Issue #17442: InteractiveInterpreter now displays the full chained traceback
in its showtraceback method, to match the built in interactive interpreter.
diff --git a/Modules/_sre.c b/Modules/_sre.c
index 13479ba..5c3d105 100644
--- a/Modules/_sre.c
+++ b/Modules/_sre.c
@@ -357,6 +357,11 @@ state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
memset(state, 0, sizeof(SRE_STATE));
+ state->mark = PyMem_New(void *, pattern->groups * 2);
+ if (!state->mark) {
+ PyErr_NoMemory();
+ goto err;
+ }
state->lastmark = -1;
state->lastindex = -1;
@@ -409,6 +414,8 @@ state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
return string;
err:
+ PyMem_Del(state->mark);
+ state->mark = NULL;
if (state->buffer.buf)
PyBuffer_Release(&state->buffer);
return NULL;
@@ -421,6 +428,8 @@ state_fini(SRE_STATE* state)
PyBuffer_Release(&state->buffer);
Py_XDECREF(state->string);
data_stack_dealloc(state);
+ PyMem_Del(state->mark);
+ state->mark = NULL;
}
/* calculate offset from start of string */
@@ -560,6 +569,7 @@ pattern_match(PatternObject *self, PyObject *args, PyObject *kwargs)
PyObject *pattern = NULL;
SRE_STATE state;
Py_ssize_t status;
+ PyObject *match;
if (!PyArg_ParseTupleAndKeywords(args, kwargs,
"|Onn$O:match", _keywords,
@@ -579,12 +589,14 @@ pattern_match(PatternObject *self, PyObject *args, PyObject *kwargs)
status = sre_match(&state, PatternObject_GetCode(self), 0);
TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
- if (PyErr_Occurred())
+ if (PyErr_Occurred()) {
+ state_fini(&state);
return NULL;
+ }
+ match = pattern_new_match(self, &state, status);
state_fini(&state);
-
- return (PyObject *)pattern_new_match(self, &state, status);
+ return match;
}
static PyObject*
@@ -592,6 +604,7 @@ pattern_fullmatch(PatternObject* self, PyObject* args, PyObject* kw)
{
SRE_STATE state;
Py_ssize_t status;
+ PyObject *match;
PyObject *string = NULL, *string2 = NULL;
Py_ssize_t start = 0;
@@ -616,12 +629,14 @@ pattern_fullmatch(PatternObject* self, PyObject* args, PyObject* kw)
status = sre_match(&state, PatternObject_GetCode(self), 1);
TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
- if (PyErr_Occurred())
+ if (PyErr_Occurred()) {
+ state_fini(&state);
return NULL;
+ }
+ match = pattern_new_match(self, &state, status);
state_fini(&state);
-
- return pattern_new_match(self, &state, status);
+ return match;
}
static PyObject*
@@ -629,6 +644,7 @@ pattern_search(PatternObject* self, PyObject* args, PyObject* kw)
{
SRE_STATE state;
Py_ssize_t status;
+ PyObject *match;
PyObject *string = NULL, *string2 = NULL;
Py_ssize_t start = 0;
@@ -652,12 +668,14 @@ pattern_search(PatternObject* self, PyObject* args, PyObject* kw)
TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
- state_fini(&state);
-
- if (PyErr_Occurred())
+ if (PyErr_Occurred()) {
+ state_fini(&state);
return NULL;
+ }
- return pattern_new_match(self, &state, status);
+ match = pattern_new_match(self, &state, status);
+ state_fini(&state);
+ return match;
}
static PyObject*
@@ -1417,7 +1435,7 @@ _compile(PyObject* self_, PyObject* args)
PyObject* groupindex = NULL;
PyObject* indexgroup = NULL;
- if (!PyArg_ParseTuple(args, "OiO!|nOO", &pattern, &flags,
+ if (!PyArg_ParseTuple(args, "OiO!nOO", &pattern, &flags,
&PyList_Type, &code, &groups,
&groupindex, &indexgroup))
return NULL;
@@ -1933,10 +1951,9 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
static int
_validate_outer(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
{
- if (groups < 0 || groups > 100 || code >= end || end[-1] != SRE_OP_SUCCESS)
+ if (groups < 0 || (size_t)groups > SRE_MAXGROUPS ||
+ code >= end || end[-1] != SRE_OP_SUCCESS)
FAIL;
- if (groups == 0) /* fix for simplejson */
- groups = 100; /* 100 groups should always be safe */
return _validate_inner(code, end-1, groups);
}
@@ -2747,6 +2764,12 @@ PyMODINIT_FUNC PyInit__sre(void)
Py_DECREF(x);
}
+ x = PyLong_FromUnsignedLong(SRE_MAXGROUPS);
+ if (x) {
+ PyDict_SetItemString(d, "MAXGROUPS", x);
+ Py_DECREF(x);
+ }
+
x = PyUnicode_FromString(copyright);
if (x) {
PyDict_SetItemString(d, "copyright", x);
diff --git a/Modules/sre.h b/Modules/sre.h
index 42fe28d..35d198f 100644
--- a/Modules/sre.h
+++ b/Modules/sre.h
@@ -18,8 +18,10 @@
#define SRE_CODE Py_UCS4
#if SIZEOF_SIZE_T > 4
# define SRE_MAXREPEAT (~(SRE_CODE)0)
+# define SRE_MAXGROUPS ((~(SRE_CODE)0) / 2)
#else
# define SRE_MAXREPEAT ((SRE_CODE)PY_SSIZE_T_MAX)
+# define SRE_MAXGROUPS ((SRE_CODE)PY_SSIZE_T_MAX / SIZEOF_SIZE_T / 2)
#endif
typedef struct {
@@ -52,9 +54,6 @@ typedef struct {
typedef unsigned int (*SRE_TOLOWER_HOOK)(unsigned int ch);
-/* FIXME: <fl> shouldn't be a constant, really... */
-#define SRE_MARK_SIZE 200
-
typedef struct SRE_REPEAT_T {
Py_ssize_t count;
SRE_CODE* pattern; /* points to REPEAT operator arguments */
@@ -76,7 +75,7 @@ typedef struct {
/* registers */
Py_ssize_t lastindex;
Py_ssize_t lastmark;
- void* mark[SRE_MARK_SIZE];
+ void** mark;
/* dynamically allocated stuff */
char* data_stack;
size_t data_stack_size;