summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Doc/library/re.rst2
-rw-r--r--Lib/test/test_re.py34
-rw-r--r--Misc/NEWS.d/next/Library/2019-01-14-11-53-10.bpo-34294.3JFdg2.rst4
-rw-r--r--Modules/_sre.c2
-rw-r--r--Modules/sre_lib.h8
5 files changed, 49 insertions, 1 deletions
diff --git a/Doc/library/re.rst b/Doc/library/re.rst
index ac6455a..b512830 100644
--- a/Doc/library/re.rst
+++ b/Doc/library/re.rst
@@ -371,6 +371,8 @@ The special characters are:
``(?#...)``
A comment; the contents of the parentheses are simply ignored.
+.. index:: single: (?=; in regular expressions
+
``(?=...)``
Matches if ``...`` matches next, but doesn't consume any of the string. This is
called a :dfn:`lookahead assertion`. For example, ``Isaac (?=Asimov)`` will match
diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py
index ab1d985..797d85d 100644
--- a/Lib/test/test_re.py
+++ b/Lib/test/test_re.py
@@ -2067,6 +2067,40 @@ ELSE
self.assertEqual(m.group(), b'xyz')
self.assertEqual(m2.group(), b'')
+ def test_bug_34294(self):
+ # Issue 34294: wrong capturing groups
+
+ # exists since Python 2
+ s = "a\tx"
+ p = r"\b(?=(\t)|(x))x"
+ self.assertEqual(re.search(p, s).groups(), (None, 'x'))
+
+ # introduced in Python 3.7.0
+ s = "ab"
+ p = r"(?=(.)(.)?)"
+ self.assertEqual(re.findall(p, s),
+ [('a', 'b'), ('b', '')])
+ self.assertEqual([m.groups() for m in re.finditer(p, s)],
+ [('a', 'b'), ('b', None)])
+
+ # test-cases provided by issue34294, introduced in Python 3.7.0
+ p = r"(?=<(?P<tag>\w+)/?>(?:(?P<text>.+?)</(?P=tag)>)?)"
+ s = "<test><foo2/></test>"
+ self.assertEqual(re.findall(p, s),
+ [('test', '<foo2/>'), ('foo2', '')])
+ self.assertEqual([m.groupdict() for m in re.finditer(p, s)],
+ [{'tag': 'test', 'text': '<foo2/>'},
+ {'tag': 'foo2', 'text': None}])
+ s = "<test>Hello</test><foo/>"
+ self.assertEqual([m.groupdict() for m in re.finditer(p, s)],
+ [{'tag': 'test', 'text': 'Hello'},
+ {'tag': 'foo', 'text': None}])
+ s = "<test>Hello</test><foo/><foo/>"
+ self.assertEqual([m.groupdict() for m in re.finditer(p, s)],
+ [{'tag': 'test', 'text': 'Hello'},
+ {'tag': 'foo', 'text': None},
+ {'tag': 'foo', 'text': None}])
+
class PatternReprTests(unittest.TestCase):
def check(self, pattern, expected):
diff --git a/Misc/NEWS.d/next/Library/2019-01-14-11-53-10.bpo-34294.3JFdg2.rst b/Misc/NEWS.d/next/Library/2019-01-14-11-53-10.bpo-34294.3JFdg2.rst
new file mode 100644
index 0000000..e1ae2ea
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2019-01-14-11-53-10.bpo-34294.3JFdg2.rst
@@ -0,0 +1,4 @@
+re module, fix wrong capturing groups in rare cases. :func:`re.search`,
+:func:`re.findall`, :func:`re.sub` and other functions that scan through
+string looking for a match, should reset capturing groups between two match
+attempts. Patch by Ma Lin. \ No newline at end of file
diff --git a/Modules/_sre.c b/Modules/_sre.c
index 75f030c..21c41b5 100644
--- a/Modules/_sre.c
+++ b/Modules/_sre.c
@@ -340,7 +340,7 @@ _sre_unicode_tolower_impl(PyObject *module, int character)
LOCAL(void)
state_reset(SRE_STATE* state)
{
- /* FIXME: dynamic! */
+ /* state->mark will be set to 0 in SRE_OP_MARK dynamically. */
/*memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);*/
state->lastmark = -1;
diff --git a/Modules/sre_lib.h b/Modules/sre_lib.h
index 44948e2..437ab43 100644
--- a/Modules/sre_lib.h
+++ b/Modules/sre_lib.h
@@ -1363,6 +1363,10 @@ exit:
return ret; /* should never get here */
}
+/* need to reset capturing groups between two SRE(match) callings in loops */
+#define RESET_CAPTURE_GROUP() \
+ do { state->lastmark = state->lastindex = -1; } while (0)
+
LOCAL(Py_ssize_t)
SRE(search)(SRE_STATE* state, SRE_CODE* pattern)
{
@@ -1440,6 +1444,7 @@ SRE(search)(SRE_STATE* state, SRE_CODE* pattern)
if (status != 0)
return status;
++ptr;
+ RESET_CAPTURE_GROUP();
}
return 0;
}
@@ -1487,6 +1492,7 @@ SRE(search)(SRE_STATE* state, SRE_CODE* pattern)
/* close but no cigar -- try again */
if (++ptr >= end)
return 0;
+ RESET_CAPTURE_GROUP();
}
i = overlap[i];
} while (i != 0);
@@ -1510,6 +1516,7 @@ SRE(search)(SRE_STATE* state, SRE_CODE* pattern)
if (status != 0)
break;
ptr++;
+ RESET_CAPTURE_GROUP();
}
} else {
/* general case */
@@ -1520,6 +1527,7 @@ SRE(search)(SRE_STATE* state, SRE_CODE* pattern)
state->must_advance = 0;
while (status == 0 && ptr < end) {
ptr++;
+ RESET_CAPTURE_GROUP();
TRACE(("|%p|%p|SEARCH\n", pattern, ptr));
state->start = state->ptr = ptr;
status = SRE(match)(state, pattern, 0);