summaryrefslogtreecommitdiffstats
path: root/Lib
diff options
context:
space:
mode:
authorSerhiy Storchaka <storchaka@gmail.com>2014-10-31 10:36:56 (GMT)
committerSerhiy Storchaka <storchaka@gmail.com>2014-10-31 10:36:56 (GMT)
commit4b8f8949b43715f1b0f0ef77e15e19c180ccc195 (patch)
treebebc1eda94d11692278f03c41c683b5b8ca815dd /Lib
parent455de40a6e99ad7548e6061733f9c5dae2327e83 (diff)
downloadcpython-4b8f8949b43715f1b0f0ef77e15e19c180ccc195.zip
cpython-4b8f8949b43715f1b0f0ef77e15e19c180ccc195.tar.gz
cpython-4b8f8949b43715f1b0f0ef77e15e19c180ccc195.tar.bz2
Issue #17381: Fixed handling of case-insensitive ranges in regular expressions.
Added new opcode RANGE_IGNORE.
Diffstat (limited to 'Lib')
-rw-r--r--Lib/sre_compile.py35
-rw-r--r--Lib/sre_constants.py9
-rw-r--r--Lib/test/test_re.py19
3 files changed, 46 insertions, 17 deletions
diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py
index d4d129b..1b3e9f8 100644
--- a/Lib/sre_compile.py
+++ b/Lib/sre_compile.py
@@ -22,9 +22,6 @@ if _sre.CODESIZE == 2:
else:
MAXCODE = 0xFFFFFFFF
-def _identityfunction(x):
- return x
-
_LITERAL_CODES = set([LITERAL, NOT_LITERAL])
_REPEATING_CODES = set([REPEAT, MIN_REPEAT, MAX_REPEAT])
_SUCCESS_CODES = set([SUCCESS, FAILURE])
@@ -53,7 +50,7 @@ def _compile(code, pattern, flags):
return _sre.getlower(literal, flags)
else:
emit(OPCODES[op])
- fixup = _identityfunction
+ fixup = None
skip = _len(code); emit(0)
_compile_charset(av, flags, code, fixup)
code[skip] = _len(code) - skip
@@ -172,17 +169,15 @@ def _compile(code, pattern, flags):
def _compile_charset(charset, flags, code, fixup=None):
# compile charset subprogram
emit = code.append
- if fixup is None:
- fixup = _identityfunction
for op, av in _optimize_charset(charset, fixup):
emit(OPCODES[op])
if op is NEGATE:
pass
elif op is LITERAL:
- emit(fixup(av))
- elif op is RANGE:
- emit(fixup(av[0]))
- emit(fixup(av[1]))
+ emit(av)
+ elif op is RANGE or op is RANGE_IGNORE:
+ emit(av[0])
+ emit(av[1])
elif op is CHARSET:
code.extend(av)
elif op is BIGCHARSET:
@@ -207,9 +202,14 @@ def _optimize_charset(charset, fixup):
while True:
try:
if op is LITERAL:
- charmap[fixup(av)] = 1
+ if fixup:
+ av = fixup(av)
+ charmap[av] = 1
elif op is RANGE:
- for i in range(fixup(av[0]), fixup(av[1])+1):
+ r = range(av[0], av[1]+1)
+ if fixup:
+ r = map(fixup, r)
+ for i in r:
charmap[i] = 1
elif op is NEGATE:
out.append((op, av))
@@ -220,7 +220,12 @@ def _optimize_charset(charset, fixup):
# character set contains non-UCS1 character codes
charmap += b'\0' * 0xff00
continue
- # character set contains non-BMP character codes
+ # Character set contains non-BMP character codes.
+ # There are only two ranges of cased non-BMP characters:
+ # 10400-1044F (Deseret) and 118A0-118DF (Warang Citi),
+ # and for both ranges RANGE_IGNORE works.
+ if fixup and op is RANGE:
+ op = RANGE_IGNORE
tail.append((op, av))
break
@@ -247,8 +252,10 @@ def _optimize_charset(charset, fixup):
else:
out.append((RANGE, (p, q - 1)))
out += tail
- if len(out) < len(charset):
+ # if the case was changed or new representation is more compact
+ if fixup or len(out) < len(charset):
return out
+ # else original character set is good enough
return charset
# use bitmap
diff --git a/Lib/sre_constants.py b/Lib/sre_constants.py
index 8815d1d..8296ecd 100644
--- a/Lib/sre_constants.py
+++ b/Lib/sre_constants.py
@@ -13,7 +13,7 @@
# update when constants are added or removed
-MAGIC = 20031017
+MAGIC = 20140917
from _sre import MAXREPEAT, MAXGROUPS
@@ -56,6 +56,7 @@ NEGATE = "negate"
NOT_LITERAL = "not_literal"
NOT_LITERAL_IGNORE = "not_literal_ignore"
RANGE = "range"
+RANGE_IGNORE = "range_ignore"
REPEAT = "repeat"
REPEAT_ONE = "repeat_one"
SUBPATTERN = "subpattern"
@@ -121,7 +122,8 @@ OPCODES = [
REPEAT,
REPEAT_ONE,
SUBPATTERN,
- MIN_REPEAT_ONE
+ MIN_REPEAT_ONE,
+ RANGE_IGNORE,
]
@@ -159,7 +161,8 @@ OP_IGNORE = {
GROUPREF: GROUPREF_IGNORE,
IN: IN_IGNORE,
LITERAL: LITERAL_IGNORE,
- NOT_LITERAL: NOT_LITERAL_IGNORE
+ NOT_LITERAL: NOT_LITERAL_IGNORE,
+ RANGE: RANGE_IGNORE,
}
AT_MULTILINE = {
diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py
index 4029561..e09aa2b 100644
--- a/Lib/test/test_re.py
+++ b/Lib/test/test_re.py
@@ -601,6 +601,25 @@ class ReTests(unittest.TestCase):
self.assertEqual(re.match(r"((a)\s(abc|a))", "a a", re.I).group(1), "a a")
self.assertEqual(re.match(r"((a)\s(abc|a)*)", "a aa", re.I).group(1), "a aa")
+ def test_ignore_case_range(self):
+ # Issues #3511, #17381.
+ self.assertTrue(re.match(r'[9-a]', '_', re.I))
+ self.assertIsNone(re.match(r'[9-A]', '_', re.I))
+ self.assertTrue(re.match(br'[9-a]', b'_', re.I))
+ self.assertIsNone(re.match(br'[9-A]', b'_', re.I))
+ self.assertTrue(re.match(r'[\xc0-\xde]', '\xd7', re.I))
+ self.assertIsNone(re.match(r'[\xc0-\xde]', '\xf7', re.I))
+ self.assertTrue(re.match(r'[\xe0-\xfe]', '\xf7', re.I))
+ self.assertIsNone(re.match(r'[\xe0-\xfe]', '\xd7', re.I))
+ self.assertTrue(re.match(r'[\u0430-\u045f]', '\u0450', re.I))
+ self.assertTrue(re.match(r'[\u0430-\u045f]', '\u0400', re.I))
+ self.assertTrue(re.match(r'[\u0400-\u042f]', '\u0450', re.I))
+ self.assertTrue(re.match(r'[\u0400-\u042f]', '\u0400', re.I))
+ self.assertTrue(re.match(r'[\U00010428-\U0001044f]', '\U00010428', re.I))
+ self.assertTrue(re.match(r'[\U00010428-\U0001044f]', '\U00010400', re.I))
+ self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010428', re.I))
+ self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010400', re.I))
+
def test_category(self):
self.assertEqual(re.match(r"(\s)", " ").group(1), " ")