summaryrefslogtreecommitdiffstats
path: root/Lib
diff options
context:
space:
mode:
authorSerhiy Storchaka <storchaka@gmail.com>2014-10-31 11:53:21 (GMT)
committerSerhiy Storchaka <storchaka@gmail.com>2014-10-31 11:53:21 (GMT)
commite9e54ae2228e195e046e6325de018374b3247d2d (patch)
tree6264af62dab14d1e36f490004948748d4dace997 /Lib
parentc04fcd40bd8fd2c9e427faded617214f8ae18402 (diff)
downloadcpython-e9e54ae2228e195e046e6325de018374b3247d2d.zip
cpython-e9e54ae2228e195e046e6325de018374b3247d2d.tar.gz
cpython-e9e54ae2228e195e046e6325de018374b3247d2d.tar.bz2
Issue #17381: Fixed ranges handling in case-insensitive regular expressions.
Diffstat (limited to 'Lib')
-rw-r--r--Lib/sre_compile.py64
-rw-r--r--Lib/test/test_re.py37
2 files changed, 86 insertions, 15 deletions
diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py
index e121845..febbde1 100644
--- a/Lib/sre_compile.py
+++ b/Lib/sre_compile.py
@@ -21,9 +21,6 @@ if _sre.CODESIZE == 2:
else:
MAXCODE = 0xFFFFFFFFL
-def _identityfunction(x):
- return x
-
_LITERAL_CODES = set([LITERAL, NOT_LITERAL])
_REPEATING_CODES = set([REPEAT, MIN_REPEAT, MAX_REPEAT])
_SUCCESS_CODES = set([SUCCESS, FAILURE])
@@ -52,7 +49,7 @@ def _compile(code, pattern, flags):
return _sre.getlower(literal, flags)
else:
emit(OPCODES[op])
- fixup = _identityfunction
+ fixup = None
skip = _len(code); emit(0)
_compile_charset(av, flags, code, fixup)
code[skip] = _len(code) - skip
@@ -178,17 +175,15 @@ def _compile(code, pattern, flags):
def _compile_charset(charset, flags, code, fixup=None):
# compile charset subprogram
emit = code.append
- if fixup is None:
- fixup = _identityfunction
- for op, av in _optimize_charset(charset, fixup):
+ for op, av in _optimize_charset(charset, fixup, flags & SRE_FLAG_UNICODE):
emit(OPCODES[op])
if op is NEGATE:
pass
elif op is LITERAL:
- emit(fixup(av))
+ emit(av)
elif op is RANGE:
- emit(fixup(av[0]))
- emit(fixup(av[1]))
+ emit(av[0])
+ emit(av[1])
elif op is CHARSET:
code.extend(av)
elif op is BIGCHARSET:
@@ -204,7 +199,7 @@ def _compile_charset(charset, flags, code, fixup=None):
raise error, "internal: unsupported set operator"
emit(OPCODES[FAILURE])
-def _optimize_charset(charset, fixup):
+def _optimize_charset(charset, fixup, isunicode):
# internal: optimize character set
out = []
tail = []
@@ -213,9 +208,15 @@ def _optimize_charset(charset, fixup):
while True:
try:
if op is LITERAL:
- charmap[fixup(av)] = 1
+ i = av
+ if fixup:
+ i = fixup(i)
+ charmap[i] = 1
elif op is RANGE:
- for i in range(fixup(av[0]), fixup(av[1])+1):
+ r = range(av[0], av[1]+1)
+ if fixup:
+ r = map(fixup, r)
+ for i in r:
charmap[i] = 1
elif op is NEGATE:
out.append((op, av))
@@ -227,7 +228,20 @@ def _optimize_charset(charset, fixup):
charmap += b'\0' * 0xff00
continue
# character set contains non-BMP character codes
- tail.append((op, av))
+ if fixup and isunicode and op is RANGE:
+ lo, hi = av
+ ranges = [av]
+ # There are only two ranges of cased astral characters:
+ # 10400-1044F (Deseret) and 118A0-118DF (Warang Citi).
+ _fixup_range(max(0x10000, lo), min(0x11fff, hi),
+ ranges, fixup)
+ for lo, hi in ranges:
+ if lo == hi:
+ tail.append((LITERAL, hi))
+ else:
+ tail.append((RANGE, (lo, hi)))
+ else:
+ tail.append((op, av))
break
# compress character map
@@ -253,8 +267,10 @@ def _optimize_charset(charset, fixup):
else:
out.append((RANGE, (p, q - 1)))
out += tail
- if len(out) < len(charset):
+ # if the case was changed or new representation is more compact
+ if fixup or len(out) < len(charset):
return out
+ # else original character set is good enough
return charset
# use bitmap
@@ -307,6 +323,24 @@ def _optimize_charset(charset, fixup):
out += tail
return out
+def _fixup_range(lo, hi, ranges, fixup):
+ for i in map(fixup, range(lo, hi+1)):
+ for k, (lo, hi) in enumerate(ranges):
+ if i < lo:
+ if l == lo - 1:
+ ranges[k] = (i, hi)
+ else:
+ ranges.insert(k, (i, i))
+ break
+ elif i > hi:
+ if i == hi + 1:
+ ranges[k] = (lo, i)
+ break
+ else:
+ break
+ else:
+ ranges.append((i, i))
+
_CODEBITS = _sre.CODESIZE * 8
_BITS_TRANS = b'0' + b'1' * 255
def _mk_bitmap(bits, _CODEBITS=_CODEBITS, _int=int):
diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py
index 7921c4a..7a25dbe 100644
--- a/Lib/test/test_re.py
+++ b/Lib/test/test_re.py
@@ -474,6 +474,43 @@ class ReTests(unittest.TestCase):
self.assertEqual(re.match(r"((a)\s(abc|a))", "a a", re.I).group(1), "a a")
self.assertEqual(re.match(r"((a)\s(abc|a)*)", "a aa", re.I).group(1), "a aa")
+ def test_ignore_case_range(self):
+ # Issues #3511, #17381.
+ self.assertTrue(re.match(r'[9-a]', '_', re.I))
+ self.assertIsNone(re.match(r'[9-A]', '_', re.I))
+ self.assertTrue(re.match(r'[\xc0-\xde]', '\xd7', re.I))
+ self.assertIsNone(re.match(r'[\xc0-\xde]', '\xf7', re.I))
+ self.assertTrue(re.match(r'[\xe0-\xfe]', '\xf7',re.I))
+ self.assertIsNone(re.match(r'[\xe0-\xfe]', '\xd7', re.I))
+ if have_unicode:
+ self.assertTrue(re.match(u(r'[9-a]'), u(r'_'), re.U | re.I))
+ self.assertIsNone(re.match(u(r'[9-A]'), u(r'_'), re.U | re.I))
+ self.assertTrue(re.match(u(r'[\xc0-\xde]'),
+ u(r'\xd7'), re.U | re.I))
+ self.assertIsNone(re.match(u(r'[\xc0-\xde]'),
+ u(r'\xf7'), re.U | re.I))
+ self.assertTrue(re.match(u(r'[\xe0-\xfe]'),
+ u(r'\xf7'), re.U | re.I))
+ self.assertIsNone(re.match(u(r'[\xe0-\xfe]'),
+ u(r'\xd7'), re.U | re.I))
+ self.assertTrue(re.match(u(r'[\u0430-\u045f]'),
+ u(r'\u0450'), re.U | re.I))
+ self.assertTrue(re.match(u(r'[\u0430-\u045f]'),
+ u(r'\u0400'), re.U | re.I))
+ self.assertTrue(re.match(u(r'[\u0400-\u042f]'),
+ u(r'\u0450'), re.U | re.I))
+ self.assertTrue(re.match(u(r'[\u0400-\u042f]'),
+ u(r'\u0400'), re.U | re.I))
+ if sys.maxunicode > 0xffff:
+ self.assertTrue(re.match(u(r'[\U00010428-\U0001044f]'),
+ u(r'\U00010428'), re.U | re.I))
+ self.assertTrue(re.match(u(r'[\U00010428-\U0001044f]'),
+ u(r'\U00010400'), re.U | re.I))
+ self.assertTrue(re.match(u(r'[\U00010400-\U00010427]'),
+ u(r'\U00010428'), re.U | re.I))
+ self.assertTrue(re.match(u(r'[\U00010400-\U00010427]'),
+ u(r'\U00010400'), re.U | re.I))
+
def test_category(self):
self.assertEqual(re.match(r"(\s)", " ").group(1), " ")