summaryrefslogtreecommitdiffstats
path: root/Lib/re/_compiler.py
diff options
context:
space:
mode:
Diffstat (limited to 'Lib/re/_compiler.py')
-rw-r--r--Lib/re/_compiler.py59
1 files changed, 2 insertions, 57 deletions
diff --git a/Lib/re/_compiler.py b/Lib/re/_compiler.py
index 065f6fb..f621d04 100644
--- a/Lib/re/_compiler.py
+++ b/Lib/re/_compiler.py
@@ -13,6 +13,7 @@
import _sre
from . import _parser
from ._constants import *
+from ._casefix import _EXTRA_CASES
assert _sre.MAGIC == MAGIC, "SRE module mismatch"
@@ -27,62 +28,6 @@ _REPEATING_CODES = {
POSSESSIVE_REPEAT: (POSSESSIVE_REPEAT, SUCCESS, POSSESSIVE_REPEAT_ONE),
}
-# Sets of lowercase characters which have the same uppercase.
-_equivalences = (
- # LATIN SMALL LETTER I, LATIN SMALL LETTER DOTLESS I
- (0x69, 0x131), # iı
- # LATIN SMALL LETTER S, LATIN SMALL LETTER LONG S
- (0x73, 0x17f), # sſ
- # MICRO SIGN, GREEK SMALL LETTER MU
- (0xb5, 0x3bc), # µμ
- # COMBINING GREEK YPOGEGRAMMENI, GREEK SMALL LETTER IOTA, GREEK PROSGEGRAMMENI
- (0x345, 0x3b9, 0x1fbe), # \u0345ιι
- # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS, GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA
- (0x390, 0x1fd3), # ΐΐ
- # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS, GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA
- (0x3b0, 0x1fe3), # ΰΰ
- # GREEK SMALL LETTER BETA, GREEK BETA SYMBOL
- (0x3b2, 0x3d0), # βϐ
- # GREEK SMALL LETTER EPSILON, GREEK LUNATE EPSILON SYMBOL
- (0x3b5, 0x3f5), # εϵ
- # GREEK SMALL LETTER THETA, GREEK THETA SYMBOL
- (0x3b8, 0x3d1), # θϑ
- # GREEK SMALL LETTER KAPPA, GREEK KAPPA SYMBOL
- (0x3ba, 0x3f0), # κϰ
- # GREEK SMALL LETTER PI, GREEK PI SYMBOL
- (0x3c0, 0x3d6), # πϖ
- # GREEK SMALL LETTER RHO, GREEK RHO SYMBOL
- (0x3c1, 0x3f1), # ρϱ
- # GREEK SMALL LETTER FINAL SIGMA, GREEK SMALL LETTER SIGMA
- (0x3c2, 0x3c3), # ςσ
- # GREEK SMALL LETTER PHI, GREEK PHI SYMBOL
- (0x3c6, 0x3d5), # φϕ
- # CYRILLIC SMALL LETTER VE, CYRILLIC SMALL LETTER ROUNDED VE
- (0x432, 0x1c80), # вᲀ
- # CYRILLIC SMALL LETTER DE, CYRILLIC SMALL LETTER LONG-LEGGED DE
- (0x434, 0x1c81), # дᲁ
- # CYRILLIC SMALL LETTER O, CYRILLIC SMALL LETTER NARROW O
- (0x43e, 0x1c82), # оᲂ
- # CYRILLIC SMALL LETTER ES, CYRILLIC SMALL LETTER WIDE ES
- (0x441, 0x1c83), # сᲃ
- # CYRILLIC SMALL LETTER TE, CYRILLIC SMALL LETTER TALL TE, CYRILLIC SMALL LETTER THREE-LEGGED TE
- (0x442, 0x1c84, 0x1c85), # тᲄᲅ
- # CYRILLIC SMALL LETTER HARD SIGN, CYRILLIC SMALL LETTER TALL HARD SIGN
- (0x44a, 0x1c86), # ъᲆ
- # CYRILLIC SMALL LETTER YAT, CYRILLIC SMALL LETTER TALL YAT
- (0x463, 0x1c87), # ѣᲇ
- # CYRILLIC SMALL LETTER UNBLENDED UK, CYRILLIC SMALL LETTER MONOGRAPH UK
- (0x1c88, 0xa64b), # ᲈꙋ
- # LATIN SMALL LETTER S WITH DOT ABOVE, LATIN SMALL LETTER LONG S WITH DOT ABOVE
- (0x1e61, 0x1e9b), # ṡẛ
- # LATIN SMALL LIGATURE LONG S T, LATIN SMALL LIGATURE ST
- (0xfb05, 0xfb06), # ſtst
-)
-
-# Maps the lowercase code to lowercase codes which have the same uppercase.
-_ignorecase_fixes = {i: tuple(j for j in t if i != j)
- for t in _equivalences for i in t}
-
class _CompileData:
__slots__ = ('code', 'repeat_count')
def __init__(self):
@@ -111,7 +56,7 @@ def _compile(data, pattern, flags):
if flags & SRE_FLAG_UNICODE:
iscased = _sre.unicode_iscased
tolower = _sre.unicode_tolower
- fixes = _ignorecase_fixes
+ fixes = _EXTRA_CASES
else:
iscased = _sre.ascii_iscased
tolower = _sre.ascii_tolower