diff options
Diffstat (limited to 'Lib/re/_compiler.py')
-rw-r--r-- | Lib/re/_compiler.py | 59 |
1 files changed, 2 insertions, 57 deletions
diff --git a/Lib/re/_compiler.py b/Lib/re/_compiler.py index 065f6fb..f621d04 100644 --- a/Lib/re/_compiler.py +++ b/Lib/re/_compiler.py @@ -13,6 +13,7 @@ import _sre from . import _parser from ._constants import * +from ._casefix import _EXTRA_CASES assert _sre.MAGIC == MAGIC, "SRE module mismatch" @@ -27,62 +28,6 @@ _REPEATING_CODES = { POSSESSIVE_REPEAT: (POSSESSIVE_REPEAT, SUCCESS, POSSESSIVE_REPEAT_ONE), } -# Sets of lowercase characters which have the same uppercase. -_equivalences = ( - # LATIN SMALL LETTER I, LATIN SMALL LETTER DOTLESS I - (0x69, 0x131), # iı - # LATIN SMALL LETTER S, LATIN SMALL LETTER LONG S - (0x73, 0x17f), # sſ - # MICRO SIGN, GREEK SMALL LETTER MU - (0xb5, 0x3bc), # µμ - # COMBINING GREEK YPOGEGRAMMENI, GREEK SMALL LETTER IOTA, GREEK PROSGEGRAMMENI - (0x345, 0x3b9, 0x1fbe), # \u0345ιι - # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS, GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA - (0x390, 0x1fd3), # ΐΐ - # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS, GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA - (0x3b0, 0x1fe3), # ΰΰ - # GREEK SMALL LETTER BETA, GREEK BETA SYMBOL - (0x3b2, 0x3d0), # βϐ - # GREEK SMALL LETTER EPSILON, GREEK LUNATE EPSILON SYMBOL - (0x3b5, 0x3f5), # εϵ - # GREEK SMALL LETTER THETA, GREEK THETA SYMBOL - (0x3b8, 0x3d1), # θϑ - # GREEK SMALL LETTER KAPPA, GREEK KAPPA SYMBOL - (0x3ba, 0x3f0), # κϰ - # GREEK SMALL LETTER PI, GREEK PI SYMBOL - (0x3c0, 0x3d6), # πϖ - # GREEK SMALL LETTER RHO, GREEK RHO SYMBOL - (0x3c1, 0x3f1), # ρϱ - # GREEK SMALL LETTER FINAL SIGMA, GREEK SMALL LETTER SIGMA - (0x3c2, 0x3c3), # ςσ - # GREEK SMALL LETTER PHI, GREEK PHI SYMBOL - (0x3c6, 0x3d5), # φϕ - # CYRILLIC SMALL LETTER VE, CYRILLIC SMALL LETTER ROUNDED VE - (0x432, 0x1c80), # вᲀ - # CYRILLIC SMALL LETTER DE, CYRILLIC SMALL LETTER LONG-LEGGED DE - (0x434, 0x1c81), # дᲁ - # CYRILLIC SMALL LETTER O, CYRILLIC SMALL LETTER NARROW O - (0x43e, 0x1c82), # оᲂ - # CYRILLIC SMALL LETTER ES, CYRILLIC SMALL LETTER WIDE ES - (0x441, 0x1c83), # сᲃ - # CYRILLIC SMALL LETTER TE, CYRILLIC SMALL LETTER TALL TE, CYRILLIC SMALL LETTER THREE-LEGGED TE - (0x442, 0x1c84, 0x1c85), # тᲄᲅ - # CYRILLIC SMALL LETTER HARD SIGN, CYRILLIC SMALL LETTER TALL HARD SIGN - (0x44a, 0x1c86), # ъᲆ - # CYRILLIC SMALL LETTER YAT, CYRILLIC SMALL LETTER TALL YAT - (0x463, 0x1c87), # ѣᲇ - # CYRILLIC SMALL LETTER UNBLENDED UK, CYRILLIC SMALL LETTER MONOGRAPH UK - (0x1c88, 0xa64b), # ᲈꙋ - # LATIN SMALL LETTER S WITH DOT ABOVE, LATIN SMALL LETTER LONG S WITH DOT ABOVE - (0x1e61, 0x1e9b), # ṡẛ - # LATIN SMALL LIGATURE LONG S T, LATIN SMALL LIGATURE ST - (0xfb05, 0xfb06), # ſtst -) - -# Maps the lowercase code to lowercase codes which have the same uppercase. -_ignorecase_fixes = {i: tuple(j for j in t if i != j) - for t in _equivalences for i in t} - class _CompileData: __slots__ = ('code', 'repeat_count') def __init__(self): @@ -111,7 +56,7 @@ def _compile(data, pattern, flags): if flags & SRE_FLAG_UNICODE: iscased = _sre.unicode_iscased tolower = _sre.unicode_tolower - fixes = _ignorecase_fixes + fixes = _EXTRA_CASES else: iscased = _sre.ascii_iscased tolower = _sre.ascii_tolower |