diff options
author | Serhiy Storchaka <storchaka@gmail.com> | 2022-04-22 18:37:46 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-04-22 18:37:46 (GMT) |
commit | f912cc0e413f667a8cc257a41775272bc641b0d8 (patch) | |
tree | af40aa0af1bd3e3669a05f3a909e227afaea2121 /Tools/scripts | |
parent | 48ec61a89a959071206549819448405c2cea61b0 (diff) | |
download | cpython-f912cc0e413f667a8cc257a41775272bc641b0d8.zip cpython-f912cc0e413f667a8cc257a41775272bc641b0d8.tar.gz cpython-f912cc0e413f667a8cc257a41775272bc641b0d8.tar.bz2 |
gh-91575: Add a script for generating data for case-insensitive matching in re (GH-91660)
Also test that all extra cases are in BMP.
Diffstat (limited to 'Tools/scripts')
-rwxr-xr-x | Tools/scripts/generate_re_casefix.py | 95 |
1 files changed, 95 insertions, 0 deletions
diff --git a/Tools/scripts/generate_re_casefix.py b/Tools/scripts/generate_re_casefix.py new file mode 100755 index 0000000..00b048b --- /dev/null +++ b/Tools/scripts/generate_re_casefix.py @@ -0,0 +1,95 @@ +#! /usr/bin/env python3 +# This script generates Lib/re/_casefix.py. + +import collections +import re +import sys +import unicodedata + +def update_file(file, content): + try: + with open(file, 'r', encoding='utf-8') as fobj: + if fobj.read() == content: + return False + except (OSError, ValueError): + pass + with open(file, 'w', encoding='utf-8') as fobj: + fobj.write(content) + return True + +re_casefix_template = """\ +# Auto-generated by Tools/scripts/generate_re_casefix.py. + +# Maps the code of lowercased character to codes of different lowercased +# characters which have the same uppercase. +_EXTRA_CASES = { +%s +} +""" + +def uname(i): + return unicodedata.name(chr(i), r'U+%04X' % i) + +class hexint(int): + def __repr__(self): + return '%#06x' % self + +def alpha(i): + c = chr(i) + return c if c.isalpha() else ascii(c)[1:-1] + + +def main(outfile='Lib/re/_casefix.py'): + # Find sets of characters which have the same uppercase. + equivalent_chars = collections.defaultdict(str) + for c in map(chr, range(sys.maxunicode + 1)): + equivalent_chars[c.upper()] += c + equivalent_chars = [t for t in equivalent_chars.values() if len(t) > 1] + + # List of codes of lowercased characters which have the same uppercase. + equivalent_lower_codes = [sorted(t) + for s in equivalent_chars + for t in [set(ord(c.lower()) for c in s)] + if len(t) > 1] + + bad_codes = [] + for t in equivalent_lower_codes: + for i in t: + if i > 0xffff: + bad_codes.extend(t) + try: + bad_codes.append(ord(chr(i).upper())) + except (ValueError, TypeError): + pass + break + if bad_codes: + print('Case-insensitive matching may not work correctly for character:', + file=sys.stderr) + for i in sorted(bad_codes): + print(" '%s' (U+%04x, %s)" % (alpha(i), i, uname(i)), + file=sys.stderr) + sys.exit(1) + + mapping = {i: tuple(j for j in t if i != j) + for t in equivalent_lower_codes + for i in t} + + items = [] + for i, t in sorted(mapping.items()): + items.append(' # %s: %s' % ( + uname(i), + ', '.join(map(uname, t)), + )) + items.append(" %r: %r, # '%s': '%s'" % ( + hexint(i), + tuple(map(hexint, t)), + alpha(i), + ''.join(map(alpha, t)), + )) + + update_file(outfile, re_casefix_template % '\n'.join(items)) + + +if __name__ == '__main__': + import sys + main(*sys.argv[1:]) |