summaryrefslogtreecommitdiffstats
path: root/Tools/scripts
diff options
context:
space:
mode:
authorSerhiy Storchaka <storchaka@gmail.com>2022-04-22 18:37:46 (GMT)
committerGitHub <noreply@github.com>2022-04-22 18:37:46 (GMT)
commitf912cc0e413f667a8cc257a41775272bc641b0d8 (patch)
treeaf40aa0af1bd3e3669a05f3a909e227afaea2121 /Tools/scripts
parent48ec61a89a959071206549819448405c2cea61b0 (diff)
downloadcpython-f912cc0e413f667a8cc257a41775272bc641b0d8.zip
cpython-f912cc0e413f667a8cc257a41775272bc641b0d8.tar.gz
cpython-f912cc0e413f667a8cc257a41775272bc641b0d8.tar.bz2
gh-91575: Add a script for generating data for case-insensitive matching in re (GH-91660)
Also test that all extra cases are in BMP.
Diffstat (limited to 'Tools/scripts')
-rwxr-xr-xTools/scripts/generate_re_casefix.py95
1 files changed, 95 insertions, 0 deletions
diff --git a/Tools/scripts/generate_re_casefix.py b/Tools/scripts/generate_re_casefix.py
new file mode 100755
index 0000000..00b048b
--- /dev/null
+++ b/Tools/scripts/generate_re_casefix.py
@@ -0,0 +1,95 @@
+#! /usr/bin/env python3
+# This script generates Lib/re/_casefix.py.
+
+import collections
+import re
+import sys
+import unicodedata
+
+def update_file(file, content):
+ try:
+ with open(file, 'r', encoding='utf-8') as fobj:
+ if fobj.read() == content:
+ return False
+ except (OSError, ValueError):
+ pass
+ with open(file, 'w', encoding='utf-8') as fobj:
+ fobj.write(content)
+ return True
+
+re_casefix_template = """\
+# Auto-generated by Tools/scripts/generate_re_casefix.py.
+
+# Maps the code of lowercased character to codes of different lowercased
+# characters which have the same uppercase.
+_EXTRA_CASES = {
+%s
+}
+"""
+
+def uname(i):
+ return unicodedata.name(chr(i), r'U+%04X' % i)
+
+class hexint(int):
+ def __repr__(self):
+ return '%#06x' % self
+
+def alpha(i):
+ c = chr(i)
+ return c if c.isalpha() else ascii(c)[1:-1]
+
+
+def main(outfile='Lib/re/_casefix.py'):
+ # Find sets of characters which have the same uppercase.
+ equivalent_chars = collections.defaultdict(str)
+ for c in map(chr, range(sys.maxunicode + 1)):
+ equivalent_chars[c.upper()] += c
+ equivalent_chars = [t for t in equivalent_chars.values() if len(t) > 1]
+
+ # List of codes of lowercased characters which have the same uppercase.
+ equivalent_lower_codes = [sorted(t)
+ for s in equivalent_chars
+ for t in [set(ord(c.lower()) for c in s)]
+ if len(t) > 1]
+
+ bad_codes = []
+ for t in equivalent_lower_codes:
+ for i in t:
+ if i > 0xffff:
+ bad_codes.extend(t)
+ try:
+ bad_codes.append(ord(chr(i).upper()))
+ except (ValueError, TypeError):
+ pass
+ break
+ if bad_codes:
+ print('Case-insensitive matching may not work correctly for character:',
+ file=sys.stderr)
+ for i in sorted(bad_codes):
+ print(" '%s' (U+%04x, %s)" % (alpha(i), i, uname(i)),
+ file=sys.stderr)
+ sys.exit(1)
+
+ mapping = {i: tuple(j for j in t if i != j)
+ for t in equivalent_lower_codes
+ for i in t}
+
+ items = []
+ for i, t in sorted(mapping.items()):
+ items.append(' # %s: %s' % (
+ uname(i),
+ ', '.join(map(uname, t)),
+ ))
+ items.append(" %r: %r, # '%s': '%s'" % (
+ hexint(i),
+ tuple(map(hexint, t)),
+ alpha(i),
+ ''.join(map(alpha, t)),
+ ))
+
+ update_file(outfile, re_casefix_template % '\n'.join(items))
+
+
+if __name__ == '__main__':
+ import sys
+ main(*sys.argv[1:])