"""Generate 10,000 unique examples for the Levenshtein short-circuit tests.""" import argparse from functools import cache import json import os.path from random import choices, randrange # This should be in sync with Lib/traceback.py. It's not importing those values # because this script is being executed by PYTHON_FOR_REGEN and not by the in-tree # build of Python. _MOVE_COST = 2 _CASE_COST = 1 def _substitution_cost(ch_a, ch_b): if ch_a == ch_b: return 0 if ch_a.lower() == ch_b.lower(): return _CASE_COST return _MOVE_COST @cache def levenshtein(a, b): if not a or not b: return (len(a) + len(b)) * _MOVE_COST option1 = levenshtein(a[:-1], b[:-1]) + _substitution_cost(a[-1], b[-1]) option2 = levenshtein(a[:-1], b) + _MOVE_COST option3 = levenshtein(a, b[:-1]) + _MOVE_COST return min(option1, option2, option3) def main(): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument('output_path', metavar='FILE', type=str) parser.add_argument('--overwrite', dest='overwrite', action='store_const', const=True, default=False, help='overwrite an existing test file') args = parser.parse_args() output_path = os.path.realpath(args.output_path) if not args.overwrite and os.path.isfile(output_path): print(f"{output_path} already exists, skipping regeneration.") print( "To force, add --overwrite to the invocation of this tool or" " delete the existing file." ) return examples = set() # Create a lot of non-empty examples, which should end up with a Gauss-like # distribution for even costs (moves) and odd costs (case substitutions). while len(examples) < 9990: a = ''.join(choices("abcABC", k=randrange(1, 10))) b = ''.join(choices("abcABC", k=randrange(1, 10))) expected = levenshtein(a, b) examples.add((a, b, expected)) # Create one empty case each for strings between 0 and 9 in length. for i in range(10): b = ''.join(choices("abcABC", k=i)) expected = levenshtein("", b) examples.add(("", b, expected)) with open(output_path, "w") as f: json.dump(sorted(examples), f, indent=2) if __name__ == "__main__": main()