diff options
Diffstat (limited to 'Tools/scripts/parse_html5_entities.py')
-rwxr-xr-x | Tools/scripts/parse_html5_entities.py | 114 |
1 files changed, 0 insertions, 114 deletions
diff --git a/Tools/scripts/parse_html5_entities.py b/Tools/scripts/parse_html5_entities.py deleted file mode 100755 index 1e5bdad..0000000 --- a/Tools/scripts/parse_html5_entities.py +++ /dev/null @@ -1,114 +0,0 @@ -#!/usr/bin/env python3 -""" -Utility for parsing HTML5 entity definitions available from: - - https://html.spec.whatwg.org/entities.json - https://html.spec.whatwg.org/multipage/named-characters.html - -The page now contains the following note: - - "This list is static and will not be expanded or changed in the future." - -Written by Ezio Melotti and Iuliia Proskurnia. -""" - -import os -import sys -import json -from urllib.request import urlopen -from html.entities import html5 - -PAGE_URL = 'https://html.spec.whatwg.org/multipage/named-characters.html' -ENTITIES_URL = 'https://html.spec.whatwg.org/entities.json' -HTML5_SECTION_START = '# HTML5 named character references' - -def get_json(url): - """Download the json file from the url and returns a decoded object.""" - with urlopen(url) as f: - data = f.read().decode('utf-8') - return json.loads(data) - -def create_dict(entities): - """Create the html5 dict from the decoded json object.""" - new_html5 = {} - for name, value in entities.items(): - new_html5[name.lstrip('&')] = value['characters'] - return new_html5 - -def compare_dicts(old, new): - """Compare the old and new dicts and print the differences.""" - added = new.keys() - old.keys() - if added: - print('{} entitie(s) have been added:'.format(len(added))) - for name in sorted(added): - print(' {!r}: {!r}'.format(name, new[name])) - removed = old.keys() - new.keys() - if removed: - print('{} entitie(s) have been removed:'.format(len(removed))) - for name in sorted(removed): - print(' {!r}: {!r}'.format(name, old[name])) - changed = set() - for name in (old.keys() & new.keys()): - if old[name] != new[name]: - changed.add((name, old[name], new[name])) - if changed: - print('{} entitie(s) have been modified:'.format(len(changed))) - for item in sorted(changed): - print(' {!r}: {!r} -> {!r}'.format(*item)) - -def write_items(entities, file=sys.stdout): - """Write the items of the dictionary in the specified file.""" - # The keys in the generated dictionary should be sorted - # in a case-insensitive way, however, when two keys are equal, - # the uppercase version should come first so that the result - # looks like: ['Aacute', 'aacute', 'Aacute;', 'aacute;', ...] - # To do this we first sort in a case-sensitive way (so all the - # uppercase chars come first) and then sort with key=str.lower. - # Since the sorting is stable the uppercase keys will eventually - # be before their equivalent lowercase version. - keys = sorted(entities.keys()) - keys = sorted(keys, key=str.lower) - print(HTML5_SECTION_START, file=file) - print(f'# Generated by {sys.argv[0]!r}\n' - f'# from {ENTITIES_URL} and\n' - f'# {PAGE_URL}.\n' - f'# Map HTML5 named character references to the ' - f'equivalent Unicode character(s).', file=file) - print('html5 = {', file=file) - for name in keys: - print(f' {name!r}: {entities[name]!a},', file=file) - print('}', file=file) - - -if __name__ == '__main__': - # without args print a diff between html.entities.html5 and new_html5 - # with --create print the new html5 dict - # with --patch patch the Lib/html/entities.py file - new_html5 = create_dict(get_json(ENTITIES_URL)) - if '--create' in sys.argv: - write_items(new_html5) - elif '--patch' in sys.argv: - fname = 'Lib/html/entities.py' - temp_fname = fname + '.temp' - with open(fname) as f1, open(temp_fname, 'w') as f2: - skip = False - for line in f1: - if line.startswith(HTML5_SECTION_START): - write_items(new_html5, file=f2) - skip = True - continue - if skip: - # skip the old items until the } - if line.startswith('}'): - skip = False - continue - f2.write(line) - os.remove(fname) - os.rename(temp_fname, fname) - else: - if html5 == new_html5: - print('The current dictionary is updated.') - else: - compare_dicts(html5, new_html5) - print('Run "./python {0} --patch" to update Lib/html/entities.html ' - 'or "./python {0} --create" to see the generated ' 'dictionary.'.format(__file__)) |