summaryrefslogtreecommitdiffstats
path: root/Tools/scripts/parse_html5_entities.py
diff options
context:
space:
mode:
Diffstat (limited to 'Tools/scripts/parse_html5_entities.py')
-rwxr-xr-xTools/scripts/parse_html5_entities.py114
1 files changed, 0 insertions, 114 deletions
diff --git a/Tools/scripts/parse_html5_entities.py b/Tools/scripts/parse_html5_entities.py
deleted file mode 100755
index 1e5bdad..0000000
--- a/Tools/scripts/parse_html5_entities.py
+++ /dev/null
@@ -1,114 +0,0 @@
-#!/usr/bin/env python3
-"""
-Utility for parsing HTML5 entity definitions available from:
-
- https://html.spec.whatwg.org/entities.json
- https://html.spec.whatwg.org/multipage/named-characters.html
-
-The page now contains the following note:
-
- "This list is static and will not be expanded or changed in the future."
-
-Written by Ezio Melotti and Iuliia Proskurnia.
-"""
-
-import os
-import sys
-import json
-from urllib.request import urlopen
-from html.entities import html5
-
-PAGE_URL = 'https://html.spec.whatwg.org/multipage/named-characters.html'
-ENTITIES_URL = 'https://html.spec.whatwg.org/entities.json'
-HTML5_SECTION_START = '# HTML5 named character references'
-
-def get_json(url):
- """Download the json file from the url and returns a decoded object."""
- with urlopen(url) as f:
- data = f.read().decode('utf-8')
- return json.loads(data)
-
-def create_dict(entities):
- """Create the html5 dict from the decoded json object."""
- new_html5 = {}
- for name, value in entities.items():
- new_html5[name.lstrip('&')] = value['characters']
- return new_html5
-
-def compare_dicts(old, new):
- """Compare the old and new dicts and print the differences."""
- added = new.keys() - old.keys()
- if added:
- print('{} entitie(s) have been added:'.format(len(added)))
- for name in sorted(added):
- print(' {!r}: {!r}'.format(name, new[name]))
- removed = old.keys() - new.keys()
- if removed:
- print('{} entitie(s) have been removed:'.format(len(removed)))
- for name in sorted(removed):
- print(' {!r}: {!r}'.format(name, old[name]))
- changed = set()
- for name in (old.keys() & new.keys()):
- if old[name] != new[name]:
- changed.add((name, old[name], new[name]))
- if changed:
- print('{} entitie(s) have been modified:'.format(len(changed)))
- for item in sorted(changed):
- print(' {!r}: {!r} -> {!r}'.format(*item))
-
-def write_items(entities, file=sys.stdout):
- """Write the items of the dictionary in the specified file."""
- # The keys in the generated dictionary should be sorted
- # in a case-insensitive way, however, when two keys are equal,
- # the uppercase version should come first so that the result
- # looks like: ['Aacute', 'aacute', 'Aacute;', 'aacute;', ...]
- # To do this we first sort in a case-sensitive way (so all the
- # uppercase chars come first) and then sort with key=str.lower.
- # Since the sorting is stable the uppercase keys will eventually
- # be before their equivalent lowercase version.
- keys = sorted(entities.keys())
- keys = sorted(keys, key=str.lower)
- print(HTML5_SECTION_START, file=file)
- print(f'# Generated by {sys.argv[0]!r}\n'
- f'# from {ENTITIES_URL} and\n'
- f'# {PAGE_URL}.\n'
- f'# Map HTML5 named character references to the '
- f'equivalent Unicode character(s).', file=file)
- print('html5 = {', file=file)
- for name in keys:
- print(f' {name!r}: {entities[name]!a},', file=file)
- print('}', file=file)
-
-
-if __name__ == '__main__':
- # without args print a diff between html.entities.html5 and new_html5
- # with --create print the new html5 dict
- # with --patch patch the Lib/html/entities.py file
- new_html5 = create_dict(get_json(ENTITIES_URL))
- if '--create' in sys.argv:
- write_items(new_html5)
- elif '--patch' in sys.argv:
- fname = 'Lib/html/entities.py'
- temp_fname = fname + '.temp'
- with open(fname) as f1, open(temp_fname, 'w') as f2:
- skip = False
- for line in f1:
- if line.startswith(HTML5_SECTION_START):
- write_items(new_html5, file=f2)
- skip = True
- continue
- if skip:
- # skip the old items until the }
- if line.startswith('}'):
- skip = False
- continue
- f2.write(line)
- os.remove(fname)
- os.rename(temp_fname, fname)
- else:
- if html5 == new_html5:
- print('The current dictionary is updated.')
- else:
- compare_dicts(html5, new_html5)
- print('Run "./python {0} --patch" to update Lib/html/entities.html '
- 'or "./python {0} --create" to see the generated ' 'dictionary.'.format(__file__))