summaryrefslogtreecommitdiffstats
path: root/Tools/scripts/parse_html5_entities.py
diff options
context:
space:
mode:
authorEzio Melotti <ezio.melotti@gmail.com>2012-10-23 13:46:33 (GMT)
committerEzio Melotti <ezio.melotti@gmail.com>2012-10-23 13:46:33 (GMT)
commitccc9e617f57e315719a62cd9ca812b3f21ad8966 (patch)
tree7023395e57db80b0ce9762aeb20705b13c141dac /Tools/scripts/parse_html5_entities.py
parentd25b3982c760289f7c5eeefdc81a5659a9703aa7 (diff)
downloadcpython-ccc9e617f57e315719a62cd9ca812b3f21ad8966.zip
cpython-ccc9e617f57e315719a62cd9ca812b3f21ad8966.tar.gz
cpython-ccc9e617f57e315719a62cd9ca812b3f21ad8966.tar.bz2
#16245: add a script to generate the html.entities.html5 dict.
Diffstat (limited to 'Tools/scripts/parse_html5_entities.py')
-rw-r--r--Tools/scripts/parse_html5_entities.py105
1 files changed, 105 insertions, 0 deletions
diff --git a/Tools/scripts/parse_html5_entities.py b/Tools/scripts/parse_html5_entities.py
new file mode 100644
index 0000000..c011328
--- /dev/null
+++ b/Tools/scripts/parse_html5_entities.py
@@ -0,0 +1,105 @@
+#!/usr/bin/env python3
+"""
+Utility for parsing HTML5 entity definitions available from:
+
+ http://dev.w3.org/html5/spec/entities.json
+
+Written by Ezio Melotti and Iuliia Proskurnia.
+
+"""
+
+import os
+import sys
+import json
+from urllib.request import urlopen
+from html.entities import html5
+
+entities_url = 'http://dev.w3.org/html5/spec/entities.json'
+
+def get_json(url):
+ """Download the json file from the url and returns a decoded object."""
+ with urlopen(url) as f:
+ data = f.read().decode('utf-8')
+ return json.loads(data)
+
+def create_dict(entities):
+ """Create the html5 dict from the decoded json object."""
+ new_html5 = {}
+ for name, value in entities.items():
+ new_html5[name.lstrip('&')] = value['characters']
+ return new_html5
+
+def compare_dicts(old, new):
+ """Compare the old and new dicts and print the differences."""
+ added = new.keys() - old.keys()
+ if added:
+ print('{} entitie(s) have been added:'.format(len(added)))
+ for name in sorted(added):
+ print(' {!r}: {!r}'.format(name, new[name]))
+ removed = old.keys() - new.keys()
+ if removed:
+ print('{} entitie(s) have been removed:'.format(len(removed)))
+ for name in sorted(removed):
+ print(' {!r}: {!r}'.format(name, old[name]))
+ changed = set()
+ for name in (old.keys() & new.keys()):
+ if old[name] != new[name]:
+ changed.add((name, old[name], new[name]))
+ if changed:
+ print('{} entitie(s) have been modified:'.format(len(changed)))
+ for item in sorted(changed):
+ print(' {!r}: {!r} -> {!r}'.format(*item))
+
+def write_items(entities, file=sys.stdout):
+ """Write the items of the dictionary in the specified file."""
+ # The keys in the generated dictionary should be sorted
+ # in a case-insensitive way, however, when two keys are equal,
+ # the uppercase version should come first so that the result
+ # looks like: ['Aacute', 'aacute', 'Aacute;', 'aacute;', ...]
+ # To do this we first sort in a case-sensitive way (so all the
+ # uppercase chars come first) and then sort with key=str.lower.
+ # Since the sorting is stable the uppercase keys will eventually
+ # be before their equivalent lowercase version.
+ keys = sorted(entities.keys())
+ keys = sorted(keys, key=str.lower)
+ print('html5 = {', file=file)
+ for name in keys:
+ print(' {!r}: {!a},'.format(name, entities[name]), file=file)
+ print('}', file=file)
+
+
+if __name__ == '__main__':
+ # without args print a diff between html.entities.html5 and new_html5
+ # with --create print the new html5 dict
+ # with --patch patch the Lib/html/entities.py file
+ new_html5 = create_dict(get_json(entities_url))
+ if '--create' in sys.argv:
+ print('# map the HTML5 named character references to the '
+ 'equivalent Unicode character(s)')
+ print('# Generated by {}. Do not edit manually.'.format(__file__))
+ write_items(new_html5)
+ elif '--patch' in sys.argv:
+ fname = 'Lib/html/entities.py'
+ temp_fname = fname + '.temp'
+ with open(fname) as f1, open(temp_fname, 'w') as f2:
+ skip = False
+ for line in f1:
+ if line.startswith('html5 = {'):
+ write_items(new_html5, file=f2)
+ skip = True
+ continue
+ if skip:
+ # skip the old items until the }
+ if line.startswith('}'):
+ skip = False
+ continue
+ f2.write(line)
+ os.remove(fname)
+ os.rename(temp_fname, fname)
+ else:
+ if html5 == new_html5:
+ print('The current dictionary is updated.')
+ else:
+ compare_dicts(html5, new_html5)
+ print('Run "./python {0} --patch" to update Lib/html/entities.html '
+ 'or "./python {0} --create" to see the generated ' 'dictionary.'.format(__file__))