From 1f7fffb308390d10a2c6a4ec624f18cfeef97aeb Mon Sep 17 00:00:00 2001 From: Georg Brandl Date: Fri, 15 Oct 2010 15:57:45 +0000 Subject: #2830: add html.escape() helper and move cgi.escape() uses in the standard library to it. It defaults to quote=True and also escapes single quotes, which makes casual use safer. The cgi.escape() interface is not touched, but emits a (silent) PendingDeprecationWarning. --- Doc/howto/webservers.rst | 2 +- Doc/library/cgi.rst | 14 +++++++------- Doc/library/html.rst | 18 ++++++++++++++++++ Doc/library/markup.rst | 1 + Lib/cgi.py | 25 +++++++++++++------------ Lib/html/__init__.py | 21 ++++++++++++++++++++- Lib/http/server.py | 6 +++--- Lib/lib2to3/tests/test_util.py | 4 ++-- Lib/test/test_html.py | 24 ++++++++++++++++++++++++ Lib/test/test_xml_etree.py | 4 ++-- Misc/NEWS | 3 +++ 11 files changed, 94 insertions(+), 28 deletions(-) create mode 100644 Doc/library/html.rst create mode 100644 Lib/test/test_html.py diff --git a/Doc/howto/webservers.rst b/Doc/howto/webservers.rst index 7f68b3b..049fe1b 100644 --- a/Doc/howto/webservers.rst +++ b/Doc/howto/webservers.rst @@ -293,7 +293,7 @@ following WSGI-application:: # -*- coding: UTF-8 -*- import sys, os - from cgi import escape + from html import escape from flup.server.fcgi import WSGIServer def app(environ, start_response): diff --git a/Doc/library/cgi.rst b/Doc/library/cgi.rst index 49d1488..8c75517 100644 --- a/Doc/library/cgi.rst +++ b/Doc/library/cgi.rst @@ -328,9 +328,9 @@ algorithms implemented in this module in other circumstances. attribute value delimited by double quotes, as in ````. Note that single quotes are never translated. - If the value to be quoted might include single- or double-quote characters, - or both, consider using the :func:`~xml.sax.saxutils.quoteattr` function in the - :mod:`xml.sax.saxutils` module instead. + .. deprecated:: 3.2 + This function is unsafe because *quote* is false by default, and therefore + deprecated. Use :func:`html.escape` instead. .. _cgi-security: @@ -508,8 +508,8 @@ Common problems and solutions .. rubric:: Footnotes -.. [#] Note that some recent versions of the HTML specification do state what order the - field values should be supplied in, but knowing whether a request was - received from a conforming browser, or even from a browser at all, is tedious - and error-prone. +.. [#] Note that some recent versions of the HTML specification do state what + order the field values should be supplied in, but knowing whether a request + was received from a conforming browser, or even from a browser at all, is + tedious and error-prone. diff --git a/Doc/library/html.rst b/Doc/library/html.rst new file mode 100644 index 0000000..2c42cf8 --- /dev/null +++ b/Doc/library/html.rst @@ -0,0 +1,18 @@ +:mod:`html` --- HyperText Markup Language support +================================================= + +.. module:: html + :synopsis: Helpers for manipulating HTML. + +.. versionadded:: 3.2 + + +This module defines utilities to manipulate HTML. + +.. function:: escape(s, quote=True) + + Convert the characters ``&``, ``<`` and ``>`` in string *s* to HTML-safe + sequences. Use this if you need to display text that might contain such + characters in HTML. If the optional flag *quote* is true, the characters + (``"``) and (``'``) are also translated; this helps for inclusion in an HTML + attribute value delimited by quotes, as in ````. diff --git a/Doc/library/markup.rst b/Doc/library/markup.rst index ae97b69..49794ef 100644 --- a/Doc/library/markup.rst +++ b/Doc/library/markup.rst @@ -20,6 +20,7 @@ definition of the Python bindings for the DOM and SAX interfaces. .. toctree:: + html.rst html.parser.rst html.entities.rst pyexpat.rst diff --git a/Lib/cgi.py b/Lib/cgi.py index 7da2b23..8786e58 100755 --- a/Lib/cgi.py +++ b/Lib/cgi.py @@ -31,13 +31,13 @@ __version__ = "2.6" # Imports # ======= -from operator import attrgetter from io import StringIO import sys import os import urllib.parse import email.parser from warnings import warn +import html __all__ = ["MiniFieldStorage", "FieldStorage", "parse", "parse_qs", "parse_qsl", "parse_multipart", @@ -800,8 +800,8 @@ def print_exception(type=None, value=None, tb=None, limit=None): list = traceback.format_tb(tb, limit) + \ traceback.format_exception_only(type, value) print("
%s%s
" % ( - escape("".join(list[:-1])), - escape(list[-1]), + html.escape("".join(list[:-1])), + html.escape(list[-1]), )) del tb @@ -812,7 +812,7 @@ def print_environ(environ=os.environ): print("

Shell Environment:

") print("
") for key in keys: - print("
", escape(key), "
", escape(environ[key])) + print("
", html.escape(key), "
", html.escape(environ[key])) print("
") print() @@ -825,10 +825,10 @@ def print_form(form): print("

No form fields.") print("

") for key in keys: - print("
" + escape(key) + ":", end=' ') + print("
" + html.escape(key) + ":", end=' ') value = form[key] - print("" + escape(repr(type(value))) + "") - print("
" + escape(repr(value))) + print("" + html.escape(repr(type(value))) + "") + print("
" + html.escape(repr(value))) print("
") print() @@ -839,9 +839,9 @@ def print_directory(): try: pwd = os.getcwd() except os.error as msg: - print("os.error:", escape(str(msg))) + print("os.error:", html.escape(str(msg))) else: - print(escape(pwd)) + print(html.escape(pwd)) print() def print_arguments(): @@ -899,9 +899,9 @@ environment as well. Here are some common variable names: # ========= def escape(s, quote=None): - '''Replace special characters "&", "<" and ">" to HTML-safe sequences. - If the optional flag quote is true, the quotation mark character (") - is also translated.''' + """Deprecated API.""" + warn("cgi.escape is deprecated, use html.escape instead", + PendingDeprecationWarning, stacklevel=2) s = s.replace("&", "&") # Must be done first! s = s.replace("<", "<") s = s.replace(">", ">") @@ -909,6 +909,7 @@ def escape(s, quote=None): s = s.replace('"', """) return s + def valid_boundary(s, _vb_pattern="^[ -~]{0,200}[!-~]$"): import re return re.match(_vb_pattern, s) diff --git a/Lib/html/__init__.py b/Lib/html/__init__.py index 196d378..335d214 100644 --- a/Lib/html/__init__.py +++ b/Lib/html/__init__.py @@ -1 +1,20 @@ -# This directory is a Python package. +""" +General functions for HTML manipulation. +""" + + +_escape_map = {ord('&'): '&', ord('<'): '<', ord('>'): '>'} +_escape_map_full = {ord('&'): '&', ord('<'): '<', ord('>'): '>', + ord('"'): '"', ord('\''): '''} + +# NB: this is a candidate for a bytes/string polymorphic interface + +def escape(s, quote=True): + """ + Replace special characters "&", "<" and ">" to HTML-safe sequences. + If the optional flag quote is true (the default), the quotation mark + character (") is also translated. + """ + if quote: + return s.translate(_escape_map_full) + return s.translate(_escape_map) diff --git a/Lib/http/server.py b/Lib/http/server.py index 894342a..f6d0db4 100644 --- a/Lib/http/server.py +++ b/Lib/http/server.py @@ -84,7 +84,7 @@ __version__ = "0.6" __all__ = ["HTTPServer", "BaseHTTPRequestHandler"] -import cgi +import html import email.message import email.parser import http.client @@ -705,7 +705,7 @@ class SimpleHTTPRequestHandler(BaseHTTPRequestHandler): return None list.sort(key=lambda a: a.lower()) r = [] - displaypath = cgi.escape(urllib.parse.unquote(self.path)) + displaypath = html.escape(urllib.parse.unquote(self.path)) r.append('') r.append("\nDirectory listing for %s\n" % displaypath) r.append("\n

Directory listing for %s

\n" % displaypath) @@ -721,7 +721,7 @@ class SimpleHTTPRequestHandler(BaseHTTPRequestHandler): displayname = name + "@" # Note: a link to a directory displays with @ and links with / r.append('
  • %s\n' - % (urllib.parse.quote(linkname), cgi.escape(displayname))) + % (urllib.parse.quote(linkname), html.escape(displayname))) r.append("\n
    \n\n\n") enc = sys.getfilesystemencoding() encoded = ''.join(r).encode(enc) diff --git a/Lib/lib2to3/tests/test_util.py b/Lib/lib2to3/tests/test_util.py index 0ab7537..d2be82c 100644 --- a/Lib/lib2to3/tests/test_util.py +++ b/Lib/lib2to3/tests/test_util.py @@ -568,8 +568,8 @@ class Test_touch_import(support.TestCase): def test_from_import(self): node = parse('bar()') - fixer_util.touch_import("cgi", "escape", node) - self.assertEqual(str(node), 'from cgi import escape\nbar()\n\n') + fixer_util.touch_import("html", "escape", node) + self.assertEqual(str(node), 'from html import escape\nbar()\n\n') def test_name_import(self): node = parse('bar()') diff --git a/Lib/test/test_html.py b/Lib/test/test_html.py new file mode 100644 index 0000000..30dac58 --- /dev/null +++ b/Lib/test/test_html.py @@ -0,0 +1,24 @@ +""" +Tests for the html module functions. +""" + +import html +import unittest +from test.support import run_unittest + + +class HtmlTests(unittest.TestCase): + def test_escape(self): + self.assertEqual( + html.escape('\'\''), + ''<script>"&foo;"</script>'') + self.assertEqual( + html.escape('\'\'', False), + '\'<script>"&foo;"</script>\'') + + +def test_main(): + run_unittest(HtmlTests) + +if __name__ == '__main__': + test_main() diff --git a/Lib/test/test_xml_etree.py b/Lib/test/test_xml_etree.py index 7914d1f..e802359 100644 --- a/Lib/test/test_xml_etree.py +++ b/Lib/test/test_xml_etree.py @@ -12,7 +12,7 @@ # except if the test is specific to the Python implementation. import sys -import cgi +import html import unittest from test import support @@ -1328,7 +1328,7 @@ XINCLUDE["default.xml"] = """\

    Example.

    -""".format(cgi.escape(SIMPLE_XMLFILE, True)) +""".format(html.escape(SIMPLE_XMLFILE, True)) def xinclude_loader(href, parse="xml", encoding=None): try: diff --git a/Misc/NEWS b/Misc/NEWS index a87dacf..abba90d 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -24,6 +24,9 @@ Core and Builtins Library ------- +- Issue #2830: Add the ``html.escape()`` function, which quotes all problematic + characters by default. Deprecate ``cgi.escape()``. + - Issue 9409: Fix the regex to match all kind of filenames, for interactive debugging in doctests. -- cgit v0.12