From af8c3d7a26d605099f5b3406a8d33ecddb77e8fb Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Mon, 1 Jul 2024 10:30:33 +0200 Subject: gh-121188: Sanitize invalid XML characters in regrtest (#121195) When creating the JUnit XML file, regrtest now escapes characters which are invalid in XML, such as the chr(27) control character used in ANSI escape sequences. --- Lib/test/libregrtest/testresult.py | 12 +++-- Lib/test/libregrtest/utils.py | 22 ++++++++ Lib/test/test_regrtest.py | 59 ++++++++++++++++++++++ .../2024-07-01-09-04-32.gh-issue-121188.XbuTVa.rst | 3 ++ 4 files changed, 91 insertions(+), 5 deletions(-) create mode 100644 Misc/NEWS.d/next/Tests/2024-07-01-09-04-32.gh-issue-121188.XbuTVa.rst diff --git a/Lib/test/libregrtest/testresult.py b/Lib/test/libregrtest/testresult.py index de23fdd..1820f35 100644 --- a/Lib/test/libregrtest/testresult.py +++ b/Lib/test/libregrtest/testresult.py @@ -9,6 +9,7 @@ import time import traceback import unittest from test import support +from test.libregrtest.utils import sanitize_xml class RegressionTestResult(unittest.TextTestResult): USE_XML = False @@ -65,23 +66,24 @@ class RegressionTestResult(unittest.TextTestResult): if capture: if self._stdout_buffer is not None: stdout = self._stdout_buffer.getvalue().rstrip() - ET.SubElement(e, 'system-out').text = stdout + ET.SubElement(e, 'system-out').text = sanitize_xml(stdout) if self._stderr_buffer is not None: stderr = self._stderr_buffer.getvalue().rstrip() - ET.SubElement(e, 'system-err').text = stderr + ET.SubElement(e, 'system-err').text = sanitize_xml(stderr) for k, v in args.items(): if not k or not v: continue + e2 = ET.SubElement(e, k) if hasattr(v, 'items'): for k2, v2 in v.items(): if k2: - e2.set(k2, str(v2)) + e2.set(k2, sanitize_xml(str(v2))) else: - e2.text = str(v2) + e2.text = sanitize_xml(str(v2)) else: - e2.text = str(v) + e2.text = sanitize_xml(str(v)) @classmethod def __makeErrorDict(cls, err_type, err_value, err_tb): diff --git a/Lib/test/libregrtest/utils.py b/Lib/test/libregrtest/utils.py index 8253d33..0167742 100644 --- a/Lib/test/libregrtest/utils.py +++ b/Lib/test/libregrtest/utils.py @@ -5,6 +5,7 @@ import math import os.path import platform import random +import re import shlex import signal import subprocess @@ -712,3 +713,24 @@ def get_signal_name(exitcode): pass return None + + +ILLEGAL_XML_CHARS_RE = re.compile( + '[' + # Control characters; newline (\x0A and \x0D) and TAB (\x09) are legal + '\x00-\x08\x0B\x0C\x0E-\x1F' + # Surrogate characters + '\uD800-\uDFFF' + # Special Unicode characters + '\uFFFE' + '\uFFFF' + # Match multiple sequential invalid characters for better effiency + ']+') + +def _sanitize_xml_replace(regs): + text = regs[0] + return ''.join(f'\\x{ord(ch):02x}' if ch <= '\xff' else ascii(ch)[1:-1] + for ch in text) + +def sanitize_xml(text): + return ILLEGAL_XML_CHARS_RE.sub(_sanitize_xml_replace, text) diff --git a/Lib/test/test_regrtest.py b/Lib/test/test_regrtest.py index 0a15170..44fd11b 100644 --- a/Lib/test/test_regrtest.py +++ b/Lib/test/test_regrtest.py @@ -21,6 +21,8 @@ import sysconfig import tempfile import textwrap import unittest +from xml.etree import ElementTree + from test import support from test.support import import_helper from test.support import os_helper @@ -2254,6 +2256,44 @@ class ArgsTestCase(BaseTestCase): self.check_executed_tests(output, testname, stats=1, parallel=True) self.assertNotIn('SPAM SPAM SPAM', output) + def test_xml(self): + code = textwrap.dedent(r""" + import unittest + from test import support + + class VerboseTests(unittest.TestCase): + def test_failed(self): + print("abc \x1b def") + self.fail() + """) + testname = self.create_test(code=code) + + # Run sequentially + filename = os_helper.TESTFN + self.addCleanup(os_helper.unlink, filename) + + output = self.run_tests(testname, "--junit-xml", filename, + exitcode=EXITCODE_BAD_TEST) + self.check_executed_tests(output, testname, + failed=testname, + stats=TestStats(1, 1, 0)) + + # Test generated XML + with open(filename, encoding="utf8") as fp: + content = fp.read() + + testsuite = ElementTree.fromstring(content) + self.assertEqual(int(testsuite.get('tests')), 1) + self.assertEqual(int(testsuite.get('errors')), 0) + self.assertEqual(int(testsuite.get('failures')), 1) + + testcase = testsuite[0][0] + self.assertEqual(testcase.get('status'), 'run') + self.assertEqual(testcase.get('result'), 'completed') + self.assertGreater(float(testcase.get('time')), 0) + for out in testcase.iter('system-out'): + self.assertEqual(out.text, r"abc \x1b def") + class TestUtils(unittest.TestCase): def test_format_duration(self): @@ -2437,6 +2477,25 @@ class TestUtils(unittest.TestCase): self.assertTrue(match_test(test_chdir)) self.assertFalse(match_test(test_copy)) + def test_sanitize_xml(self): + sanitize_xml = utils.sanitize_xml + + # escape invalid XML characters + self.assertEqual(sanitize_xml('abc \x1b\x1f def'), + r'abc \x1b\x1f def') + self.assertEqual(sanitize_xml('nul:\x00, bell:\x07'), + r'nul:\x00, bell:\x07') + self.assertEqual(sanitize_xml('surrogate:\uDC80'), + r'surrogate:\udc80') + self.assertEqual(sanitize_xml('illegal \uFFFE and \uFFFF'), + r'illegal \ufffe and \uffff') + + # no escape for valid XML characters + self.assertEqual(sanitize_xml('a\n\tb'), + 'a\n\tb') + self.assertEqual(sanitize_xml('valid t\xe9xt \u20ac'), + 'valid t\xe9xt \u20ac') + if __name__ == '__main__': unittest.main() diff --git a/Misc/NEWS.d/next/Tests/2024-07-01-09-04-32.gh-issue-121188.XbuTVa.rst b/Misc/NEWS.d/next/Tests/2024-07-01-09-04-32.gh-issue-121188.XbuTVa.rst new file mode 100644 index 0000000..c92002d --- /dev/null +++ b/Misc/NEWS.d/next/Tests/2024-07-01-09-04-32.gh-issue-121188.XbuTVa.rst @@ -0,0 +1,3 @@ +When creating the JUnit XML file, regrtest now escapes characters which are +invalid in XML, such as the chr(27) control character used in ANSI escape +sequences. Patch by Victor Stinner. -- cgit v0.12