summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVictor Stinner <vstinner@python.org>2024-07-01 08:30:33 (GMT)
committerGitHub <noreply@github.com>2024-07-01 08:30:33 (GMT)
commitaf8c3d7a26d605099f5b3406a8d33ecddb77e8fb (patch)
treeec814ccede37647ad55f4518d8a6ee26ee3d9abe
parentf80376b129ad947263a6b03a6c3a874e9f8706e6 (diff)
downloadcpython-af8c3d7a26d605099f5b3406a8d33ecddb77e8fb.zip
cpython-af8c3d7a26d605099f5b3406a8d33ecddb77e8fb.tar.gz
cpython-af8c3d7a26d605099f5b3406a8d33ecddb77e8fb.tar.bz2
gh-121188: Sanitize invalid XML characters in regrtest (#121195)
When creating the JUnit XML file, regrtest now escapes characters which are invalid in XML, such as the chr(27) control character used in ANSI escape sequences.
-rw-r--r--Lib/test/libregrtest/testresult.py12
-rw-r--r--Lib/test/libregrtest/utils.py22
-rw-r--r--Lib/test/test_regrtest.py59
-rw-r--r--Misc/NEWS.d/next/Tests/2024-07-01-09-04-32.gh-issue-121188.XbuTVa.rst3
4 files changed, 91 insertions, 5 deletions
diff --git a/Lib/test/libregrtest/testresult.py b/Lib/test/libregrtest/testresult.py
index de23fdd..1820f35 100644
--- a/Lib/test/libregrtest/testresult.py
+++ b/Lib/test/libregrtest/testresult.py
@@ -9,6 +9,7 @@ import time
import traceback
import unittest
from test import support
+from test.libregrtest.utils import sanitize_xml
class RegressionTestResult(unittest.TextTestResult):
USE_XML = False
@@ -65,23 +66,24 @@ class RegressionTestResult(unittest.TextTestResult):
if capture:
if self._stdout_buffer is not None:
stdout = self._stdout_buffer.getvalue().rstrip()
- ET.SubElement(e, 'system-out').text = stdout
+ ET.SubElement(e, 'system-out').text = sanitize_xml(stdout)
if self._stderr_buffer is not None:
stderr = self._stderr_buffer.getvalue().rstrip()
- ET.SubElement(e, 'system-err').text = stderr
+ ET.SubElement(e, 'system-err').text = sanitize_xml(stderr)
for k, v in args.items():
if not k or not v:
continue
+
e2 = ET.SubElement(e, k)
if hasattr(v, 'items'):
for k2, v2 in v.items():
if k2:
- e2.set(k2, str(v2))
+ e2.set(k2, sanitize_xml(str(v2)))
else:
- e2.text = str(v2)
+ e2.text = sanitize_xml(str(v2))
else:
- e2.text = str(v)
+ e2.text = sanitize_xml(str(v))
@classmethod
def __makeErrorDict(cls, err_type, err_value, err_tb):
diff --git a/Lib/test/libregrtest/utils.py b/Lib/test/libregrtest/utils.py
index 8253d33..0167742 100644
--- a/Lib/test/libregrtest/utils.py
+++ b/Lib/test/libregrtest/utils.py
@@ -5,6 +5,7 @@ import math
import os.path
import platform
import random
+import re
import shlex
import signal
import subprocess
@@ -712,3 +713,24 @@ def get_signal_name(exitcode):
pass
return None
+
+
+ILLEGAL_XML_CHARS_RE = re.compile(
+ '['
+ # Control characters; newline (\x0A and \x0D) and TAB (\x09) are legal
+ '\x00-\x08\x0B\x0C\x0E-\x1F'
+ # Surrogate characters
+ '\uD800-\uDFFF'
+ # Special Unicode characters
+ '\uFFFE'
+ '\uFFFF'
+ # Match multiple sequential invalid characters for better effiency
+ ']+')
+
+def _sanitize_xml_replace(regs):
+ text = regs[0]
+ return ''.join(f'\\x{ord(ch):02x}' if ch <= '\xff' else ascii(ch)[1:-1]
+ for ch in text)
+
+def sanitize_xml(text):
+ return ILLEGAL_XML_CHARS_RE.sub(_sanitize_xml_replace, text)
diff --git a/Lib/test/test_regrtest.py b/Lib/test/test_regrtest.py
index 0a15170..44fd11b 100644
--- a/Lib/test/test_regrtest.py
+++ b/Lib/test/test_regrtest.py
@@ -21,6 +21,8 @@ import sysconfig
import tempfile
import textwrap
import unittest
+from xml.etree import ElementTree
+
from test import support
from test.support import import_helper
from test.support import os_helper
@@ -2254,6 +2256,44 @@ class ArgsTestCase(BaseTestCase):
self.check_executed_tests(output, testname, stats=1, parallel=True)
self.assertNotIn('SPAM SPAM SPAM', output)
+ def test_xml(self):
+ code = textwrap.dedent(r"""
+ import unittest
+ from test import support
+
+ class VerboseTests(unittest.TestCase):
+ def test_failed(self):
+ print("abc \x1b def")
+ self.fail()
+ """)
+ testname = self.create_test(code=code)
+
+ # Run sequentially
+ filename = os_helper.TESTFN
+ self.addCleanup(os_helper.unlink, filename)
+
+ output = self.run_tests(testname, "--junit-xml", filename,
+ exitcode=EXITCODE_BAD_TEST)
+ self.check_executed_tests(output, testname,
+ failed=testname,
+ stats=TestStats(1, 1, 0))
+
+ # Test generated XML
+ with open(filename, encoding="utf8") as fp:
+ content = fp.read()
+
+ testsuite = ElementTree.fromstring(content)
+ self.assertEqual(int(testsuite.get('tests')), 1)
+ self.assertEqual(int(testsuite.get('errors')), 0)
+ self.assertEqual(int(testsuite.get('failures')), 1)
+
+ testcase = testsuite[0][0]
+ self.assertEqual(testcase.get('status'), 'run')
+ self.assertEqual(testcase.get('result'), 'completed')
+ self.assertGreater(float(testcase.get('time')), 0)
+ for out in testcase.iter('system-out'):
+ self.assertEqual(out.text, r"abc \x1b def")
+
class TestUtils(unittest.TestCase):
def test_format_duration(self):
@@ -2437,6 +2477,25 @@ class TestUtils(unittest.TestCase):
self.assertTrue(match_test(test_chdir))
self.assertFalse(match_test(test_copy))
+ def test_sanitize_xml(self):
+ sanitize_xml = utils.sanitize_xml
+
+ # escape invalid XML characters
+ self.assertEqual(sanitize_xml('abc \x1b\x1f def'),
+ r'abc \x1b\x1f def')
+ self.assertEqual(sanitize_xml('nul:\x00, bell:\x07'),
+ r'nul:\x00, bell:\x07')
+ self.assertEqual(sanitize_xml('surrogate:\uDC80'),
+ r'surrogate:\udc80')
+ self.assertEqual(sanitize_xml('illegal \uFFFE and \uFFFF'),
+ r'illegal \ufffe and \uffff')
+
+ # no escape for valid XML characters
+ self.assertEqual(sanitize_xml('a\n\tb'),
+ 'a\n\tb')
+ self.assertEqual(sanitize_xml('valid t\xe9xt \u20ac'),
+ 'valid t\xe9xt \u20ac')
+
if __name__ == '__main__':
unittest.main()
diff --git a/Misc/NEWS.d/next/Tests/2024-07-01-09-04-32.gh-issue-121188.XbuTVa.rst b/Misc/NEWS.d/next/Tests/2024-07-01-09-04-32.gh-issue-121188.XbuTVa.rst
new file mode 100644
index 0000000..c92002d
--- /dev/null
+++ b/Misc/NEWS.d/next/Tests/2024-07-01-09-04-32.gh-issue-121188.XbuTVa.rst
@@ -0,0 +1,3 @@
+When creating the JUnit XML file, regrtest now escapes characters which are
+invalid in XML, such as the chr(27) control character used in ANSI escape
+sequences. Patch by Victor Stinner.