summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSerhiy Storchaka <storchaka@gmail.com>2024-11-21 11:15:12 (GMT)
committerGitHub <noreply@github.com>2024-11-21 11:15:12 (GMT)
commiteaf217108226633c03cc5c4c90f0b6e4587c8803 (patch)
tree85aebbbea34fc7303c5a8f2f8213c5a35948d38f
parentff2278e2bf660155ca8f7c0529190ca59a41c13a (diff)
downloadcpython-eaf217108226633c03cc5c4c90f0b6e4587c8803.zip
cpython-eaf217108226633c03cc5c4c90f0b6e4587c8803.tar.gz
cpython-eaf217108226633c03cc5c4c90f0b6e4587c8803.tar.bz2
gh-126997: Fix support of non-ASCII strings in pickletools (GH-127062)
* Fix support of STRING and GLOBAL opcodes with non-ASCII arguments. * dis() now outputs non-ASCII bytes in STRING, BINSTRING and SHORT_BINSTRING arguments as escaped (\xXX).
-rw-r--r--Lib/pickletools.py11
-rw-r--r--Lib/test/test_pickletools.py82
-rw-r--r--Misc/NEWS.d/next/Library/2024-11-20-16-58-59.gh-issue-126997.0PI41Y.rst3
3 files changed, 92 insertions, 4 deletions
diff --git a/Lib/pickletools.py b/Lib/pickletools.py
index c462d26..d9c4fb1 100644
--- a/Lib/pickletools.py
+++ b/Lib/pickletools.py
@@ -312,7 +312,7 @@ uint8 = ArgumentDescriptor(
doc="Eight-byte unsigned integer, little-endian.")
-def read_stringnl(f, decode=True, stripquotes=True):
+def read_stringnl(f, decode=True, stripquotes=True, *, encoding='latin-1'):
r"""
>>> import io
>>> read_stringnl(io.BytesIO(b"'abcd'\nefg\n"))
@@ -356,7 +356,7 @@ def read_stringnl(f, decode=True, stripquotes=True):
raise ValueError("no string quotes around %r" % data)
if decode:
- data = codecs.escape_decode(data)[0].decode("ascii")
+ data = codecs.escape_decode(data)[0].decode(encoding)
return data
stringnl = ArgumentDescriptor(
@@ -370,7 +370,7 @@ stringnl = ArgumentDescriptor(
""")
def read_stringnl_noescape(f):
- return read_stringnl(f, stripquotes=False)
+ return read_stringnl(f, stripquotes=False, encoding='utf-8')
stringnl_noescape = ArgumentDescriptor(
name='stringnl_noescape',
@@ -2509,7 +2509,10 @@ def dis(pickle, out=None, memo=None, indentlevel=4, annotate=0):
# make a mild effort to align arguments
line += ' ' * (10 - len(opcode.name))
if arg is not None:
- line += ' ' + repr(arg)
+ if opcode.name in ("STRING", "BINSTRING", "SHORT_BINSTRING"):
+ line += ' ' + ascii(arg)
+ else:
+ line += ' ' + repr(arg)
if markmsg:
line += ' ' + markmsg
if annotate:
diff --git a/Lib/test/test_pickletools.py b/Lib/test/test_pickletools.py
index d8ff7a2..265dc49 100644
--- a/Lib/test/test_pickletools.py
+++ b/Lib/test/test_pickletools.py
@@ -361,6 +361,88 @@ highest protocol among opcodes = 0
highest protocol among opcodes = 0
''', annotate=20)
+ def test_string(self):
+ self.check_dis(b"S'abc'\n.", '''\
+ 0: S STRING 'abc'
+ 7: . STOP
+highest protocol among opcodes = 0
+''')
+ self.check_dis(b'S"abc"\n.', '''\
+ 0: S STRING 'abc'
+ 7: . STOP
+highest protocol among opcodes = 0
+''')
+ self.check_dis(b"S'\xc3\xb5'\n.", '''\
+ 0: S STRING '\\xc3\\xb5'
+ 6: . STOP
+highest protocol among opcodes = 0
+''')
+
+ def test_string_without_quotes(self):
+ self.check_dis_error(b"Sabc'\n.", '',
+ 'no string quotes around b"abc\'"')
+ self.check_dis_error(b'Sabc"\n.', '',
+ "no string quotes around b'abc\"'")
+ self.check_dis_error(b"S'abc\n.", '',
+ '''strinq quote b"'" not found at both ends of b"'abc"''')
+ self.check_dis_error(b'S"abc\n.', '',
+ r"""strinq quote b'"' not found at both ends of b'"abc'""")
+ self.check_dis_error(b"S'abc\"\n.", '',
+ r"""strinq quote b"'" not found at both ends of b'\\'abc"'""")
+ self.check_dis_error(b"S\"abc'\n.", '',
+ r"""strinq quote b'"' not found at both ends of b'"abc\\''""")
+
+ def test_binstring(self):
+ self.check_dis(b"T\x03\x00\x00\x00abc.", '''\
+ 0: T BINSTRING 'abc'
+ 8: . STOP
+highest protocol among opcodes = 1
+''')
+ self.check_dis(b"T\x02\x00\x00\x00\xc3\xb5.", '''\
+ 0: T BINSTRING '\\xc3\\xb5'
+ 7: . STOP
+highest protocol among opcodes = 1
+''')
+
+ def test_short_binstring(self):
+ self.check_dis(b"U\x03abc.", '''\
+ 0: U SHORT_BINSTRING 'abc'
+ 5: . STOP
+highest protocol among opcodes = 1
+''')
+ self.check_dis(b"U\x02\xc3\xb5.", '''\
+ 0: U SHORT_BINSTRING '\\xc3\\xb5'
+ 4: . STOP
+highest protocol among opcodes = 1
+''')
+
+ def test_global(self):
+ self.check_dis(b"cmodule\nname\n.", '''\
+ 0: c GLOBAL 'module name'
+ 13: . STOP
+highest protocol among opcodes = 0
+''')
+ self.check_dis(b"cm\xc3\xb6dule\nn\xc3\xa4me\n.", '''\
+ 0: c GLOBAL 'm\xf6dule n\xe4me'
+ 15: . STOP
+highest protocol among opcodes = 0
+''')
+
+ def test_inst(self):
+ self.check_dis(b"(imodule\nname\n.", '''\
+ 0: ( MARK
+ 1: i INST 'module name' (MARK at 0)
+ 14: . STOP
+highest protocol among opcodes = 0
+''')
+
+ def test_persid(self):
+ self.check_dis(b"Pabc\n.", '''\
+ 0: P PERSID 'abc'
+ 5: . STOP
+highest protocol among opcodes = 0
+''')
+
class MiscTestCase(unittest.TestCase):
def test__all__(self):
diff --git a/Misc/NEWS.d/next/Library/2024-11-20-16-58-59.gh-issue-126997.0PI41Y.rst b/Misc/NEWS.d/next/Library/2024-11-20-16-58-59.gh-issue-126997.0PI41Y.rst
new file mode 100644
index 0000000..b85c51e
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2024-11-20-16-58-59.gh-issue-126997.0PI41Y.rst
@@ -0,0 +1,3 @@
+Fix support of STRING and GLOBAL opcodes with non-ASCII arguments in
+:mod:`pickletools`. :func:`pickletools.dis` now outputs non-ASCII bytes in
+STRING, BINSTRING and SHORT_BINSTRING arguments as escaped (``\xXX``).