From a25a985535ccbb7df8caddc0017550ff4eae5855 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Tue, 22 Mar 2022 11:52:55 +0200 Subject: bpo-28080: Add support for the fallback encoding in ZIP files (GH-32007) * Add the metadata_encoding parameter in the zipfile.ZipFile constructor. * Add the --metadata-encoding option in the zipfile CLI. Co-authored-by: Stephen J. Turnbull --- Doc/library/zipfile.rst | 41 +++++- Doc/whatsnew/3.11.rst | 6 + Lib/test/test_zipfile.py | 140 ++++++++++++++++++++- Lib/zipfile.py | 31 +++-- .../2022-03-20-15-54-41.bpo-28080.kn35Vk.rst | 4 + 5 files changed, 211 insertions(+), 11 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2022-03-20-15-54-41.bpo-28080.kn35Vk.rst diff --git a/Doc/library/zipfile.rst b/Doc/library/zipfile.rst index 9d0d894..bfcc883 100644 --- a/Doc/library/zipfile.rst +++ b/Doc/library/zipfile.rst @@ -139,7 +139,8 @@ ZipFile Objects .. class:: ZipFile(file, mode='r', compression=ZIP_STORED, allowZip64=True, \ - compresslevel=None, *, strict_timestamps=True) + compresslevel=None, *, strict_timestamps=True, + metadata_encoding=None) Open a ZIP file, where *file* can be a path to a file (a string), a file-like object or a :term:`path-like object`. @@ -183,6 +184,10 @@ ZipFile Objects Similar behavior occurs with files newer than 2107-12-31, the timestamp is also set to the limit. + When mode is ``'r'``, *metadata_encoding* may be set to the name of a codec, + which will be used to decode metadata such as the names of members and ZIP + comments. + If the file is created with mode ``'w'``, ``'x'`` or ``'a'`` and then :meth:`closed ` without adding any files to the archive, the appropriate ZIP structures for an empty archive will be written to the file. @@ -194,6 +199,19 @@ ZipFile Objects with ZipFile('spam.zip', 'w') as myzip: myzip.write('eggs.txt') + .. note:: + + *metadata_encoding* is an instance-wide setting for the ZipFile. + It is not currently possible to set this on a per-member basis. + + This attribute is a workaround for legacy implementations which produce + archives with names in the current locale encoding or code page (mostly + on Windows). According to the .ZIP standard, the encoding of metadata + may be specified to be either IBM code page (default) or UTF-8 by a flag + in the archive header. + That flag takes precedence over *metadata_encoding*, which is + a Python-specific extension. + .. versionadded:: 3.2 Added the ability to use :class:`ZipFile` as a context manager. @@ -220,6 +238,10 @@ ZipFile Objects .. versionadded:: 3.8 The *strict_timestamps* keyword-only argument + .. versionchanged:: 3.11 + Added support for specifying member name encoding for reading + metadata in the zipfile's directory and file headers. + .. method:: ZipFile.close() @@ -397,6 +419,15 @@ ZipFile Objects .. note:: + The ZIP file standard historically did not specify a metadata encoding, + but strongly recommended CP437 (the original IBM PC encoding) for + interoperability. Recent versions allow use of UTF-8 (only). In this + module, UTF-8 will automatically be used to write the member names if + they contain any non-ASCII characters. It is not possible to write + member names in any encoding other than ASCII or UTF-8. + + .. note:: + Archive names should be relative to the archive root, that is, they should not start with a path separator. @@ -868,6 +899,14 @@ Command-line options Test whether the zipfile is valid or not. +.. cmdoption:: --metadata-encoding + + Specify encoding of member names for :option:`-l`, :option:`-e` and + :option:`-t`. + + .. versionadded:: 3.11 + + Decompression pitfalls ---------------------- diff --git a/Doc/whatsnew/3.11.rst b/Doc/whatsnew/3.11.rst index 96db3a9..938a573 100644 --- a/Doc/whatsnew/3.11.rst +++ b/Doc/whatsnew/3.11.rst @@ -432,6 +432,12 @@ venv Third party code that also creates new virtual environments should do the same. (Contributed by Miro HronĨok in :issue:`45413`.) +zipfile +------- + +* Added support for specifying member name encoding for reading + metadata in the zipfile's directory and file headers. + (Contributed by Stephen J. Turnbull and Serhiy Storchaka in :issue:`28080`.) fcntl ----- diff --git a/Lib/test/test_zipfile.py b/Lib/test/test_zipfile.py index 759a4ab..26c4045 100644 --- a/Lib/test/test_zipfile.py +++ b/Lib/test/test_zipfile.py @@ -21,8 +21,10 @@ from tempfile import TemporaryFile from random import randint, random, randbytes from test.support import script_helper -from test.support import (findfile, requires_zlib, requires_bz2, - requires_lzma, captured_stdout, requires_subprocess) +from test.support import ( + findfile, requires_zlib, requires_bz2, requires_lzma, + captured_stdout, captured_stderr, requires_subprocess +) from test.support.os_helper import ( TESTFN, unlink, rmtree, temp_dir, temp_cwd, fd_count ) @@ -3210,5 +3212,139 @@ class TestPath(unittest.TestCase): assert isinstance(file, cls) +class EncodedMetadataTests(unittest.TestCase): + file_names = ['\u4e00', '\u4e8c', '\u4e09'] # Han 'one', 'two', 'three' + file_content = [ + "This is pure ASCII.\n".encode('ascii'), + # This is modern Japanese. (UTF-8) + "\u3053\u308c\u306f\u73fe\u4ee3\u7684\u65e5\u672c\u8a9e\u3067\u3059\u3002\n".encode('utf-8'), + # This is obsolete Japanese. (Shift JIS) + "\u3053\u308c\u306f\u53e4\u3044\u65e5\u672c\u8a9e\u3067\u3059\u3002\n".encode('shift_jis'), + ] + + def setUp(self): + self.addCleanup(unlink, TESTFN) + # Create .zip of 3 members with Han names encoded in Shift JIS. + # Each name is 1 Han character encoding to 2 bytes in Shift JIS. + # The ASCII names are arbitrary as long as they are length 2 and + # not otherwise contained in the zip file. + # Data elements are encoded bytes (ascii, utf-8, shift_jis). + placeholders = ["n1", "n2"] + self.file_names[2:] + with zipfile.ZipFile(TESTFN, mode="w") as tf: + for temp, content in zip(placeholders, self.file_content): + tf.writestr(temp, content, zipfile.ZIP_STORED) + # Hack in the Shift JIS names with flag bit 11 (UTF-8) unset. + with open(TESTFN, "rb") as tf: + data = tf.read() + for name, temp in zip(self.file_names, placeholders[:2]): + data = data.replace(temp.encode('ascii'), + name.encode('shift_jis')) + with open(TESTFN, "wb") as tf: + tf.write(data) + + def _test_read(self, zipfp, expected_names, expected_content): + # Check the namelist + names = zipfp.namelist() + self.assertEqual(sorted(names), sorted(expected_names)) + + # Check infolist + infos = zipfp.infolist() + names = [zi.filename for zi in infos] + self.assertEqual(sorted(names), sorted(expected_names)) + + # check getinfo + for name, content in zip(expected_names, expected_content): + info = zipfp.getinfo(name) + self.assertEqual(info.filename, name) + self.assertEqual(info.file_size, len(content)) + self.assertEqual(zipfp.read(name), content) + + def test_read_with_metadata_encoding(self): + # Read the ZIP archive with correct metadata_encoding + with zipfile.ZipFile(TESTFN, "r", metadata_encoding='shift_jis') as zipfp: + self._test_read(zipfp, self.file_names, self.file_content) + + def test_read_without_metadata_encoding(self): + # Read the ZIP archive without metadata_encoding + expected_names = [name.encode('shift_jis').decode('cp437') + for name in self.file_names[:2]] + self.file_names[2:] + with zipfile.ZipFile(TESTFN, "r") as zipfp: + self._test_read(zipfp, expected_names, self.file_content) + + def test_read_with_incorrect_metadata_encoding(self): + # Read the ZIP archive with incorrect metadata_encoding + expected_names = [name.encode('shift_jis').decode('koi8-u') + for name in self.file_names[:2]] + self.file_names[2:] + with zipfile.ZipFile(TESTFN, "r", metadata_encoding='koi8-u') as zipfp: + self._test_read(zipfp, expected_names, self.file_content) + + def test_read_with_unsuitable_metadata_encoding(self): + # Read the ZIP archive with metadata_encoding unsuitable for + # decoding metadata + with self.assertRaises(UnicodeDecodeError): + zipfile.ZipFile(TESTFN, "r", metadata_encoding='ascii') + with self.assertRaises(UnicodeDecodeError): + zipfile.ZipFile(TESTFN, "r", metadata_encoding='utf-8') + + def test_read_after_append(self): + newname = '\u56db' # Han 'four' + expected_names = [name.encode('shift_jis').decode('cp437') + for name in self.file_names[:2]] + self.file_names[2:] + expected_names.append(newname) + expected_content = (*self.file_content, b"newcontent") + + with zipfile.ZipFile(TESTFN, "a") as zipfp: + zipfp.writestr(newname, "newcontent") + self.assertEqual(sorted(zipfp.namelist()), sorted(expected_names)) + + with zipfile.ZipFile(TESTFN, "r") as zipfp: + self._test_read(zipfp, expected_names, expected_content) + + with zipfile.ZipFile(TESTFN, "r", metadata_encoding='shift_jis') as zipfp: + self.assertEqual(sorted(zipfp.namelist()), sorted(expected_names)) + for i, (name, content) in enumerate(zip(expected_names, expected_content)): + info = zipfp.getinfo(name) + self.assertEqual(info.filename, name) + self.assertEqual(info.file_size, len(content)) + if i < 2: + with self.assertRaises(zipfile.BadZipFile): + zipfp.read(name) + else: + self.assertEqual(zipfp.read(name), content) + + def test_write_with_metadata_encoding(self): + ZF = zipfile.ZipFile + for mode in ("w", "x", "a"): + with self.assertRaisesRegex(ValueError, + "^metadata_encoding is only"): + ZF("nonesuch.zip", mode, metadata_encoding="shift_jis") + + def test_cli_with_metadata_encoding(self): + errmsg = "Non-conforming encodings not supported with -c." + args = ["--metadata-encoding=shift_jis", "-c", "nonesuch", "nonesuch"] + with captured_stdout() as stdout: + with captured_stderr() as stderr: + self.assertRaises(SystemExit, zipfile.main, args) + self.assertEqual(stdout.getvalue(), "") + self.assertIn(errmsg, stderr.getvalue()) + + with captured_stdout() as stdout: + zipfile.main(["--metadata-encoding=shift_jis", "-t", TESTFN]) + listing = stdout.getvalue() + + with captured_stdout() as stdout: + zipfile.main(["--metadata-encoding=shift_jis", "-l", TESTFN]) + listing = stdout.getvalue() + for name in self.file_names: + self.assertIn(name, listing) + + os.mkdir(TESTFN2) + self.addCleanup(rmtree, TESTFN2) + zipfile.main(["--metadata-encoding=shift_jis", "-e", TESTFN, TESTFN2]) + listing = os.listdir(TESTFN2) + for name in self.file_names: + self.assertIn(name, listing) + + if __name__ == "__main__": unittest.main() diff --git a/Lib/zipfile.py b/Lib/zipfile.py index 385adc8..721834a 100644 --- a/Lib/zipfile.py +++ b/Lib/zipfile.py @@ -480,7 +480,7 @@ class ZipInfo (object): def _encodeFilenameFlags(self): try: - return self.filename.encode('ascii'), self.flag_bits + return self.filename.encode('ascii'), self.flag_bits & ~_MASK_UTF_FILENAME except UnicodeEncodeError: return self.filename.encode('utf-8'), self.flag_bits | _MASK_UTF_FILENAME @@ -1240,7 +1240,7 @@ class ZipFile: _windows_illegal_name_trans_table = None def __init__(self, file, mode="r", compression=ZIP_STORED, allowZip64=True, - compresslevel=None, *, strict_timestamps=True): + compresslevel=None, *, strict_timestamps=True, metadata_encoding=None): """Open the ZIP file with mode read 'r', write 'w', exclusive create 'x', or append 'a'.""" if mode not in ('r', 'w', 'x', 'a'): @@ -1259,6 +1259,12 @@ class ZipFile: self.pwd = None self._comment = b'' self._strict_timestamps = strict_timestamps + self.metadata_encoding = metadata_encoding + + # Check that we don't try to write with nonconforming codecs + if self.metadata_encoding and mode != 'r': + raise ValueError( + "metadata_encoding is only supported for reading files") # Check if we were passed a file-like object if isinstance(file, os.PathLike): @@ -1389,13 +1395,13 @@ class ZipFile: if self.debug > 2: print(centdir) filename = fp.read(centdir[_CD_FILENAME_LENGTH]) - flags = centdir[5] + flags = centdir[_CD_FLAG_BITS] if flags & _MASK_UTF_FILENAME: # UTF-8 file names extension filename = filename.decode('utf-8') else: # Historical ZIP filename encoding - filename = filename.decode('cp437') + filename = filename.decode(self.metadata_encoding or 'cp437') # Create ZipInfo instance to store file information x = ZipInfo(filename) x.extra = fp.read(centdir[_CD_EXTRA_FIELD_LENGTH]) @@ -1572,7 +1578,7 @@ class ZipFile: # UTF-8 filename fname_str = fname.decode("utf-8") else: - fname_str = fname.decode("cp437") + fname_str = fname.decode(self.metadata_encoding or "cp437") if fname_str != zinfo.orig_filename: raise BadZipFile( @@ -2461,11 +2467,15 @@ def main(args=None): help='Create zipfile from sources') group.add_argument('-t', '--test', metavar='', help='Test if a zipfile is valid') + parser.add_argument('--metadata-encoding', metavar='', + help='Specify encoding of member names for -l, -e and -t') args = parser.parse_args(args) + encoding = args.metadata_encoding + if args.test is not None: src = args.test - with ZipFile(src, 'r') as zf: + with ZipFile(src, 'r', metadata_encoding=encoding) as zf: badfile = zf.testzip() if badfile: print("The following enclosed file is corrupted: {!r}".format(badfile)) @@ -2473,15 +2483,20 @@ def main(args=None): elif args.list is not None: src = args.list - with ZipFile(src, 'r') as zf: + with ZipFile(src, 'r', metadata_encoding=encoding) as zf: zf.printdir() elif args.extract is not None: src, curdir = args.extract - with ZipFile(src, 'r') as zf: + with ZipFile(src, 'r', metadata_encoding=encoding) as zf: zf.extractall(curdir) elif args.create is not None: + if encoding: + print("Non-conforming encodings not supported with -c.", + file=sys.stderr) + sys.exit(1) + zip_name = args.create.pop(0) files = args.create diff --git a/Misc/NEWS.d/next/Library/2022-03-20-15-54-41.bpo-28080.kn35Vk.rst b/Misc/NEWS.d/next/Library/2022-03-20-15-54-41.bpo-28080.kn35Vk.rst new file mode 100644 index 0000000..08428e6 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2022-03-20-15-54-41.bpo-28080.kn35Vk.rst @@ -0,0 +1,4 @@ +Add the *metadata_encoding* parameter in the :class:`zipfile.ZipFile` +constructor and the ``--metadata-encoding`` option in the :mod:`zipfile` +CLI to allow reading zipfiles using non-standard codecs to encode the +filenames within the archive. -- cgit v0.12