summaryrefslogtreecommitdiffstats
path: root/Lib
diff options
context:
space:
mode:
Diffstat (limited to 'Lib')
-rw-r--r--Lib/test/test_zipfile.py140
-rw-r--r--Lib/zipfile.py31
2 files changed, 161 insertions, 10 deletions
diff --git a/Lib/test/test_zipfile.py b/Lib/test/test_zipfile.py
index 759a4ab..26c4045 100644
--- a/Lib/test/test_zipfile.py
+++ b/Lib/test/test_zipfile.py
@@ -21,8 +21,10 @@ from tempfile import TemporaryFile
from random import randint, random, randbytes
from test.support import script_helper
-from test.support import (findfile, requires_zlib, requires_bz2,
- requires_lzma, captured_stdout, requires_subprocess)
+from test.support import (
+ findfile, requires_zlib, requires_bz2, requires_lzma,
+ captured_stdout, captured_stderr, requires_subprocess
+)
from test.support.os_helper import (
TESTFN, unlink, rmtree, temp_dir, temp_cwd, fd_count
)
@@ -3210,5 +3212,139 @@ class TestPath(unittest.TestCase):
assert isinstance(file, cls)
+class EncodedMetadataTests(unittest.TestCase):
+ file_names = ['\u4e00', '\u4e8c', '\u4e09'] # Han 'one', 'two', 'three'
+ file_content = [
+ "This is pure ASCII.\n".encode('ascii'),
+ # This is modern Japanese. (UTF-8)
+ "\u3053\u308c\u306f\u73fe\u4ee3\u7684\u65e5\u672c\u8a9e\u3067\u3059\u3002\n".encode('utf-8'),
+ # This is obsolete Japanese. (Shift JIS)
+ "\u3053\u308c\u306f\u53e4\u3044\u65e5\u672c\u8a9e\u3067\u3059\u3002\n".encode('shift_jis'),
+ ]
+
+ def setUp(self):
+ self.addCleanup(unlink, TESTFN)
+ # Create .zip of 3 members with Han names encoded in Shift JIS.
+ # Each name is 1 Han character encoding to 2 bytes in Shift JIS.
+ # The ASCII names are arbitrary as long as they are length 2 and
+ # not otherwise contained in the zip file.
+ # Data elements are encoded bytes (ascii, utf-8, shift_jis).
+ placeholders = ["n1", "n2"] + self.file_names[2:]
+ with zipfile.ZipFile(TESTFN, mode="w") as tf:
+ for temp, content in zip(placeholders, self.file_content):
+ tf.writestr(temp, content, zipfile.ZIP_STORED)
+ # Hack in the Shift JIS names with flag bit 11 (UTF-8) unset.
+ with open(TESTFN, "rb") as tf:
+ data = tf.read()
+ for name, temp in zip(self.file_names, placeholders[:2]):
+ data = data.replace(temp.encode('ascii'),
+ name.encode('shift_jis'))
+ with open(TESTFN, "wb") as tf:
+ tf.write(data)
+
+ def _test_read(self, zipfp, expected_names, expected_content):
+ # Check the namelist
+ names = zipfp.namelist()
+ self.assertEqual(sorted(names), sorted(expected_names))
+
+ # Check infolist
+ infos = zipfp.infolist()
+ names = [zi.filename for zi in infos]
+ self.assertEqual(sorted(names), sorted(expected_names))
+
+ # check getinfo
+ for name, content in zip(expected_names, expected_content):
+ info = zipfp.getinfo(name)
+ self.assertEqual(info.filename, name)
+ self.assertEqual(info.file_size, len(content))
+ self.assertEqual(zipfp.read(name), content)
+
+ def test_read_with_metadata_encoding(self):
+ # Read the ZIP archive with correct metadata_encoding
+ with zipfile.ZipFile(TESTFN, "r", metadata_encoding='shift_jis') as zipfp:
+ self._test_read(zipfp, self.file_names, self.file_content)
+
+ def test_read_without_metadata_encoding(self):
+ # Read the ZIP archive without metadata_encoding
+ expected_names = [name.encode('shift_jis').decode('cp437')
+ for name in self.file_names[:2]] + self.file_names[2:]
+ with zipfile.ZipFile(TESTFN, "r") as zipfp:
+ self._test_read(zipfp, expected_names, self.file_content)
+
+ def test_read_with_incorrect_metadata_encoding(self):
+ # Read the ZIP archive with incorrect metadata_encoding
+ expected_names = [name.encode('shift_jis').decode('koi8-u')
+ for name in self.file_names[:2]] + self.file_names[2:]
+ with zipfile.ZipFile(TESTFN, "r", metadata_encoding='koi8-u') as zipfp:
+ self._test_read(zipfp, expected_names, self.file_content)
+
+ def test_read_with_unsuitable_metadata_encoding(self):
+ # Read the ZIP archive with metadata_encoding unsuitable for
+ # decoding metadata
+ with self.assertRaises(UnicodeDecodeError):
+ zipfile.ZipFile(TESTFN, "r", metadata_encoding='ascii')
+ with self.assertRaises(UnicodeDecodeError):
+ zipfile.ZipFile(TESTFN, "r", metadata_encoding='utf-8')
+
+ def test_read_after_append(self):
+ newname = '\u56db' # Han 'four'
+ expected_names = [name.encode('shift_jis').decode('cp437')
+ for name in self.file_names[:2]] + self.file_names[2:]
+ expected_names.append(newname)
+ expected_content = (*self.file_content, b"newcontent")
+
+ with zipfile.ZipFile(TESTFN, "a") as zipfp:
+ zipfp.writestr(newname, "newcontent")
+ self.assertEqual(sorted(zipfp.namelist()), sorted(expected_names))
+
+ with zipfile.ZipFile(TESTFN, "r") as zipfp:
+ self._test_read(zipfp, expected_names, expected_content)
+
+ with zipfile.ZipFile(TESTFN, "r", metadata_encoding='shift_jis') as zipfp:
+ self.assertEqual(sorted(zipfp.namelist()), sorted(expected_names))
+ for i, (name, content) in enumerate(zip(expected_names, expected_content)):
+ info = zipfp.getinfo(name)
+ self.assertEqual(info.filename, name)
+ self.assertEqual(info.file_size, len(content))
+ if i < 2:
+ with self.assertRaises(zipfile.BadZipFile):
+ zipfp.read(name)
+ else:
+ self.assertEqual(zipfp.read(name), content)
+
+ def test_write_with_metadata_encoding(self):
+ ZF = zipfile.ZipFile
+ for mode in ("w", "x", "a"):
+ with self.assertRaisesRegex(ValueError,
+ "^metadata_encoding is only"):
+ ZF("nonesuch.zip", mode, metadata_encoding="shift_jis")
+
+ def test_cli_with_metadata_encoding(self):
+ errmsg = "Non-conforming encodings not supported with -c."
+ args = ["--metadata-encoding=shift_jis", "-c", "nonesuch", "nonesuch"]
+ with captured_stdout() as stdout:
+ with captured_stderr() as stderr:
+ self.assertRaises(SystemExit, zipfile.main, args)
+ self.assertEqual(stdout.getvalue(), "")
+ self.assertIn(errmsg, stderr.getvalue())
+
+ with captured_stdout() as stdout:
+ zipfile.main(["--metadata-encoding=shift_jis", "-t", TESTFN])
+ listing = stdout.getvalue()
+
+ with captured_stdout() as stdout:
+ zipfile.main(["--metadata-encoding=shift_jis", "-l", TESTFN])
+ listing = stdout.getvalue()
+ for name in self.file_names:
+ self.assertIn(name, listing)
+
+ os.mkdir(TESTFN2)
+ self.addCleanup(rmtree, TESTFN2)
+ zipfile.main(["--metadata-encoding=shift_jis", "-e", TESTFN, TESTFN2])
+ listing = os.listdir(TESTFN2)
+ for name in self.file_names:
+ self.assertIn(name, listing)
+
+
if __name__ == "__main__":
unittest.main()
diff --git a/Lib/zipfile.py b/Lib/zipfile.py
index 385adc8..721834a 100644
--- a/Lib/zipfile.py
+++ b/Lib/zipfile.py
@@ -480,7 +480,7 @@ class ZipInfo (object):
def _encodeFilenameFlags(self):
try:
- return self.filename.encode('ascii'), self.flag_bits
+ return self.filename.encode('ascii'), self.flag_bits & ~_MASK_UTF_FILENAME
except UnicodeEncodeError:
return self.filename.encode('utf-8'), self.flag_bits | _MASK_UTF_FILENAME
@@ -1240,7 +1240,7 @@ class ZipFile:
_windows_illegal_name_trans_table = None
def __init__(self, file, mode="r", compression=ZIP_STORED, allowZip64=True,
- compresslevel=None, *, strict_timestamps=True):
+ compresslevel=None, *, strict_timestamps=True, metadata_encoding=None):
"""Open the ZIP file with mode read 'r', write 'w', exclusive create 'x',
or append 'a'."""
if mode not in ('r', 'w', 'x', 'a'):
@@ -1259,6 +1259,12 @@ class ZipFile:
self.pwd = None
self._comment = b''
self._strict_timestamps = strict_timestamps
+ self.metadata_encoding = metadata_encoding
+
+ # Check that we don't try to write with nonconforming codecs
+ if self.metadata_encoding and mode != 'r':
+ raise ValueError(
+ "metadata_encoding is only supported for reading files")
# Check if we were passed a file-like object
if isinstance(file, os.PathLike):
@@ -1389,13 +1395,13 @@ class ZipFile:
if self.debug > 2:
print(centdir)
filename = fp.read(centdir[_CD_FILENAME_LENGTH])
- flags = centdir[5]
+ flags = centdir[_CD_FLAG_BITS]
if flags & _MASK_UTF_FILENAME:
# UTF-8 file names extension
filename = filename.decode('utf-8')
else:
# Historical ZIP filename encoding
- filename = filename.decode('cp437')
+ filename = filename.decode(self.metadata_encoding or 'cp437')
# Create ZipInfo instance to store file information
x = ZipInfo(filename)
x.extra = fp.read(centdir[_CD_EXTRA_FIELD_LENGTH])
@@ -1572,7 +1578,7 @@ class ZipFile:
# UTF-8 filename
fname_str = fname.decode("utf-8")
else:
- fname_str = fname.decode("cp437")
+ fname_str = fname.decode(self.metadata_encoding or "cp437")
if fname_str != zinfo.orig_filename:
raise BadZipFile(
@@ -2461,11 +2467,15 @@ def main(args=None):
help='Create zipfile from sources')
group.add_argument('-t', '--test', metavar='<zipfile>',
help='Test if a zipfile is valid')
+ parser.add_argument('--metadata-encoding', metavar='<encoding>',
+ help='Specify encoding of member names for -l, -e and -t')
args = parser.parse_args(args)
+ encoding = args.metadata_encoding
+
if args.test is not None:
src = args.test
- with ZipFile(src, 'r') as zf:
+ with ZipFile(src, 'r', metadata_encoding=encoding) as zf:
badfile = zf.testzip()
if badfile:
print("The following enclosed file is corrupted: {!r}".format(badfile))
@@ -2473,15 +2483,20 @@ def main(args=None):
elif args.list is not None:
src = args.list
- with ZipFile(src, 'r') as zf:
+ with ZipFile(src, 'r', metadata_encoding=encoding) as zf:
zf.printdir()
elif args.extract is not None:
src, curdir = args.extract
- with ZipFile(src, 'r') as zf:
+ with ZipFile(src, 'r', metadata_encoding=encoding) as zf:
zf.extractall(curdir)
elif args.create is not None:
+ if encoding:
+ print("Non-conforming encodings not supported with -c.",
+ file=sys.stderr)
+ sys.exit(1)
+
zip_name = args.create.pop(0)
files = args.create