summaryrefslogtreecommitdiffstats
path: root/generic/tclZipfs.c
diff options
context:
space:
mode:
authordkf <donal.k.fellows@manchester.ac.uk>2021-03-18 21:35:09 (GMT)
committerdkf <donal.k.fellows@manchester.ac.uk>2021-03-18 21:35:09 (GMT)
commitf2cb38dda94341ddbb64b3b72efdeeb62ec610be (patch)
treeed2981683cfa27b6d8b315a3549d55ac8498b744 /generic/tclZipfs.c
parent80af639d396a0e631190bba05dfaaa1653dba265 (diff)
downloadtcl-f2cb38dda94341ddbb64b3b72efdeeb62ec610be.zip
tcl-f2cb38dda94341ddbb64b3b72efdeeb62ec610be.tar.gz
tcl-f2cb38dda94341ddbb64b3b72efdeeb62ec610be.tar.bz2
Handle the encoding of filenames in ZIPs correctly. (This is awful. The awful isn't Tcl's fault. Terrible spec...)
Diffstat (limited to 'generic/tclZipfs.c')
-rw-r--r--generic/tclZipfs.c199
1 files changed, 177 insertions, 22 deletions
diff --git a/generic/tclZipfs.c b/generic/tclZipfs.c
index feb5324..0b1963b 100644
--- a/generic/tclZipfs.c
+++ b/generic/tclZipfs.c
@@ -47,6 +47,7 @@
#define ZIPFS_VOLUME_LEN 9
#define ZIPFS_APP_MOUNT "//zipfs:/app"
#define ZIPFS_ZIP_MOUNT "//zipfs:/lib/tcl"
+#define ZIPFS_FALLBACK_ENCODING "cp437"
/*
* Various constants and offsets found in ZIP archive files
@@ -262,11 +263,19 @@ static struct {
int wrmax; /* Maximum write size of a file; only written
* to from Tcl code in a trusted interpreter,
* so NOT protected by mutex. */
+ char *fallbackEntryEncoding;/* The fallback encoding for ZIP entries when
+ * they are believed to not be UTF-8; only
+ * written to from Tcl code in a trusted
+ * interpreter, so not protected by mutex. */
+ Tcl_Encoding utf8; /* The UTF-8 encoding that we prefer to use
+ * for the strings (especially filenames)
+ * embedded in a ZIP. Other encodings are used
+ * dynamically. */
int idCount; /* Counter for channel names */
Tcl_HashTable fileHash; /* File name to ZipEntry mapping */
Tcl_HashTable zipHash; /* Mount to ZipFile mapping */
} ZipFS = {
- 0, 0, 0, DEFAULT_WRITE_MAX_SIZE, 0,
+ 0, 0, 0, DEFAULT_WRITE_MAX_SIZE, NULL, NULL, 0,
{0,{0,0,0,0},0,0,0,0,0,0,0,0,0},
{0,{0,0,0,0},0,0,0,0,0,0,0,0,0}
};
@@ -686,6 +695,115 @@ CountSlashes(
/*
*-------------------------------------------------------------------------
*
+ * DecodeZipEntryText --
+ *
+ * Given a sequence of bytes from an entry in a ZIP central directory,
+ * convert that into a Tcl string. This is complicated because we don't
+ * actually know what encoding is in use! So we try to use UTF-8, and if
+ * that goes wrong, we fall back to a user-specified encoding, or to an
+ * encoding we specify (Windows code page 437), or to ISO 8859-1 if
+ * absolutely nothing else works.
+ *
+ * During Tcl startup, we skip the user-specified encoding and cp437, as
+ * we may well not have any loadable encodings yet. Tcl's own library
+ * files ought to be using ASCII filenames.
+ *
+ * Results:
+ * The decoded filename; the filename is owned by the argument DString.
+ *
+ * Side effects:
+ * Updates dstPtr.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+static char *
+DecodeZipEntryText(
+ const unsigned char *inputBytes,
+ unsigned int inputLength,
+ Tcl_DString *dstPtr)
+{
+ Tcl_Encoding encoding;
+ const char *src;
+ char *dst;
+ int dstLen, srcLen = inputLength, flags;
+ Tcl_EncodingState state;
+
+ Tcl_DStringInit(dstPtr);
+ if (inputLength < 1) {
+ return Tcl_DStringValue(dstPtr);
+ }
+
+ /*
+ * We can't use Tcl_ExternalToUtfDString at this point; it has no way to
+ * fail. So we use this modified version of it that can report encoding
+ * errors to us (so we can fall back to something else).
+ *
+ * The utf-8 encoding is implemented internally, and so is guaranteed to
+ * be present.
+ */
+
+ src = (const char *) inputBytes;
+ dst = Tcl_DStringValue(dstPtr);
+ dstLen = dstPtr->spaceAvl - 1;
+ flags = TCL_ENCODING_START | TCL_ENCODING_END |
+ TCL_ENCODING_STOPONERROR; /* Special flag! */
+
+ while (1) {
+ int srcRead, dstWrote;
+ int result = Tcl_ExternalToUtf(NULL, ZipFS.utf8, src, srcLen, flags,
+ &state, dst, dstLen, &srcRead, &dstWrote, NULL);
+ int soFar = dst + dstWrote - Tcl_DStringValue(dstPtr);
+
+ if (result == TCL_OK) {
+ Tcl_DStringSetLength(dstPtr, soFar);
+ return Tcl_DStringValue(dstPtr);
+ } else if (result != TCL_CONVERT_NOSPACE) {
+ break;
+ }
+
+ flags &= ~TCL_ENCODING_START;
+ src += srcRead;
+ srcLen -= srcRead;
+ if (Tcl_DStringLength(dstPtr) == 0) {
+ Tcl_DStringSetLength(dstPtr, dstLen);
+ }
+ Tcl_DStringSetLength(dstPtr, 2 * Tcl_DStringLength(dstPtr) + 1);
+ dst = Tcl_DStringValue(dstPtr) + soFar;
+ dstLen = Tcl_DStringLength(dstPtr) - soFar - 1;
+ }
+
+ /*
+ * Something went wrong. Fall back to another encoding. Those *can* use
+ * Tcl_ExternalToUtfDString().
+ */
+
+ encoding = NULL;
+ if (ZipFS.fallbackEntryEncoding) {
+ encoding = Tcl_GetEncoding(NULL, ZipFS.fallbackEntryEncoding);
+ }
+ if (!encoding) {
+ encoding = Tcl_GetEncoding(NULL, ZIPFS_FALLBACK_ENCODING);
+ }
+ if (!encoding) {
+ /*
+ * Fallback to internal encoding that always converts all bytes.
+ * Should only happen when a filename isn't UTF-8 and we've not got
+ * our encodings initialised for some reason.
+ */
+
+ encoding = Tcl_GetEncoding(NULL, "iso8859-1");
+ }
+
+ char *converted = Tcl_ExternalToUtfDString(encoding,
+ (const char *) inputBytes, inputLength, dstPtr);
+ Tcl_FreeEncoding(encoding);
+ return converted;
+}
+
+/*
+ *-------------------------------------------------------------------------
+ *
* CanonicalPath --
*
* This function computes the canonical path from a directory and file
@@ -1546,9 +1664,7 @@ ZipFSCatalogFilesystem(
pathlen = ZipReadShort(start, end, q + ZIP_CENTRAL_PATHLEN_OFFS);
comlen = ZipReadShort(start, end, q + ZIP_CENTRAL_FCOMMENTLEN_OFFS);
extra = ZipReadShort(start, end, q + ZIP_CENTRAL_EXTRALEN_OFFS);
- Tcl_DStringSetLength(&ds, 0);
- Tcl_DStringAppend(&ds, (char *) q + ZIP_CENTRAL_HEADER_LEN, pathlen);
- path = Tcl_DStringValue(&ds);
+ path = DecodeZipEntryText(q + ZIP_CENTRAL_HEADER_LEN, pathlen, &ds);
if ((pathlen > 0) && (path[pathlen - 1] == '/')) {
Tcl_DStringSetLength(&ds, pathlen - 1);
path = Tcl_DStringValue(&ds);
@@ -1738,6 +1854,10 @@ ZipfsSetup(void)
Tcl_InitHashTable(&ZipFS.zipHash, TCL_STRING_KEYS);
ZipFS.idCount = 1;
ZipFS.wrmax = DEFAULT_WRITE_MAX_SIZE;
+ ZipFS.fallbackEntryEncoding =
+ Tcl_Alloc(strlen(ZIPFS_FALLBACK_ENCODING) + 1);
+ strcpy(ZipFS.fallbackEntryEncoding, ZIPFS_FALLBACK_ENCODING);
+ ZipFS.utf8 = Tcl_GetEncoding(NULL, "utf-8");
ZipFS.initialized = 1;
}
@@ -2357,6 +2477,9 @@ RandomChar(
* input file is added to the given fileHash table for later creation of
* the central ZIP directory.
*
+ * Tcl *always* encodes filenames in the ZIP as UTF-8. Similarly, it
+ * would always encode comments as UTF-8, if it supported comments.
+ *
* Results:
* A standard Tcl result.
*
@@ -2371,7 +2494,8 @@ static int
ZipAddFile(
Tcl_Interp *interp, /* Current interpreter. */
Tcl_Obj *pathObj, /* Actual name of the file to add. */
- const char *name, /* Name to use in the ZIP archive. */
+ const char *name, /* Name to use in the ZIP archive, in Tcl's
+ * internal encoding. */
Tcl_Channel out, /* The open ZIP archive being built. */
const char *passwd, /* Password for encoding the file, or NULL if
* the file is to be unprotected. */
@@ -2386,7 +2510,10 @@ ZipAddFile(
Tcl_HashEntry *hPtr;
ZipEntry *z;
z_stream stream;
- const char *zpath;
+ Tcl_DString zpathDs; /* Buffer for the encoded filename. */
+ const char *zpathExt; /* Filename in external encoding (true
+ * UTF-8). */
+ const char *zpathTcl; /* Filename in Tcl's internal encoding. */
int crc, flush, zpathlen;
size_t nbyte, nbytecompr, len, olen, align = 0;
long long headerStartOffset, dataStartOffset, dataEndOffset;
@@ -2399,23 +2526,31 @@ ZipAddFile(
* nothing to do.
*/
- zpath = name;
- while (zpath && zpath[0] == '/') {
- zpath++;
+ zpathTcl = name;
+ while (zpathTcl && zpathTcl[0] == '/') {
+ zpathTcl++;
}
- if (!zpath || (zpath[0] == '\0')) {
+ if (!zpathTcl || (zpathTcl[0] == '\0')) {
return TCL_OK;
}
- zpathlen = strlen(zpath);
+ /*
+ * Convert to encoded form. Note that we use strlen() here; if someone's
+ * crazy enough to embed NULs in filenames, they deserve what they get!
+ */
+
+ zpathExt = Tcl_UtfToExternalDString(ZipFS.utf8, zpathTcl, -1, &zpathDs);
+ zpathlen = strlen(zpathExt);
if (zpathlen + ZIP_CENTRAL_HEADER_LEN > bufsize) {
Tcl_SetObjResult(interp, Tcl_ObjPrintf(
"path too long for \"%s\"", Tcl_GetString(pathObj)));
ZIPFS_ERROR_CODE(interp, "PATH_LEN");
+ Tcl_DStringFree(&zpathDs);
return TCL_ERROR;
}
in = Tcl_FSOpenFileChannel(interp, pathObj, "rb", 0);
if (!in) {
+ Tcl_DStringFree(&zpathDs);
#ifdef _WIN32
/* hopefully a directory */
if (strcmp("permission denied", Tcl_PosixError(interp)) == 0) {
@@ -2443,6 +2578,7 @@ ZipAddFile(
while (1) {
len = Tcl_Read(in, buf, bufsize);
if (len == ERROR_LENGTH) {
+ Tcl_DStringFree(&zpathDs);
if (nbyte == 0 && errno == EISDIR) {
Tcl_Close(interp, in);
return TCL_OK;
@@ -2463,6 +2599,7 @@ ZipAddFile(
Tcl_SetObjResult(interp, Tcl_ObjPrintf("seek error on \"%s\": %s",
Tcl_GetString(pathObj), Tcl_PosixError(interp)));
Tcl_Close(interp, in);
+ Tcl_DStringFree(&zpathDs);
return TCL_ERROR;
}
@@ -2479,7 +2616,7 @@ ZipAddFile(
*/
memset(buf, '\0', ZIP_LOCAL_HEADER_LEN);
- memcpy(buf + ZIP_LOCAL_HEADER_LEN, zpath, zpathlen);
+ memcpy(buf + ZIP_LOCAL_HEADER_LEN, zpathExt, zpathlen);
len = zpathlen + ZIP_LOCAL_HEADER_LEN;
if ((size_t) Tcl_Write(out, buf, len) != len) {
writeErrorWithChannelOpen:
@@ -2487,6 +2624,7 @@ ZipAddFile(
"write error on \"%s\": %s",
Tcl_GetString(pathObj), Tcl_PosixError(interp)));
Tcl_Close(interp, in);
+ Tcl_DStringFree(&zpathDs);
return TCL_ERROR;
}
@@ -2563,6 +2701,7 @@ ZipAddFile(
"compression init error on \"%s\"", Tcl_GetString(pathObj)));
ZIPFS_ERROR_CODE(interp, "DEFLATE_INIT");
Tcl_Close(interp, in);
+ Tcl_DStringFree(&zpathDs);
return TCL_ERROR;
}
@@ -2585,6 +2724,7 @@ ZipAddFile(
ZIPFS_ERROR_CODE(interp, "DEFLATE");
deflateEnd(&stream);
Tcl_Close(interp, in);
+ Tcl_DStringFree(&zpathDs);
return TCL_ERROR;
}
olen = sizeof(obuf) - stream.avail_out;
@@ -2625,6 +2765,7 @@ ZipAddFile(
Tcl_SetObjResult(interp, Tcl_ObjPrintf(
"seek error: %s", Tcl_PosixError(interp)));
Tcl_Close(interp, in);
+ Tcl_DStringFree(&zpathDs);
return TCL_ERROR;
}
nbytecompr = (passwd ? 12 : 0);
@@ -2660,8 +2801,10 @@ ZipAddFile(
Tcl_TruncateChannel(out, dataEndOffset);
}
Tcl_Close(interp, in);
+ Tcl_DStringFree(&zpathDs);
+ zpathExt = NULL;
- hPtr = Tcl_CreateHashEntry(fileHash, zpath, &isNew);
+ hPtr = Tcl_CreateHashEntry(fileHash, zpathTcl, &isNew);
if (!isNew) {
Tcl_SetObjResult(interp, Tcl_ObjPrintf(
"non-unique path name \"%s\"", Tcl_GetString(pathObj)));
@@ -2758,8 +2901,8 @@ ZipFSFind(
* should be skipped.
*
* Returns:
- * Pointer to the name, which will be in memory owned by one of the
- * argument objects.
+ * Pointer to the name (in Tcl's internal encoding), which will be in
+ * memory owned by one of the argument objects.
*
* Side effects:
* None (if Tcl_Objs have string representations)
@@ -2772,8 +2915,10 @@ ComputeNameInArchive(
Tcl_Obj *pathObj, /* The path to the origin file */
Tcl_Obj *directNameObj, /* User-specified name for use in the ZIP
* archive */
- const char *strip, /* A prefix to strip */
- int slen) /* The length of the prefix */
+ const char *strip, /* A prefix to strip; may be NULL if no
+ * stripping need be done. */
+ int slen) /* The length of the prefix; must be 0 if no
+ * stripping need be done. */
{
const char *name;
int len;
@@ -2811,6 +2956,9 @@ ComputeNameInArchive(
* file. It's the core of the implementation of [zipfs mkzip], [zipfs
* mkimg], [zipfs lmkzip] and [zipfs lmkimg].
*
+ * Tcl *always* encodes filenames in the ZIP as UTF-8. Similarly, it
+ * would always encode comments as UTF-8, if it supported comments.
+ *
* Results:
* A standard Tcl result.
*
@@ -3023,6 +3171,9 @@ ZipFSMkZipOrImg(
dataStartOffset = Tcl_Tell(out);
if (mappingList == NULL && stripPrefix != NULL) {
strip = Tcl_GetStringFromObj(stripPrefix, &slen);
+ if (!slen) {
+ strip = NULL;
+ }
}
for (i = 0; i < (size_t) lobjc; i += (mappingList ? 2 : 1)) {
Tcl_Obj *pathObj = lobjv[i];
@@ -3047,25 +3198,27 @@ ZipFSMkZipOrImg(
for (i = 0; i < (size_t) lobjc; i += (mappingList ? 2 : 1)) {
const char *name = ComputeNameInArchive(lobjv[i],
(mappingList ? lobjv[i + 1] : NULL), strip, slen);
+ Tcl_DString ds;
- if (name[0] == '\0') {
- continue;
- }
hPtr = Tcl_FindHashEntry(&fileHash, name);
if (!hPtr) {
continue;
}
z = (ZipEntry *) Tcl_GetHashValue(hPtr);
- len = strlen(z->name);
+
+ name = Tcl_UtfToExternalDString(ZipFS.utf8, z->name, -1, &ds);
+ len = Tcl_DStringLength(&ds);
SerializeCentralDirectoryEntry(start, end, (unsigned char *) buf,
z, len, dataStartOffset);
if ((Tcl_Write(out, buf, ZIP_CENTRAL_HEADER_LEN)
!= ZIP_CENTRAL_HEADER_LEN)
- || ((size_t) Tcl_Write(out, z->name, len) != len)) {
+ || ((size_t) Tcl_Write(out, name, len) != len)) {
Tcl_SetObjResult(interp, Tcl_ObjPrintf(
"write error: %s", Tcl_PosixError(interp)));
+ Tcl_DStringFree(&ds);
goto done;
}
+ Tcl_DStringFree(&ds);
count++;
}
@@ -5517,6 +5670,8 @@ TclZipfs_Init(
if (!Tcl_IsSafe(interp)) {
Tcl_LinkVar(interp, "::tcl::zipfs::wrmax", (char *) &ZipFS.wrmax,
TCL_LINK_INT);
+ Tcl_LinkVar(interp, "::tcl::zipfs::fallbackEntryEncoding",
+ (char *) &ZipFS.fallbackEntryEncoding, TCL_LINK_STRING);
}
ensemble = TclMakeEnsemble(interp, "zipfs",
Tcl_IsSafe(interp) ? (initMap + 4) : initMap);