summaryrefslogtreecommitdiffstats
path: root/generic/tclBinary.c
diff options
context:
space:
mode:
Diffstat (limited to 'generic/tclBinary.c')
-rw-r--r--generic/tclBinary.c178
1 files changed, 134 insertions, 44 deletions
diff --git a/generic/tclBinary.c b/generic/tclBinary.c
index d0d9d5e..5772bc6 100644
--- a/generic/tclBinary.c
+++ b/generic/tclBinary.c
@@ -155,35 +155,108 @@ static const EnsembleImplMap decodeMap[] = {
};
/*
- * The following object type represents an array of bytes. An array of bytes
- * is not equivalent to an internationalized string. Conceptually, a string is
- * an array of 16-bit quantities organized as a sequence of properly formed
- * UTF-8 characters, while a ByteArray is an array of 8-bit quantities.
- * Accessor functions are provided to convert a ByteArray to a String or a
- * String to a ByteArray. Two or more consecutive bytes in an array of bytes
- * may look like a single UTF-8 character if the array is casually treated as
- * a string. But obtaining the String from a ByteArray is guaranteed to
- * produced properly formed UTF-8 sequences so that there is a one-to-one map
- * between bytes and characters.
- *
- * Converting a ByteArray to a String proceeds by casting each byte in the
- * array to a 16-bit quantity, treating that number as a Unicode character,
- * and storing the UTF-8 version of that Unicode character in the String. For
- * ByteArrays consisting entirely of values 1..127, the corresponding String
- * representation is the same as the ByteArray representation.
- *
- * Converting a String to a ByteArray proceeds by getting the Unicode
- * representation of each character in the String, casting it to a byte by
- * truncating the upper 8 bits, and then storing the byte in the ByteArray.
- * Converting from ByteArray to String and back to ByteArray is not lossy, but
- * converting an arbitrary String to a ByteArray may be.
+ * The following object types represent an array of bytes. The intent is
+ * to allow arbitrary binary data to pass through Tcl as a Tcl value
+ * without loss or damage. Such values are useful for things like
+ * encoded strings or Tk images to name just two.
+ *
+ * It's strange to have two Tcl_ObjTypes in place for this task when
+ * one would do, so a bit of detail and history how we got to this point
+ * and where we might go from here.
+ *
+ * A bytearray is an ordered sequence of bytes. Each byte is an integer
+ * value in the range [0-255]. To be a Tcl value type, we need a way to
+ * encode each value in the value set as a Tcl string. The simplest
+ * encoding is to represent each byte value as the same codepoint value.
+ * A bytearray of N bytes is encoded into a Tcl string of N characters
+ * where the codepoint of each character is the value of corresponding byte.
+ * This approach creates a one-to-one map between all bytearray values
+ * and a subset of Tcl string values.
+ *
+ * When converting a Tcl string value to the bytearray internal rep, the
+ * question arises what to do with strings outside that subset? That is,
+ * those Tcl strings containing at least one codepoint greater than 255?
+ * The obviously correct answer is to raise an error! That string value
+ * does not represent any valid bytearray value. Full Stop. The
+ * setFromAnyProc signature has a completion code return value for just
+ * this reason, to reject invalid inputs.
+ *
+ * Unfortunately this was not the path taken by the authors of the
+ * original tclByteArrayType. They chose to accept all Tcl string values
+ * as acceptable string encodings of the bytearray values that result
+ * from masking away the high bits of any codepoint value at all. This
+ * meant that every bytearray value had multiple accepted string
+ * representations.
+ *
+ * The implications of this choice are truly ugly. When a Tcl value has
+ * a string representation, we are required to accept that as the true
+ * value. Bytearray values that possess a string representation cannot
+ * be processed as bytearrays because we cannot know which true value
+ * that bytearray represents. The consequence is that we drag around
+ * an internal rep that we cannot make any use of. This painful price
+ * is extracted at any point after a string rep happens to be generated
+ * for the value. This happens even when the troublesome codepoints
+ * outside the byte range never show up. This happens rather routinely
+ * in normal Tcl operations unless we burden the script writer with the
+ * cognitive burden of avoiding it. The price is also paid by callers
+ * of the C interface. The routine
+ *
+ * unsigned char *Tcl_GetByteArrayFromObj(objPtr, lenPtr)
+ *
+ * has a guarantee to always return a non-NULL value, but that value
+ * points to a byte sequence that cannot be used by the caller to
+ * process the Tcl value absent some sideband testing that objPtr
+ * is "pure". Tcl offers no public interface to perform this test,
+ * so callers either break encapsulation or are unavoidably buggy. Tcl
+ * has defined a public interface that cannot be used correctly. The
+ * Tcl source code itself suffers the same problem, and has been buggy,
+ * but progressively less so as more and more portions of the code have
+ * been retrofitted with the required "purity testing". The set of values
+ * able to pass the purity test can be increased via the introduction of
+ * a "canonical" flag marker, but the only way the broken interface itself
+ * can be discarded is to start over and define the Tcl_ObjType properly.
+ * Bytearrays should simply be usable as bytearrays without a kabuki
+ * dance of testing.
+ *
+ * The Tcl_ObjType "properByteArrayType" is (nearly) a correct
+ * implementation of bytearrays. Any Tcl value with the type
+ * properByteArrayType can have its bytearray value fetched and
+ * used with confidence that acting on that value is equivalent to
+ * acting on the true Tcl string value. This still implies a side
+ * testing burden -- past mistakes will not let us avoid that
+ * immediately, but it is at least a conventional test of type, and
+ * can be implemented entirely by examining the objPtr fields, with
+ * no need to query the intrep, as a canonical flag would require.
+ *
+ * Until Tcl_GetByteArrayFromObj() and Tcl_SetByteArrayLength() can
+ * be revised to admit the possibility of returning NULL when the true
+ * value is not a valid bytearray, we need a mechanism to retain
+ * compatibility with the deployed callers of the broken interface.
+ * That's what the retained "tclByteArrayType" provides. In those
+ * unusual circumstances where we convert an invalid bytearray value
+ * to a bytearray type, it is to this legacy type. Essentially any
+ * time this legacy type gets used, it's a signal of a bug being ignored.
+ * A TIP should be drafted to remove this connection to the broken past
+ * so that Tcl 9 will no longer have any trace of it. Prescribing a
+ * migration path will be the key element of that work. The internal
+ * changes now in place are the limit of what can be done short of
+ * interface repair. They provide a great expansion of the histories
+ * over which bytearray values can be useful in the meanwhile.
*/
-const Tcl_ObjType tclByteArrayType = {
+static const Tcl_ObjType properByteArrayType = {
"bytearray",
FreeByteArrayInternalRep,
DupByteArrayInternalRep,
UpdateStringOfByteArray,
+ NULL
+};
+
+const Tcl_ObjType tclByteArrayType = {
+ "bytearray",
+ FreeByteArrayInternalRep,
+ DupByteArrayInternalRep,
+ NULL,
SetByteArrayFromAny
};
@@ -211,6 +284,12 @@ typedef struct ByteArray {
#define SET_BYTEARRAY(objPtr, baPtr) \
(objPtr)->internalRep.twoPtrValue.ptr1 = (void *) (baPtr)
+int
+TclIsPureByteArray(
+ Tcl_Obj * objPtr)
+{
+ return (objPtr->typePtr == &properByteArrayType);
+}
/*
*----------------------------------------------------------------------
@@ -341,7 +420,7 @@ Tcl_SetByteArrayObj(
if ((bytes != NULL) && (length > 0)) {
memcpy(byteArrayPtr->bytes, bytes, (size_t) length);
}
- objPtr->typePtr = &tclByteArrayType;
+ objPtr->typePtr = &properByteArrayType;
SET_BYTEARRAY(objPtr, byteArrayPtr);
}
@@ -371,7 +450,8 @@ Tcl_GetByteArrayFromObj(
{
ByteArray *baPtr;
- if (objPtr->typePtr != &tclByteArrayType) {
+ if ((objPtr->typePtr != &properByteArrayType)
+ && (objPtr->typePtr != &tclByteArrayType)) {
SetByteArrayFromAny(NULL, objPtr);
}
baPtr = GET_BYTEARRAY(objPtr);
@@ -414,7 +494,8 @@ Tcl_SetByteArrayLength(
if (Tcl_IsShared(objPtr)) {
Tcl_Panic("%s called with shared object", "Tcl_SetByteArrayLength");
}
- if (objPtr->typePtr != &tclByteArrayType) {
+ if ((objPtr->typePtr != &properByteArrayType)
+ && (objPtr->typePtr != &tclByteArrayType)) {
SetByteArrayFromAny(NULL, objPtr);
}
@@ -450,29 +531,37 @@ SetByteArrayFromAny(
Tcl_Interp *interp, /* Not used. */
Tcl_Obj *objPtr) /* The object to convert to type ByteArray. */
{
- int length;
+ size_t length;
+ int improper = 0;
const char *src, *srcEnd;
unsigned char *dst;
ByteArray *byteArrayPtr;
Tcl_UniChar ch;
- if (objPtr->typePtr != &tclByteArrayType) {
- src = TclGetStringFromObj(objPtr, &length);
- srcEnd = src + length;
-
- byteArrayPtr = ckalloc(BYTEARRAY_SIZE(length));
- for (dst = byteArrayPtr->bytes; src < srcEnd; ) {
- src += TclUtfToUniChar(src, &ch);
- *dst++ = UCHAR(ch);
- }
+ if (objPtr->typePtr == &properByteArrayType) {
+ return TCL_OK;
+ }
+ if (objPtr->typePtr == &tclByteArrayType) {
+ return TCL_OK;
+ }
- byteArrayPtr->used = dst - byteArrayPtr->bytes;
- byteArrayPtr->allocated = length;
+ src = TclGetString(objPtr);
+ length = objPtr->length;
+ srcEnd = src + length;
- TclFreeIntRep(objPtr);
- objPtr->typePtr = &tclByteArrayType;
- SET_BYTEARRAY(objPtr, byteArrayPtr);
+ byteArrayPtr = ckalloc(BYTEARRAY_SIZE(length));
+ for (dst = byteArrayPtr->bytes; src < srcEnd; ) {
+ src += TclUtfToUniChar(src, &ch);
+ improper = improper || (ch > 255);
+ *dst++ = UCHAR(ch);
}
+
+ byteArrayPtr->used = dst - byteArrayPtr->bytes;
+ byteArrayPtr->allocated = length;
+
+ TclFreeIntRep(objPtr);
+ objPtr->typePtr = improper ? &tclByteArrayType : &properByteArrayType;
+ SET_BYTEARRAY(objPtr, byteArrayPtr);
return TCL_OK;
}
@@ -535,7 +624,7 @@ DupByteArrayInternalRep(
memcpy(copyArrayPtr->bytes, srcArrayPtr->bytes, (size_t) length);
SET_BYTEARRAY(copyPtr, copyArrayPtr);
- copyPtr->typePtr = &tclByteArrayType;
+ copyPtr->typePtr = srcPtr->typePtr;
}
/*
@@ -642,7 +731,8 @@ TclAppendBytesToByteArray(
/* Append zero bytes is a no-op. */
return;
}
- if (objPtr->typePtr != &tclByteArrayType) {
+ if ((objPtr->typePtr != &properByteArrayType)
+ && (objPtr->typePtr != &tclByteArrayType)) {
SetByteArrayFromAny(NULL, objPtr);
}
byteArrayPtr = GET_BYTEARRAY(objPtr);
@@ -2508,7 +2598,7 @@ BinaryEncode64(
}
break;
case OPT_WRAPCHAR:
- wrapchar = Tcl_GetStringFromObj(objv[i+1], &wrapcharlen);
+ wrapchar = TclGetStringFromObj(objv[i+1], &wrapcharlen);
if (wrapcharlen == 0) {
maxlen = 0;
}