diff options
Diffstat (limited to 'generic/tclBinary.c')
-rw-r--r-- | generic/tclBinary.c | 178 |
1 files changed, 134 insertions, 44 deletions
diff --git a/generic/tclBinary.c b/generic/tclBinary.c index 981f174..a3e5071 100644 --- a/generic/tclBinary.c +++ b/generic/tclBinary.c @@ -155,35 +155,108 @@ static const EnsembleImplMap decodeMap[] = { }; /* - * The following object type represents an array of bytes. An array of bytes - * is not equivalent to an internationalized string. Conceptually, a string is - * an array of 16-bit quantities organized as a sequence of properly formed - * UTF-8 characters, while a ByteArray is an array of 8-bit quantities. - * Accessor functions are provided to convert a ByteArray to a String or a - * String to a ByteArray. Two or more consecutive bytes in an array of bytes - * may look like a single UTF-8 character if the array is casually treated as - * a string. But obtaining the String from a ByteArray is guaranteed to - * produced properly formed UTF-8 sequences so that there is a one-to-one map - * between bytes and characters. - * - * Converting a ByteArray to a String proceeds by casting each byte in the - * array to a 16-bit quantity, treating that number as a Unicode character, - * and storing the UTF-8 version of that Unicode character in the String. For - * ByteArrays consisting entirely of values 1..127, the corresponding String - * representation is the same as the ByteArray representation. - * - * Converting a String to a ByteArray proceeds by getting the Unicode - * representation of each character in the String, casting it to a byte by - * truncating the upper 8 bits, and then storing the byte in the ByteArray. - * Converting from ByteArray to String and back to ByteArray is not lossy, but - * converting an arbitrary String to a ByteArray may be. + * The following object types represent an array of bytes. The intent is + * to allow arbitrary binary data to pass through Tcl as a Tcl value + * without loss or damage. Such values are useful for things like + * encoded strings or Tk images to name just two. + * + * It's strange to have two Tcl_ObjTypes in place for this task when + * one would do, so a bit of detail and history how we got to this point + * and where we might go from here. + * + * A bytearray is an ordered sequence of bytes. Each byte is an integer + * value in the range [0-255]. To be a Tcl value type, we need a way to + * encode each value in the value set as a Tcl string. The simplest + * encoding is to represent each byte value as the same codepoint value. + * A bytearray of N bytes is encoded into a Tcl string of N characters + * where the codepoint of each character is the value of corresponding byte. + * This approach creates a one-to-one map between all bytearray values + * and a subset of Tcl string values. + * + * When converting a Tcl string value to the bytearray internal rep, the + * question arises what to do with strings outside that subset? That is, + * those Tcl strings containing at least one codepoint greater than 255? + * The obviously correct answer is to raise an error! That string value + * does not represent any valid bytearray value. Full Stop. The + * setFromAnyProc signature has a completion code return value for just + * this reason, to reject invalid inputs. + * + * Unfortunately this was not the path taken by the authors of the + * original tclByteArrayType. They chose to accept all Tcl string values + * as acceptable string encodings of the bytearray values that result + * from masking away the high bits of any codepoint value at all. This + * meant that every bytearray value had multiple accepted string + * representations. + * + * The implications of this choice are truly ugly. When a Tcl value has + * a string representation, we are required to accept that as the true + * value. Bytearray values that possess a string representation cannot + * be processed as bytearrays because we cannot know which true value + * that bytearray represents. The consequence is that we drag around + * an internal rep that we cannot make any use of. This painful price + * is extracted at any point after a string rep happens to be generated + * for the value. This happens even when the troublesome codepoints + * outside the byte range never show up. This happens rather routinely + * in normal Tcl operations unless we burden the script writer with the + * cognitive burden of avoiding it. The price is also paid by callers + * of the C interface. The routine + * + * unsigned char *Tcl_GetByteArrayFromObj(objPtr, lenPtr) + * + * has a guarantee to always return a non-NULL value, but that value + * points to a byte sequence that cannot be used by the caller to + * process the Tcl value absent some sideband testing that objPtr + * is "pure". Tcl offers no public interface to perform this test, + * so callers either break encapsulation or are unavoidably buggy. Tcl + * has defined a public interface that cannot be used correctly. The + * Tcl source code itself suffers the same problem, and has been buggy, + * but progressively less so as more and more portions of the code have + * been retrofitted with the required "purity testing". The set of values + * able to pass the purity test can be increased via the introduction of + * a "canonical" flag marker, but the only way the broken interface itself + * can be discarded is to start over and define the Tcl_ObjType properly. + * Bytearrays should simply be usable as bytearrays without a kabuki + * dance of testing. + * + * The Tcl_ObjType "properByteArrayType" is (nearly) a correct + * implementation of bytearrays. Any Tcl value with the type + * properByteArrayType can have its bytearray value fetched and + * used with confidence that acting on that value is equivalent to + * acting on the true Tcl string value. This still implies a side + * testing burden -- past mistakes will not let us avoid that + * immediately, but it is at least a conventional test of type, and + * can be implemented entirely by examining the objPtr fields, with + * no need to query the intrep, as a canonical flag would require. + * + * Until Tcl_GetByteArrayFromObj() and Tcl_SetByteArrayLength() can + * be revised to admit the possibility of returning NULL when the true + * value is not a valid bytearray, we need a mechanism to retain + * compatibility with the deployed callers of the broken interface. + * That's what the retained "tclByteArrayType" provides. In those + * unusual circumstances where we convert an invalid bytearray value + * to a bytearray type, it is to this legacy type. Essentially any + * time this legacy type gets used, it's a signal of a bug being ignored. + * A TIP should be drafted to remove this connection to the broken past + * so that Tcl 9 will no longer have any trace of it. Prescribing a + * migration path will be the key element of that work. The internal + * changes now in place are the limit of what can be done short of + * interface repair. They provide a great expansion of the histories + * over which bytearray values can be useful in the meanwhile. */ -const Tcl_ObjType tclByteArrayType = { +static const Tcl_ObjType properByteArrayType = { "bytearray", FreeByteArrayInternalRep, DupByteArrayInternalRep, UpdateStringOfByteArray, + NULL +}; + +const Tcl_ObjType tclByteArrayType = { + "bytearray", + FreeByteArrayInternalRep, + DupByteArrayInternalRep, + NULL, SetByteArrayFromAny }; @@ -211,6 +284,12 @@ typedef struct ByteArray { #define SET_BYTEARRAY(objPtr, baPtr) \ (objPtr)->internalRep.twoPtrValue.ptr1 = (void *) (baPtr) +int +TclIsPureByteArray( + Tcl_Obj * objPtr) +{ + return (objPtr->typePtr == &properByteArrayType); +} /* *---------------------------------------------------------------------- @@ -341,7 +420,7 @@ Tcl_SetByteArrayObj( if ((bytes != NULL) && (length > 0)) { memcpy(byteArrayPtr->bytes, bytes, (size_t) length); } - objPtr->typePtr = &tclByteArrayType; + objPtr->typePtr = &properByteArrayType; SET_BYTEARRAY(objPtr, byteArrayPtr); } @@ -371,7 +450,8 @@ Tcl_GetByteArrayFromObj( { ByteArray *baPtr; - if (objPtr->typePtr != &tclByteArrayType) { + if ((objPtr->typePtr != &properByteArrayType) + && (objPtr->typePtr != &tclByteArrayType)) { SetByteArrayFromAny(NULL, objPtr); } baPtr = GET_BYTEARRAY(objPtr); @@ -414,7 +494,8 @@ Tcl_SetByteArrayLength( if (Tcl_IsShared(objPtr)) { Tcl_Panic("%s called with shared object", "Tcl_SetByteArrayLength"); } - if (objPtr->typePtr != &tclByteArrayType) { + if ((objPtr->typePtr != &properByteArrayType) + && (objPtr->typePtr != &tclByteArrayType)) { SetByteArrayFromAny(NULL, objPtr); } @@ -450,29 +531,37 @@ SetByteArrayFromAny( Tcl_Interp *interp, /* Not used. */ Tcl_Obj *objPtr) /* The object to convert to type ByteArray. */ { - int length; + size_t length; + int improper = 0; const char *src, *srcEnd; unsigned char *dst; ByteArray *byteArrayPtr; Tcl_UniChar ch; - if (objPtr->typePtr != &tclByteArrayType) { - src = TclGetStringFromObj(objPtr, &length); - srcEnd = src + length; - - byteArrayPtr = ckalloc(BYTEARRAY_SIZE(length)); - for (dst = byteArrayPtr->bytes; src < srcEnd; ) { - src += Tcl_UtfToUniChar(src, &ch); - *dst++ = UCHAR(ch); - } + if (objPtr->typePtr == &properByteArrayType) { + return TCL_OK; + } + if (objPtr->typePtr == &tclByteArrayType) { + return TCL_OK; + } - byteArrayPtr->used = dst - byteArrayPtr->bytes; - byteArrayPtr->allocated = length; + src = TclGetString(objPtr); + length = objPtr->length; + srcEnd = src + length; - TclFreeIntRep(objPtr); - objPtr->typePtr = &tclByteArrayType; - SET_BYTEARRAY(objPtr, byteArrayPtr); + byteArrayPtr = ckalloc(BYTEARRAY_SIZE(length)); + for (dst = byteArrayPtr->bytes; src < srcEnd; ) { + src += Tcl_UtfToUniChar(src, &ch); + improper = improper || (ch > 255); + *dst++ = UCHAR(ch); } + + byteArrayPtr->used = dst - byteArrayPtr->bytes; + byteArrayPtr->allocated = length; + + TclFreeIntRep(objPtr); + objPtr->typePtr = improper ? &tclByteArrayType : &properByteArrayType; + SET_BYTEARRAY(objPtr, byteArrayPtr); return TCL_OK; } @@ -535,7 +624,7 @@ DupByteArrayInternalRep( memcpy(copyArrayPtr->bytes, srcArrayPtr->bytes, (size_t) length); SET_BYTEARRAY(copyPtr, copyArrayPtr); - copyPtr->typePtr = &tclByteArrayType; + copyPtr->typePtr = srcPtr->typePtr; } /* @@ -642,7 +731,8 @@ TclAppendBytesToByteArray( /* Append zero bytes is a no-op. */ return; } - if (objPtr->typePtr != &tclByteArrayType) { + if ((objPtr->typePtr != &properByteArrayType) + && (objPtr->typePtr != &tclByteArrayType)) { SetByteArrayFromAny(NULL, objPtr); } byteArrayPtr = GET_BYTEARRAY(objPtr); @@ -2500,7 +2590,7 @@ BinaryEncode64( } break; case OPT_WRAPCHAR: - wrapchar = Tcl_GetStringFromObj(objv[i+1], &wrapcharlen); + wrapchar = TclGetStringFromObj(objv[i+1], &wrapcharlen); if (wrapcharlen == 0) { maxlen = 0; } |