summaryrefslogtreecommitdiffstats
path: root/generic/tclBinary.c
diff options
context:
space:
mode:
Diffstat (limited to 'generic/tclBinary.c')
-rw-r--r--generic/tclBinary.c182
1 files changed, 46 insertions, 136 deletions
diff --git a/generic/tclBinary.c b/generic/tclBinary.c
index 72f1498..d0d9d5e 100644
--- a/generic/tclBinary.c
+++ b/generic/tclBinary.c
@@ -155,108 +155,35 @@ static const EnsembleImplMap decodeMap[] = {
};
/*
- * The following object types represent an array of bytes. The intent is
- * to allow arbitrary binary data to pass through Tcl as a Tcl value
- * without loss or damage. Such values are useful for things like
- * encoded strings or Tk images to name just two.
- *
- * It's strange to have two Tcl_ObjTypes in place for this task when
- * one would do, so a bit of detail and history how we got to this point
- * and where we might go from here.
- *
- * A bytearray is an ordered sequence of bytes. Each byte is an integer
- * value in the range [0-255]. To be a Tcl value type, we need a way to
- * encode each value in the value set as a Tcl string. The simplest
- * encoding is to represent each byte value as the same codepoint value.
- * A bytearray of N bytes is encoded into a Tcl string of N characters
- * where the codepoint of each character is the value of corresponding byte.
- * This approach creates a one-to-one map between all bytearray values
- * and a subset of Tcl string values.
- *
- * When converting a Tcl string value to the bytearray internal rep, the
- * question arises what to do with strings outside that subset? That is,
- * those Tcl strings containing at least one codepoint greater than 255?
- * The obviously correct answer is to raise an error! That string value
- * does not represent any valid bytearray value. Full Stop. The
- * setFromAnyProc signature has a completion code return value for just
- * this reason, to reject invalid inputs.
- *
- * Unfortunately this was not the path taken by the authors of the
- * original tclByteArrayType. They chose to accept all Tcl string values
- * as acceptable string encodings of the bytearray values that result
- * from masking away the high bits of any codepoint value at all. This
- * meant that every bytearray value had multiple accepted string
- * representations.
- *
- * The implications of this choice are truly ugly. When a Tcl value has
- * a string representation, we are required to accept that as the true
- * value. Bytearray values that possess a string representation cannot
- * be processed as bytearrays because we cannot know which true value
- * that bytearray represents. The consequence is that we drag around
- * an internal rep that we cannot make any use of. This painful price
- * is extracted at any point after a string rep happens to be generated
- * for the value. This happens even when the troublesome codepoints
- * outside the byte range never show up. This happens rather routinely
- * in normal Tcl operations unless we burden the script writer with the
- * cognitive burden of avoiding it. The price is also paid by callers
- * of the C interface. The routine
- *
- * unsigned char *Tcl_GetByteArrayFromObj(objPtr, lenPtr)
- *
- * has a guarantee to always return a non-NULL value, but that value
- * points to a byte sequence that cannot be used by the caller to
- * process the Tcl value absent some sideband testing that objPtr
- * is "pure". Tcl offers no public interface to perform this test,
- * so callers either break encapsulation or are unavoidably buggy. Tcl
- * has defined a public interface that cannot be used correctly. The
- * Tcl source code itself suffers the same problem, and has been buggy,
- * but progressively less so as more and more portions of the code have
- * been retrofitted with the required "purity testing". The set of values
- * able to pass the purity test can be increased via the introduction of
- * a "canonical" flag marker, but the only way the broken interface itself
- * can be discarded is to start over and define the Tcl_ObjType properly.
- * Bytearrays should simply be usable as bytearrays without a kabuki
- * dance of testing.
- *
- * The Tcl_ObjType "properByteArrayType" is (nearly) a correct
- * implementation of bytearrays. Any Tcl value with the type
- * properByteArrayType can have its bytearray value fetched and
- * used with confidence that acting on that value is equivalent to
- * acting on the true Tcl string value. This still implies a side
- * testing burden -- past mistakes will not let us avoid that
- * immediately, but it is at least a conventional test of type, and
- * can be implemented entirely by examining the objPtr fields, with
- * no need to query the intrep, as a canonical flag would require.
- *
- * Until Tcl_GetByteArrayFromObj() and Tcl_SetByteArrayLength() can
- * be revised to admit the possibility of returning NULL when the true
- * value is not a valid bytearray, we need a mechanism to retain
- * compatibility with the deployed callers of the broken interface.
- * That's what the retained "tclByteArrayType" provides. In those
- * unusual circumstances where we convert an invalid bytearray value
- * to a bytearray type, it is to this legacy type. Essentially any
- * time this legacy type gets used, it's a signal of a bug being ignored.
- * A TIP should be drafted to remove this connection to the broken past
- * so that Tcl 9 will no longer have any trace of it. Prescribing a
- * migration path will be the key element of that work. The internal
- * changes now in place are the limit of what can be done short of
- * interface repair. They provide a great expansion of the histories
- * over which bytearray values can be useful in the meanwhile.
+ * The following object type represents an array of bytes. An array of bytes
+ * is not equivalent to an internationalized string. Conceptually, a string is
+ * an array of 16-bit quantities organized as a sequence of properly formed
+ * UTF-8 characters, while a ByteArray is an array of 8-bit quantities.
+ * Accessor functions are provided to convert a ByteArray to a String or a
+ * String to a ByteArray. Two or more consecutive bytes in an array of bytes
+ * may look like a single UTF-8 character if the array is casually treated as
+ * a string. But obtaining the String from a ByteArray is guaranteed to
+ * produced properly formed UTF-8 sequences so that there is a one-to-one map
+ * between bytes and characters.
+ *
+ * Converting a ByteArray to a String proceeds by casting each byte in the
+ * array to a 16-bit quantity, treating that number as a Unicode character,
+ * and storing the UTF-8 version of that Unicode character in the String. For
+ * ByteArrays consisting entirely of values 1..127, the corresponding String
+ * representation is the same as the ByteArray representation.
+ *
+ * Converting a String to a ByteArray proceeds by getting the Unicode
+ * representation of each character in the String, casting it to a byte by
+ * truncating the upper 8 bits, and then storing the byte in the ByteArray.
+ * Converting from ByteArray to String and back to ByteArray is not lossy, but
+ * converting an arbitrary String to a ByteArray may be.
*/
-static const Tcl_ObjType properByteArrayType = {
- "bytearray",
- FreeByteArrayInternalRep,
- DupByteArrayInternalRep,
- UpdateStringOfByteArray,
- NULL
-};
-
const Tcl_ObjType tclByteArrayType = {
"bytearray",
FreeByteArrayInternalRep,
DupByteArrayInternalRep,
- NULL,
+ UpdateStringOfByteArray,
SetByteArrayFromAny
};
@@ -284,12 +211,6 @@ typedef struct ByteArray {
#define SET_BYTEARRAY(objPtr, baPtr) \
(objPtr)->internalRep.twoPtrValue.ptr1 = (void *) (baPtr)
-int
-TclIsPureByteArray(
- Tcl_Obj * objPtr)
-{
- return (objPtr->typePtr == &properByteArrayType);
-}
/*
*----------------------------------------------------------------------
@@ -420,7 +341,7 @@ Tcl_SetByteArrayObj(
if ((bytes != NULL) && (length > 0)) {
memcpy(byteArrayPtr->bytes, bytes, (size_t) length);
}
- objPtr->typePtr = &properByteArrayType;
+ objPtr->typePtr = &tclByteArrayType;
SET_BYTEARRAY(objPtr, byteArrayPtr);
}
@@ -450,8 +371,7 @@ Tcl_GetByteArrayFromObj(
{
ByteArray *baPtr;
- if ((objPtr->typePtr != &properByteArrayType)
- && (objPtr->typePtr != &tclByteArrayType)) {
+ if (objPtr->typePtr != &tclByteArrayType) {
SetByteArrayFromAny(NULL, objPtr);
}
baPtr = GET_BYTEARRAY(objPtr);
@@ -494,8 +414,7 @@ Tcl_SetByteArrayLength(
if (Tcl_IsShared(objPtr)) {
Tcl_Panic("%s called with shared object", "Tcl_SetByteArrayLength");
}
- if ((objPtr->typePtr != &properByteArrayType)
- && (objPtr->typePtr != &tclByteArrayType)) {
+ if (objPtr->typePtr != &tclByteArrayType) {
SetByteArrayFromAny(NULL, objPtr);
}
@@ -531,37 +450,29 @@ SetByteArrayFromAny(
Tcl_Interp *interp, /* Not used. */
Tcl_Obj *objPtr) /* The object to convert to type ByteArray. */
{
- size_t length;
- int improper = 0;
+ int length;
const char *src, *srcEnd;
unsigned char *dst;
ByteArray *byteArrayPtr;
Tcl_UniChar ch;
- if (objPtr->typePtr == &properByteArrayType) {
- return TCL_OK;
- }
- if (objPtr->typePtr == &tclByteArrayType) {
- return TCL_OK;
- }
-
- src = TclGetString(objPtr);
- length = objPtr->length;
- srcEnd = src + length;
+ if (objPtr->typePtr != &tclByteArrayType) {
+ src = TclGetStringFromObj(objPtr, &length);
+ srcEnd = src + length;
- byteArrayPtr = ckalloc(BYTEARRAY_SIZE(length));
- for (dst = byteArrayPtr->bytes; src < srcEnd; ) {
- src += Tcl_UtfToUniChar(src, &ch);
- improper = improper || (ch > 255);
- *dst++ = UCHAR(ch);
- }
+ byteArrayPtr = ckalloc(BYTEARRAY_SIZE(length));
+ for (dst = byteArrayPtr->bytes; src < srcEnd; ) {
+ src += TclUtfToUniChar(src, &ch);
+ *dst++ = UCHAR(ch);
+ }
- byteArrayPtr->used = dst - byteArrayPtr->bytes;
- byteArrayPtr->allocated = length;
+ byteArrayPtr->used = dst - byteArrayPtr->bytes;
+ byteArrayPtr->allocated = length;
- TclFreeIntRep(objPtr);
- objPtr->typePtr = improper ? &tclByteArrayType : &properByteArrayType;
- SET_BYTEARRAY(objPtr, byteArrayPtr);
+ TclFreeIntRep(objPtr);
+ objPtr->typePtr = &tclByteArrayType;
+ SET_BYTEARRAY(objPtr, byteArrayPtr);
+ }
return TCL_OK;
}
@@ -624,7 +535,7 @@ DupByteArrayInternalRep(
memcpy(copyArrayPtr->bytes, srcArrayPtr->bytes, (size_t) length);
SET_BYTEARRAY(copyPtr, copyArrayPtr);
- copyPtr->typePtr = srcPtr->typePtr;
+ copyPtr->typePtr = &tclByteArrayType;
}
/*
@@ -731,8 +642,7 @@ TclAppendBytesToByteArray(
/* Append zero bytes is a no-op. */
return;
}
- if ((objPtr->typePtr != &properByteArrayType)
- && (objPtr->typePtr != &tclByteArrayType)) {
+ if (objPtr->typePtr != &tclByteArrayType) {
SetByteArrayFromAny(NULL, objPtr);
}
byteArrayPtr = GET_BYTEARRAY(objPtr);
@@ -1303,7 +1213,7 @@ BinaryFormatCmd(
Tcl_UniChar ch;
char buf[TCL_UTF_MAX + 1];
- Tcl_UtfToUniChar(errorString, &ch);
+ TclUtfToUniChar(errorString, &ch);
buf[Tcl_UniCharToUtf(ch, buf)] = '\0';
Tcl_SetObjResult(interp, Tcl_ObjPrintf(
"bad field specifier \"%s\"", buf));
@@ -1673,7 +1583,7 @@ BinaryScanCmd(
Tcl_UniChar ch;
char buf[TCL_UTF_MAX + 1];
- Tcl_UtfToUniChar(errorString, &ch);
+ TclUtfToUniChar(errorString, &ch);
buf[Tcl_UniCharToUtf(ch, buf)] = '\0';
Tcl_SetObjResult(interp, Tcl_ObjPrintf(
"bad field specifier \"%s\"", buf));
@@ -2598,7 +2508,7 @@ BinaryEncode64(
}
break;
case OPT_WRAPCHAR:
- wrapchar = TclGetStringFromObj(objv[i+1], &wrapcharlen);
+ wrapchar = Tcl_GetStringFromObj(objv[i+1], &wrapcharlen);
if (wrapcharlen == 0) {
maxlen = 0;
}