summaryrefslogtreecommitdiffstats
path: root/generic/tclEncoding.c
diff options
context:
space:
mode:
authorjan.nijtmans <nijtmans@users.sourceforge.net>2022-03-17 11:37:51 (GMT)
committerjan.nijtmans <nijtmans@users.sourceforge.net>2022-03-17 11:37:51 (GMT)
commitfe55a90645e1dcaef9e9f0df2e5293b2e529b1e7 (patch)
tree8b62438d5609e9eda941f3021d13668ccf7c3a37 /generic/tclEncoding.c
parent88c11e83785964b24f9910c3024daabbb9f8c636 (diff)
parentcf4140cc14a1800d2a74e73545fd86c49f3a1e73 (diff)
downloadtcl-fe55a90645e1dcaef9e9f0df2e5293b2e529b1e7.zip
tcl-fe55a90645e1dcaef9e9f0df2e5293b2e529b1e7.tar.gz
tcl-fe55a90645e1dcaef9e9f0df2e5293b2e529b1e7.tar.bz2
Merge 8.7
Diffstat (limited to 'generic/tclEncoding.c')
-rw-r--r--generic/tclEncoding.c136
1 files changed, 117 insertions, 19 deletions
diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c
index 3a6385f..fd2337b 100644
--- a/generic/tclEncoding.c
+++ b/generic/tclEncoding.c
@@ -515,10 +515,8 @@ FillEncodingFileMap(void)
*---------------------------------------------------------------------------
*/
-/* Those flags must not conflict with other TCL_ENCODING_* flags in tcl.h */
/* Since TCL_ENCODING_MODIFIED is only used for utf-8/cesu-8 and
* TCL_ENCODING_LE is only used for utf-16/utf-32/ucs-2. re-use the same value */
-#define TCL_ENCODING_MODIFIED 0x20 /* Converting NULL bytes to 0xC0 0x80 */
#define TCL_ENCODING_LE TCL_ENCODING_MODIFIED /* Little-endian encoding */
#define TCL_ENCODING_UTF 0x200 /* For UTF-8 encoding, allow 4-byte output sequences */
@@ -1080,11 +1078,57 @@ Tcl_ExternalToUtfDString(
Tcl_DString *dstPtr) /* Uninitialized or free DString in which the
* converted string is stored. */
{
+ Tcl_ExternalToUtfDStringEx(encoding, src, srcLen, 0, dstPtr);
+ return Tcl_DStringValue(dstPtr);
+}
+
+
+/*
+ *-------------------------------------------------------------------------
+ *
+ * Tcl_ExternalToUtfDStringEx --
+ *
+ * Convert a source buffer from the specified encoding into UTF-8.
+* The parameter flags controls the behavior, if any of the bytes in
+ * the source buffer are invalid or cannot be represented in utf-8.
+ * Possible flags values:
+ * TCL_ENCODING_STOPONERROR: don't replace invalid characters/bytes but
+ * return the first error position (Default in Tcl 9.0).
+ * TCL_ENCODING_NOCOMPLAIN: replace invalid characters/bytes by a default
+ * fallback character. Always return -1 (Default in Tcl 8.7).
+ * TCL_ENCODING_MODIFIED: convert NULL bytes to \xC0\x80 in stead of 0x00.
+ * Only valid for "utf-8" and "cesu-8". This flag may be used together
+ * with the other flags.
+ *
+ * Results:
+ * The converted bytes are stored in the DString, which is then NULL
+ * terminated in an encoding-specific manner. The return value is
+ * the error position in the source string or -1 if no conversion error
+ * is reported.
+ *
+ * Side effects:
+ * None.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+size_t
+Tcl_ExternalToUtfDStringEx(
+ Tcl_Encoding encoding, /* The encoding for the source string, or NULL
+ * for the default system encoding. */
+ const char *src, /* Source string in specified encoding. */
+ size_t srcLen, /* Source string length in bytes, or TCL_INDEX_NONE for
+ * encoding-specific string length. */
+ int flags, /* Conversion control flags. */
+ Tcl_DString *dstPtr) /* Uninitialized or free DString in which the
+ * converted string is stored. */
+{
char *dst;
Tcl_EncodingState state;
const Encoding *encodingPtr;
- int flags, result, soFar, srcRead, dstWrote, dstChars;
+ int result, soFar, srcRead, dstWrote, dstChars;
size_t dstLen;
+ const char *srcStart = src;
Tcl_DStringInit(dstPtr);
dst = Tcl_DStringValue(dstPtr);
@@ -1101,7 +1145,7 @@ Tcl_ExternalToUtfDString(
srcLen = encodingPtr->lengthProc(src);
}
- flags = TCL_ENCODING_START | TCL_ENCODING_END;
+ flags |= TCL_ENCODING_START | TCL_ENCODING_END;
if (encodingPtr->toUtfProc == UtfToUtfProc) {
flags |= TCL_ENCODING_MODIFIED | TCL_ENCODING_UTF;
}
@@ -1114,7 +1158,7 @@ Tcl_ExternalToUtfDString(
src += srcRead;
if (result != TCL_CONVERT_NOSPACE) {
Tcl_DStringSetLength(dstPtr, soFar);
- return Tcl_DStringValue(dstPtr);
+ return (result == TCL_OK) ? TCL_INDEX_NONE : (size_t)(src - srcStart);
}
flags &= ~TCL_ENCODING_START;
srcLen -= srcRead;
@@ -1273,10 +1317,57 @@ Tcl_UtfToExternalDString(
Tcl_DString *dstPtr) /* Uninitialized or free DString in which the
* converted string is stored. */
{
+ Tcl_UtfToExternalDStringEx(encoding, src, srcLen, 0, dstPtr);
+ return Tcl_DStringValue(dstPtr);
+}
+
+
+/*
+ *-------------------------------------------------------------------------
+ *
+ * Tcl_UtfToExternalDStringEx --
+ *
+ * Convert a source buffer from UTF-8 to the specified encoding.
+ * The parameter flags controls the behavior, if any of the bytes in
+ * the source buffer are invalid or cannot be represented in the
+ * target encoding.
+ * Possible flags values:
+ * TCL_ENCODING_STOPONERROR: don't replace invalid characters/bytes but
+ * return the first error position (Default in Tcl 9.0).
+ * TCL_ENCODING_NOCOMPLAIN: replace invalid characters/bytes by a default
+ * fallback character. Always return -1 (Default in Tcl 8.7).
+ * TCL_ENCODING_MODIFIED: convert NULL bytes to \xC0\x80 in stead of 0x00.
+ * Only valid for "utf-8" and "cesu-8". This flag may be used together
+ * with the other flags.
+ *
+ * Results:
+ * The converted bytes are stored in the DString, which is then NULL
+ * terminated in an encoding-specific manner. The return value is
+ * the error position in the source string or -1 if no conversion error
+ * is reported.
+ *
+ * Side effects:
+ * None.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+size_t
+Tcl_UtfToExternalDStringEx(
+ Tcl_Encoding encoding, /* The encoding for the converted string, or
+ * NULL for the default system encoding. */
+ const char *src, /* Source string in UTF-8. */
+ size_t srcLen, /* Source string length in bytes, or < 0 for
+ * strlen(). */
+ int flags, /* Conversion control flags. */
+ Tcl_DString *dstPtr) /* Uninitialized or free DString in which the
+ * converted string is stored. */
+{
char *dst;
Tcl_EncodingState state;
const Encoding *encodingPtr;
- int flags, result, soFar, srcRead, dstWrote, dstChars;
+ int result, soFar, srcRead, dstWrote, dstChars;
+ const char *srcStart = src;
size_t dstLen;
Tcl_DStringInit(dstPtr);
@@ -1293,7 +1384,7 @@ Tcl_UtfToExternalDString(
} else if (srcLen == TCL_INDEX_NONE) {
srcLen = strlen(src);
}
- flags = TCL_ENCODING_START | TCL_ENCODING_END;
+ flags |= TCL_ENCODING_START | TCL_ENCODING_END;
while (1) {
result = encodingPtr->fromUtfProc(encodingPtr->clientData, src,
srcLen, flags, &state, dst, dstLen,
@@ -1306,7 +1397,7 @@ Tcl_UtfToExternalDString(
while (i >= soFar) {
Tcl_DStringSetLength(dstPtr, i--);
}
- return Tcl_DStringValue(dstPtr);
+ return (result == TCL_OK) ? TCL_INDEX_NONE : (size_t)(src - srcStart);
}
flags &= ~TCL_ENCODING_START;
@@ -2134,6 +2225,12 @@ BinaryProc(
*-------------------------------------------------------------------------
*/
+#if TCL_MAJOR_VERSION > 8 || defined(TCL_NO_DEPRECATED)
+# define STOPONERROR !(flags & TCL_ENCODING_NOCOMPLAIN)
+#else
+# define STOPONERROR (flags & TCL_ENCODING_STOPONERROR)
+#endif
+
static int
UtfToUtfProc(
ClientData clientData, /* additional flags, e.g. TCL_ENCODING_MODIFIED */
@@ -2216,7 +2313,7 @@ UtfToUtfProc(
*/
if (flags & TCL_ENCODING_MODIFIED) {
- if (flags & TCL_ENCODING_STOPONERROR) {
+ if (STOPONERROR) {
result = TCL_CONVERT_MULTIBYTE;
break;
}
@@ -2231,7 +2328,7 @@ UtfToUtfProc(
int low;
const char *saveSrc = src;
size_t len = TclUtfToUCS4(src, &ch);
- if ((len < 2) && (ch != 0) && (flags & TCL_ENCODING_STOPONERROR)
+ if ((len < 2) && (ch != 0) && STOPONERROR
&& (flags & TCL_ENCODING_MODIFIED)) {
result = TCL_CONVERT_SYNTAX;
break;
@@ -2256,7 +2353,8 @@ UtfToUtfProc(
len = (src <= srcEnd-3) ? TclUtfToUCS4(src, &low) : 0;
if (((low & ~0x3FF) != 0xDC00) || (ch & 0x400)) {
- if (flags & TCL_ENCODING_STOPONERROR) {
+
+ if (STOPONERROR) {
result = TCL_CONVERT_UNKNOWN;
src = saveSrc;
break;
@@ -2271,7 +2369,7 @@ UtfToUtfProc(
dst += Tcl_UniCharToUtf(ch, dst);
ch = low;
} else if (!Tcl_UniCharIsUnicode(ch)) {
- if (flags & TCL_ENCODING_STOPONERROR) {
+ if (STOPONERROR) {
result = TCL_CONVERT_UNKNOWN;
src = saveSrc;
break;
@@ -2457,7 +2555,7 @@ UtfToUtf32Proc(
}
len = TclUtfToUCS4(src, &ch);
if (!Tcl_UniCharIsUnicode(ch)) {
- if (flags & TCL_ENCODING_STOPONERROR) {
+ if (STOPONERROR) {
result = TCL_CONVERT_UNKNOWN;
break;
}
@@ -2660,7 +2758,7 @@ UtfToUtf16Proc(
}
len = TclUtfToUCS4(src, &ch);
if (!Tcl_UniCharIsUnicode(ch)) {
- if (flags & TCL_ENCODING_STOPONERROR) {
+ if (STOPONERROR) {
result = TCL_CONVERT_UNKNOWN;
break;
}
@@ -2880,7 +2978,7 @@ TableToUtfProc(
ch = pageZero[byte];
}
if ((ch == 0) && (byte != 0)) {
- if (flags & TCL_ENCODING_STOPONERROR) {
+ if (STOPONERROR) {
result = TCL_CONVERT_SYNTAX;
break;
}
@@ -2996,7 +3094,7 @@ TableFromUtfProc(
word = fromUnicode[(ch >> 8)][ch & 0xFF];
if ((word == 0) && (ch != 0)) {
- if (flags & TCL_ENCODING_STOPONERROR) {
+ if (STOPONERROR) {
result = TCL_CONVERT_UNKNOWN;
break;
}
@@ -3184,7 +3282,7 @@ Iso88591FromUtfProc(
|| ((ch >= 0xD800) && (len < 3))
#endif
) {
- if (flags & TCL_ENCODING_STOPONERROR) {
+ if (STOPONERROR) {
result = TCL_CONVERT_UNKNOWN;
break;
}
@@ -3411,7 +3509,7 @@ EscapeToUtfProc(
if ((checked == dataPtr->numSubTables + 2)
|| (flags & TCL_ENCODING_END)) {
- if ((flags & TCL_ENCODING_STOPONERROR) == 0) {
+ if (!STOPONERROR) {
/*
* Skip the unknown escape sequence.
*/
@@ -3586,7 +3684,7 @@ EscapeFromUtfProc(
if (word == 0) {
state = oldState;
- if (flags & TCL_ENCODING_STOPONERROR) {
+ if (STOPONERROR) {
result = TCL_CONVERT_UNKNOWN;
break;
}