summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--doc/Utf.328
-rw-r--r--generic/tclTest.c52
-rw-r--r--generic/tclUtf.c50
-rw-r--r--tests/utf.test163
4 files changed, 274 insertions, 19 deletions
diff --git a/doc/Utf.3 b/doc/Utf.3
index 5361f32..87d1318 100644
--- a/doc/Utf.3
+++ b/doc/Utf.3
@@ -223,13 +223,27 @@ string. The caller must not ask for the next character after the last
character in the string if the string is not terminated by a null
character.
.PP
-Given \fIsrc\fR, a pointer to some location in a UTF-8 string (or to a
-null byte immediately following such a string), \fBTcl_UtfPrev\fR
-returns a pointer to the closest preceding byte that starts a UTF-8
-character.
-This function will not back up to a position before \fIstart\fR,
-the start of the UTF-8 string. If \fIsrc\fR was already at \fIstart\fR, the
-return value will be \fIstart\fR.
+\fBTcl_UtfPrev\fR is used to step backward through but not beyond the
+UTF-8 string that begins at \fIstart\fR. If the UTF-8 string is made
+up entirely of complete and well-formed characters, and \fIsrc\fR points
+to the lead byte of one of those characters (or to the location one byte
+past the end of the string), then repeated calls of \fBTcl_UtfPrev\fR will
+return pointers to the lead bytes of each character in the string, one
+character at a time, terminating when it returns \fIstart\fR.
+.PP
+When the conditions of completeness and well-formedness may not be satisfied,
+a more precise description of the function of \fBTcl_UtfPrev\fR is necessary.
+It always returns a pointer greater than or equal to \fIstart\fR; that is,
+always a pointer to a location in the string. It always returns a pointer to
+a byte that begins a character when scanning for characters beginning
+from \fIstart\fR. When \fIsrc\fR is greater than \fIstart\fR, it
+always returns a pointer less than \fIsrc\fR and greater than or
+equal to (\fIsrc\fR - \fBTCL_UTF_MAX\fR). The character that begins
+at the returned pointer is the first one that either includes the
+byte \fIsrc[-1]\fR, or might include it if the right trail bytes are
+present at \fIsrc\fR and greater. \fBTcl_UtfPrev\fR never reads the
+byte \fIsrc[0]\fR nor the byte \fIstart[-1]\fR nor the byte
+\fIsrc[-\fBTCL_UTF_MAX\fI-1]\fR.
.PP
\fBTcl_UniCharAtIndex\fR corresponds to a C string array dereference or the
Pascal Ord() function. It returns the Tcl_UniChar represented at the
diff --git a/generic/tclTest.c b/generic/tclTest.c
index 66b2233..506cef9 100644
--- a/generic/tclTest.c
+++ b/generic/tclTest.c
@@ -433,6 +433,7 @@ static int SimpleMatchInDirectory(
Tcl_Interp *interp, Tcl_Obj *resultPtr,
Tcl_Obj *dirPtr, const char *pattern,
Tcl_GlobTypeData *types);
+static Tcl_ObjCmdProc TestUtfPrevCmd;
static int TestNumUtfCharsCmd(ClientData clientData,
Tcl_Interp *interp, int objc,
Tcl_Obj *const objv[]);
@@ -690,6 +691,8 @@ Tcltest_Init(
(ClientData) 0, NULL);
Tcl_CreateObjCommand(interp, "testsetobjerrorcode",
TestsetobjerrorcodeCmd, (ClientData) 0, NULL);
+ Tcl_CreateObjCommand(interp, "testutfprev",
+ TestUtfPrevCmd, (ClientData) 0, NULL);
Tcl_CreateObjCommand(interp, "testnumutfchars",
TestNumUtfCharsCmd, (ClientData) 0, NULL);
Tcl_CreateCommand(interp, "testsetplatform", TestsetplatformCmd,
@@ -7094,6 +7097,55 @@ SimpleListVolumes(void)
}
/*
+ * Used to check operations of Tcl_UtfPrev.
+ *
+ * Usage: testutfprev $bytes $offset
+ */
+
+static int
+TestUtfPrevCmd(
+ ClientData clientData,
+ Tcl_Interp *interp,
+ int objc,
+ Tcl_Obj *const objv[])
+{
+ int numBytes, offset;
+ char *bytes;
+ const char *result;
+ Tcl_Obj *copy;
+
+ if (objc < 2 || objc > 3) {
+ Tcl_WrongNumArgs(interp, 1, objv, "bytes ?offset?");
+ return TCL_ERROR;
+ }
+
+ bytes = (char *) Tcl_GetByteArrayFromObj(objv[1], &numBytes);
+
+ if (objc == 3) {
+ if (TCL_OK != Tcl_GetIntFromObj(interp, objv[2], &offset)) {
+ return TCL_ERROR;
+ }
+ if (offset < 0) {
+ offset = 0;
+ }
+ if (offset > numBytes) {
+ offset = numBytes;
+ }
+ } else {
+ offset = numBytes;
+ }
+ copy = Tcl_DuplicateObj(objv[1]);
+ bytes = (char *) Tcl_SetByteArrayLength(copy, numBytes+1);
+ bytes[numBytes] = '\0';
+
+ result = Tcl_UtfPrev(bytes + offset, bytes);
+
+ Tcl_DecrRefCount(copy);
+ Tcl_SetObjResult(interp, Tcl_NewIntObj(result - bytes));
+ return TCL_OK;
+}
+
+/*
* Used to check correct string-length determining in Tcl_NumUtfChars
*/
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index 9aaf506..fbdba4c 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -654,15 +654,43 @@ Tcl_UtfNext(
*
* Tcl_UtfPrev --
*
- * Given a pointer to some current location in a UTF-8 string, move
- * backwards one character. This works correctly when the pointer is in
- * the middle of a UTF-8 character.
+ * The aim of this routine is to provide a way to move backward
+ * through a UTF-8 string. The caller is expected to pass non-NULL
+ * pointer arguments start and src. start points to the beginning
+ * of a string, and src >= start points to a location within (or just
+ * past the end) of the string. This routine always returns a
+ * pointer within the string (>= start). When (src == start), it
+ * returns start. When (src > start), it returns a pointer (< src)
+ * and (>= src - TCL_UTF_MAX). Subject to these constraints, the
+ * routine returns a pointer to the earliest byte in the string that
+ * starts a character when characters are read starting at start and
+ * that character might include the byte src[-1]. The routine will
+ * examine only those bytes in the range that might be returned.
+ * It will not examine the byte *src, and because of that cannot
+ * determine for certain in all circumstances whether the character
+ * that begins with the returned pointer will or will not include
+ * the byte src[-1]. In the scenario, where src points to the end of
+ * a buffer being filled, the returned pointer point to either the
+ * final complete character in the string or to the earliest byte
+ * that might start an incomplete character waiting for more bytes to
+ * complete.
+ *
+ * Because this routine always returns a value < src until the point
+ * it is forced to return start, it is useful as a backward iterator
+ * through a string that will always make progress and always be
+ * prevented from running past the beginning of the string.
+ *
+ * In a string where all characters are complete and properly formed,
+ * and the value of src points to the first byte of a character,
+ * repeated Tcl_UtfPrev calls will step to the starting bytes of
+ * characters, one character at a time. Within those limitations,
+ * Tcl_UtfPrev and Tcl_UtfNext are inverses. If either condition cannot
+ * be met, Tcl_UtfPrev and Tcl_UtfNext may not function as inverses and
+ * the caller will have to take greater care.
*
* Results:
- * The return value is a pointer to the previous character in the UTF-8
- * string. If the current location was already at the beginning of the
- * string, the return value will also be a pointer to the beginning of
- * the string.
+ * A pointer to the start of a character in the string as described
+ * above.
*
* Side effects:
* None.
@@ -672,9 +700,8 @@ Tcl_UtfNext(
CONST char *
Tcl_UtfPrev(
- CONST char *src, /* The current location in the string. */
- CONST char *start) /* Pointer to the beginning of the string, to
- * avoid going backwards too far. */
+ CONST char *src, /* A location in a UTF-8 string. */
+ CONST char *start) /* Pointer to the beginning of the string */
{
CONST char *look;
int i, byte;
@@ -693,6 +720,9 @@ Tcl_UtfPrev(
break;
}
if (byte >= 0xC0) {
+ if (totalBytes[byte] <= i) {
+ break;
+ }
return look;
}
look--;
diff --git a/tests/utf.test b/tests/utf.test
index e8ee374..c2191c2 100644
--- a/tests/utf.test
+++ b/tests/utf.test
@@ -94,8 +94,167 @@ test utf-5.1 {Tcl_UtfFindFirsts} {
test utf-6.1 {Tcl_UtfNext} {
} {}
-test utf-7.1 {Tcl_UtfPrev} {
-} {}
+testConstraint testutfprev [llength [info commands testutfprev]]
+
+test utf-7.1 {Tcl_UtfPrev} testutfprev {
+ testutfprev {}
+} 0
+test utf-7.2 {Tcl_UtfPrev} testutfprev {
+ testutfprev A
+} 0
+test utf-7.3 {Tcl_UtfPrev} testutfprev {
+ testutfprev AA
+} 1
+test utf-7.4 {Tcl_UtfPrev} testutfprev {
+ testutfprev A\xF8
+} 1
+test utf-7.4.1 {Tcl_UtfPrev} testutfprev {
+ testutfprev A\xF8\xA0\xA0\xA0 2
+} 1
+test utf-7.4.2 {Tcl_UtfPrev} testutfprev {
+ testutfprev A\xF8\xF8\xA0\xA0 2
+} 1
+test utf-7.5 {Tcl_UtfPrev} testutfprev {
+ testutfprev A\xF4
+} 1
+test utf-7.5.1 {Tcl_UtfPrev} testutfprev {
+ testutfprev A\xF4\xA0\xA0\xA0 2
+} 1
+test utf-7.5.2 {Tcl_UtfPrev} testutfprev {
+ testutfprev A\xF4\xF8\xA0\xA0 2
+} 1
+test utf-7.6 {Tcl_UtfPrev} testutfprev {
+ testutfprev A\xE8
+} 1
+test utf-7.6.1 {Tcl_UtfPrev} testutfprev {
+ testutfprev A\xE8\xA0\xA0\xA0 2
+} 1
+test utf-7.6.2 {Tcl_UtfPrev} testutfprev {
+ testutfprev A\xE8\xF8\xA0\xA0 2
+} 1
+test utf-7.7 {Tcl_UtfPrev} testutfprev {
+ testutfprev A\xD0
+} 1
+test utf-7.7.1 {Tcl_UtfPrev} testutfprev {
+ testutfprev A\xD0\xA0\xA0\xA0 2
+} 1
+test utf-7.7.2 {Tcl_UtfPrev} testutfprev {
+ testutfprev A\xD0\xF8\xA0\xA0 2
+} 1
+test utf-7.8 {Tcl_UtfPrev} testutfprev {
+ testutfprev A\xA0
+} 1
+test utf-7.8.1 {Tcl_UtfPrev} testutfprev {
+ testutfprev A\xA0\xA0\xA0\xA0 2
+} 1
+test utf-7.8.2 {Tcl_UtfPrev} testutfprev {
+ testutfprev A\xA0\xF8\xA0\xA0 2
+} 1
+test utf-7.9 {Tcl_UtfPrev} testutfprev {
+ testutfprev A\xF8\xA0
+} 2
+test utf-7.9.1 {Tcl_UtfPrev} testutfprev {
+ testutfprev A\xF8\xA0\xA0\xA0 3
+} 2
+test utf-7.9.2 {Tcl_UtfPrev} testutfprev {
+ testutfprev A\xF8\xA0\xF8\xA0 3
+} 2
+test utf-7.10 {Tcl_UtfPrev} testutfprev {
+ testutfprev A\xF4\xA0
+} 2
+test utf-7.10.1 {Tcl_UtfPrev} testutfprev {
+ testutfprev A\xF4\xA0\xA0\xA0 3
+} 2
+test utf-7.10.2 {Tcl_UtfPrev} testutfprev {
+ testutfprev A\xF4\xA0\xF8\xA0 3
+} 2
+test utf-7.11 {Tcl_UtfPrev} testutfprev {
+ testutfprev A\xE8\xA0
+} 1
+test utf-7.11.1 {Tcl_UtfPrev} testutfprev {
+ testutfprev A\xE8\xA0\xA0\xA0 3
+} 1
+test utf-7.11.2 {Tcl_UtfPrev} testutfprev {
+ testutfprev A\xE8\xA0\xF8\xA0 3
+} 1
+test utf-7.12 {Tcl_UtfPrev} testutfprev {
+ testutfprev A\xD0\xA0
+} 1
+test utf-7.12.1 {Tcl_UtfPrev} testutfprev {
+ testutfprev A\xD0\xA0\xA0\xA0 3
+} 1
+test utf-7.12.2 {Tcl_UtfPrev} testutfprev {
+ testutfprev A\xD0\xA0\xF8\xA0 3
+} 1
+test utf-7.13 {Tcl_UtfPrev} testutfprev {
+ testutfprev A\xA0\xA0
+} 2
+test utf-7.13.1 {Tcl_UtfPrev} testutfprev {
+ testutfprev A\xA0\xA0\xA0\xA0 3
+} 2
+test utf-7.13.2 {Tcl_UtfPrev} testutfprev {
+ testutfprev A\xA0\xA0\xF8\xA0 3
+} 2
+test utf-7.14 {Tcl_UtfPrev} testutfprev {
+ testutfprev A\xF8\xA0\xA0
+} 3
+test utf-7.14.1 {Tcl_UtfPrev} testutfprev {
+ testutfprev A\xF8\xA0\xA0\xA0 4
+} 3
+test utf-7.14.2 {Tcl_UtfPrev} testutfprev {
+ testutfprev A\xF8\xA0\xA0\xF8 4
+} 3
+test utf-7.15 {Tcl_UtfPrev} testutfprev {
+ testutfprev A\xF4\xA0\xA0
+} 3
+test utf-7.15.1 {Tcl_UtfPrev} testutfprev {
+ testutfprev A\xF4\xA0\xA0\xA0 4
+} 3
+test utf-7.15.2 {Tcl_UtfPrev} testutfprev {
+ testutfprev A\xF4\xA0\xA0\xF8 4
+} 3
+test utf-7.16 {Tcl_UtfPrev} testutfprev {
+ testutfprev A\xE8\xA0\xA0
+} 1
+test utf-7.16.1 {Tcl_UtfPrev} testutfprev {
+ testutfprev A\xE8\xA0\xA0\xA0 4
+} 1
+test utf-7.16.2 {Tcl_UtfPrev} testutfprev {
+ testutfprev A\xE8\xA0\xA0\xF8 4
+} 1
+test utf-7.17 {Tcl_UtfPrev} testutfprev {
+ testutfprev A\xD0\xA0\xA0
+} 3
+test utf-7.17.1 {Tcl_UtfPrev} testutfprev {
+ testutfprev A\xD0\xA0\xA0\xA0 4
+} 3
+test utf-7.17.2 {Tcl_UtfPrev} testutfprev {
+ testutfprev A\xD0\xA0\xA0\xF8 4
+} 3
+test utf-7.18 {Tcl_UtfPrev} testutfprev {
+ testutfprev A\xA0\xA0\xA0
+} 3
+test utf-7.18.1 {Tcl_UtfPrev} testutfprev {
+ testutfprev A\xA0\xA0\xA0\xA0 4
+} 3
+test utf-7.18.2 {Tcl_UtfPrev} testutfprev {
+ testutfprev A\xA0\xA0\xA0\xF8 4
+} 3
+test utf-7.19 {Tcl_UtfPrev} testutfprev {
+ testutfprev A\xF8\xA0\xA0\xA0
+} 4
+test utf-7.20 {Tcl_UtfPrev} testutfprev {
+ testutfprev A\xF4\xA0\xA0\xA0
+} 4
+test utf-7.21 {Tcl_UtfPrev} testutfprev {
+ testutfprev A\xE8\xA0\xA0\xA0
+} 4
+test utf-7.22 {Tcl_UtfPrev} testutfprev {
+ testutfprev A\xD0\xA0\xA0\xA0
+} 4
+test utf-7.23 {Tcl_UtfPrev} testutfprev {
+ testutfprev A\xA0\xA0\xA0\xA0
+} 4
test utf-8.1 {Tcl_UniCharAtIndex: index = 0} {
string index abcd 0