summaryrefslogtreecommitdiffstats
path: root/generic/tclUtf.c
diff options
context:
space:
mode:
Diffstat (limited to 'generic/tclUtf.c')
-rw-r--r--generic/tclUtf.c52
1 files changed, 41 insertions, 11 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index 12d764c..22e26d2 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -675,15 +675,43 @@ Tcl_UtfNext(
*
* Tcl_UtfPrev --
*
- * Given a pointer to some current location in a UTF-8 string, move
- * backwards one character. This works correctly when the pointer is in
- * the middle of a UTF-8 character.
+ * The aim of this routine is to provide a way to move backward
+ * through a UTF-8 string. The caller is expected to pass non-NULL
+ * pointer arguments start and src. start points to the beginning
+ * of a string, and src >= start points to a location within (or just
+ * past the end) of the string. This routine always returns a
+ * pointer within the string (>= start). When (src == start), it
+ * returns start. When (src > start), it returns a pointer (< src)
+ * and (>= src - TCL_UTF_MAX). Subject to these constraints, the
+ * routine returns a pointer to the earliest byte in the string that
+ * starts a character when characters are read starting at start and
+ * that character might include the byte src[-1]. The routine will
+ * examine only those bytes in the range that might be returned.
+ * It will not examine the byte *src, and because of that cannot
+ * determine for certain in all circumstances whether the character
+ * that begins with the returned pointer will or will not include
+ * the byte src[-1]. In the scenario, where src points to the end of
+ * a buffer being filled, the returned pointer point to either the
+ * final complete character in the string or to the earliest byte
+ * that might start an incomplete character waiting for more bytes to
+ * complete.
+ *
+ * Because this routine always returns a value < src until the point
+ * it is forced to return start, it is useful as a backward iterator
+ * through a string that will always make progress and always be
+ * prevented from running past the beginning of the string.
+ *
+ * In a string where all characters are complete and properly formed,
+ * and the value of src points to the first byte of a character,
+ * repeated Tcl_UtfPrev calls will step to the starting bytes of
+ * characters, one character at a time. Within those limitations,
+ * Tcl_UtfPrev and Tcl_UtfNext are inverses. If either condition cannot
+ * be met, Tcl_UtfPrev and Tcl_UtfNext may not function as inverses and
+ * the caller will have to take greater care.
*
* Results:
- * The return value is a pointer to the previous character in the UTF-8
- * string. If the current location was already at the beginning of the
- * string, the return value will also be a pointer to the beginning of
- * the string.
+ * A pointer to the start of a character in the string as described
+ * above.
*
* Side effects:
* None.
@@ -693,9 +721,8 @@ Tcl_UtfNext(
const char *
Tcl_UtfPrev(
- const char *src, /* The current location in the string. */
- const char *start) /* Pointer to the beginning of the string, to
- * avoid going backwards too far. */
+ const char *src, /* A location in a UTF-8 string. */
+ const char *start) /* Pointer to the beginning of the string */
{
const char *look;
int i, byte;
@@ -713,6 +740,9 @@ Tcl_UtfPrev(
break;
}
if (byte >= 0xC0) {
+ if (totalBytes[byte] <= i) {
+ break;
+ }
return look;
}
look--;
@@ -1692,7 +1722,7 @@ Tcl_UniCharIsSpace(
*/
if (ch < 0x80) {
- return TclIsSpaceProc((char) ch);
+ return TclIsSpaceProcM((char) ch);
#if TCL_UTF_MAX > 3
} else if (UNICODE_OUT_OF_RANGE(ch)) {
return 0;