diff options
| -rw-r--r-- | doc/Utf.3 | 28 | ||||
| -rw-r--r-- | generic/tclTest.c | 52 | ||||
| -rw-r--r-- | generic/tclUtf.c | 50 | ||||
| -rw-r--r-- | tests/utf.test | 163 |
4 files changed, 274 insertions, 19 deletions
@@ -223,13 +223,27 @@ string. The caller must not ask for the next character after the last character in the string if the string is not terminated by a null character. .PP -Given \fIsrc\fR, a pointer to some location in a UTF-8 string (or to a -null byte immediately following such a string), \fBTcl_UtfPrev\fR -returns a pointer to the closest preceding byte that starts a UTF-8 -character. -This function will not back up to a position before \fIstart\fR, -the start of the UTF-8 string. If \fIsrc\fR was already at \fIstart\fR, the -return value will be \fIstart\fR. +\fBTcl_UtfPrev\fR is used to step backward through but not beyond the +UTF-8 string that begins at \fIstart\fR. If the UTF-8 string is made +up entirely of complete and well-formed characters, and \fIsrc\fR points +to the lead byte of one of those characters (or to the location one byte +past the end of the string), then repeated calls of \fBTcl_UtfPrev\fR will +return pointers to the lead bytes of each character in the string, one +character at a time, terminating when it returns \fIstart\fR. +.PP +When the conditions of completeness and well-formedness may not be satisfied, +a more precise description of the function of \fBTcl_UtfPrev\fR is necessary. +It always returns a pointer greater than or equal to \fIstart\fR; that is, +always a pointer to a location in the string. It always returns a pointer to +a byte that begins a character when scanning for characters beginning +from \fIstart\fR. When \fIsrc\fR is greater than \fIstart\fR, it +always returns a pointer less than \fIsrc\fR and greater than or +equal to (\fIsrc\fR - \fBTCL_UTF_MAX\fR). The character that begins +at the returned pointer is the first one that either includes the +byte \fIsrc[-1]\fR, or might include it if the right trail bytes are +present at \fIsrc\fR and greater. \fBTcl_UtfPrev\fR never reads the +byte \fIsrc[0]\fR nor the byte \fIstart[-1]\fR nor the byte +\fIsrc[-\fBTCL_UTF_MAX\fI-1]\fR. .PP \fBTcl_UniCharAtIndex\fR corresponds to a C string array dereference or the Pascal Ord() function. It returns the Tcl_UniChar represented at the diff --git a/generic/tclTest.c b/generic/tclTest.c index 66b2233..506cef9 100644 --- a/generic/tclTest.c +++ b/generic/tclTest.c @@ -433,6 +433,7 @@ static int SimpleMatchInDirectory( Tcl_Interp *interp, Tcl_Obj *resultPtr, Tcl_Obj *dirPtr, const char *pattern, Tcl_GlobTypeData *types); +static Tcl_ObjCmdProc TestUtfPrevCmd; static int TestNumUtfCharsCmd(ClientData clientData, Tcl_Interp *interp, int objc, Tcl_Obj *const objv[]); @@ -690,6 +691,8 @@ Tcltest_Init( (ClientData) 0, NULL); Tcl_CreateObjCommand(interp, "testsetobjerrorcode", TestsetobjerrorcodeCmd, (ClientData) 0, NULL); + Tcl_CreateObjCommand(interp, "testutfprev", + TestUtfPrevCmd, (ClientData) 0, NULL); Tcl_CreateObjCommand(interp, "testnumutfchars", TestNumUtfCharsCmd, (ClientData) 0, NULL); Tcl_CreateCommand(interp, "testsetplatform", TestsetplatformCmd, @@ -7094,6 +7097,55 @@ SimpleListVolumes(void) } /* + * Used to check operations of Tcl_UtfPrev. + * + * Usage: testutfprev $bytes $offset + */ + +static int +TestUtfPrevCmd( + ClientData clientData, + Tcl_Interp *interp, + int objc, + Tcl_Obj *const objv[]) +{ + int numBytes, offset; + char *bytes; + const char *result; + Tcl_Obj *copy; + + if (objc < 2 || objc > 3) { + Tcl_WrongNumArgs(interp, 1, objv, "bytes ?offset?"); + return TCL_ERROR; + } + + bytes = (char *) Tcl_GetByteArrayFromObj(objv[1], &numBytes); + + if (objc == 3) { + if (TCL_OK != Tcl_GetIntFromObj(interp, objv[2], &offset)) { + return TCL_ERROR; + } + if (offset < 0) { + offset = 0; + } + if (offset > numBytes) { + offset = numBytes; + } + } else { + offset = numBytes; + } + copy = Tcl_DuplicateObj(objv[1]); + bytes = (char *) Tcl_SetByteArrayLength(copy, numBytes+1); + bytes[numBytes] = '\0'; + + result = Tcl_UtfPrev(bytes + offset, bytes); + + Tcl_DecrRefCount(copy); + Tcl_SetObjResult(interp, Tcl_NewIntObj(result - bytes)); + return TCL_OK; +} + +/* * Used to check correct string-length determining in Tcl_NumUtfChars */ diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 9aaf506..fbdba4c 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -654,15 +654,43 @@ Tcl_UtfNext( * * Tcl_UtfPrev -- * - * Given a pointer to some current location in a UTF-8 string, move - * backwards one character. This works correctly when the pointer is in - * the middle of a UTF-8 character. + * The aim of this routine is to provide a way to move backward + * through a UTF-8 string. The caller is expected to pass non-NULL + * pointer arguments start and src. start points to the beginning + * of a string, and src >= start points to a location within (or just + * past the end) of the string. This routine always returns a + * pointer within the string (>= start). When (src == start), it + * returns start. When (src > start), it returns a pointer (< src) + * and (>= src - TCL_UTF_MAX). Subject to these constraints, the + * routine returns a pointer to the earliest byte in the string that + * starts a character when characters are read starting at start and + * that character might include the byte src[-1]. The routine will + * examine only those bytes in the range that might be returned. + * It will not examine the byte *src, and because of that cannot + * determine for certain in all circumstances whether the character + * that begins with the returned pointer will or will not include + * the byte src[-1]. In the scenario, where src points to the end of + * a buffer being filled, the returned pointer point to either the + * final complete character in the string or to the earliest byte + * that might start an incomplete character waiting for more bytes to + * complete. + * + * Because this routine always returns a value < src until the point + * it is forced to return start, it is useful as a backward iterator + * through a string that will always make progress and always be + * prevented from running past the beginning of the string. + * + * In a string where all characters are complete and properly formed, + * and the value of src points to the first byte of a character, + * repeated Tcl_UtfPrev calls will step to the starting bytes of + * characters, one character at a time. Within those limitations, + * Tcl_UtfPrev and Tcl_UtfNext are inverses. If either condition cannot + * be met, Tcl_UtfPrev and Tcl_UtfNext may not function as inverses and + * the caller will have to take greater care. * * Results: - * The return value is a pointer to the previous character in the UTF-8 - * string. If the current location was already at the beginning of the - * string, the return value will also be a pointer to the beginning of - * the string. + * A pointer to the start of a character in the string as described + * above. * * Side effects: * None. @@ -672,9 +700,8 @@ Tcl_UtfNext( CONST char * Tcl_UtfPrev( - CONST char *src, /* The current location in the string. */ - CONST char *start) /* Pointer to the beginning of the string, to - * avoid going backwards too far. */ + CONST char *src, /* A location in a UTF-8 string. */ + CONST char *start) /* Pointer to the beginning of the string */ { CONST char *look; int i, byte; @@ -693,6 +720,9 @@ Tcl_UtfPrev( break; } if (byte >= 0xC0) { + if (totalBytes[byte] <= i) { + break; + } return look; } look--; diff --git a/tests/utf.test b/tests/utf.test index e8ee374..c2191c2 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -94,8 +94,167 @@ test utf-5.1 {Tcl_UtfFindFirsts} { test utf-6.1 {Tcl_UtfNext} { } {} -test utf-7.1 {Tcl_UtfPrev} { -} {} +testConstraint testutfprev [llength [info commands testutfprev]] + +test utf-7.1 {Tcl_UtfPrev} testutfprev { + testutfprev {} +} 0 +test utf-7.2 {Tcl_UtfPrev} testutfprev { + testutfprev A +} 0 +test utf-7.3 {Tcl_UtfPrev} testutfprev { + testutfprev AA +} 1 +test utf-7.4 {Tcl_UtfPrev} testutfprev { + testutfprev A\xF8 +} 1 +test utf-7.4.1 {Tcl_UtfPrev} testutfprev { + testutfprev A\xF8\xA0\xA0\xA0 2 +} 1 +test utf-7.4.2 {Tcl_UtfPrev} testutfprev { + testutfprev A\xF8\xF8\xA0\xA0 2 +} 1 +test utf-7.5 {Tcl_UtfPrev} testutfprev { + testutfprev A\xF4 +} 1 +test utf-7.5.1 {Tcl_UtfPrev} testutfprev { + testutfprev A\xF4\xA0\xA0\xA0 2 +} 1 +test utf-7.5.2 {Tcl_UtfPrev} testutfprev { + testutfprev A\xF4\xF8\xA0\xA0 2 +} 1 +test utf-7.6 {Tcl_UtfPrev} testutfprev { + testutfprev A\xE8 +} 1 +test utf-7.6.1 {Tcl_UtfPrev} testutfprev { + testutfprev A\xE8\xA0\xA0\xA0 2 +} 1 +test utf-7.6.2 {Tcl_UtfPrev} testutfprev { + testutfprev A\xE8\xF8\xA0\xA0 2 +} 1 +test utf-7.7 {Tcl_UtfPrev} testutfprev { + testutfprev A\xD0 +} 1 +test utf-7.7.1 {Tcl_UtfPrev} testutfprev { + testutfprev A\xD0\xA0\xA0\xA0 2 +} 1 +test utf-7.7.2 {Tcl_UtfPrev} testutfprev { + testutfprev A\xD0\xF8\xA0\xA0 2 +} 1 +test utf-7.8 {Tcl_UtfPrev} testutfprev { + testutfprev A\xA0 +} 1 +test utf-7.8.1 {Tcl_UtfPrev} testutfprev { + testutfprev A\xA0\xA0\xA0\xA0 2 +} 1 +test utf-7.8.2 {Tcl_UtfPrev} testutfprev { + testutfprev A\xA0\xF8\xA0\xA0 2 +} 1 +test utf-7.9 {Tcl_UtfPrev} testutfprev { + testutfprev A\xF8\xA0 +} 2 +test utf-7.9.1 {Tcl_UtfPrev} testutfprev { + testutfprev A\xF8\xA0\xA0\xA0 3 +} 2 +test utf-7.9.2 {Tcl_UtfPrev} testutfprev { + testutfprev A\xF8\xA0\xF8\xA0 3 +} 2 +test utf-7.10 {Tcl_UtfPrev} testutfprev { + testutfprev A\xF4\xA0 +} 2 +test utf-7.10.1 {Tcl_UtfPrev} testutfprev { + testutfprev A\xF4\xA0\xA0\xA0 3 +} 2 +test utf-7.10.2 {Tcl_UtfPrev} testutfprev { + testutfprev A\xF4\xA0\xF8\xA0 3 +} 2 +test utf-7.11 {Tcl_UtfPrev} testutfprev { + testutfprev A\xE8\xA0 +} 1 +test utf-7.11.1 {Tcl_UtfPrev} testutfprev { + testutfprev A\xE8\xA0\xA0\xA0 3 +} 1 +test utf-7.11.2 {Tcl_UtfPrev} testutfprev { + testutfprev A\xE8\xA0\xF8\xA0 3 +} 1 +test utf-7.12 {Tcl_UtfPrev} testutfprev { + testutfprev A\xD0\xA0 +} 1 +test utf-7.12.1 {Tcl_UtfPrev} testutfprev { + testutfprev A\xD0\xA0\xA0\xA0 3 +} 1 +test utf-7.12.2 {Tcl_UtfPrev} testutfprev { + testutfprev A\xD0\xA0\xF8\xA0 3 +} 1 +test utf-7.13 {Tcl_UtfPrev} testutfprev { + testutfprev A\xA0\xA0 +} 2 +test utf-7.13.1 {Tcl_UtfPrev} testutfprev { + testutfprev A\xA0\xA0\xA0\xA0 3 +} 2 +test utf-7.13.2 {Tcl_UtfPrev} testutfprev { + testutfprev A\xA0\xA0\xF8\xA0 3 +} 2 +test utf-7.14 {Tcl_UtfPrev} testutfprev { + testutfprev A\xF8\xA0\xA0 +} 3 +test utf-7.14.1 {Tcl_UtfPrev} testutfprev { + testutfprev A\xF8\xA0\xA0\xA0 4 +} 3 +test utf-7.14.2 {Tcl_UtfPrev} testutfprev { + testutfprev A\xF8\xA0\xA0\xF8 4 +} 3 +test utf-7.15 {Tcl_UtfPrev} testutfprev { + testutfprev A\xF4\xA0\xA0 +} 3 +test utf-7.15.1 {Tcl_UtfPrev} testutfprev { + testutfprev A\xF4\xA0\xA0\xA0 4 +} 3 +test utf-7.15.2 {Tcl_UtfPrev} testutfprev { + testutfprev A\xF4\xA0\xA0\xF8 4 +} 3 +test utf-7.16 {Tcl_UtfPrev} testutfprev { + testutfprev A\xE8\xA0\xA0 +} 1 +test utf-7.16.1 {Tcl_UtfPrev} testutfprev { + testutfprev A\xE8\xA0\xA0\xA0 4 +} 1 +test utf-7.16.2 {Tcl_UtfPrev} testutfprev { + testutfprev A\xE8\xA0\xA0\xF8 4 +} 1 +test utf-7.17 {Tcl_UtfPrev} testutfprev { + testutfprev A\xD0\xA0\xA0 +} 3 +test utf-7.17.1 {Tcl_UtfPrev} testutfprev { + testutfprev A\xD0\xA0\xA0\xA0 4 +} 3 +test utf-7.17.2 {Tcl_UtfPrev} testutfprev { + testutfprev A\xD0\xA0\xA0\xF8 4 +} 3 +test utf-7.18 {Tcl_UtfPrev} testutfprev { + testutfprev A\xA0\xA0\xA0 +} 3 +test utf-7.18.1 {Tcl_UtfPrev} testutfprev { + testutfprev A\xA0\xA0\xA0\xA0 4 +} 3 +test utf-7.18.2 {Tcl_UtfPrev} testutfprev { + testutfprev A\xA0\xA0\xA0\xF8 4 +} 3 +test utf-7.19 {Tcl_UtfPrev} testutfprev { + testutfprev A\xF8\xA0\xA0\xA0 +} 4 +test utf-7.20 {Tcl_UtfPrev} testutfprev { + testutfprev A\xF4\xA0\xA0\xA0 +} 4 +test utf-7.21 {Tcl_UtfPrev} testutfprev { + testutfprev A\xE8\xA0\xA0\xA0 +} 4 +test utf-7.22 {Tcl_UtfPrev} testutfprev { + testutfprev A\xD0\xA0\xA0\xA0 +} 4 +test utf-7.23 {Tcl_UtfPrev} testutfprev { + testutfprev A\xA0\xA0\xA0\xA0 +} 4 test utf-8.1 {Tcl_UniCharAtIndex: index = 0} { string index abcd 0 |
