From ec1723eeadcf9efe52b0f81a65d683dff9b160c5 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Sat, 18 Apr 2020 12:46:54 +0000 Subject: Update documentation of Tcl_UtfPrev/Tcl_UtfNext back to how it was. Will be updated later, when implementation is ready and agreed upon. --- doc/Utf.3 | 37 +++++++++++------------------ generic/tclUtf.c | 72 ++++++++++++++++++++------------------------------------ 2 files changed, 39 insertions(+), 70 deletions(-) diff --git a/doc/Utf.3 b/doc/Utf.3 index cb82699..334fa6f 100644 --- a/doc/Utf.3 +++ b/doc/Utf.3 @@ -3,7 +3,7 @@ '\" '\" See the file "license.terms" for information on usage and redistribution '\" of this file, and for a DISCLAIMER OF ALL WARRANTIES. -'\" +'\" .TH Utf 3 "8.1" Tcl "Tcl Library Procedures" .so man.macros .BS @@ -13,7 +13,7 @@ Tcl_UniChar, Tcl_UniCharToUtf, Tcl_UtfToUniChar, Tcl_UniCharToUtfDString, Tcl_Ut .nf \fB#include \fR .sp -typedef ... Tcl_UniChar; +typedef ... \fBTcl_UniChar\fR; .sp int \fBTcl_UniCharToUtf\fR(\fIch, buf\fR) @@ -48,7 +48,7 @@ int int \fBTcl_UtfCharComplete\fR(\fIsrc, length\fR) .sp -int +int \fBTcl_NumUtfChars\fR(\fIsrc, length\fR) .sp const char * @@ -109,7 +109,7 @@ Pointer to the beginning of a UTF-8 string. .AP int index in The index of a character (not byte) in the UTF-8 string. .AP int *readPtr out -If non-NULL, filled with the number of bytes in the backslash sequence, +If non-NULL, filled with the number of bytes in the backslash sequence, including the backslash character. .AP char *dst out Buffer in which the bytes represented by the backslash sequence are stored. @@ -141,8 +141,8 @@ source buffer is long enough such that this routine does not run off the end and dereference non-existent or random memory; if the source buffer is known to be null-terminated, this will not happen. If the input is not in proper UTF-8 format, \fBTcl_UtfToUniChar\fR will store the first -byte of \fIsrc\fR in \fI*chPtr\fR as a Tcl_UniChar between 0x0000 and -0x00FF and return 1. +byte of \fIsrc\fR in \fI*chPtr\fR as a Tcl_UniChar between 0x0080 and +0x00FF and return 1. .PP \fBTcl_UniCharToUtfDString\fR converts the given Unicode string to UTF-8, storing the result in a previously initialized \fBTcl_DString\fR. @@ -210,27 +210,18 @@ length is negative, all bytes up to the first null byte are used. \fBTcl_UtfFindFirst\fR corresponds to \fBstrchr\fR for UTF-8 strings. It returns a pointer to the first occurrence of the Tcl_UniChar \fIch\fR in the null-terminated UTF-8 string \fIsrc\fR. The null terminator is -considered part of the UTF-8 string. +considered part of the UTF-8 string. .PP \fBTcl_UtfFindLast\fR corresponds to \fBstrrchr\fR for UTF-8 strings. It returns a pointer to the last occurrence of the Tcl_UniChar \fIch\fR in the null-terminated UTF-8 string \fIsrc\fR. The null terminator is -considered part of the UTF-8 string. +considered part of the UTF-8 string. .PP -\fBTcl_UtfNext\fR is used to step forward through a UTF-8 string. -If the UTF-8 string is made up entirely of complete, well-formed, and -valid character byte sequences, and \fIsrc\fR points to the lead byte -of one of those sequences, then repeated calls of \fBTcl_UtfNext\fR will -return pointers to the lead bytes of each character in the string, one -character at a time. In any other circumstance, \fBTcl_UtfNext\fR -returns \fIsrc\fR+1. \fBTcl_UtfNext\fR will always read \fIsrc[0]\fR -and may read as many following bytes (up to a total of \fBTCL_UTF_MAX\fR) -as needed to find the end of the byte sequence. If the string is -\fBNUL\fR-terminated, \fBTcl_UtfNext\fR will not read beyond the terminating -\fBNUL\fR byte. If not, the caller must use the companion routine -\fBTcl_UtfCharComplete\fR to determine whether there is any risk -\fBTcl_UtfNext\fR might read beyond the readable memory occupied -by the string. +Given \fIsrc\fR, a pointer to some location in a UTF-8 string, +\fBTcl_UtfNext\fR returns a pointer to the next UTF-8 character in the +string. The caller must not ask for the next character after the last +character in the string if the string is not terminated by a null +character. .PP \fBTcl_UtfPrev\fR is used to step backward through but not beyond the UTF-8 string that begins at \fIstart\fR. If the UTF-8 string is made @@ -262,7 +253,7 @@ characters. Behavior is undefined if a negative \fIindex\fR is given. .PP \fBTcl_UtfAtIndex\fR returns a pointer to the specified character (not byte) \fIindex\fR in the UTF-8 string \fIsrc\fR. The source string must -contain at least \fIindex\fR characters. This is equivalent to calling +contain at least \fIindex\fR characters. This is equivalent to calling \fBTcl_UtfNext\fR \fIindex\fR times. If a negative \fIindex\fR is given, the return pointer points to the first character in the source string. .PP diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 1883804..64ee0a8 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -678,35 +678,13 @@ Tcl_UtfFindLast( * * Tcl_UtfNext -- * - * The aim of this routine is to provide a way to iterate forward - * through a UTF-8 string. The caller is expected to pass a non-NULL - * pointer argument /src/ which points to a location within a string. - * (*src) will be read, so /src/ must not point to an unreadable - * location past the end of the string. If /src/ points to the - * beginning of a complete, well-formed and valid UTF_8 byte sequence - * of no more than TCL_UTF_MAX bytes, Tcl_UtfNext returns the pointer - * just past the end of that sequence. In any other circumstance, - * Tcl_UtfNext returns /src/+1. - * - * Because this routine always returns a value > /src/, it is useful - * as a forward iterator that will always make progress. If the string - * is NUL-terminated, Tcl_UtfNext will not read beyond the terminating - * NUL character. If it is not NUL-terminated, the caller must make - * use of the companion routine Tcl_UtfCharComplete to test whether - * there is risk that Tcl_UtfNext will read beyond the end of the string. - * Tcl_UtfNext will never read more than TCL_UTF_MAX bytes. - * - * In a string where all characters are complete and properly formed, - * and /src/ points to the first byte of a character, repeated - * Tcl_UtfNext calls will step to the starting bytes of characters, one - * character at a time. Within those limitations, Tcl_UtfPrev and - * Tcl_UtfNext are inverses. If either condition cannot be met, - * Tcl_UtfPrev and Tcl_UtfNext may not function as inverses and the - * caller will have to take greater care. + * Given a pointer to some current location in a UTF-8 string, move + * forward one character. The caller must ensure that they are not asking + * for the next character after the last character in the string. * * Results: - * A pointer to the start of the next character in the string (or to - * the end of the string) as described above. + * The return value is the pointer to the next character in the UTF-8 + * string. * * Side effects: * None. @@ -747,37 +725,37 @@ Tcl_UtfNext( * * The aim of this routine is to provide a way to move backward * through a UTF-8 string. The caller is expected to pass non-NULL - * pointer arguments /start/ and /src/. /start/ points to the beginning - * of a string, and /src/ (>= /start/) points to a location within (or - * just past the end) of the string. This routine always returns a - * pointer within the string (>= /start/). When (/src/ == /start/), - * it returns /start/. When (/src/ > /start/), it returns a pointer - * (< /src/) and (>= /src/ - TCL_UTF_MAX). Subject to these constraints, - * the routine returns a pointer to the earliest byte in the string that - * starts a character when characters are read starting at /start/ and + * pointer arguments start and src. start points to the beginning + * of a string, and src >= start points to a location within (or just + * past the end) of the string. This routine always returns a + * pointer within the string (>= start). When (src == start), it + * returns start. When (src > start), it returns a pointer (< src) + * and (>= src - TCL_UTF_MAX). Subject to these constraints, the + * routine returns a pointer to the earliest byte in the string that + * starts a character when characters are read starting at start and * that character might include the byte src[-1]. The routine will * examine only those bytes in the range that might be returned. - * It will not examine the byte (*src), and because of that cannot + * It will not examine the byte *src, and because of that cannot * determine for certain in all circumstances whether the character * that begins with the returned pointer will or will not include - * the byte src[-1]. In the scenario where /src/ points to the end of - * a buffer being filled, the returned pointer points to either the + * the byte src[-1]. In the scenario, where src points to the end of + * a buffer being filled, the returned pointer point to either the * final complete character in the string or to the earliest byte * that might start an incomplete character waiting for more bytes to * complete. * - * Because this routine always returns a value < /src/ until the point - * it is forced to return /start/, it is useful as a backward iterator + * Because this routine always returns a value < src until the point + * it is forced to return start, it is useful as a backward iterator * through a string that will always make progress and always be * prevented from running past the beginning of the string. * * In a string where all characters are complete and properly formed, - * and /src/ points to the first byte of a character, repeated - * Tcl_UtfPrev calls will step to the starting bytes of characters, one - * character at a time. Within those limitations, Tcl_UtfPrev and - * Tcl_UtfNext are inverses. If either condition cannot be met, - * Tcl_UtfPrev and Tcl_UtfNext may not function as inverses and the - * caller will have to take greater care. + * and the value of src points to the first byte of a character, + * repeated Tcl_UtfPrev calls will step to the starting bytes of + * characters, one character at a time. Within those limitations, + * Tcl_UtfPrev and Tcl_UtfNext are inverses. If either condition cannot + * be met, Tcl_UtfPrev and Tcl_UtfNext may not function as inverses and + * the caller will have to take greater care. * * Results: * A pointer to the start of a character in the string as described @@ -887,7 +865,7 @@ Tcl_UtfPrev( * * Tcl_UniCharAtIndex -- * - * Returns the Unicode character represented at the specified character + * Returns the Tcl_UniChar represented at the specified character * (not byte) position in the UTF-8 string. * * Results: -- cgit v0.12