From 9146dba483fef4e11e316b7f6503565b7c656ce4 Mon Sep 17 00:00:00 2001 From: stanton Date: Mon, 16 Nov 1998 20:45:22 +0000 Subject: moved scan implementation into tclScan.c changed scan to support unicode --- generic/tclCmdMZ.c | 382 +------------------- generic/tclScan.c | 1008 ++++++++++++++++++++++++++++++++++++++++++++++++++++ tests/scan.test | 64 +++- win/makefile.vc | 3 +- 4 files changed, 1076 insertions(+), 381 deletions(-) create mode 100644 generic/tclScan.c diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c index 1671fef..7fa09d6 100644 --- a/generic/tclCmdMZ.c +++ b/generic/tclCmdMZ.c @@ -12,7 +12,7 @@ * See the file "license.terms" for information on usage and redistribution * of this file, and for a DISCLAIMER OF ALL WARRANTIES. * - * RCS: @(#) $Id: tclCmdMZ.c,v 1.1.2.6 1998/11/11 04:54:08 stanton Exp $ + * RCS: @(#) $Id: tclCmdMZ.c,v 1.1.2.7 1998/11/16 20:45:22 stanton Exp $ */ #include "tclInt.h" @@ -21,6 +21,22 @@ #include "tclRegexp.h" /* + * Flag values used by Tcl_ScanObjCmd. + */ + +#define SCAN_NOSKIP 0x1 /* Don't skip blanks. */ +#define SCAN_SUPPRESS 0x2 /* Suppress assignment. */ +#define SCAN_UNSIGNED 0x4 /* Read an unsigned value. */ +#define SCAN_WIDTH 0x8 /* A width value was supplied. */ + +#define SCAN_SIGNOK 0x10 /* A +/- character is allowed. */ +#define SCAN_NODIGITS 0x20 /* No digits have been scanned. */ +#define SCAN_NOZERO 0x40 /* No zero digits have been scanned. */ +#define SCAN_XOK 0x80 /* An 'x' is allowed. */ +#define SCAN_PTOK 0x100 /* Decimal point is allowed. */ +#define SCAN_EXPOK 0x200 /* An exponent is allowed. */ + +/* * Structure used to hold information about variable traces: */ @@ -633,370 +649,6 @@ Tcl_ReturnObjCmd(dummy, interp, objc, objv) /* *---------------------------------------------------------------------- * - * Tcl_ScanObjCmd -- - * - * This procedure is invoked to process the "scan" Tcl command. - * See the user documentation for details on what it does. - * - * Results: - * A standard Tcl result. - * - * Side effects: - * See the user documentation. - * - *---------------------------------------------------------------------- - */ - - /* ARGSUSED */ -int -Tcl_ScanObjCmd(dummy, interp, objc, objv) - ClientData dummy; /* Not used. */ - Tcl_Interp *interp; /* Current interpreter. */ - int objc; /* Number of arguments. */ - Tcl_Obj *CONST objv[]; /* Argument objects. */ -{ -# define MAX_FIELDS 20 - typedef struct { - char fmt; /* Format for field. */ - int size; /* How many bytes to allow for - * field. */ - char *location; /* Where field will be stored. */ - } Field; - Field fields[MAX_FIELDS]; /* Info about all the fields in the - * format string. */ - register Field *curField; - int numFields = 0; /* Number of fields actually - * specified. */ - int suppress; /* Current field is assignment- - * suppressed. */ - int totalSize = 0; /* Number of bytes needed to store - * all results combined. */ - char *results = NULL; /* Where scanned output goes. - * Malloced; NULL means not allocated - * yet. */ - Tcl_Obj *varPtr = NULL; /* The vars set by sscanf converted - * to Tcl_Objects. Initialized to - * avoid compiler warning. */ - int numScanned; /* sscanf's result. */ - register char *fmt; /* The format specifiers */ - char *src; /* The input to be parsed */ - int i, widthSpecified, fmtLen, srcLen, code, value; - char unsignedStr[40]; - Tcl_DString srcBuf, fmtBuf; - Tcl_Encoding encoding; - - /* - * The variables below are used to hold a copy of the format - * string, so that we can replace format specifiers like "%f" - * and "%F" with specifiers like "%lf" - */ - -# define STATIC_SIZE 5 - char copyBuf[STATIC_SIZE], *fmtCopy; - register char *dst; - - if (objc < 3) { - Tcl_WrongNumArgs(interp, 1, objv, - "string format ?varName varName ...?"); - return TCL_ERROR; - } - - encoding = Tcl_GetEncoding(interp, "iso8859-1"); - - /* - * This procedure operates in four stages: - * 1. Scan the format string, collecting information about each field. - * 2. Allocate an array to hold all of the scanned fields. - * 3. Call sscanf to do all the dirty work, and have it store the - * parsed fields in the array. - * 4. Pick off the fields from the array and assign them to variables. - */ - - /* - * INTL: ISO only. - * - * Convert the source and format strings from utf to iso8859-1 so - * sscanf will work correctly. - */ - - code = TCL_OK; - Tcl_UtfToExternalDString(encoding, Tcl_GetString(objv[1]), -1, &srcBuf); - Tcl_UtfToExternalDString(encoding, Tcl_GetString(objv[2]), -1, &fmtBuf); - src = Tcl_DStringValue(&srcBuf); - srcLen = Tcl_DStringLength(&srcBuf) + 1; - fmt = Tcl_DStringValue(&fmtBuf); - fmtLen = (Tcl_DStringLength(&fmtBuf) * 2) + 1; - - if (fmtLen < STATIC_SIZE) { - fmtCopy = copyBuf; - } else { - fmtCopy = (char *) ckalloc((unsigned) fmtLen); - } - dst = fmtCopy; - for ( ; *fmt != 0; fmt++) { - *dst = *fmt; - dst++; - if (*fmt != '%') { - continue; - } - fmt++; - if (*fmt == '%') { - *dst = *fmt; - dst++; - continue; - } - if (*fmt == '*') { - suppress = 1; - *dst = *fmt; - dst++; - fmt++; - } else { - suppress = 0; - } - widthSpecified = 0; - while (isdigit(UCHAR(*fmt))) { /* INTL: digit */ - widthSpecified = 1; - *dst = *fmt; - dst++; - fmt++; - } - if ((*fmt == 'l') || (*fmt == 'h') || (*fmt == 'L')) { - fmt++; - } - *dst = *fmt; - dst++; - if (suppress) { - continue; - } - if (numFields == MAX_FIELDS) { - Tcl_AppendStringsToObj(Tcl_GetObjResult(interp), - "too many fields to scan", (char *) NULL); - code = TCL_ERROR; - goto done; - } - curField = &fields[numFields]; - numFields++; - switch (*fmt) { - case 'd': - case 'i': - case 'o': - case 'x': - curField->fmt = 'd'; - curField->size = sizeof(int); - break; - - case 'u': - curField->fmt = 'u'; - curField->size = sizeof(int); - break; - - case 's': - curField->fmt = 's'; - curField->size = srcLen; - break; - - case 'c': - if (widthSpecified) { - Tcl_AppendStringsToObj(Tcl_GetObjResult(interp), - "field width may not be specified in %c conversion" - , (char *) NULL); - code = TCL_ERROR; - goto done; - } - curField->fmt = 'c'; - curField->size = sizeof(int); - break; - - case 'e': - case 'f': - case 'g': - dst[-1] = 'l'; - dst[0] = 'f'; - dst++; - curField->fmt = 'f'; - curField->size = sizeof(double); - break; - - case '[': - curField->fmt = 's'; - curField->size = srcLen; - do { - fmt++; - if (*fmt == 0) { - Tcl_AppendStringsToObj(Tcl_GetObjResult(interp), - "unmatched [ in format string", (char *) NULL); - code = TCL_ERROR; - goto done; - } - *dst = *fmt; - dst++; - } while (*fmt != ']'); - break; - - default: - { - char buf[50]; - - sprintf(buf, "bad scan conversion character \"%c\"",*fmt); - Tcl_AppendStringsToObj(Tcl_GetObjResult(interp), buf, - (char *) NULL); - code = TCL_ERROR; - goto done; - } - } - curField->size = TCL_ALIGN(curField->size); - totalSize += curField->size; - } - *dst = 0; - - if (numFields != (objc - 3)) { - Tcl_AppendStringsToObj(Tcl_GetObjResult(interp), - "different numbers of variable names and field specifiers", - (char *) NULL); - code = TCL_ERROR; - goto done; - } - - /* - * Step 2: - */ - - results = (char *) ckalloc((unsigned) totalSize); - for (i = 0, totalSize = 0, curField = fields; - i < numFields; i++, curField++) { - curField->location = results + totalSize; - totalSize += curField->size; - } - - /* - * Fill in the remaining fields with NULL; the only purpose of - * this is to keep some memory analyzers, like Purify, from - * complaining. - */ - - for ( ; i < MAX_FIELDS; i++, curField++) { - curField->location = NULL; - } - - /* - * Step 3: - */ - - numScanned = sscanf(src, fmtCopy, - fields[0].location, fields[1].location, fields[2].location, - fields[3].location, fields[4].location, fields[5].location, - fields[6].location, fields[7].location, fields[8].location, - fields[9].location, fields[10].location, fields[11].location, - fields[12].location, fields[13].location, fields[14].location, - fields[15].location, fields[16].location, fields[17].location, - fields[18].location, fields[19].location); - - /* - * Step 4: - */ - - if (numScanned < numFields) { - numFields = numScanned; - } - for (i = 0, curField = fields; i < numFields; i++, curField++) { - switch (curField->fmt) { - case 'd': - varPtr = Tcl_NewIntObj(*((int *)curField->location)); - break; - - case 'u': - /* - * If value is < 0 then it cannot be stored in a Tcl - * Integer. Store the unsigned value as a string. - */ - - value = (*((int *)curField->location)); - if (value < 0) { - /* - * Note: The curField->size was an upper limit to the - * size of the string. The correct size needs to be - * re-calculated. - */ - - /* - * INTL: ISO only - * - * Convert the scanned string from iso8859-1 to utf. - */ - - sprintf(unsignedStr, "%u", value); - Tcl_DStringSetLength(&srcBuf, 0); - Tcl_ExternalToUtfDString(encoding, unsignedStr, -1, - &srcBuf); - varPtr = Tcl_NewStringObj(Tcl_DStringValue(&srcBuf), -1); - } else { - varPtr = Tcl_NewIntObj(value); - } - break; - - case 'c': - varPtr = Tcl_NewIntObj(*((unsigned char *)curField->location)); - break; - - case 's': - /* - * Note: The curField->size was an upper limit to the size of - * the string. The correct size needs to be re-calculated. - */ - - /* - * INTL: ISO only - * - * Convert the scanned string from iso8859-1 to utf. - */ - - Tcl_DStringSetLength(&srcBuf, 0); - Tcl_ExternalToUtfDString(encoding, curField->location, -1, - &srcBuf); - varPtr = Tcl_NewStringObj(Tcl_DStringValue(&srcBuf), -1); - break; - - case 'f': - varPtr = Tcl_NewDoubleObj(*((double *)curField->location)); - break; - - default: - panic("Tcl_ScanObjCmd: unexpected curField->fmt '%c'", - curField->fmt); - /* - * Never reached but tell smart compilers that varPtr - * can't be used unitialized. - */ - code = TCL_ERROR; - goto done; - } - if (Tcl_SetObjVar2(interp, - Tcl_GetString(objv[i+3]), NULL, varPtr, 0) == NULL) { - Tcl_AppendStringsToObj(Tcl_GetObjResult(interp), - "couldn't set variable \"", - Tcl_GetString(objv[(i + 3)]), "\"", (char *) NULL); - code = TCL_ERROR; - Tcl_DecrRefCount(varPtr); - goto done; - } - } - Tcl_SetIntObj(Tcl_GetObjResult(interp), numScanned); - - done: - if (results != NULL) { - ckfree(results); - } - if (fmtCopy != copyBuf) { - ckfree(fmtCopy); - } - Tcl_DStringFree(&srcBuf); - Tcl_DStringFree(&fmtBuf); - return code; -} - -/* - *---------------------------------------------------------------------- - * * Tcl_SourceObjCmd -- * * This procedure is invoked to process the "source" Tcl command. diff --git a/generic/tclScan.c b/generic/tclScan.c new file mode 100644 index 0000000..1b2fb20 --- /dev/null +++ b/generic/tclScan.c @@ -0,0 +1,1008 @@ +/* + * tclScan.c -- + * + * This file contains the implementation of the "scan" command. + * + * Copyright (c) 1998 by Scriptics Corporation. + * + * See the file "license.terms" for information on usage and redistribution + * of this file, and for a DISCLAIMER OF ALL WARRANTIES. + * + * RCS: @(#) $Id: tclScan.c,v 1.1.2.1 1998/11/16 20:45:22 stanton Exp $ + */ + +#include "tclInt.h" + +/* + * Flag values used by Tcl_ScanObjCmd. + */ + +#define SCAN_NOSKIP 0x1 /* Don't skip blanks. */ +#define SCAN_SUPPRESS 0x2 /* Suppress assignment. */ +#define SCAN_UNSIGNED 0x4 /* Read an unsigned value. */ +#define SCAN_WIDTH 0x8 /* A width value was supplied. */ + +#define SCAN_SIGNOK 0x10 /* A +/- character is allowed. */ +#define SCAN_NODIGITS 0x20 /* No digits have been scanned. */ +#define SCAN_NOZERO 0x40 /* No zero digits have been scanned. */ +#define SCAN_XOK 0x80 /* An 'x' is allowed. */ +#define SCAN_PTOK 0x100 /* Decimal point is allowed. */ +#define SCAN_EXPOK 0x200 /* An exponent is allowed. */ + + +/* + * The following structure contains the information associated with + * a character set. + */ + +typedef struct CharSet { + int exclude; /* 1 if this is an exclusion set. */ + int nchars; + Tcl_UniChar *chars; + int nranges; + struct Range { + Tcl_UniChar start; + Tcl_UniChar end; + } *ranges; +} CharSet; + +/* + * Declarations for functions used only in this file. + */ + +static int CharInSet _ANSI_ARGS_((CharSet *cset, Tcl_UniChar ch)); +static void ReleaseCharSet _ANSI_ARGS_((CharSet *cset)); +static int ValidateFormat _ANSI_ARGS_((Tcl_Interp *interp, char *format, + int numVars)); + +/* + *---------------------------------------------------------------------- + * + * BuildCharSet -- + * + * This function examines a character set format specification + * and builds a CharSet containing the individual characters and + * character ranges specified. + * + * Results: + * Returns the next format position. + * + * Side effects: + * Initializes the charset. + * + *---------------------------------------------------------------------- + */ + +static char * +BuildCharSet(cset, format) + CharSet *cset; + char *format; /* Points to first char of set. */ +{ + Tcl_UniChar ch, start; + int offset, nranges; + char *end; + + memset(cset, 0, sizeof(CharSet)); + + offset = Tcl_UtfToUniChar(format, &ch); + if (ch == '^') { + cset->exclude = 1; + format += offset; + offset = Tcl_UtfToUniChar(format, &ch); + } + end = format + offset; + + /* + * Find the close bracket so we can overallocate the set. + */ + + if (ch == ']') { + end += Tcl_UtfToUniChar(end, &ch); + } + nranges = 0; + while (ch != ']') { + if (ch == '-') { + nranges++; + } + end += Tcl_UtfToUniChar(end, &ch); + } + + cset->chars = ckalloc(sizeof(Tcl_UniChar) * (end - format - 1)); + if (nranges > 0) { + cset->ranges = ckalloc(sizeof(struct Range)*nranges); + } else { + cset->ranges = NULL; + } + + /* + * Now build the character set. + */ + + cset->nchars = cset->nranges = 0; + format += Tcl_UtfToUniChar(format, &ch); + start = ch; + if (ch == ']' || ch == '-') { + cset->chars[cset->nchars++] = ch; + format += Tcl_UtfToUniChar(format, &ch); + } + while (ch != ']') { + if (*format == '-') { + /* + * This may be the first character of a range, so don't add + * it yet. + */ + + start = ch; + } else if (ch == '-') { + /* + * Check to see if this is the last character in the set, in which + * case it is not a range and we should add the previous character + * as well as the dash. + */ + + if (*format == ']') { + cset->chars[cset->nchars++] = start; + cset->chars[cset->nchars++] = ch; + } else { + format += Tcl_UtfToUniChar(format, &ch); + cset->ranges[cset->nranges].start = start; + cset->ranges[cset->nranges].end = ch; + cset->nranges++; + } + } else { + cset->chars[cset->nchars++] = ch; + } + format += Tcl_UtfToUniChar(format, &ch); + } + return format; +} + +/* + *---------------------------------------------------------------------- + * + * CharInSet -- + * + * Check to see if a character matches the given set. + * + * Results: + * Returns non-zero if the character matches the given set. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +static int +CharInSet(cset, ch) + CharSet *cset; + Tcl_UniChar ch; +{ + int i, match = 0; + for (i = 0; i < cset->nchars; i++) { + if (cset->chars[i] == ch) { + match = 1; + break; + } + } + if (!match) { + for (i = 0; i < cset->nranges; i++) { + if ((cset->ranges[i].start <= ch) + || (ch <= cset->ranges[i].end)) { + match = 1; + break; + } + } + } + return (cset->exclude ? !match : match); +} + +/* + *---------------------------------------------------------------------- + * + * ReleaseCharSet -- + * + * Free the storage associated with a character set. + * + * Results: + * None. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +static void +ReleaseCharSet(cset) + CharSet *cset; +{ + ckfree((char *)cset->chars); + if (cset->ranges) { + ckfree((char *)cset->ranges); + } +} + +/* + *---------------------------------------------------------------------- + * + * ValidateFormat -- + * + * Parse the format string and verify that it is properly formed + * and that there are exactly enough variables on the command line. + * + * Results: + * A standard Tcl result. + * + * Side effects: + * May place an error in the interpreter result. + * + *---------------------------------------------------------------------- + */ + +static int +ValidateFormat(interp, format, numVars) + Tcl_Interp *interp; /* Current interpreter. */ + char *format; /* The format string. */ + int numVars; /* The number of variables passed to the + * scan command. */ +{ + int gotXpg, gotSequential, value, i, flags; + char *end; + Tcl_UniChar ch; + int *nassign = (int*)ckalloc(sizeof(int) * numVars); + int code = TCL_OK; + int objIndex; + + /* + * Initialize an array that records the number of times a variable + * is assigned to by the format string. We use this to detect if + * a variable is multiply assigned or left unassigned. + */ + + for (i = 0; i < numVars; i++) { + nassign[i] = 0; + } + + objIndex = gotXpg = gotSequential = 0; + + while (*format != '\0') { + format += Tcl_UtfToUniChar(format, &ch); + + flags = 0; + + if (ch != '%') { + continue; + } + format += Tcl_UtfToUniChar(format, &ch); + if (ch == '%') { + continue; + } + if (ch == '*') { + flags |= SCAN_SUPPRESS; + format += Tcl_UtfToUniChar(format, &ch); + goto xpgCheckDone; + } + + if ((ch < 0x80) && isdigit(UCHAR(ch))) { /* INTL: "C" locale. */ + /* + * Check for an XPG3-style %n$ specification. Note: there + * must not be a mixture of XPG3 specs and non-XPG3 specs + * in the same format string. + */ + + value = strtoul(format-1, &end, 10); /* INTL: "C" locale. */ + if (*end != '$') { + goto notXpg; + } + format = end+1; + format += Tcl_UtfToUniChar(format, &ch); + gotXpg = 1; + if (gotSequential) { + goto mixedXPG; + } + objIndex = value - 1; + if ((objIndex < 0) || (objIndex >= numVars)) { + goto badIndex; + } + goto xpgCheckDone; + } + + notXpg: + gotSequential = 1; + if (gotXpg) { + mixedXPG: + Tcl_SetResult(interp, + "cannot mix \"%\" and \"%n$\" conversion specifiers", + TCL_STATIC); + goto error; + } + + xpgCheckDone: + /* + * Parse any width specifier. + */ + + if ((ch < 0x80) && isdigit(UCHAR(ch))) { /* INTL: "C" locale. */ + value = strtoul(format-1, &format, 10); /* INTL: "C" locale. */ + flags |= SCAN_WIDTH; + format += Tcl_UtfToUniChar(format, &ch); + } + + /* + * Ignore size specifier. + */ + + if ((ch == 'l') || (ch == 'L') || (ch == 'h')) { + format += Tcl_UtfToUniChar(format, &ch); + } + + if (!(flags & SCAN_SUPPRESS) && objIndex >= numVars) { + goto badIndex; + } + + /* + * Handle the various field types. + */ + + switch (ch) { + case 'n': + case 'd': + case 'i': + case 'o': + case 'x': + case 'u': + case 'f': + case 'e': + case 'g': + case 's': + break; + case 'c': + if (flags & SCAN_WIDTH) { + Tcl_SetResult(interp, "field width may not be specified in %c conversion", TCL_STATIC); + goto error; + } + break; + case '[': + if (*format == '\0') { + goto badSet; + } + format += Tcl_UtfToUniChar(format, &ch); + if (ch == '^') { + if (*format == '\0') { + goto badSet; + } + format += Tcl_UtfToUniChar(format, &ch); + } + if (ch == ']') { + if (*format == '\0') { + goto badSet; + } + format += Tcl_UtfToUniChar(format, &ch); + } + while (ch != ']') { + if (*format == '\0') { + goto badSet; + } + format += Tcl_UtfToUniChar(format, &ch); + } + break; + badSet: + Tcl_SetResult(interp, "unmatched [ in format string", + TCL_STATIC); + goto error; + default: + { + char buf[TCL_UTF_MAX+1]; + + buf[Tcl_UniCharToUtf(ch, buf)] = '\0'; + Tcl_AppendStringsToObj(Tcl_GetObjResult(interp), + "bad scan conversion character \"", buf, "\"", NULL); + goto error; + } + } + if (!(flags & SCAN_SUPPRESS)) { + nassign[objIndex]++; + objIndex++; + } + } + + /* + * Verify that all of the variable were assigned exactly once. + */ + + for (i = 0; i < numVars; i++) { + if (nassign[i] > 1) { + Tcl_SetResult(interp, "variable is assigned by multiple \"%n$\" conversion specifiers", TCL_STATIC); + goto error; + } else if (nassign[i] == 0) { + Tcl_SetResult(interp, "variable is not assigend by any conversion specifiers", TCL_STATIC); + goto error; + } + } + + ckfree((char *)nassign); + return TCL_OK; + + badIndex: + if (gotXpg) { + Tcl_SetResult(interp, "\"%n$\" argument index out of range", + TCL_STATIC); + } else { + Tcl_SetResult(interp, + "different numbers of variable names and field specifiers", + TCL_STATIC); + } + + error: + ckfree((char *)nassign); + return TCL_ERROR; +} + +/* + *---------------------------------------------------------------------- + * + * Tcl_ScanObjCmd -- + * + * This procedure is invoked to process the "scan" Tcl command. + * See the user documentation for details on what it does. + * + * Results: + * A standard Tcl result. + * + * Side effects: + * See the user documentation. + * + *---------------------------------------------------------------------- + */ + + /* ARGSUSED */ +int +Tcl_ScanObjCmd(dummy, interp, objc, objv) + ClientData dummy; /* Not used. */ + Tcl_Interp *interp; /* Current interpreter. */ + int objc; /* Number of arguments. */ + Tcl_Obj *CONST objv[]; /* Argument objects. */ +{ + char *format; + int numVars; + int objIndex, offset, i, value, result, code; + char *string, *end, *baseString; + char op; + int base; + int underflow = 0; + size_t width; + long (*fn)(); + Tcl_UniChar ch, sch; + Tcl_Obj **objs, *objPtr; + int flags; + char buf[513]; /* Temporary buffer to hold scanned + * number strings before they are + * passed to strtoul. */ + + if (objc < 3) { + Tcl_WrongNumArgs(interp, 1, objv, + "string format ?varName varName ...?"); + return TCL_ERROR; + } + + format = Tcl_GetStringFromObj(objv[2], NULL); + numVars = objc-3; + + /* + * Check for errors in the format string. + */ + + if (ValidateFormat(interp, format, numVars) == TCL_ERROR) { + return TCL_ERROR; + } + + /* + * Allocate space for the result objects. + */ + + objs = (Tcl_Obj **) ckalloc(sizeof(Tcl_Obj*) * numVars); + for (i = 0; i < numVars; i++) { + objs[i] = NULL; + } + + string = Tcl_GetStringFromObj(objv[1], NULL); + baseString = string; + + /* + * Iterate over the format string filling in the result objects until + * we reach the end of input, the end of the format string, or there + * is a mismatch. + */ + + objIndex = 0; + while (*format != '\0') { + format += Tcl_UtfToUniChar(format, &ch); + + flags = 0; + + /* + * If we see whitespace in the format, skip whitespace in the string. + */ + + if (TclUniCharIsSpace(ch)) { + offset = Tcl_UtfToUniChar(string, &sch); + while (TclUniCharIsSpace(sch)) { + if (*string == '\0') { + goto done; + } + string += offset; + offset = Tcl_UtfToUniChar(string, &sch); + } + continue; + } + + if (ch != '%') { + literal: + if (*string == '\0') { + underflow = 1; + goto done; + } + string += Tcl_UtfToUniChar(string, &sch); + if (ch != sch) { + goto done; + } + continue; + } + + format += Tcl_UtfToUniChar(format, &ch); + if (ch == '%') { + goto literal; + } + + /* + * Check for assignment suppression ('*') or an XPG3-style + * assignment ('%n$'). + */ + + if (ch == '*') { + flags |= SCAN_SUPPRESS; + format += Tcl_UtfToUniChar(format, &ch); + } else if ((ch < 0x80) && isdigit(UCHAR(ch))) { /* INTL: "C" locale. */ + value = strtoul(format-1, &end, 10); /* INTL: "C" locale. */ + if (*end == '$') { + format = end+1; + format += Tcl_UtfToUniChar(format, &ch); + objIndex = value - 1; + } + } + + /* + * Parse any width specifier. + */ + + if ((ch < 0x80) && isdigit(UCHAR(ch))) { /* INTL: "C" locale. */ + width = strtoul(format-1, &format, 10); /* INTL: "C" locale. */ + format += Tcl_UtfToUniChar(format, &ch); + } else { + width = 0; + } + + /* + * Ignore size specifier. + */ + + if ((ch == 'l') || (ch == 'L') || (ch == 'h')) { + format += Tcl_UtfToUniChar(format, &ch); + } + + /* + * Handle the various field types. + */ + + switch (ch) { + case 'n': + if (!(flags & SCAN_SUPPRESS)) { + objPtr = Tcl_NewIntObj(string - baseString); + Tcl_IncrRefCount(objPtr); + objs[objIndex++] = objPtr; + } + continue; + + case 'd': + op = 'i'; + base = 10; + fn = (long (*)())strtol; + break; + case 'i': + op = 'i'; + base = 0; + fn = (long (*)())strtol; + break; + case 'o': + op = 'i'; + base = 8; + fn = (long (*)())strtol; + break; + case 'x': + op = 'i'; + base = 16; + fn = (long (*)())strtol; + break; + case 'u': + op = 'i'; + base = 10; + flags |= SCAN_UNSIGNED; + fn = (long (*)())strtoul; + break; + + case 'f': + case 'e': + case 'g': + op = 'f'; + break; + + case 's': + op = 's'; + break; + + case 'c': + op = 'c'; + flags |= SCAN_NOSKIP; + break; + case '[': + op = '['; + flags |= SCAN_NOSKIP; + break; + } + + /* + * At this point, we will need additional characters from the + * string to proceed. + */ + + if (*string == '\0') { + underflow = 1; + goto done; + } + + /* + * Skip any leading whitespace at the beginning of a field unless + * the format suppresses this behavior. + */ + + if (!(flags & SCAN_NOSKIP)) { + while (*string != '\0') { + offset = Tcl_UtfToUniChar(string, &sch); + if (!TclUniCharIsSpace(sch)) { + break; + } + string += offset; + } + if (*string == '\0') { + underflow = 1; + goto done; + } + } + + /* + * Perform the requested scanning operation. + */ + + switch (op) { + case 's': + /* + * Scan a string up to width characters or whitespace. + */ + + if (width == 0) { + width = (size_t) ~0; + } + end = string; + while (*end != '\0') { + offset = Tcl_UtfToUniChar(end, &sch); + if (TclUniCharIsSpace(sch)) { + break; + } + end += offset; + if (--width == 0) { + break; + } + } + if (!(flags & SCAN_SUPPRESS)) { + objPtr = Tcl_NewStringObj(string, end-string); + Tcl_IncrRefCount(objPtr); + objs[objIndex++] = objPtr; + } + string = end; + break; + + case '[': { + CharSet cset; + + if (width == 0) { + width = (size_t) ~0; + } + end = string; + + format = BuildCharSet(&cset, format); + while (*end != '\0') { + offset = Tcl_UtfToUniChar(end, &sch); + if (!CharInSet(&cset, sch)) { + break; + } + end += offset; + if (--width == 0) { + break; + } + } + ReleaseCharSet(&cset); + + if (!(flags & SCAN_SUPPRESS)) { + objPtr = Tcl_NewStringObj(string, end-string); + Tcl_IncrRefCount(objPtr); + objs[objIndex++] = objPtr; + } + string = end; + + break; + } + case 'c': + /* + * Scan a single Unicode character. + */ + + string += Tcl_UtfToUniChar(string, &sch); + if (!(flags & SCAN_SUPPRESS)) { + objPtr = Tcl_NewIntObj((int)sch); + Tcl_IncrRefCount(objPtr); + objs[objIndex++] = objPtr; + } + break; + + case 'i': + /* + * Scan an unsigned or signed integer. + */ + + if ((width == 0) || (width > sizeof(buf) - 1)) { + width = sizeof(buf) - 1; + } + flags |= SCAN_SIGNOK | SCAN_NODIGITS | SCAN_NOZERO; + for (end = buf; width > 0; width--) { + switch (*string) { + /* + * The 0 digit has special meaning at the beginning of + * a number. If we are unsure of the base, it + * indicates that we are in base 8 or base 16 (if it is + * followed by an 'x'). + */ + case '0': + if (base == 0) { + base = 8; + flags |= SCAN_XOK; + } + if (flags & SCAN_NOZERO) { + flags &= ~(SCAN_SIGNOK | SCAN_NODIGITS + | SCAN_NOZERO); + } else { + flags &= ~(SCAN_SIGNOK | SCAN_XOK + | SCAN_NODIGITS); + } + goto addToInt; + + case '1': case '2': case '3': case '4': + case '5': case '6': case '7': + if (base == 0) { + base = 10; + } + flags &= ~(SCAN_SIGNOK | SCAN_XOK | SCAN_NODIGITS); + goto addToInt; + + case '8': case '9': + if (base == 0) { + base = 10; + } + if (base <= 8) { + break; + } + flags &= ~(SCAN_SIGNOK | SCAN_XOK | SCAN_NODIGITS); + goto addToInt; + + case 'A': case 'B': case 'C': + case 'D': case 'E': case 'F': + case 'a': case 'b': case 'c': + case 'd': case 'e': case 'f': + if (base <= 10) { + break; + } + flags &= ~(SCAN_SIGNOK | SCAN_XOK | SCAN_NODIGITS); + goto addToInt; + + case '+': case '-': + if (flags & SCAN_SIGNOK) { + flags &= ~SCAN_SIGNOK; + goto addToInt; + } + break; + + case 'x': case 'X': + if ((flags & SCAN_XOK) && (end == buf+1)) { + base = 16; + flags &= ~SCAN_XOK; + goto addToInt; + } + break; + } + + /* + * We got an illegal character so we are done accumulating. + */ + + break; + + addToInt: + /* + * Add the character to the temporary buffer. + */ + + *end++ = *string++; + if (*string == '\0') { + break; + } + } + + /* + * Check to see if we need to back up because we only got a + * sign or a trailing x after a 0. + */ + + if (flags & SCAN_NODIGITS) { + goto done; + } else if (end[-1] == 'x' || end[-1] == 'X') { + end--; + string--; + } + + + /* + * Scan the value from the temporary buffer. If we are + * returning a large unsigned value, we have to convert it back + * to a string since Tcl only supports signed values. + */ + + if (!(flags & SCAN_SUPPRESS)) { + *end = '\0'; + value = (int) (*fn)(buf, NULL, base); + if ((flags & SCAN_UNSIGNED) && (value < 0)) { + sprintf(buf, "%u", value); /* INTL: ISO digit */ + objPtr = Tcl_NewStringObj(buf, -1); + } else { + objPtr = Tcl_NewIntObj(value); + } + Tcl_IncrRefCount(objPtr); + objs[objIndex++] = objPtr; + } + + break; + + case 'f': + /* + * Scan a floating point number + */ + + if ((width == 0) || (width > sizeof(buf) - 1)) { + width = sizeof(buf) - 1; + } + flags |= SCAN_SIGNOK | SCAN_NODIGITS | SCAN_PTOK | SCAN_EXPOK; + for (end = buf; width > 0; width--) { + switch (*string) { + case '0': case '1': case '2': case '3': + case '4': case '5': case '6': case '7': + case '8': case '9': + flags &= ~(SCAN_SIGNOK | SCAN_NODIGITS); + goto addToFloat; + case '+': case '-': + if (flags & SCAN_SIGNOK) { + flags &= ~SCAN_SIGNOK; + goto addToFloat; + } + break; + case '.': + if (flags & SCAN_PTOK) { + flags &= ~(SCAN_SIGNOK | SCAN_PTOK); + goto addToFloat; + } + break; + case 'e': case 'E': + /* + * An exponent is not allowed until there has + * been at least one digit. + */ + + if ((flags & (SCAN_NODIGITS | SCAN_EXPOK)) + == SCAN_EXPOK) { + flags = (flags & ~(SCAN_EXPOK|SCAN_PTOK)) + | SCAN_SIGNOK | SCAN_NODIGITS; + goto addToFloat; + } + break; + } + + /* + * We got an illegal character so we are done accumulating. + */ + + break; + + addToFloat: + /* + * Add the character to the temporary buffer. + */ + + *end++ = *string++; + if (*string == '\0') { + break; + } + } + + /* + * Check to see if we need to back up because we saw a + * trailing 'e' or sign. + */ + + if (flags & SCAN_NODIGITS) { + if (flags & SCAN_EXPOK) { + /* + * There were no digits at all so scanning has + * failed and we are done. + */ + goto done; + } + + /* + * We got a bad exponent ('e' and maybe a sign). + */ + + end--; + string--; + if (*end != 'e' && *end != 'E') { + end--; + string--; + } + } + + /* + * Scan the value from the temporary buffer. + */ + + if (!(flags & SCAN_SUPPRESS)) { + double dvalue; + *end = '\0'; + dvalue = strtod(buf, NULL); + objPtr = Tcl_NewDoubleObj(dvalue); + Tcl_IncrRefCount(objPtr); + objs[objIndex++] = objPtr; + } + break; + } + } + + done: + result = 0; + code = TCL_OK; + + for (i = 0; i < numVars; i++) { + if (objs[i] != NULL) { + result++; + if (Tcl_SetObjVar2(interp, Tcl_GetString(objv[i+3]), + NULL, objs[i], 0) == NULL) { + Tcl_AppendStringsToObj(Tcl_GetObjResult(interp), + "couldn't set variable \"", + Tcl_GetString(objv[i+3]), "\"", (char *) NULL); + code = TCL_ERROR; + } + Tcl_DecrRefCount(objs[i]); + } + } + ckfree((char*) objs); + if (code == TCL_OK) { + Tcl_SetObjResult(interp, Tcl_NewIntObj(result)); + } + return code; +} diff --git a/tests/scan.test b/tests/scan.test index e343742..bbae718 100644 --- a/tests/scan.test +++ b/tests/scan.test @@ -10,10 +10,38 @@ # See the file "license.terms" for information on usage and redistribution # of this file, and for a DISCLAIMER OF ALL WARRANTIES. # -# RCS: @(#) $Id: scan.test,v 1.1.2.2 1998/09/24 23:59:35 stanton Exp $ +# RCS: @(#) $Id: scan.test,v 1.1.2.3 1998/11/16 20:45:23 stanton Exp $ if {[string compare test [info procs test]] == 1} then {source defs} +test scan-1.1 {ValidateFormat} { + list [catch {scan {} {%d%1$d} x} msg] $msg +} {1 {cannot mix "%" and "%n$" conversion specifiers}} +test scan-1.1 {ValidateFormat} { + list [catch {scan {} {%d%1$d} x} msg] $msg +} {1 {cannot mix "%" and "%n$" conversion specifiers}} +test scan-1.1 {ValidateFormat} { + list [catch {scan {} {%2$d%d} x} msg] $msg +} {1 {"%n$" argument index out of range}} +test scan-1.1 {ValidateFormat} { + list [catch {scan {} %d} msg] $msg +} {1 {different numbers of variable names and field specifiers}} +test scan-1.1 {ValidateFormat} { + list [catch {scan {} {%10c} a} msg] $msg +} {1 {field width may not be specified in %c conversion}} +test scan-1.1 {ValidateFormat} { + list [catch {scan {} {%*1$d} a} msg] $msg +} {1 {bad scan conversion character "$"}} +test scan-1.1 {ValidateFormat} { + list [catch {scan {} {%1$d%1$d} a} msg] $msg +} {1 {variable is assigned by multiple "%n$" conversion specifiers}} +test scan-1.1 {ValidateFormat} { + list [catch {scan {} a x} msg] $msg +} {1 {variable is not assigend by any conversion specifiers}} + + + + test scan-1.1 {integer scanning} { set a {}; set b {}; set c {}; set d {} list [scan "-20 1476 \n33 0" "%d %d %d %d" a b c d] $a $b $c $d @@ -130,6 +158,10 @@ test scan-3.7 {string and character scanning, unicode} { set a {}; set b {} list [scan "ab\u00c7cdef" "ab%c%c" a b] $a $b } "2 199 99" +test scan-3.7 {string and character scanning, unicode} { + set a {}; set b {} + list [scan "ab\ufeffdef" "%\[ab\ufeff\]" a] $a +} "1 ab\ufeff" test scan-4.1 {error conditions} { catch {scan a} @@ -138,34 +170,27 @@ test scan-4.2 {error conditions} { catch {scan a} msg set msg } {wrong # args: should be "scan string format ?varName varName ...?"} -test scan-4.3 {error conditions} { - catch {scan "1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21" "%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d" a1 a2 a3 a4 a5 a6 a7 a8 a9 a10 a11 a12 a13 a14 a15 a16 a17 a18 a19 a20 a21} -} 1 -test scan-4.4 {error conditions} { - catch {scan "1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21" "%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d" a1 a2 a3 a4 a5 a6 a7 a8 a9 a10 a11 a12 a13 a14 a15 a16 a17 a18 a19 a20 a21} msg - set msg -} {too many fields to scan} test scan-4.5 {error conditions} { - list [catch {scan a %D} msg] $msg + list [catch {scan a %D x} msg] $msg } {1 {bad scan conversion character "D"}} test scan-4.6 {error conditions} { - list [catch {scan a %O} msg] $msg + list [catch {scan a %O x} msg] $msg } {1 {bad scan conversion character "O"}} test scan-4.7 {error conditions} { - list [catch {scan a %X} msg] $msg + list [catch {scan a %X x} msg] $msg } {1 {bad scan conversion character "X"}} test scan-4.8 {error conditions} { - list [catch {scan a %F} msg] $msg + list [catch {scan a %F x} msg] $msg } {1 {bad scan conversion character "F"}} test scan-4.9 {error conditions} { - list [catch {scan a %E} msg] $msg + list [catch {scan a %E x} msg] $msg } {1 {bad scan conversion character "E"}} test scan-4.10 {error conditions} { list [catch {scan a "%d %d" a} msg] $msg } {1 {different numbers of variable names and field specifiers}} test scan-4.11 {error conditions} { list [catch {scan a "%d %d" a b c} msg] $msg -} {1 {different numbers of variable names and field specifiers}} +} {1 {variable is not assigend by any conversion specifiers}} test scan-4.12 {error conditions} { set a {}; set b {}; set c {}; set d {} list [expr {[scan " a" " a %d %d %d %d" a b c d] <= 0}] $a $b $c $d @@ -204,7 +229,16 @@ test scan-4.19 {error conditions} { list [catch {scan 44 %2c a} msg] $msg } {1 {field width may not be specified in %c conversion}} test scan-4.20 {error conditions} { - list [catch {scan abc {%[}} msg] $msg + list [catch {scan abc {%[} x} msg] $msg +} {1 {unmatched [ in format string}} +test scan-4.20 {error conditions} { + list [catch {scan abc {%[^a} x} msg] $msg +} {1 {unmatched [ in format string}} +test scan-4.20 {error conditions} { + list [catch {scan abc {%[^]a} x} msg] $msg +} {1 {unmatched [ in format string}} +test scan-4.20 {error conditions} { + list [catch {scan abc {%[]a} x} msg] $msg } {1 {unmatched [ in format string}} test scan-5.1 {lots of arguments} { diff --git a/win/makefile.vc b/win/makefile.vc index 79c4352..c18b180 100644 --- a/win/makefile.vc +++ b/win/makefile.vc @@ -4,7 +4,7 @@ # of this file, and for a DISCLAIMER OF ALL WARRANTIES. # # Copyright (c) 1995-1996 Sun Microsystems, Inc. -# RCS: @(#) $Id: makefile.vc,v 1.1.2.9 1998/11/11 04:08:39 stanton Exp $ +# RCS: @(#) $Id: makefile.vc,v 1.1.2.10 1998/11/16 20:45:23 stanton Exp $ # Does not depend on the presence of any environment variables in # order to compile tcl; all needed information is derived from @@ -168,6 +168,7 @@ TCLOBJS = \ $(TMPDIR)\tclProc.obj \ $(TMPDIR)\tclRegexp.obj \ $(TMPDIR)\tclResult.obj \ + $(TMPDIR)\tclScan.obj \ $(TMPDIR)\tclStringObj.obj \ $(TMPDIR)\tclThread.obj \ $(TMPDIR)\tclTimer.obj \ -- cgit v0.12