From 60e360f12b1c25a8e89f5893a564ca17d3b99217 Mon Sep 17 00:00:00 2001 From: dkf Date: Wed, 29 Jan 2014 13:59:32 +0000 Subject: Compile [string is] with character classes in a non-awful way. Needs more work to make resulting bytecode disassemble nicely. --- generic/tclCompCmdsSZ.c | 99 ++++++++++++++++++++++++++++++++++++++++++++----- generic/tclCompile.c | 5 +++ generic/tclCompile.h | 37 +++++++++++++++++- generic/tclExecute.c | 19 ++++++++++ 4 files changed, 150 insertions(+), 10 deletions(-) diff --git a/generic/tclCompCmdsSZ.c b/generic/tclCompCmdsSZ.c index 1a69a89..639b4a5 100644 --- a/generic/tclCompCmdsSZ.c +++ b/generic/tclCompCmdsSZ.c @@ -452,6 +452,7 @@ TclCompileStringIsCmd( STR_IS_XDIGIT }; int t, range, allowEmpty = 0, end; + InstStringClassType strClassType; Tcl_Obj *isClass; if (parsePtr->numWords < 3 || parsePtr->numWords > 6) { @@ -486,7 +487,7 @@ TclCompileStringIsCmd( tokenPtr = TokenAfter(tokenPtr); if (parsePtr->numWords == 3) { - allowEmpty = (t != STR_IS_LIST); + allowEmpty = 1; } else { if (!GotLiteral(tokenPtr, "-strict")) { return TCL_ERROR; @@ -496,30 +497,77 @@ TclCompileStringIsCmd( #undef GotLiteral /* - * Some types are not currently handled. Character classes are a prime - * example of this. + * Compile the code. There are several main classes of check here. + * 1. Character classes + * 2. Booleans + * 3. Integers + * 4. Floats + * 5. Lists */ + CompileWord(envPtr, tokenPtr, interp, parsePtr->numWords-1); + switch ((enum isClasses) t) { case STR_IS_ALNUM: + strClassType = STR_CLASS_ALNUM; + goto compileStrClass; case STR_IS_ALPHA: + strClassType = STR_CLASS_ALPHA; + goto compileStrClass; case STR_IS_ASCII: + strClassType = STR_CLASS_ASCII; + goto compileStrClass; case STR_IS_CONTROL: + strClassType = STR_CLASS_CONTROL; + goto compileStrClass; case STR_IS_DIGIT: + strClassType = STR_CLASS_DIGIT; + goto compileStrClass; case STR_IS_GRAPH: + strClassType = STR_CLASS_GRAPH; + goto compileStrClass; case STR_IS_LOWER: + strClassType = STR_CLASS_LOWER; + goto compileStrClass; case STR_IS_PRINT: + strClassType = STR_CLASS_PRINT; + goto compileStrClass; case STR_IS_PUNCT: + strClassType = STR_CLASS_PUNCT; + goto compileStrClass; case STR_IS_SPACE: + strClassType = STR_CLASS_SPACE; + goto compileStrClass; case STR_IS_UPPER: + strClassType = STR_CLASS_UPPER; + goto compileStrClass; case STR_IS_WORD: + strClassType = STR_CLASS_WORD; + goto compileStrClass; case STR_IS_XDIGIT: - return TclCompileBasicMin0ArgCmd(interp, parsePtr, cmdPtr, envPtr); + strClassType = STR_CLASS_XDIGIT; + compileStrClass: + if (allowEmpty) { + OP1( STR_CLASS, strClassType); + } else { + int over, over2; + + OP( DUP); + OP1( STR_CLASS, strClassType); + JUMP1( JUMP_TRUE, over); + OP( POP); + PUSH( "0"); + JUMP1( JUMP, over2); + FIXJUMP1(over); + PUSH( ""); + OP( STR_NEQ); + FIXJUMP1(over2); + } + return TCL_OK; case STR_IS_BOOL: case STR_IS_FALSE: case STR_IS_TRUE: - CompileWord(envPtr, tokenPtr, interp, parsePtr->numWords-1); OP( TRY_CVT_TO_BOOLEAN); switch (t) { int over, over2; @@ -569,7 +617,6 @@ TclCompileStringIsCmd( case STR_IS_DOUBLE: { int satisfied, isEmpty; - CompileWord(envPtr, tokenPtr, interp, parsePtr->numWords-1); if (allowEmpty) { OP( DUP); PUSH( ""); @@ -598,7 +645,6 @@ TclCompileStringIsCmd( case STR_IS_INT: case STR_IS_WIDE: case STR_IS_ENTIER: - CompileWord(envPtr, tokenPtr, interp, parsePtr->numWords-1); if (allowEmpty) { int testNumType; @@ -638,7 +684,6 @@ TclCompileStringIsCmd( return TCL_OK; case STR_IS_LIST: - CompileWord(envPtr, tokenPtr, interp, parsePtr->numWords-1); range = TclCreateExceptRange(CATCH_EXCEPTION_RANGE, envPtr); OP4( BEGIN_CATCH4, range); ExceptionRangeStarts(envPtr, range); @@ -653,7 +698,8 @@ TclCompileStringIsCmd( OP( LNOT); return TCL_OK; } - return TCL_ERROR; + + return TclCompileBasicMin0ArgCmd(interp, parsePtr, cmdPtr, envPtr); } int @@ -1171,6 +1217,41 @@ TclCompileStringToTitleCmd( } /* + * Support definitions for the [string is] compilation. + */ + +static int +UniCharIsAscii( + int character) +{ + return (character >= 0) && (character < 0x80); +} + +static int +UniCharIsHexDigit( + int character) +{ + return (character >= 0) && (character < 0x80) && isxdigit(character); +} + +StringClassDesc const tclStringClassTable[] = { + {"alnum", Tcl_UniCharIsAlnum}, + {"alpha", Tcl_UniCharIsAlpha}, + {"ascii", UniCharIsAscii}, + {"control", Tcl_UniCharIsControl}, + {"digit", Tcl_UniCharIsDigit}, + {"graph", Tcl_UniCharIsGraph}, + {"lower", Tcl_UniCharIsLower}, + {"print", Tcl_UniCharIsPrint}, + {"punct", Tcl_UniCharIsPunct}, + {"space", Tcl_UniCharIsSpace}, + {"upper", Tcl_UniCharIsUpper}, + {"word", Tcl_UniCharIsWordChar}, + {"xdigit", UniCharIsHexDigit}, + {NULL, NULL} +}; + +/* *---------------------------------------------------------------------- * * TclCompileSubstCmd -- diff --git a/generic/tclCompile.c b/generic/tclCompile.c index fdc3e26..08a7a4c 100644 --- a/generic/tclCompile.c +++ b/generic/tclCompile.c @@ -644,6 +644,11 @@ InstructionDesc const tclInstructionTable[] = { {"tryCvtToBoolean", 1, +1, 0, {OPERAND_NONE}}, /* Try converting stktop to boolean if possible. No errors. * Stack: ... value => ... value isStrictBool */ + {"strclass", 2, 0, 1, {OPERAND_UINT1}}, + /* See if all the characters of the given string are a member of the + * specified (by opnd) character class. Note that an empty string will + * satisfy the class check (standard definition of "all"). + * Stack: ... stringValue => ... boolean */ {NULL, 0, 0, 0, {OPERAND_NONE}} }; diff --git a/generic/tclCompile.h b/generic/tclCompile.h index d6d515d..502a2e6 100644 --- a/generic/tclCompile.h +++ b/generic/tclCompile.h @@ -797,9 +797,10 @@ typedef struct ByteCode { #define INST_NUM_TYPE 182 #define INST_TRY_CVT_TO_BOOLEAN 183 +#define INST_STR_CLASS 184 /* The last opcode */ -#define LAST_INST_OPCODE 183 +#define LAST_INST_OPCODE 184 /* * Table describing the Tcl bytecode instructions: their name (for displaying @@ -844,6 +845,40 @@ typedef struct InstructionDesc { MODULE_SCOPE InstructionDesc const tclInstructionTable[]; /* + * Constants used by INST_STRING_CLASS to indicate character classes. These + * correspond closely by name with what [string is] can support, but there is + * no requirement to keep the values the same. + */ + +typedef enum InstStringClassType { + STR_CLASS_ALNUM, /* Unicode alphabet or digit characters. */ + STR_CLASS_ALPHA, /* Unicode alphabet characters. */ + STR_CLASS_ASCII, /* Characters in range U+000000..U+00007F. */ + STR_CLASS_CONTROL, /* Unicode control characters. */ + STR_CLASS_DIGIT, /* Unicode digit characters. */ + STR_CLASS_GRAPH, /* Unicode printing characters, excluding + * space. */ + STR_CLASS_LOWER, /* Unicode lower-case alphabet characters. */ + STR_CLASS_PRINT, /* Unicode printing characters, including + * spaces. */ + STR_CLASS_PUNCT, /* Unicode punctuation characters. */ + STR_CLASS_SPACE, /* Unicode space characters. */ + STR_CLASS_UPPER, /* Unicode upper-case alphabet characters. */ + STR_CLASS_WORD, /* Unicode word (alphabetic, digit, connector + * punctuation) characters. */ + STR_CLASS_XDIGIT /* Characters that can be used as digits in + * hexadecimal numbers ([0-9A-Fa-f]). */ +} InstStringClassType; + +typedef struct StringClassDesc { + const char *name; /* Name of the class. */ + int (*comparator)(int); /* Function to test if a single unicode + * character is a member of the class. */ +} StringClassDesc; + +MODULE_SCOPE StringClassDesc const tclStringClassTable[]; + +/* * Compilation of some Tcl constructs such as if commands and the logical or * (||) and logical and (&&) operators in expressions requires the generation * of forward jumps. Since the PC target of these jumps isn't known when the diff --git a/generic/tclExecute.c b/generic/tclExecute.c index 916de17..58d85e1 100644 --- a/generic/tclExecute.c +++ b/generic/tclExecute.c @@ -5810,6 +5810,25 @@ TEBCresume( TclNewIntObj(objResultPtr, match); NEXT_INST_F(1, 2, 1); + + case INST_STR_CLASS: + opnd = TclGetInt1AtPtr(pc+1); + valuePtr = OBJ_AT_TOS; + TRACE(("%s \"%.30s\" => ", tclStringClassTable[opnd].name, + O2S(valuePtr))); + ustring1 = Tcl_GetUnicodeFromObj(valuePtr, &length); + match = 1; + if (length > 0) { + end = ustring1 + length; + for (p=ustring1 ; p