From 2561cb41c0da4522531af13b664373518e0b8008 Mon Sep 17 00:00:00 2001 From: sebres Date: Tue, 10 Jan 2017 22:37:40 +0000 Subject: string index tree for fast greedy search of the string (index) by unique string prefix as key; clock scan rewritten to use string index tries search; --- generic/tclClock.c | 74 ++++--- generic/tclClockFmt.c | 444 ++++++++++++++++------------------------- generic/tclDate.h | 22 +- generic/tclStrIdxTree.c | 519 ++++++++++++++++++++++++++++++++++++++++++++++++ generic/tclStrIdxTree.h | 134 +++++++++++++ unix/Makefile.in | 4 + win/Makefile.in | 1 + win/makefile.vc | 1 + 8 files changed, 865 insertions(+), 334 deletions(-) create mode 100644 generic/tclStrIdxTree.c create mode 100644 generic/tclStrIdxTree.h diff --git a/generic/tclClock.c b/generic/tclClock.c index 0c08391..e52b2e7 100644 --- a/generic/tclClock.c +++ b/generic/tclClock.c @@ -15,6 +15,7 @@ */ #include "tclInt.h" +#include "tclStrIdxTree.h" #include "tclDate.h" /* @@ -140,6 +141,10 @@ static unsigned long TzsetGetEpoch(void); static void TzsetIfNecessary(void); static void ClockDeleteCmdProc(ClientData); +static int ClockTestObjCmd( + ClientData clientData, Tcl_Interp *interp, + int objc, Tcl_Obj *const objv[]); + /* * Structure containing description of "native" clock commands to create. */ @@ -169,6 +174,7 @@ static const struct ClockCommand clockCommands[] = { { "GetJulianDayFromEraYearWeekDay", ClockGetjuliandayfromerayearweekdayObjCmd }, { "ParseFormatArgs", ClockParseformatargsObjCmd }, + { "_test", TclStrIdxTreeTestObjCmd }, { NULL, NULL } }; @@ -584,7 +590,7 @@ ClockMCGet( } MODULE_SCOPE Tcl_Obj * -ClockMCGetListIdxDict( +ClockMCGetIdx( ClockFmtScnCmdArgs *opts, int mcKey) { @@ -598,53 +604,45 @@ ClockMCGetListIdxDict( return NULL; } - /* try to get indices dictionray, - * if not available - create from list */ + /* try to get indices object */ + if (dataPtr->mcLitIdxs == NULL) { + return NULL; + } if (Tcl_DictObjGet(NULL, opts->mcDictObj, dataPtr->mcLitIdxs[mcKey], &valObj) != TCL_OK ) { - Tcl_Obj **lstv, *intObj; - int i, lstc; + return NULL; + } - if (dataPtr->mcLitIdxs == NULL) { - dataPtr->mcLitIdxs = ckalloc(MCLIT__END * sizeof(Tcl_Obj*)); - for (i = 0; i < MCLIT__END; ++i) { - Tcl_InitObjRef(dataPtr->mcLitIdxs[i], - Tcl_NewStringObj(MsgCtLitIdxs[i], -1)); - } - } + return valObj; +} - if (Tcl_DictObjGet(opts->interp, opts->mcDictObj, - dataPtr->mcLiterals[mcKey], &valObj) != TCL_OK) { - return NULL; - }; - if (TclListObjGetElements(opts->interp, valObj, - &lstc, &lstv) != TCL_OK) { - return NULL; - }; +MODULE_SCOPE int +ClockMCSetIdx( + ClockFmtScnCmdArgs *opts, + int mcKey, Tcl_Obj *valObj) +{ + ClockClientData *dataPtr = opts->clientData; - valObj = Tcl_NewDictObj(); - for (i = 0; i < lstc; i++) { - intObj = Tcl_NewIntObj(i); - if (Tcl_DictObjPut(opts->interp, valObj, - lstv[i], intObj) != TCL_OK - ) { - Tcl_DecrRefCount(valObj); - Tcl_DecrRefCount(intObj); - return NULL; - } - }; + if (opts->mcDictObj == NULL) { + ClockMCDict(opts); + if (opts->mcDictObj == NULL) + return TCL_ERROR; + } - if (Tcl_DictObjPut(opts->interp, opts->mcDictObj, - dataPtr->mcLitIdxs[mcKey], valObj) != TCL_OK - ) { - Tcl_DecrRefCount(valObj); - return NULL; + /* if literal storage for indices not yet created */ + if (dataPtr->mcLitIdxs == NULL) { + int i; + dataPtr->mcLitIdxs = ckalloc(MCLIT__END * sizeof(Tcl_Obj*)); + for (i = 0; i < MCLIT__END; ++i) { + Tcl_InitObjRef(dataPtr->mcLitIdxs[i], + Tcl_NewStringObj(MsgCtLitIdxs[i], -1)); } - }; + } - return valObj; + return Tcl_DictObjPut(opts->interp, opts->mcDictObj, + dataPtr->mcLitIdxs[mcKey], valObj); } /* diff --git a/generic/tclClockFmt.c b/generic/tclClockFmt.c index 5469ee1..e66c525 100644 --- a/generic/tclClockFmt.c +++ b/generic/tclClockFmt.c @@ -11,6 +11,7 @@ */ #include "tclInt.h" +#include "tclStrIdxTree.h" #include "tclDate.h" /* @@ -33,6 +34,9 @@ static void ClockFmtScnStorageDelete(ClockFmtScnStorage *fss); static void ClockFrmScnFinalize(ClientData clientData); +/* Msgcat index literals prefixed with _IDX_, used for quick dictionary search */ +CLOCK_LOCALE_LITERAL_ARRAY(MsgCtLitIdxs, "_IDX_"); + /* * Clock scan and format facilities. */ @@ -583,6 +587,139 @@ LocaleListSearch(ClockFmtScnCmdArgs *opts, minLen, maxLen); } +static TclStrIdxTree * +ClockMCGetListIdxTree( + ClockFmtScnCmdArgs *opts, + int mcKey) +{ + TclStrIdxTree * idxTree; + Tcl_Obj *objPtr = ClockMCGetIdx(opts, mcKey); + if ( objPtr != NULL + && (idxTree = TclStrIdxTreeGetFromObj(objPtr)) != NULL + ) { + return idxTree; + + } else { + /* build new index */ + + Tcl_Obj **lstv; + int lstc; + Tcl_Obj *valObj; + + objPtr = TclStrIdxTreeNewObj(); + if ((idxTree = TclStrIdxTreeGetFromObj(objPtr)) == NULL) { + goto done; /* unexpected, but ...*/ + } + + valObj = ClockMCGet(opts, mcKey); + if (valObj == NULL) { + goto done; + } + + if (TclListObjGetElements(opts->interp, valObj, + &lstc, &lstv) != TCL_OK) { + goto done; + }; + + if (TclStrIdxTreeBuildFromList(idxTree, lstc, lstv) != TCL_OK) { + goto done; + } + + ClockMCSetIdx(opts, mcKey, objPtr); + objPtr = NULL; + }; + +done: + if (objPtr) { + Tcl_DecrRefCount(objPtr); + idxTree = NULL; + } + + return idxTree; +} + +static TclStrIdxTree * +ClockMCGetMultiListIdxTree( + ClockFmtScnCmdArgs *opts, + int mcKey, + int *mcKeys) +{ + TclStrIdxTree * idxTree; + Tcl_Obj *objPtr = ClockMCGetIdx(opts, mcKey); + if ( objPtr != NULL + && (idxTree = TclStrIdxTreeGetFromObj(objPtr)) != NULL + ) { + return idxTree; + + } else { + /* build new index */ + + Tcl_Obj **lstv; + int lstc; + Tcl_Obj *valObj; + + objPtr = TclStrIdxTreeNewObj(); + if ((idxTree = TclStrIdxTreeGetFromObj(objPtr)) == NULL) { + goto done; /* unexpected, but ...*/ + } + + while (*mcKeys) { + + valObj = ClockMCGet(opts, *mcKeys); + if (valObj == NULL) { + goto done; + } + + if (TclListObjGetElements(opts->interp, valObj, + &lstc, &lstv) != TCL_OK) { + goto done; + }; + + if (TclStrIdxTreeBuildFromList(idxTree, lstc, lstv) != TCL_OK) { + goto done; + } + mcKeys++; + } + + ClockMCSetIdx(opts, mcKey, objPtr); + }; + +done: + if (objPtr) { + Tcl_DecrRefCount(objPtr); + idxTree = NULL; + } + + return idxTree; +} + +inline int +ClockStrIdxTreeSearch(ClockFmtScnCmdArgs *opts, + DateInfo *info, TclStrIdxTree *idxTree, int *val, + int minLen, int maxLen) +{ + const char *f; + TclStrIdx *foundItem; + f = TclStrIdxTreeSearch(NULL, &foundItem, idxTree, + yyInput, yyInput + maxLen); + + if (f <= yyInput || (f - yyInput) < minLen) { + /* not found */ + return TCL_RETURN; + } + if (foundItem->value == -1) { + /* ambigous */ + return TCL_RETURN; + } + + *val = foundItem->value; + + /* shift input pointer */ + yyInput = f; + + return TCL_OK; +} + static int StaticListSearch(ClockFmtScnCmdArgs *opts, DateInfo *info, const char **lst, int *val) @@ -612,21 +749,19 @@ FindWordEnd( register const char * p, const char * end) { register const char *x = tok->tokWord.start; - if (x == tok->tokWord.end) { /* single char word */ - if (*p != *x) { - /* no match -> error */ - return NULL; + const char *pfnd; + if (x == tok->tokWord.end - 1) { /* fast phase-out for single char word */ + if (*p == *x) { + return ++p; } - return ++p; } /* multi-char word */ - do - if (*p++ != *x++) { - /* no match -> error */ - return NULL; - } - while (x <= tok->tokWord.end && p < end); - return p; + x = TclUtfFindEqualNC(x, tok->tokWord.end, p, end, &pfnd); + if (x < tok->tokWord.end) { + /* no match -> error */ + return NULL; + } + return pfnd; } static int @@ -822,11 +957,18 @@ ClockScnToken_LocaleListMatcher_Proc(ClockFmtScnCmdArgs *opts, { int ret, val; int minLen, maxLen; + TclStrIdxTree *idxTree; DetermineGreedySearchLen(opts, info, tok, &minLen, &maxLen); - ret = LocaleListSearch(opts, info, (int)tok->map->data, &val, - minLen, maxLen); + /* get or create tree in msgcat dict */ + + idxTree = ClockMCGetListIdxTree(opts, (int)tok->map->data /* mcKey */); + if (idxTree == NULL) { + return TCL_ERROR; + } + + ret = ClockStrIdxTreeSearch(opts, info, idxTree, &val, minLen, maxLen); if (ret != TCL_OK) { return ret; } @@ -1120,7 +1262,8 @@ ClockGetOrParseScanFormat( /* begin new word token - don't join with previous word token, * because current mapping should be "...%%..." -> "...%..." */ tok->map = &ScnWordTokenMap; - tok->tokWord.start = tok->tokWord.end = p; + tok->tokWord.start = p; + tok->tokWord.end = p+1; AllocTokenInChain(tok, fss->scnTok, fss->scnTokC); continue; break; @@ -1190,7 +1333,7 @@ word_tok: if (tok > fss->scnTok && (tok-1)->map == &ScnWordTokenMap) { wordTok = tok-1; } - wordTok->tokWord.end = p; + wordTok->tokWord.end = p+1; if (wordTok == tok) { wordTok->tokWord.start = p; wordTok->map = &ScnWordTokenMap; @@ -1214,7 +1357,7 @@ word_tok: if (prevTok->map->type != CTOKT_WORD) { endDist += prevTok->map->minSize; } else { - endDist += prevTok->tokWord.end - prevTok->tokWord.start + 1; + endDist += prevTok->tokWord.end - prevTok->tokWord.start; } prevTok--; } @@ -1325,6 +1468,11 @@ ClockScan( yyMeridian = MER24; + /* lower case given string into new object */ + strObj = Tcl_NewStringObj(TclGetString(strObj), strObj->length); + Tcl_IncrRefCount(strObj); + strObj->length = Tcl_UtfToLower(TclGetString(strObj)); + p = TclGetString(strObj); end = p + strObj->length; /* in strict mode - bypass spaces at begin / end only (not between tokens) */ @@ -1578,6 +1726,8 @@ not_match: done: + Tcl_DecrRefCount(strObj); + return ret; } @@ -1604,266 +1754,6 @@ ClockFormat( return TCL_ERROR; } */ -#if 0 - /* prepare parsing */ - - yyMeridian = MER24; - - p = TclGetString(strObj); - end = p + strObj->length; - /* in strict mode - bypass spaces at begin / end only (not between tokens) */ - if (opts->flags & CLF_STRICT) { - while (p < end && isspace(UCHAR(*p))) { - p++; - } - } - info->dateStart = yyInput = p; - info->dateEnd = end; - - /* parse string */ - for (; tok->map != NULL; tok++) { - map = tok->map; - /* bypass spaces at begin of input before parsing each token */ - if ( !(opts->flags & CLF_STRICT) - && (map->type != CTOKT_SPACE && map->type != CTOKT_WORD) - ) { - while (p < end && isspace(UCHAR(*p))) { - p++; - } - } - yyInput = p; - switch (map->type) - { - case CTOKT_DIGIT: - if (1) { - int size = map->maxSize; - int sign = 1; - if (map->flags & CLF_SIGNED) { - if (*p == '+') { yyInput = ++p; } - else - if (*p == '-') { yyInput = ++p; sign = -1; }; - } - /* greedy find digits (look for forward digits consider spaces), - * corresponding pre-calculated lookAhead */ - if (size != map->minSize && tok->lookAhead) { - int spcnt = 0; - const char *pe; - size += tok->lookAhead; - x = p + size; if (x > end) { x = end; }; - pe = x; - while (p < x) { - if (isspace(UCHAR(*p))) { - if (pe > p) { pe = p; }; - if (x < end) x++; - p++; - spcnt++; - continue; - } - if (isdigit(UCHAR(*p))) { - p++; - continue; - } - break; - } - /* consider reserved (lookAhead) for next tokens */ - p -= tok->lookAhead + spcnt; - if (p > pe) { - p = pe; - } - } else { - x = p + size; if (x > end) { x = end; }; - while (isdigit(UCHAR(*p)) && p < x) { p++; }; - } - size = p - yyInput; - if (size < map->minSize) { - /* missing input -> error */ - goto not_match; - } - /* string 2 number, put number into info structure by offset */ - p = yyInput; x = p + size; - if (!(map->flags & CLF_LOCALSEC)) { - if (_str2int((time_t *)(((char *)info) + map->offs), - p, x, sign) != TCL_OK) { - goto overflow; - } - p = x; - } else { - if (_str2wideInt((Tcl_WideInt *)(((char *)info) + map->offs), - p, x, sign) != TCL_OK) { - goto overflow; - } - p = x; - } - flags = (flags & ~map->clearFlags) | map->flags; - } - break; - case CTOKT_PARSER: - switch (map->parser(opts, info, tok)) { - case TCL_OK: - break; - case TCL_RETURN: - goto not_match; - break; - default: - goto done; - break; - }; - p = yyInput; - flags = (flags & ~map->clearFlags) | map->flags; - break; - case CTOKT_SPACE: - /* at least one space in strict mode */ - if (opts->flags & CLF_STRICT) { - if (!isspace(UCHAR(*p))) { - /* unmatched -> error */ - goto not_match; - } - p++; - } - while (p < end && isspace(UCHAR(*p))) { - p++; - } - break; - case CTOKT_WORD: - x = FindWordEnd(tok, p, end); - if (!x) { - /* no match -> error */ - goto not_match; - } - p = x; - continue; - break; - } - } - - /* ignore spaces at end */ - while (p < end && isspace(UCHAR(*p))) { - p++; - } - /* check end was reached */ - if (p < end) { - /* something after last token - wrong format */ - goto not_match; - } - - /* - * Invalidate result - */ - - /* seconds token (%s) take precedence over all other tokens */ - if ((opts->flags & CLF_EXTENDED) || !(flags & CLF_LOCALSEC)) { - if (flags & CLF_DATE) { - - if (!(flags & CLF_JULIANDAY)) { - info->flags |= CLF_ASSEMBLE_SECONDS|CLF_ASSEMBLE_JULIANDAY; - - /* dd precedence below ddd */ - switch (flags & (CLF_MONTH|CLF_DAYOFYEAR|CLF_DAYOFMONTH)) { - case (CLF_DAYOFYEAR|CLF_DAYOFMONTH): - /* miss month: ddd over dd (without month) */ - flags &= ~CLF_DAYOFMONTH; - case (CLF_DAYOFYEAR): - /* ddd over naked weekday */ - if (!(flags & CLF_ISO8601YEAR)) { - flags &= ~CLF_ISO8601; - } - break; - case (CLF_MONTH|CLF_DAYOFYEAR|CLF_DAYOFMONTH): - /* both available: mmdd over ddd */ - flags &= ~CLF_DAYOFYEAR; - case (CLF_MONTH|CLF_DAYOFMONTH): - case (CLF_DAYOFMONTH): - /* mmdd / dd over naked weekday */ - if (!(flags & CLF_ISO8601YEAR)) { - flags &= ~CLF_ISO8601; - } - break; - } - - /* YearWeekDay below YearMonthDay */ - if ( (flags & CLF_ISO8601) - && ( (flags & (CLF_YEAR|CLF_DAYOFYEAR)) == (CLF_YEAR|CLF_DAYOFYEAR) - || (flags & (CLF_YEAR|CLF_DAYOFMONTH|CLF_MONTH)) == (CLF_YEAR|CLF_DAYOFMONTH|CLF_MONTH) - ) - ) { - /* yy precedence below yyyy */ - if (!(flags & CLF_ISO8601CENTURY) && (flags & CLF_CENTURY)) { - /* normally precedence of ISO is higher, but no century - so put it down */ - flags &= ~CLF_ISO8601; - } - else - /* yymmdd or yyddd over naked weekday */ - if (!(flags & CLF_ISO8601YEAR)) { - flags &= ~CLF_ISO8601; - } - } - - if (!(flags & CLF_ISO8601)) { - if (yyYear < 100) { - if (!(flags & CLF_CENTURY)) { - if (yyYear >= dataPtr->yearOfCenturySwitch) { - yyYear -= 100; - } - yyYear += dataPtr->currentYearCentury; - } else { - yyYear += info->dateCentury * 100; - } - } - } else { - if (info->date.iso8601Year < 100) { - if (!(flags & CLF_ISO8601CENTURY)) { - if (info->date.iso8601Year >= dataPtr->yearOfCenturySwitch) { - info->date.iso8601Year -= 100; - } - info->date.iso8601Year += dataPtr->currentYearCentury; - } else { - info->date.iso8601Year += info->dateCentury * 100; - } - } - } - } - } - - /* if no time - reset time */ - if (!(flags & (CLF_TIME|CLF_LOCALSEC))) { - info->flags |= CLF_ASSEMBLE_SECONDS; - yydate.localSeconds = 0; - } - - if (flags & CLF_TIME) { - info->flags |= CLF_ASSEMBLE_SECONDS; - yySeconds = ToSeconds(yyHour, yyMinutes, - yySeconds, yyMeridian); - } else - if (!(flags & CLF_LOCALSEC)) { - info->flags |= CLF_ASSEMBLE_SECONDS; - yySeconds = yydate.localSeconds % SECONDS_PER_DAY; - } - } - - /* tell caller which flags were set */ - info->flags |= flags; - - ret = TCL_OK; - goto done; - -overflow: - - Tcl_SetResult(interp, "requested date too large to represent", - TCL_STATIC); - Tcl_SetErrorCode(interp, "CLOCK", "dateTooLarge", NULL); - goto done; - -not_match: - - Tcl_SetResult(interp, "input string does not match supplied format", - TCL_STATIC); - Tcl_SetErrorCode(interp, "CLOCK", "badInputString", NULL); - -done: - - return ret; -#endif } diff --git a/generic/tclDate.h b/generic/tclDate.h index 112ed31..2728dd3 100644 --- a/generic/tclDate.h +++ b/generic/tclDate.h @@ -126,24 +126,6 @@ typedef enum ClockMsgCtLiteral { } /* - * Primitives to safe set, reset and free references. - */ - -#define Tcl_UnsetObjRef(obj) \ - if (obj != NULL) { Tcl_DecrRefCount(obj); obj = NULL; } -#define Tcl_InitObjRef(obj, val) \ - obj = val; if (obj) { Tcl_IncrRefCount(obj); } -#define Tcl_SetObjRef(obj, val) \ -if (1) { \ - Tcl_Obj *nval = val; \ - if (obj != nval) { \ - Tcl_Obj *prev = obj; \ - Tcl_InitObjRef(obj, nval); \ - if (prev != NULL) { Tcl_DecrRefCount(prev); }; \ - } \ -} - -/* * Structure containing the fields used in [clock format] and [clock scan] */ @@ -450,7 +432,9 @@ MODULE_SCOPE Tcl_Obj * MODULE_SCOPE Tcl_Obj * ClockMCGet(ClockFmtScnCmdArgs *opts, int mcKey); MODULE_SCOPE Tcl_Obj * - ClockMCGetListIdxDict(ClockFmtScnCmdArgs *opts, int mcKey); + ClockMCGetIdx(ClockFmtScnCmdArgs *opts, int mcKey); +MODULE_SCOPE int ClockMCSetIdx(ClockFmtScnCmdArgs *opts, int mcKey, + Tcl_Obj *valObj); /* tclClockFmt.c module declarations */ diff --git a/generic/tclStrIdxTree.c b/generic/tclStrIdxTree.c new file mode 100644 index 0000000..f078c7a --- /dev/null +++ b/generic/tclStrIdxTree.c @@ -0,0 +1,519 @@ +/* + * tclStrIdxTree.c -- + * + * Contains the routines for managing string index tries in Tcl. + * + * This code is back-ported from the tclSE engine, by Serg G. Brester. + * + * Copyright (c) 2016 by Sergey G. Brester aka sebres. All rights reserved. + * + * See the file "license.terms" for information on usage and redistribution of + * this file, and for a DISCLAIMER OF ALL WARRANTIES. + * + * ----------------------------------------------------------------------- + * + * String index tries are prepaired structures used for fast greedy search of the string + * (index) by unique string prefix as key. + * + * Index tree build for two lists together can be explained in the following datagram + * + * Lists: + * + * {Januar Februar Maerz April Mai Juni Juli August September Oktober November Dezember} + * {Jnr Fbr Mrz Apr Mai Jni Jli Agt Spt Okt Nvb Dzb} + * + * Index-Tree: + * + * j -1 * ... + * anuar 0 * + * u -1 * a -1 + * ni 5 * pril 3 + * li 6 * ugust 7 + * n -1 * gt 7 + * r 0 * s 8 + * i 5 * eptember 8 + * li 6 * pt 8 + * f 1 * oktober 9 + * ebruar 1 * n 10 + * br 1 * ovember 10 + * m -1 * vb 10 + * a -1 * d 11 + * erz 2 * ezember 11 + * i 4 * zb 11 + * rz 2 * + * ... + * + * Thereby value -1 shows pure group items (corresponding ambigous matches). + * + * StrIdxTree's are very fast, so: + * build of above-mentioned tree takes about 10 microseconds. + * search of string index in this tree takes fewer as 0.1 microseconds. + * + */ + +#include "tclInt.h" +#include "tclStrIdxTree.h" + + +/* + *---------------------------------------------------------------------- + * + * TclStrIdxTreeSearch -- + * + * Find largest part of string "start" in indexed tree (case sensitive). + * + * Also used for building of string index tree. + * + * Results: + * Return position of UTF character in start after last equal character + * and found item (with parent). + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +MODULE_SCOPE const char* +TclStrIdxTreeSearch( + TclStrIdxTree **foundParent, /* Return value of found sub tree (used for tree build) */ + TclStrIdx **foundItem, /* Return value of found item */ + TclStrIdxTree *tree, /* Index tree will be browsed */ + const char *start, /* UTF string to find in tree */ + const char *end) /* End of string */ +{ + TclStrIdxTree *parent = tree, *prevParent = tree; + TclStrIdx *item = tree->firstPtr, *prevItem = NULL; + const char *s = start, *e, *cin, *preve; + int offs = 0; + + if (item == NULL) { + goto done; + } + + /* search in tree */ + do { + cin = TclGetString(item->key) + offs; + e = TclUtfFindEqual(s, end, cin, cin + item->length); + /* if something was found */ + if (e > s) { + /* if whole string was found */ + if (e >= end) { + start = e; + goto done; + }; + /* set new offset and shift start string */ + offs += (e - s); + s = e; + /* if match item, go deeper as long as possible */ + if (offs >= item->length && item->childTree.firstPtr) { + /* save previuosly found item (if not ambigous) for + * possible fallback (few greedy match) */ + if (item->value != -1) { + preve = e; + prevItem = item; + prevParent = parent; + } + parent = &item->childTree; + item = item->childTree.firstPtr; + continue; + } + /* no children - return this item and current chars found */ + start = e; + goto done; + } + + item = item->nextPtr; + + } while (item != NULL); + + /* fallback (few greedy match) not ambigous (has a value) */ + if (prevItem != NULL) { + item = prevItem; + parent = prevParent; + start = preve; + } + +done: + + if (foundParent) + *foundParent = parent; + if (foundItem) + *foundItem = item; + return start; +} + +MODULE_SCOPE void +TclStrIdxTreeFree( + TclStrIdx *tree) +{ + while (tree != NULL) { + TclStrIdx *t; + Tcl_DecrRefCount(tree->key); + if (tree->childTree.firstPtr != NULL) { + TclStrIdxTreeFree(tree->childTree.firstPtr); + } + t = tree, tree = tree->nextPtr; + ckfree(t); + } +} + +/* + * Several bidirectional list primitives + */ +inline void +TclStrIdxTreeInsertBranch( + TclStrIdxTree *parent, + register TclStrIdx *item, + register TclStrIdx *child) +{ + if (parent->firstPtr == child) + parent->firstPtr = item; + if (parent->lastPtr == child) + parent->lastPtr = item; + if (item->nextPtr = child->nextPtr) { + item->nextPtr->prevPtr = item; + child->nextPtr = NULL; + } + if (item->prevPtr = child->prevPtr) { + item->prevPtr->nextPtr = item; + child->prevPtr = NULL; + } + item->childTree.firstPtr = child; + item->childTree.lastPtr = child; +} + +inline void +TclStrIdxTreeAppend( + register TclStrIdxTree *parent, + register TclStrIdx *item) +{ + if (parent->lastPtr != NULL) { + parent->lastPtr->nextPtr = item; + } + item->prevPtr = parent->lastPtr; + item->nextPtr = NULL; + parent->lastPtr = item; + if (parent->firstPtr == NULL) { + parent->firstPtr = item; + } +} + + +/* + *---------------------------------------------------------------------- + * + * TclStrIdxTreeBuildFromList -- + * + * Build or extend string indexed tree from tcl list. + * + * Important: by multiple lists, optimal tree can be created only if list with + * larger strings used firstly. + * + * Results: + * Returns a standard Tcl result. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +MODULE_SCOPE int +TclStrIdxTreeBuildFromList( + TclStrIdxTree *idxTree, + int lstc, + Tcl_Obj **lstv) +{ + Tcl_Obj **lwrv; + int i, ret = TCL_ERROR; + const char *s, *e, *f; + TclStrIdx *item; + + /* create lowercase reflection of the list keys */ + + lwrv = ckalloc(sizeof(Tcl_Obj*) * lstc); + if (lwrv == NULL) { + return TCL_ERROR; + } + for (i = 0; i < lstc; i++) { + lwrv[i] = Tcl_DuplicateObj(lstv[i]); + if (lwrv[i] == NULL) { + return TCL_ERROR; + } + Tcl_IncrRefCount(lwrv[i]); + lwrv[i]->length = Tcl_UtfToLower(TclGetString(lwrv[i])); + } + + /* build index tree of the list keys */ + for (i = 0; i < lstc; i++) { + TclStrIdxTree *foundParent = idxTree; + e = s = TclGetString(lwrv[i]); + e += lwrv[i]->length; + + /* ignore empty values (impossible to index it) */ + if (lwrv[i]->length == 0) continue; + + item = NULL; + if (idxTree->firstPtr != NULL) { + TclStrIdx *foundItem; + f = TclStrIdxTreeSearch(&foundParent, &foundItem, + idxTree, s, e); + /* if common prefix was found */ + if (f > s) { + /* ignore element if fulfilled or ambigous */ + if (f == e) { + continue; + } + /* if shortest key was found with the same value, + * just replace its current key with longest key */ + if ( foundItem->value == i + && foundItem->length < lwrv[i]->length + && foundItem->childTree.firstPtr == NULL + ) { + Tcl_SetObjRef(foundItem->key, lwrv[i]); + foundItem->length = lwrv[i]->length; + continue; + } + /* split tree (e. g. j->(jan,jun) + jul == j->(jan,ju->(jun,jul)) ) + * but don't split by fulfilled child of found item ( ii->iii->iiii ) */ + if (foundItem->length != (f - s)) { + /* first split found item (insert one between parent and found + new one) */ + item = ckalloc(sizeof(*item)); + if (item == NULL) { + goto done; + } + Tcl_InitObjRef(item->key, foundItem->key); + item->length = f - s; + /* set value or mark as ambigous if not the same value of both */ + item->value = (foundItem->value == i) ? i : -1; + /* insert group item between foundParent and foundItem */ + TclStrIdxTreeInsertBranch(foundParent, item, foundItem); + foundParent = &item->childTree; + } else { + /* the new item should be added as child of found item */ + foundParent = &foundItem->childTree; + } + } + } + /* append item at end of found parent */ + item = ckalloc(sizeof(*item)); + if (item == NULL) { + goto done; + } + item->childTree.lastPtr = item->childTree.firstPtr = NULL; + Tcl_InitObjRef(item->key, lwrv[i]); + item->length = lwrv[i]->length; + item->value = i; + TclStrIdxTreeAppend(foundParent, item); + }; + + ret = TCL_OK; + +done: + + if (lwrv != NULL) { + for (i = 0; i < lstc; i++) { + Tcl_DecrRefCount(lwrv[i]); + } + ckfree(lwrv); + } + + if (ret != TCL_OK) { + if (idxTree->firstPtr != NULL) { + TclStrIdxTreeFree(idxTree->firstPtr); + } + } + + return ret; +} + + +static void +StrIdxTreeObj_DupIntRepProc(Tcl_Obj *srcPtr, Tcl_Obj *copyPtr); +static void +StrIdxTreeObj_FreeIntRepProc(Tcl_Obj *objPtr); +static void +StrIdxTreeObj_UpdateStringProc(Tcl_Obj *objPtr); + +Tcl_ObjType StrIdxTreeObjType = { + "str-idx-tree", /* name */ + StrIdxTreeObj_FreeIntRepProc, /* freeIntRepProc */ + StrIdxTreeObj_DupIntRepProc, /* dupIntRepProc */ + StrIdxTreeObj_UpdateStringProc, /* updateStringProc */ + NULL /* setFromAnyProc */ +}; + +MODULE_SCOPE Tcl_Obj* +TclStrIdxTreeNewObj() +{ + Tcl_Obj *objPtr = Tcl_NewObj(); + objPtr->internalRep.twoPtrValue.ptr1 = NULL; + objPtr->internalRep.twoPtrValue.ptr2 = NULL; + objPtr->typePtr = &StrIdxTreeObjType; + /* return tree root in internal representation */ + return objPtr; +} + +static void +StrIdxTreeObj_DupIntRepProc(Tcl_Obj *srcPtr, Tcl_Obj *copyPtr) +{ + /* follow links (smart pointers) */ + if ( srcPtr->internalRep.twoPtrValue.ptr1 != NULL + && srcPtr->internalRep.twoPtrValue.ptr2 == NULL + ) { + srcPtr = (Tcl_Obj*)srcPtr->internalRep.twoPtrValue.ptr1; + } + /* create smart pointer to it (ptr1 != NULL, ptr2 = NULL) */ + Tcl_InitObjRef(((Tcl_Obj *)copyPtr->internalRep.twoPtrValue.ptr1), + srcPtr); + copyPtr->internalRep.twoPtrValue.ptr2 = NULL; + copyPtr->typePtr = &StrIdxTreeObjType; +} + +static void +StrIdxTreeObj_FreeIntRepProc(Tcl_Obj *objPtr) +{ + /* follow links (smart pointers) */ + if ( objPtr->internalRep.twoPtrValue.ptr1 != NULL + && objPtr->internalRep.twoPtrValue.ptr2 == NULL + ) { + /* is a link */ + Tcl_UnsetObjRef(((Tcl_Obj *)objPtr->internalRep.twoPtrValue.ptr1)); + } else { + /* is a tree */ + TclStrIdxTree *tree = (TclStrIdxTree*)&objPtr->internalRep.twoPtrValue.ptr1; + if (tree->firstPtr != NULL) { + TclStrIdxTreeFree(tree->firstPtr); + } + objPtr->internalRep.twoPtrValue.ptr1 = NULL; + objPtr->internalRep.twoPtrValue.ptr2 = NULL; + } + objPtr->typePtr = NULL; +}; + +static void +StrIdxTreeObj_UpdateStringProc(Tcl_Obj *objPtr) +{ + /* currently only dummy empty string possible */ + objPtr->length = 0; + objPtr->bytes = tclEmptyStringRep; +}; + +MODULE_SCOPE TclStrIdxTree * +TclStrIdxTreeGetFromObj(Tcl_Obj *objPtr) { + /* follow links (smart pointers) */ + if (objPtr->typePtr != &StrIdxTreeObjType) { + return NULL; + } + if ( objPtr->internalRep.twoPtrValue.ptr1 != NULL + && objPtr->internalRep.twoPtrValue.ptr2 == NULL + ) { + objPtr = (Tcl_Obj*)objPtr->internalRep.twoPtrValue.ptr1; + } + /* return tree root in internal representation */ + return (TclStrIdxTree*)&objPtr->internalRep.twoPtrValue.ptr1; +} + +/* + * Several debug primitives + */ +#if 1 + +void +TclStrIdxTreePrint( + Tcl_Interp *interp, + TclStrIdx *tree, + int offs) +{ + Tcl_Obj *obj[2]; + const char *s; + Tcl_InitObjRef(obj[0], Tcl_NewStringObj("::puts", -1)); + while (tree != NULL) { + s = TclGetString(tree->key) + offs; + Tcl_InitObjRef(obj[1], Tcl_ObjPrintf("%*s%.*s\t:%d", + offs, "", tree->length - offs, s, tree->value)); + Tcl_PutsObjCmd(NULL, interp, 2, obj); + Tcl_UnsetObjRef(obj[1]); + if (tree->childTree.firstPtr != NULL) { + TclStrIdxTreePrint(interp, tree->childTree.firstPtr, tree->length); + } + tree = tree->nextPtr; + } + Tcl_UnsetObjRef(obj[0]); +} + + +MODULE_SCOPE int +TclStrIdxTreeTestObjCmd( + ClientData clientData, Tcl_Interp *interp, + int objc, Tcl_Obj *const objv[]) +{ + const char *cs, *cin, *ret; + + static const char *const options[] = { + "index", "puts-index", "findequal", + NULL + }; + enum optionInd { + O_INDEX, O_PUTS_INDEX, O_FINDEQUAL + }; + int optionIndex; + + if (objc < 2) { + Tcl_SetResult(interp, "wrong # args", TCL_STATIC); + return TCL_ERROR; + } + if (Tcl_GetIndexFromObj(interp, objv[1], options, + "option", 0, &optionIndex) != TCL_OK) { + Tcl_SetErrorCode(interp, "CLOCK", "badOption", + Tcl_GetString(objv[1]), NULL); + return TCL_ERROR; + } + switch (optionIndex) { + case O_FINDEQUAL: + if (objc < 4) { + Tcl_SetResult(interp, "wrong # args", TCL_STATIC); + return TCL_ERROR; + } + cs = TclGetString(objv[2]); + cin = TclGetString(objv[3]); + ret = TclUtfFindEqual( + cs, cs + objv[1]->length, cin, cin + objv[2]->length); + Tcl_SetObjResult(interp, Tcl_NewIntObj(ret - cs)); + break; + case O_INDEX: + case O_PUTS_INDEX: + + if (1) { + Tcl_Obj **lstv; + int i, lstc; + TclStrIdxTree idxTree = {NULL, NULL}; + i = 1; + while (++i < objc) { + if (TclListObjGetElements(interp, objv[i], + &lstc, &lstv) != TCL_OK) { + return TCL_ERROR; + }; + TclStrIdxTreeBuildFromList(&idxTree, lstc, lstv); + } + if (optionIndex == O_PUTS_INDEX) { + TclStrIdxTreePrint(interp, idxTree.firstPtr, 0); + } + TclStrIdxTreeFree(idxTree.firstPtr); + } + break; + } + + return TCL_OK; +} + +#endif + +/* + * Local Variables: + * mode: c + * c-basic-offset: 4 + * fill-column: 78 + * End: + */ diff --git a/generic/tclStrIdxTree.h b/generic/tclStrIdxTree.h new file mode 100644 index 0000000..e80d3db --- /dev/null +++ b/generic/tclStrIdxTree.h @@ -0,0 +1,134 @@ +/* + * tclStrIdxTree.h -- + * + * Declarations of string index tries and other primitives currently + * back-ported from tclSE. + * + * Copyright (c) 2016 Serg G. Brester (aka sebres) + * + * See the file "license.terms" for information on usage and redistribution + * of this file, and for a DISCLAIMER OF ALL WARRANTIES. + */ + +#ifndef _TCLSTRIDXTREE_H +#define _TCLSTRIDXTREE_H + + +/* + * Main structures declarations of index tree and entry + */ + +typedef struct TclStrIdxTree { + struct TclStrIdx *firstPtr; + struct TclStrIdx *lastPtr; +} TclStrIdxTree; + +typedef struct TclStrIdx { + struct TclStrIdxTree childTree; + struct TclStrIdx *nextPtr; + struct TclStrIdx *prevPtr; + Tcl_Obj *key; + int length; + int value; +} TclStrIdx; + + +/* + *---------------------------------------------------------------------- + * + * TclUtfFindEqual, TclUtfFindEqualNC -- + * + * Find largest part of string cs in string cin (case sensitive and not). + * + * Results: + * Return position of UTF character in cs after last equal character. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +inline const char * +TclUtfFindEqual( + register const char *cs, /* UTF string to find in cin. */ + register const char *cse, /* End of cs */ + register const char *cin, /* UTF string will be browsed. */ + register const char *cine) /* End of cin */ +{ + register const char *ret = cs; + Tcl_UniChar ch1, ch2; + do { + cs += TclUtfToUniChar(cs, &ch1); + cin += TclUtfToUniChar(cin, &ch2); + if (ch1 != ch2) break; + } while ((ret = cs) < cse && cin < cine); + return ret; +} + +inline const char * +TclUtfFindEqualNC( + register const char *cs, /* UTF string to find in cin. */ + register const char *cse, /* End of cs */ + register const char *cin, /* UTF string will be browsed. */ + register const char *cine, /* End of cin */ + const char **cinfnd) /* Return position in cin */ +{ + register const char *ret = cs; + Tcl_UniChar ch1, ch2; + do { + cs += TclUtfToUniChar(cs, &ch1); + cin += TclUtfToUniChar(cin, &ch2); + if (ch1 != ch2) { + ch1 = Tcl_UniCharToLower(ch1); + ch2 = Tcl_UniCharToLower(ch2); + if (ch1 != ch2) break; + } + *cinfnd = cin; + } while ((ret = cs) < cse && cin < cine); + return ret; +} + +/* + * Primitives to safe set, reset and free references. + */ + +#define Tcl_UnsetObjRef(obj) \ + if (obj != NULL) { Tcl_DecrRefCount(obj); obj = NULL; } +#define Tcl_InitObjRef(obj, val) \ + obj = val; if (obj) { Tcl_IncrRefCount(obj); } +#define Tcl_SetObjRef(obj, val) \ +if (1) { \ + Tcl_Obj *nval = val; \ + if (obj != nval) { \ + Tcl_Obj *prev = obj; \ + Tcl_InitObjRef(obj, nval); \ + if (prev != NULL) { Tcl_DecrRefCount(prev); }; \ + } \ +} + +/* + * Prototypes of module functions. + */ + +MODULE_SCOPE const char* + TclStrIdxTreeSearch(TclStrIdxTree **foundParent, + TclStrIdx **foundItem, TclStrIdxTree *tree, + const char *start, const char *end); + +MODULE_SCOPE int TclStrIdxTreeBuildFromList(TclStrIdxTree *idxTree, + int lstc, Tcl_Obj **lstv); + +MODULE_SCOPE Tcl_Obj* + TclStrIdxTreeNewObj(); + +MODULE_SCOPE TclStrIdxTree* + TclStrIdxTreeGetFromObj(Tcl_Obj *objPtr); + +#if 1 + +MODULE_SCOPE int TclStrIdxTreeTestObjCmd(ClientData, Tcl_Interp *, + int, Tcl_Obj *const objv[]); +#endif + +#endif /* _TCLSTRIDXTREE_H */ diff --git a/unix/Makefile.in b/unix/Makefile.in index b220139..19ab6ec 100644 --- a/unix/Makefile.in +++ b/unix/Makefile.in @@ -451,6 +451,7 @@ GENERIC_SRCS = \ $(GENERIC_DIR)/tclScan.c \ $(GENERIC_DIR)/tclStubInit.c \ $(GENERIC_DIR)/tclStringObj.c \ + $(GENERIC_DIR)/tclStrIdxTree.c \ $(GENERIC_DIR)/tclStrToD.c \ $(GENERIC_DIR)/tclTest.c \ $(GENERIC_DIR)/tclTestObj.c \ @@ -1305,6 +1306,9 @@ tclScan.o: $(GENERIC_DIR)/tclScan.c tclStringObj.o: $(GENERIC_DIR)/tclStringObj.c $(MATHHDRS) $(CC) -c $(CC_SWITCHES) $(GENERIC_DIR)/tclStringObj.c +tclStrIdxTree.o: $(GENERIC_DIR)/tclStrIdxTree.c $(MATHHDRS) + $(CC) -c $(CC_SWITCHES) $(GENERIC_DIR)/tclStrIdxTree.c + tclStrToD.o: $(GENERIC_DIR)/tclStrToD.c $(MATHHDRS) $(CC) -c $(CC_SWITCHES) $(GENERIC_DIR)/tclStrToD.c diff --git a/win/Makefile.in b/win/Makefile.in index 82e5516..478bbb9 100644 --- a/win/Makefile.in +++ b/win/Makefile.in @@ -291,6 +291,7 @@ GENERIC_OBJS = \ tclResult.$(OBJEXT) \ tclScan.$(OBJEXT) \ tclStringObj.$(OBJEXT) \ + tclStrIdxTree.$(OBJEXT) \ tclStrToD.$(OBJEXT) \ tclStubInit.$(OBJEXT) \ tclThread.$(OBJEXT) \ diff --git a/win/makefile.vc b/win/makefile.vc index d6dbf85..48bacbc 100644 --- a/win/makefile.vc +++ b/win/makefile.vc @@ -333,6 +333,7 @@ COREOBJS = \ $(TMP_DIR)\tclResult.obj \ $(TMP_DIR)\tclScan.obj \ $(TMP_DIR)\tclStringObj.obj \ + $(TMP_DIR)\tclStrIdxTree.obj \ $(TMP_DIR)\tclStrToD.obj \ $(TMP_DIR)\tclStubInit.obj \ $(TMP_DIR)\tclThread.obj \ -- cgit v0.12