From b99f44e2b60aa470e6f3b2c7f119b3f1a77f8606 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Thu, 27 Jun 2024 13:45:10 +0000 Subject: (backport) Add tcl::unsupported::icu command (why not!) --- generic/tclBasic.c | 22 +- generic/tclIcu.c | 701 +++++++++++++++++++++++++++++++++++++++++++++++++++++ generic/tclInt.h | 1 + library/icu.tcl | 145 +++++++++++ library/tclIndex | 1 + tests/icu.test | 51 ++++ unix/Makefile.in | 7 +- win/Makefile.in | 1 + win/makefile.vc | 1 + 9 files changed, 921 insertions(+), 9 deletions(-) create mode 100644 generic/tclIcu.c create mode 100644 library/icu.tcl create mode 100644 tests/icu.test diff --git a/generic/tclBasic.c b/generic/tclBasic.c index 507d987..9c35889 100644 --- a/generic/tclBasic.c +++ b/generic/tclBasic.c @@ -352,7 +352,7 @@ static const CmdInfo builtInCmds[] = { {"lmap", Tcl_LmapObjCmd, TclCompileLmapCmd, TclNRLmapCmd, CMD_IS_SAFE}, {"lpop", Tcl_LpopObjCmd, NULL, NULL, CMD_IS_SAFE}, {"lrange", Tcl_LrangeObjCmd, TclCompileLrangeCmd, NULL, CMD_IS_SAFE}, - {"lremove", Tcl_LremoveObjCmd, NULL, NULL, CMD_IS_SAFE}, + {"lremove", Tcl_LremoveObjCmd, NULL, NULL, CMD_IS_SAFE}, {"lrepeat", Tcl_LrepeatObjCmd, NULL, NULL, CMD_IS_SAFE}, {"lreplace", Tcl_LreplaceObjCmd, TclCompileLreplaceCmd, NULL, CMD_IS_SAFE}, {"lreverse", Tcl_LreverseObjCmd, NULL, NULL, CMD_IS_SAFE}, @@ -758,7 +758,7 @@ buildInfoObjCmd( p += len; q = strchr(++p, '.'); if (!q) { - q = p + strlen(p); + q = p + strlen(p); } memcpy(buf, p, q - p); buf[q - p] = '\0'; @@ -1213,6 +1213,10 @@ Tcl_CreateInterp(void) Tcl_CreateObjCommand(interp, "::tcl::unsupported::corotype", CoroTypeObjCmd, NULL, NULL); + /* Load and intialize ICU */ + Tcl_CreateObjCommand(interp, "::tcl::unsupported::loadIcu", + TclLoadIcuObjCmd, NULL, NULL); + /* Export unsupported commands */ nsPtr = Tcl_FindNamespace(interp, "::tcl::unsupported", NULL, 0); if (nsPtr) { @@ -1488,9 +1492,11 @@ TclHideUnsafeCommands( Tcl_Obj *hideName = Tcl_ObjPrintf("tcl:%s:%s", unsafePtr->ensembleNsName, unsafePtr->commandName); +#define INTERIM_HACK_NAME "___tmp" + if (TclRenameCommand(interp, TclGetString(cmdName), - "___tmp") != TCL_OK - || Tcl_HideCommand(interp, "___tmp", + INTERIM_HACK_NAME) != TCL_OK + || Tcl_HideCommand(interp, INTERIM_HACK_NAME, TclGetString(hideName)) != TCL_OK) { Tcl_Panic("problem making '%s %s' safe: %s", unsafePtr->ensembleNsName, unsafePtr->commandName, @@ -2763,7 +2769,7 @@ Tcl_CreateObjCommand( /* If not NULL, gives a function to call when * this command is deleted. */ { - Interp *iPtr = (Interp *) interp; + Interp *iPtr = (Interp *)interp; Namespace *nsPtr; const char *tail; @@ -2772,7 +2778,7 @@ Tcl_CreateObjCommand( * The interpreter is being deleted. Don't create any new commands; * it's not safe to muck with the interpreter anymore. */ - return (Tcl_Command) NULL; + return NULL; } /* @@ -3045,7 +3051,7 @@ TclInvokeObjectCommand( int argc, /* Number of arguments. */ const char **argv) /* Argument strings. */ { - Command *cmdPtr = ( Command *) clientData; + Command *cmdPtr = ( Command *)clientData; Tcl_Obj *objPtr; int i, length, result; Tcl_Obj **objv = (Tcl_Obj **) @@ -9433,7 +9439,7 @@ TclNRTailcallObjCmd( nsObjPtr = Tcl_NewStringObj(nsPtr->fullName, TCL_INDEX_NONE); listPtr = Tcl_NewListObj(objc, objv); - TclListObjSetElement(interp, listPtr, 0, nsObjPtr); + TclListObjSetElement(interp, listPtr, 0, nsObjPtr); iPtr->varFramePtr->tailcallPtr = listPtr; } diff --git a/generic/tclIcu.c b/generic/tclIcu.c new file mode 100644 index 0000000..b6355ee --- /dev/null +++ b/generic/tclIcu.c @@ -0,0 +1,701 @@ +/* + * tclIcu.c -- + * + * tclIcu.c implements various Tcl commands that make use of + * the ICU library if present on the system. + * (Adapted from tkIcu.c) + * + * Copyright © 2021 Jan Nijtmans + * Copyright © 2024 Ashok P. Nadkarni + * + * See the file "license.terms" for information on usage and redistribution of + * this file, and for a DISCLAIMER OF ALL WARRANTIES. + */ + +#include "tclInt.h" + +/* + * Runtime linking of libicu. + */ +typedef enum UBreakIteratorTypex { + UBRK_CHARACTERX = 0, + UBRK_WORDX = 1 +} UBreakIteratorTypex; + +typedef enum UErrorCodex { + U_AMBIGUOUS_ALIAS_WARNING = -122, + U_ZERO_ERRORZ = 0, /**< No error, no warning. */ +} UErrorCodex; + +#define U_SUCCESS(x) ((x)<=U_ZERO_ERRORZ) +#define U_FAILURE(x) ((x)>U_ZERO_ERRORZ) + +struct UEnumeration; +typedef struct UEnumeration UEnumeration; +struct UCharsetDetector; +typedef struct UCharsetDetector UCharsetDetector; +struct UCharsetMatch; +typedef struct UCharsetMatch UCharsetMatch; + +/* + * Prototypes for ICU functions sorted by category. + */ +typedef void (*fn_u_cleanup)(void); +typedef const char *(*fn_u_errorName)(UErrorCodex); + +typedef uint16_t (*fn_ucnv_countAliases)(const char *, UErrorCodex *); +typedef int32_t (*fn_ucnv_countAvailable)(); +typedef const char *(*fn_ucnv_getAlias)(const char *, uint16_t, UErrorCodex *); +typedef const char *(*fn_ucnv_getAvailableName)(int32_t); + +typedef void *(*fn_ubrk_open)(UBreakIteratorTypex, const char *, + const uint16_t *, int32_t, UErrorCodex *); +typedef void (*fn_ubrk_close)(void *); +typedef int32_t (*fn_ubrk_preceding)(void *, int32_t); +typedef int32_t (*fn_ubrk_following)(void *, int32_t); +typedef int32_t (*fn_ubrk_previous)(void *); +typedef int32_t (*fn_ubrk_next)(void *); +typedef void (*fn_ubrk_setText)(void *, const void *, int32_t, UErrorCodex *); + +typedef UCharsetDetector * (*fn_ucsdet_open)(UErrorCodex *status); +typedef void (*fn_ucsdet_close)(UCharsetDetector *ucsd); +typedef void (*fn_ucsdet_setText)(UCharsetDetector *ucsd, const char *textIn, int32_t len, UErrorCodex *status); +typedef const char * (*fn_ucsdet_getName)(const UCharsetMatch *ucsm, UErrorCodex *status); +typedef UEnumeration * (*fn_ucsdet_getAllDetectableCharsets)(UCharsetDetector *ucsd, UErrorCodex *status); +typedef const UCharsetMatch * (*fn_ucsdet_detect)(UCharsetDetector *ucsd, UErrorCodex *status); +typedef const UCharsetMatch ** (*fn_ucsdet_detectAll)(UCharsetDetector *ucsd, int32_t *matchesFound, UErrorCodex *status); + +typedef void (*fn_uenum_close)(UEnumeration *); +typedef int32_t (*fn_uenum_count)(UEnumeration *, UErrorCodex *); +typedef const char *(*fn_uenum_next)(UEnumeration *, int32_t *, UErrorCodex *); + +#define FIELD(name) fn_ ## name _ ## name +static struct { + size_t nopen; /* Total number of references to ALL libraries */ + /* + * Depending on platform, ICU symbols may be distributed amongst + * multiple libraries. For current functionality at most 2 needed. + * Order of library loading is not guaranteed. + */ + Tcl_LoadHandle libs[2]; + + FIELD(u_cleanup); + FIELD(u_errorName); + + FIELD(ubrk_open); + FIELD(ubrk_close); + FIELD(ubrk_preceding); + FIELD(ubrk_following); + FIELD(ubrk_previous); + FIELD(ubrk_next); + FIELD(ubrk_setText); + + FIELD(ucnv_countAliases); + FIELD(ucnv_countAvailable); + FIELD(ucnv_getAlias); + FIELD(ucnv_getAvailableName); + + FIELD(ucsdet_close); + FIELD(ucsdet_detect); + FIELD(ucsdet_detectAll); + FIELD(ucsdet_getAllDetectableCharsets); + FIELD(ucsdet_getName); + FIELD(ucsdet_open); + FIELD(ucsdet_setText); + + FIELD(uenum_close); + FIELD(uenum_count); + FIELD(uenum_next); + +} icu_fns = { + 0, {NULL, NULL}, /* Reference count, library handles */ + NULL, NULL, /* u_* */ + NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* ubrk* */ + NULL, NULL, NULL, NULL, /* ucnv_* */ + NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* ucsdet* */ + NULL, NULL, NULL, /* uenum_* */ +}; + +#define u_cleanup icu_fns._u_cleanup +#define u_errorName icu_fns._u_errorName + +#define ubrk_open icu_fns._ubrk_open +#define ubrk_close icu_fns._ubrk_close +#define ubrk_preceding icu_fns._ubrk_preceding +#define ubrk_following icu_fns._ubrk_following +#define ubrk_previous icu_fns._ubrk_previous +#define ubrk_next icu_fns._ubrk_next +#define ubrk_setText icu_fns._ubrk_setText + +#define ucnv_countAliases icu_fns._ucnv_countAliases +#define ucnv_countAvailable icu_fns._ucnv_countAvailable +#define ucnv_getAlias icu_fns._ucnv_getAlias +#define ucnv_getAvailableName icu_fns._ucnv_getAvailableName + +#define ucsdet_close icu_fns._ucsdet_close +#define ucsdet_detect icu_fns._ucsdet_detect +#define ucsdet_detectAll icu_fns._ucsdet_detectAll +#define ucsdet_getAllDetectableCharsets icu_fns._ucsdet_getAllDetectableCharsets +#define ucsdet_getName icu_fns._ucsdet_getName +#define ucsdet_open icu_fns._ucsdet_open +#define ucsdet_setText icu_fns._ucsdet_setText + +#define uenum_next icu_fns._uenum_next +#define uenum_close icu_fns._uenum_close +#define uenum_count icu_fns._uenum_count + + +TCL_DECLARE_MUTEX(icu_mutex); + +static int FunctionNotAvailableError(Tcl_Interp *interp) { + if (interp) { + Tcl_SetObjResult(interp, + Tcl_NewStringObj("ICU function not available", TCL_INDEX_NONE)); + } + return TCL_ERROR; +} + +static int IcuError(Tcl_Interp *interp, const char *message, UErrorCodex code) +{ + if (interp) { + const char *codeMessage = NULL; + if (u_errorName) { + codeMessage = u_errorName(code); + } + Tcl_SetObjResult(interp, + Tcl_ObjPrintf("%s. ICU error (%d): %s", + message, + code, + codeMessage ? codeMessage : "")); + } + return TCL_ERROR; +} + +static int DetectEncoding(Tcl_Interp *interp, Tcl_Obj *objPtr, int all) +{ + Tcl_Size len; + const char *bytes; + const UCharsetMatch *match; + const UCharsetMatch **matches; + int nmatches; + int ret; + + if (ucsdet_open == NULL || ucsdet_setText == NULL || + ucsdet_detect == NULL || ucsdet_detectAll == NULL || + ucsdet_getName == NULL || ucsdet_close == NULL) { + return FunctionNotAvailableError(interp); + } + + bytes = (char *) Tcl_GetBytesFromObj(interp, objPtr, &len); + if (bytes == NULL) { + return TCL_ERROR; + } + UErrorCodex status = U_ZERO_ERRORZ; + + UCharsetDetector* csd = ucsdet_open(&status); + if (U_FAILURE(status)) { + return IcuError(interp, "Could not open charset detector.", status); + } + + ucsdet_setText(csd, bytes, len, &status); + if (U_FAILURE(status)) { + IcuError(interp, "Could not set detection text.", status); + ucsdet_close(csd); + return TCL_ERROR; + } + + if (all) { + matches = ucsdet_detectAll(csd, &nmatches, &status); + } + else { + match = ucsdet_detect(csd, &status); + matches = &match; + nmatches = match ? 1 : 0; + } + + if (U_FAILURE(status) || nmatches == 0) { + ret = IcuError(interp, "Could not detect character set.", status); + } + else { + int i; + Tcl_Obj *resultObj = Tcl_NewListObj(nmatches, NULL); + for (i = 0; i < nmatches; ++i) { + const char *name = ucsdet_getName(matches[i], &status); + if (U_FAILURE(status) || name == NULL) { + name = "unknown"; + status = U_ZERO_ERRORZ; /* Reset on failure */ + } + Tcl_ListObjAppendElement( + NULL, resultObj, Tcl_NewStringObj(name, -1)); + } + Tcl_SetObjResult(interp, resultObj); + ret = TCL_OK; + } + + ucsdet_close(csd); + return ret; +} + +static int DetectableEncodings(Tcl_Interp *interp) +{ + if (ucsdet_open == NULL || ucsdet_getAllDetectableCharsets == NULL || + ucsdet_close == NULL || uenum_next == NULL || uenum_count == NULL || + uenum_close == NULL) { + return FunctionNotAvailableError(interp); + } + UErrorCodex status = U_ZERO_ERRORZ; + + UCharsetDetector* csd = ucsdet_open(&status); + if (U_FAILURE(status)) { + return IcuError(interp, "Could not open charset detector.", status); + } + + int ret; + UEnumeration *enumerator = ucsdet_getAllDetectableCharsets(csd, &status); + if (U_FAILURE(status) || enumerator == NULL) { + IcuError(interp, "Could not get list of detectable encodings.", status); + ret = TCL_ERROR; + } else { + int32_t count; + count = uenum_count(enumerator, &status); + if (U_FAILURE(status)) { + IcuError(interp, "Could not get charset enumerator count.", status); + ret = TCL_ERROR; + } else { + int i; + Tcl_Obj *resultObj = Tcl_NewListObj(0, NULL); + for (i = 0; i < count; ++i) { + const char *name; + int32_t len; + name = uenum_next(enumerator, &len, &status); + if (name == NULL || U_FAILURE(status)) { + name = "unknown"; + len = 7; + status = U_ZERO_ERRORZ; /* Reset on error */ + } + Tcl_ListObjAppendElement( + interp, resultObj, Tcl_NewStringObj(name, len)); + } + Tcl_SetObjResult(interp, resultObj); + ret = TCL_OK; + } + uenum_close(enumerator); + } + + ucsdet_close(csd); + return ret; +} + +/* + *------------------------------------------------------------------------ + * + * EncodingDetectObjCmd -- + * + * Implements the Tcl command EncodingDetect. + * encdetect - returns names of all detectable encodings + * encdetect BYTES ?-all? - return detected encoding(s) + * + * Results: + * TCL_OK - Success. + * TCL_ERROR - Error. + * + * Side effects: + * Interpreter result holds result or error message. + * + *------------------------------------------------------------------------ + */ +static int +IcuDetectObjCmd( + TCL_UNUSED(void *), + Tcl_Interp *interp, + int objc, + Tcl_Obj *const objv[]) +{ + if (objc > 3) { + Tcl_WrongNumArgs(interp, 1 , objv, "?bytes ?-all??"); + return TCL_ERROR; + } + + if (objc == 1) { + return DetectableEncodings(interp); + } + + int all = 0; + if (objc == 3) { + if (strcmp("-all", Tcl_GetString(objv[2]))) { + Tcl_SetObjResult( + interp, + Tcl_ObjPrintf("Invalid option %s, must be \"-all\"", + Tcl_GetString(objv[2]))); + return TCL_ERROR; + } + all = 1; + } + + return DetectEncoding(interp, objv[1], all); +} + +/* + *------------------------------------------------------------------------ + * + * IcuConverterNamesObjCmd -- + * + * Sets interp result to list of available ICU converters. + * + * Results: + * TCL_OK - Success. + * TCL_ERROR - Error. + * + * Side effects: + * Interpreter result holds list of converter names. + * + *------------------------------------------------------------------------ + */ +static int +IcuConverterNamesObjCmd ( + TCL_UNUSED(void *), + Tcl_Interp *interp, /* Current interpreter. */ + int objc, /* Number of arguments. */ + Tcl_Obj *const objv[]) /* Argument objects. */ +{ + + if (objc != 1) { + Tcl_WrongNumArgs(interp, 1 , objv, ""); + return TCL_ERROR; + } + if (ucnv_countAvailable == NULL || ucnv_getAvailableName == NULL) { + return FunctionNotAvailableError(interp); + } + + int32_t count = ucnv_countAvailable(); + if (count <= 0) { + return TCL_OK; + } + Tcl_Obj *resultObj = Tcl_NewListObj(count, NULL); + int32_t i; + for (i = 0; i < count; ++i) { + const char *name = ucnv_getAvailableName(i); + if (name) { + Tcl_ListObjAppendElement( + NULL, resultObj, Tcl_NewStringObj(name, -1)); + } + } + Tcl_SetObjResult(interp, resultObj); + return TCL_OK; +} + +/* + *------------------------------------------------------------------------ + * + * IcuConverterAliasesObjCmd -- + * + * Sets interp result to list of available ICU converters. + * + * Results: + * TCL_OK - Success. + * TCL_ERROR - Error. + * + * Side effects: + * Interpreter result holds list of converter names. + * + *------------------------------------------------------------------------ + */ +static int +IcuConverterAliasesObjCmd ( + TCL_UNUSED(void *), + Tcl_Interp *interp, /* Current interpreter. */ + int objc, /* Number of arguments. */ + Tcl_Obj *const objv[]) /* Argument objects. */ +{ + if (objc != 2) { + Tcl_WrongNumArgs(interp, 1 , objv, "convertername"); + return TCL_ERROR; + } + if (ucnv_countAliases == NULL || ucnv_getAlias == NULL) { + return FunctionNotAvailableError(interp); + } + + const char *name = Tcl_GetString(objv[1]); + UErrorCodex status = U_ZERO_ERRORZ; + uint16_t count = ucnv_countAliases(name, &status); + if (status != U_AMBIGUOUS_ALIAS_WARNING && U_FAILURE(status)) { + return IcuError(interp, "Could not get aliases.", status); + } + if (count <= 0) { + return TCL_OK; + } + Tcl_Obj *resultObj = Tcl_NewListObj(count, NULL); + uint16_t i; + for (i = 0; i < count; ++i) { + status = U_ZERO_ERRORZ; /* Reset in case U_AMBIGUOUS_ALIAS_WARNING */ + const char *aliasName = ucnv_getAlias(name, i, &status); + if (status != U_AMBIGUOUS_ALIAS_WARNING && U_FAILURE(status)) { + status = U_ZERO_ERRORZ; /* Reset error for next iteration */ + continue; + } + if (aliasName) { + Tcl_ListObjAppendElement( + NULL, resultObj, Tcl_NewStringObj(aliasName, -1)); + } + } + Tcl_SetObjResult(interp, resultObj); + return TCL_OK; +} + +static void +TclIcuCleanup( + TCL_UNUSED(void *)) +{ + Tcl_MutexLock(&icu_mutex); + if (icu_fns.nopen-- <= 1) { + int i; + if (u_cleanup != NULL) { + u_cleanup(); + } + for (i = 0; i < (int)(sizeof(icu_fns.libs) / sizeof(icu_fns.libs[0])); + ++i) { + if (icu_fns.libs[i] != NULL) { + Tcl_FSUnloadFile(NULL, icu_fns.libs[i]); + } + } + memset(&icu_fns, 0, sizeof(icu_fns)); + } + Tcl_MutexUnlock(&icu_mutex); +} + +static void +TclIcuInit( + Tcl_Interp *interp) +{ + Tcl_MutexLock(&icu_mutex); + char symbol[256]; + char icuversion[4] = "_80"; /* Highest ICU version + 1 */ + + /* + * The initialization below clones the existing one from Tk. May need + * revisiting. + * ICU shared library names as well as function names *may* be versioned. + * See https://unicode-org.github.io/icu/userguide/icu4c/packaging.html + * for the gory details. + */ + if (icu_fns.nopen == 0) { + int i = 0; + Tcl_Obj *nameobj; + static const char *iculibs[] = { +#if defined(_WIN32) +# define DLLNAME "icu%s%s.dll" + "icuuc??.dll", /* Windows, user-provided */ + NULL, + "cygicuuc??.dll", /* When running under Cygwin */ +#elif defined(__CYGWIN__) +# define DLLNAME "cygicu%s%s.dll" + "cygicuuc??.dll", +#elif defined(MAC_OSX_TCL) +# define DLLNAME "libicu%s.%s.dylib" + "libicuuc.??.dylib", +#else +# define DLLNAME "libicu%s.so.%s" + "libicuuc.so.??", +#endif + NULL + }; + + /* Going back down to ICU version 60 */ + while ((icu_fns.libs[0] == NULL) && (icuversion[1] >= '6')) { + if (--icuversion[2] < '0') { + icuversion[1]--; icuversion[2] = '9'; + } +#if defined(__CYGWIN__) + i = 2; +#else + i = 0; +#endif + while (iculibs[i] != NULL) { + Tcl_ResetResult(interp); + nameobj = Tcl_NewStringObj(iculibs[i], TCL_INDEX_NONE); + char *nameStr = Tcl_GetString(nameobj); + char *p = strchr(nameStr, '?'); + if (p != NULL) { + memcpy(p, icuversion+1, 2); + } + Tcl_IncrRefCount(nameobj); + if (Tcl_LoadFile(interp, nameobj, NULL, 0, NULL, &icu_fns.libs[0]) + == TCL_OK) { + if (p == NULL) { + icuversion[0] = '\0'; + } + Tcl_DecrRefCount(nameobj); + break; + } + Tcl_DecrRefCount(nameobj); + ++i; + } + } + if (icu_fns.libs[0] != NULL) { + /* Loaded icuuc, load others with the same version */ + nameobj = Tcl_ObjPrintf(DLLNAME, "i18n", icuversion+1); + Tcl_IncrRefCount(nameobj); + /* Ignore errors. Calls to contained functions will fail. */ + (void) Tcl_LoadFile(interp, nameobj, NULL, 0, NULL, &icu_fns.libs[1]); + Tcl_DecrRefCount(nameobj); + } +#if defined(_WIN32) + /* + * On Windows, if no ICU install found, look for the system's + * (Win10 1703 or later). There are two cases. Newer systems + * have icu.dll containing all functions. Older systems have + * icucc.dll and icuin.dll + */ + if (icu_fns.libs[0] == NULL) { + Tcl_ResetResult(interp); + nameobj = Tcl_NewStringObj("icu.dll", TCL_INDEX_NONE); + Tcl_IncrRefCount(nameobj); + if (Tcl_LoadFile(interp, nameobj, NULL, 0, NULL, &icu_fns.libs[0]) + == TCL_OK) { + /* Reload same for second set of functions. */ + (void) Tcl_LoadFile(interp, nameobj, NULL, 0, NULL, &icu_fns.libs[1]); + /* Functions do NOT have version suffixes */ + icuversion[0] = '\0'; + } + Tcl_DecrRefCount(nameobj); + } + if (icu_fns.libs[0] == NULL) { + /* No icu.dll. Try last fallback */ + Tcl_ResetResult(interp); + nameobj = Tcl_NewStringObj("icuuc.dll", TCL_INDEX_NONE); + Tcl_IncrRefCount(nameobj); + if (Tcl_LoadFile(interp, nameobj, NULL, 0, NULL, &icu_fns.libs[0]) + == TCL_OK) { + Tcl_DecrRefCount(nameobj); + nameobj = Tcl_NewStringObj("icuin.dll", TCL_INDEX_NONE); + Tcl_IncrRefCount(nameobj); + (void) Tcl_LoadFile(interp, nameobj, NULL, 0, NULL, &icu_fns.libs[1]); + /* Functions do NOT have version suffixes */ + icuversion[0] = '\0'; + } + Tcl_DecrRefCount(nameobj); + } +#endif + +#define ICUUC_SYM(name) \ + strcpy(symbol, #name ); \ + strcat(symbol, icuversion); \ + icu_fns._##name = (fn_ ## name) \ + Tcl_FindSymbol(NULL, icu_fns.libs[0], symbol) + if (icu_fns.libs[0] != NULL) { + ICUUC_SYM(u_cleanup); + ICUUC_SYM(u_errorName); + + ICUUC_SYM(ucnv_countAliases); + ICUUC_SYM(ucnv_countAvailable); + ICUUC_SYM(ucnv_getAlias); + ICUUC_SYM(ucnv_getAvailableName); + + ICUUC_SYM(ubrk_open); + ICUUC_SYM(ubrk_close); + ICUUC_SYM(ubrk_preceding); + ICUUC_SYM(ubrk_following); + ICUUC_SYM(ubrk_previous); + ICUUC_SYM(ubrk_next); + ICUUC_SYM(ubrk_setText); + + ICUUC_SYM(uenum_close); + ICUUC_SYM(uenum_count); + ICUUC_SYM(uenum_next); + +#undef ICUUC_SYM + } + +#define ICUIN_SYM(name) \ + strcpy(symbol, #name ); \ + strcat(symbol, icuversion); \ + icu_fns._##name = (fn_ ## name) \ + Tcl_FindSymbol(NULL, icu_fns.libs[1], symbol) + if (icu_fns.libs[1] != NULL) { + ICUIN_SYM(ucsdet_close); + ICUIN_SYM(ucsdet_detect); + ICUIN_SYM(ucsdet_detectAll); + ICUIN_SYM(ucsdet_getName); + ICUIN_SYM(ucsdet_getAllDetectableCharsets); + ICUIN_SYM(ucsdet_open); + ICUIN_SYM(ucsdet_setText); +#undef ICUIN_SYM + } + + } +#undef ICU_SYM + + Tcl_MutexUnlock(&icu_mutex); + + if (icu_fns.libs[0] != NULL) { + /* + * Note refcounts updated BEFORE command definition to protect + * against self redefinition. + */ + if (icu_fns.libs[1] != NULL) { + /* Commands needing both libraries */ + + /* Ref count number of commands */ + icu_fns.nopen += 1; + Tcl_CreateObjCommand(interp, + "::tcl::unsupported::icu::detect", + IcuDetectObjCmd, + 0, + TclIcuCleanup); + } + /* Commands needing only libs[0] (icuuc) */ + + /* Ref count number of commands */ + icu_fns.nopen += 2; + Tcl_CreateObjCommand(interp, + "::tcl::unsupported::icu::converters", + IcuConverterNamesObjCmd, + 0, + TclIcuCleanup); + Tcl_CreateObjCommand(interp, + "::tcl::unsupported::icu::aliases", + IcuConverterAliasesObjCmd, + 0, + TclIcuCleanup); + } +} + +/* + *------------------------------------------------------------------------ + * + * TclLoadIcuObjCmd -- + * + * Loads and initializes ICU + * + * Results: + * TCL_OK - Success. + * TCL_ERROR - Error. + * + * Side effects: + * Interpreter result holds result or error message. + * + *------------------------------------------------------------------------ + */ +int +TclLoadIcuObjCmd ( + TCL_UNUSED(void *), + Tcl_Interp *interp, /* Current interpreter. */ + int objc, /* Number of arguments. */ + Tcl_Obj *const objv[]) /* Argument objects. */ +{ + if (objc != 1) { + Tcl_WrongNumArgs(interp, 1 , objv, ""); + return TCL_ERROR; + } + TclIcuInit(interp); + return TCL_OK; +} + +/* + * Local Variables: + * mode: c + * c-basic-offset: 4 + * fill-column: 78 + * coding: utf-8 + * End: + */ diff --git a/generic/tclInt.h b/generic/tclInt.h index b34ca35..9e956dc 100644 --- a/generic/tclInt.h +++ b/generic/tclInt.h @@ -3556,6 +3556,7 @@ MODULE_SCOPE int TclDictWithFinish(Tcl_Interp *interp, Var *varPtr, MODULE_SCOPE Tcl_Obj * TclDictWithInit(Tcl_Interp *interp, Tcl_Obj *dictPtr, Tcl_Size pathc, Tcl_Obj *const pathv[]); MODULE_SCOPE Tcl_ObjCmdProc Tcl_DisassembleObjCmd; +MODULE_SCOPE Tcl_ObjCmdProc TclLoadIcuObjCmd; /* Assemble command function */ MODULE_SCOPE Tcl_ObjCmdProc Tcl_AssembleObjCmd; diff --git a/library/icu.tcl b/library/icu.tcl new file mode 100644 index 0000000..827fd04 --- /dev/null +++ b/library/icu.tcl @@ -0,0 +1,145 @@ +#---------------------------------------------------------------------- +# +# icu.tcl -- +# +# This file implements the portions of the [tcl::unsupported::icu] +# ensemble that are coded in Tcl. +# +#---------------------------------------------------------------------- +# +# Copyright © 2024 Ashok P. Nadkarni +# See the file "license.terms" for information on usage and redistribution +# of this file, and for a DISCLAIMER OF ALL WARRANTIES. +# +#---------------------------------------------------------------------- + +::tcl::unsupported::loadIcu + +namespace eval ::tcl::unsupported::icu { + # Map Tcl encoding names to ICU and back. Note ICU has multiple aliases + # for the same encoding. + variable tclToIcu + variable icuToTcl + + proc LogError {message} { + puts stderr $message + } + + proc Init {} { + variable tclToIcu + variable icuToTcl + # There are some special cases where names do not line up + # at all. Map Tcl -> ICU + array set specialCases { + ebcdic ebcdic-cp-us + macCentEuro maccentraleurope + utf16 UTF16_PlatformEndian + utf-16be UnicodeBig + utf-16le UnicodeLittle + utf32 UTF32_PlatformEndian + } + # Ignore all errors. Do not want to hold up Tcl + # if ICU not available + if {[catch { + foreach tclName [encoding names] { + if {[catch { + set icuNames [aliases $tclName] + } erMsg]} { + LogError "Could not get aliases for $tclName: $erMsg" + continue + } + if {[llength $icuNames] == 0} { + # E.g. macGreek -> x-MacGreek + set icuNames [aliases x-$tclName] + if {[llength $icuNames] == 0} { + # Still no joy, check for special cases + if {[info exists specialCases($tclName)]} { + set icuNames [aliases $specialCases($tclName)] + } + } + } + # If the Tcl name is also an ICU name use it else use + # the first name which is the canonical ICU name + set pos [lsearch -exact -nocase $icuNames $tclName] + if {$pos >= 0} { + lappend tclToIcu($tclName) [lindex $icuNames $pos] {*}[lreplace $icuNames $pos $pos] + } else { + set tclToIcu($tclName) $icuNames + } + foreach icuName $icuNames { + lappend icuToTcl($icuName) $tclName + } + } + } errMsg]} { + LogError $errMsg + } + array default set tclToIcu "" + array default set icuToTcl "" + + # Redefine ourselves to no-op. + proc Init {} {} + } + # Primarily used during development + proc MappedIcuNames {{pat *}} { + Init + variable icuToTcl + return [array names icuToTcl $pat] + } + # Primarily used during development + proc UnmappedIcuNames {{pat *}} { + Init + variable icuToTcl + set unmappedNames {} + foreach icuName [converters] { + if {[llength [icuToTcl $icuName]] == 0} { + lappend unmappedNames $icuName + } + foreach alias [aliases $icuName] { + if {[llength [icuToTcl $alias]] == 0} { + lappend unmappedNames $alias + } + } + } + # Aliases can be duplicates. Remove + return [lsort -unique [lsearch -inline -all $unmappedNames $pat]] + } + # Primarily used during development + proc UnmappedTclNames {{pat *}} { + Init + variable tclToIcu + set unmappedNames {} + foreach tclName [encoding names] { + # Note entry will always exist. Check if empty + if {[llength [tclToIcu $tclName]] == 0} { + lappend unmappedNames $tclName + } + } + return [lsearch -inline -all $unmappedNames $pat] + } + + # Returns the Tcl equivalent of an ICU encoding name or + # the empty string in case not found. + proc icuToTcl {icuName} { + Init + proc icuToTcl {icuName} { + variable icuToTcl + return [lindex $icuToTcl($icuName) 0] + } + icuToTcl $icuName + } + + # Returns the ICU equivalent of an Tcl encoding name or + # the empty string in case not found. + proc tclToIcu {tclName} { + Init + proc tclToIcu {tclName} { + variable tclToIcu + return [lindex $tclToIcu($tclName) 0] + } + tclToIcu $tclName + } + + + namespace export {[a-z]*} + namespace ensemble create +} diff --git a/library/tclIndex b/library/tclIndex index 871298f..2b5619f 100644 --- a/library/tclIndex +++ b/library/tclIndex @@ -109,3 +109,4 @@ set auto_index(tcl_endOfWord) [list ::tcl::Pkg::source [file join $dir word.tcl] set auto_index(tcl_startOfNextWord) [list ::tcl::Pkg::source [file join $dir word.tcl]] set auto_index(tcl_startOfPreviousWord) [list ::tcl::Pkg::source [file join $dir word.tcl]] set auto_index(writeFile) [list ::tcl::Pkg::source [file join $dir writefile.tcl]] +set auto_index(::tcl::unsupported::icu) [list ::tcl::Pkg::source [file join $dir icu.tcl]] \ No newline at end of file diff --git a/tests/icu.test b/tests/icu.test new file mode 100644 index 0000000..522ed53 --- /dev/null +++ b/tests/icu.test @@ -0,0 +1,51 @@ +# Tests for tcl::unsupported::icu + +if {"::tcltest" ni [namespace children]} { + package require tcltest + namespace import -force ::tcltest::* +} + +# Force late loading of ICU if present +catch {::tcl::unsupported::icu} +testConstraint icu [expr {[info commands ::tcl::unsupported::icu::detect] ne ""}] + +namespace eval icu { + test icu-detect-0 {Return list of ICU encodings} -constraints icu -body { + set encoders [::tcl::unsupported::icu detect] + list [::tcl::mathop::in UTF-8 $encoders] [::tcl::mathop::in ISO-8859-1 $encoders] + } -result {1 1} + + test icu-detect-1 {Guess encoding} -constraints icu -body { + ::tcl::unsupported::icu detect [readFile [info script]] + } -result ISO-8859-1 + + test icu-detect-2 {Get all possible encodings} -constraints icu -body { + set encodings [::tcl::unsupported::icu detect [readFile [info script]] -all] + list [::tcl::mathop::in UTF-8 $encodings] [::tcl::mathop::in ISO-8859-1 $encodings] + } -result {1 1} + + test icu-tclToIcu-0 {Map Tcl encoding} -constraints icu -body { + # tis-620 because it is ambiguous in ICU on some platforms + # but should return the preferred encoding + list [::tcl::unsupported::icu tclToIcu utf-8] [::tcl::unsupported::icu tclToIcu tis-620] [::tcl::unsupported::icu tclToIcu shiftjis] + } -result {UTF-8 TIS-620 ibm-943_P15A-2003} + + test icu-tclToIcu-1 {Map Tcl encoding - no map} -constraints icu -body { + # Should not raise an error + ::tcl::unsupported::icu tclToIcu dummy + } -result {} + + test icu-icuToTcl-0 {Map ICU encoding} -constraints icu -body { + list [::tcl::unsupported::icu icuToTcl UTF-8] [::tcl::unsupported::icu icuToTcl TIS-620] [::tcl::unsupported::icu icuToTcl ibm-943_P15A-2003] + } -result {utf-8 tis-620 cp932} + + test icu-icuToTcl-1 {Map ICU encoding - no map} -constraints icu -body { + # Should not raise an error + ::tcl::unsupported::icu icuToTcl dummy + } -result {} + +} + + +namespace delete icu +::tcltest::cleanupTests diff --git a/unix/Makefile.in b/unix/Makefile.in index 9610165..71c7c93 100644 --- a/unix/Makefile.in +++ b/unix/Makefile.in @@ -305,7 +305,8 @@ GENERIC_OBJS = regcomp.o regexec.o regfree.o regerror.o tclAlloc.o \ tclCompile.o tclConfig.o tclDate.o tclDictObj.o tclDisassemble.o \ tclEncoding.o tclEnsemble.o \ tclEnv.o tclEvent.o tclExecute.o tclFCmd.o tclFileName.o tclGet.o \ - tclHash.o tclHistory.o tclIndexObj.o tclInterp.o tclIO.o tclIOCmd.o \ + tclHash.o tclHistory.o \ + tclIcu.o tclIndexObj.o tclInterp.o tclIO.o tclIOCmd.o \ tclIORChan.o tclIORTrans.o tclIOGT.o tclIOSock.o tclIOUtil.o \ tclLink.o tclListObj.o \ tclLiteral.o tclLoad.o tclMain.o tclNamesp.o tclNotify.o \ @@ -431,6 +432,7 @@ GENERIC_SRCS = \ $(GENERIC_DIR)/tclGet.c \ $(GENERIC_DIR)/tclHash.c \ $(GENERIC_DIR)/tclHistory.c \ + $(GENERIC_DIR)/tclIcu.c \ $(GENERIC_DIR)/tclIndexObj.c \ $(GENERIC_DIR)/tclInterp.c \ $(GENERIC_DIR)/tclIO.c \ @@ -1371,6 +1373,9 @@ tclHash.o: $(GENERIC_DIR)/tclHash.c tclHistory.o: $(GENERIC_DIR)/tclHistory.c $(CC) -c $(CC_SWITCHES) $(GENERIC_DIR)/tclHistory.c +tclIcu.o: $(GENERIC_DIR)/tclIcu.c + $(CC) -c $(CC_SWITCHES) $(GENERIC_DIR)/tclIcu.c + tclIndexObj.o: $(GENERIC_DIR)/tclIndexObj.c $(CC) -c $(CC_SWITCHES) $(GENERIC_DIR)/tclIndexObj.c diff --git a/win/Makefile.in b/win/Makefile.in index 391ec58..69421f0 100644 --- a/win/Makefile.in +++ b/win/Makefile.in @@ -313,6 +313,7 @@ GENERIC_OBJS = \ tclGet.$(OBJEXT) \ tclHash.$(OBJEXT) \ tclHistory.$(OBJEXT) \ + tclIcu.$(OBJEXT) \ tclIndexObj.$(OBJEXT) \ tclInterp.$(OBJEXT) \ tclIO.$(OBJEXT) \ diff --git a/win/makefile.vc b/win/makefile.vc index 74a594c..c02465ff 100644 --- a/win/makefile.vc +++ b/win/makefile.vc @@ -278,6 +278,7 @@ COREOBJS = \ $(TMP_DIR)\tclGet.obj \ $(TMP_DIR)\tclHash.obj \ $(TMP_DIR)\tclHistory.obj \ + $(TMP_DIR)\tclIcu.obj \ $(TMP_DIR)\tclIndexObj.obj \ $(TMP_DIR)\tclInterp.obj \ $(TMP_DIR)\tclIO.obj \ -- cgit v0.12