diff options
Diffstat (limited to 'generic/tkTextLineBreak.c')
-rw-r--r-- | generic/tkTextLineBreak.c | 1010 |
1 files changed, 1010 insertions, 0 deletions
diff --git a/generic/tkTextLineBreak.c b/generic/tkTextLineBreak.c new file mode 100644 index 0000000..cc51dd8 --- /dev/null +++ b/generic/tkTextLineBreak.c @@ -0,0 +1,1010 @@ +/* + * tkTextLineBreak.c -- + * + * This module provides line break computation for line wrapping. + * It uses the library "libunibreak" (from Wu Yongwei) for the + * computation, but only if available (currently only UNIX), and if + * the language support is enabled, otherwise our own line break + * algorithm is used (it's a simplified version of the recommendation + * at http://www.unicode.org/reports/tr14/tr14-26.html). + * + * The alternative is the use of ICU library (http://site.icu-project.org/), + * instead of libunibreak, but this would require to support a very + * complex interface of a dynamic load library, with other words, we + * would need dozens of functions pointers. This is not really a drawback, + * and probably the ICU library is the better choice, but I think that a + * change to the ICU library is reasonable only if the Tcl/Tk developer team + * is deciding to use this library also for complete Unicode support (character + * conversion, for instance). + * + * Copyright (c) 2015-2017 Gregor Cramer + * + * See the file "license.terms" for information on usage and redistribution of + * this file, and for a DISCLAIMER OF ALL WARRANTIES. + */ + +#include "tkText.h" + +#include <ctype.h> +#include <assert.h> + +#ifndef MAX +# define MAX(a,b) (((int) a) < ((int) b) ? b : a) +#endif + + +typedef void (*ComputeBreakLocationsFunc)( + const unsigned char *text, size_t len, const char *lang, char *brks); + +static void ComputeBreakLocations( + const unsigned char *text, size_t len, const char *lang, char *brks); + +static ComputeBreakLocationsFunc libLinebreakFunc = ComputeBreakLocations; + +/* + *---------------------------------------------------------------------- + * + * GetLineBreakFunc -- + * + * Return the appropriate line break function. If argument 'lang' + * is NULL, then our own line break alorithm will be used (fast, + * but a bit simple). If 'lang' is not NULL, then this function + * tries to load the library "libunibreak" (currently only UNIX). + * If the load succeeds, then set_linebreaks_utf8 will be returned, + * otherwise ComputeBreakLocations will be returned. + * + * Note that "libunibreak" has language specific support, but + * currently only for zh, ja, and ko. Nethertheless any non-NULL + * value for 'lang' tries to use this library. + * + * Results: + * None. + * + * Side effects: + * The "libunibreak" library may be loaded, if available. + * + *---------------------------------------------------------------------- + */ + +#ifdef __UNIX__ + +static int +LoadFile( + Tcl_Interp *interp, + Tcl_Obj *pathPtr, + Tcl_LoadHandle *handle, + char const **symbols, + void **funcs) +{ + /* Keep backward compatibility to 8.5 */ +# if TCL_MAJOR_VERSION == 8 && TCL_MINOR_VERSION == 5 + return Tcl_FSLoadFile(interp, pathPtr, symbols[0], symbols[1], + (void *) &funcs[0], (void *) &funcs[1], handle, NULL); +# else + return Tcl_LoadFile(interp, pathPtr, symbols, TCL_LOAD_GLOBAL, funcs, handle); +# endif +} + +static void +LoadLibUnibreak( + Tcl_Interp *interp) +{ + typedef void *VoidP; + typedef void (*InitFunc)(); + + static char const *Symbols[3] = { + "init_linebreak", + "set_linebreaks_utf8", + NULL + }; + + VoidP Funcs[sizeof(Symbols)/sizeof(Symbols[0])]; + Tcl_LoadHandle handle; + Tcl_Obj *pathPtr = Tcl_NewStringObj("libunibreak.so.1", -1); + bool rc; + + Tcl_IncrRefCount(pathPtr); + rc = LoadFile(interp, pathPtr, &handle, Symbols, Funcs); + if (rc != TCL_OK) { + /* + * We couldn't find "libunibreak.so.1", so try the predecessor "liblinebreak.so.2". + */ + + Tcl_ResetResult(interp); + Tcl_DecrRefCount(pathPtr); + Tcl_IncrRefCount(pathPtr = Tcl_NewStringObj("liblinebreak.so.2", -1)); + rc = LoadFile(interp, pathPtr, &handle, Symbols, Funcs); + } + Tcl_DecrRefCount(pathPtr); + if (rc == TCL_OK) { + ((InitFunc) Funcs[0])(); + libLinebreakFunc = Funcs[1]; + } else { + Tcl_ResetResult(interp); + } +} + +#endif /* __UNIX__ */ + +static ComputeBreakLocationsFunc +GetLineBreakFunc( + Tcl_Interp *interp, + char const *lang) +{ +#if TCL_UTF_MAX > 4 /* exclude non-standard encodings */ + /* + * IMPORTANT NOTE: + * + * TCL_UTF_MAX > 4 is a severe violation of the standard. It's not possible + * anymore to use external libraries. + * + * Furthermore it is causing security problems for all systems, even for + * TCL themself. Example: an application is storing a pseudo UTF-8 string + * (generated by a TCL library with TCL_UTF_MAX > 4) into a database. Later + * the reader of the database fails to read/display this string, or is even + * crashing, because the stored string is not UTF-8 conform. With other words: + * the database is damaged. It is possible that with the introduction of + * TCL_UTF_MAX > 4 the TCL library will get the general assesment to be a + * safety-endagering system. + * + * Another fact: the whole UTF-8 string support of the TCL library is not + * reliable anymore. Example: + * + * int ch, isWordchar; + * TkUtfToUniChar(string, &ch); + * isWordchar = Tcl_UniCharIsWordChar(ch); + * + * The result of Tcl_UniCharIsWordChar() might be incorrect, because type + * 'int' (32 bit) will be truncated to type 'Tcl_UniChar' (16 bit). With + * the introduction of TCL_UTF_MAX > 4 the string handling is not reliable + * anymore. + * + * The whole thing with TCL_UTF_MAX > 4 is nothing else than ignorance of + * standards, and the willingness to endager the safety of data and applications. + * In general it's even not possible anymore to work with UTF-8 string in a + * proper way. + */ + + return NULL; +#endif /* TCL_UTF_MAX > 4 */ +#ifdef __UNIX__ + if (lang) { + static bool loaded = false; + + if (!loaded) { + LoadLibUnibreak(interp); + } + } +#endif + return libLinebreakFunc; +} + +/* + *---------------------------------------------------------------------- + * + * TkTextComputeBreakLocations -- + * + * Compute break locations in UTF-8 text. This function expects + * a nul-terminated string (this mean that the character at position + * 'len' must be NUL). Thus it is also required that the break buffer + * 'brks' has at least size 'len+1'. If 'lang' is not NULL, then the + * external library linunibreak will be used for the line break + * computation, but only if this library is loadable, otherwise the + * internal algorithm will be used. + * + * Results: + * The computed break locations. This function returns 'true' if + * the external linebreak library has been used for the computation, + * otherwise 'false' will be returned. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +bool +TkTextComputeBreakLocations( + Tcl_Interp *interp, + const char *text, /* must be nul-terminated */ + unsigned len, /* without trailing nul byte */ + const char *lang, /* can be NULL */ + char *brks) +{ + ComputeBreakLocationsFunc func; + int lastBreakablePos = -1; + unsigned i; + + assert(text); + assert(brks); + assert(text[len] == '\0'); + assert(!lang || (isalpha(lang[0]) && isalpha(lang[1]) && !lang[2])); + + func = GetLineBreakFunc(interp, lang); + + /* + * The algorithm don't give us a break value for the last character if we do + * not include the final nul char into the computation. + */ + + len += 1; + (*func)((const unsigned char *) text, len, lang, brks); + len -= 1; + + for (i = 0; i < len; ++i) { + switch (brks[i]) { + case LINEBREAK_MUSTBREAK: + break; + case LINEBREAK_ALLOWBREAK: + if (text[i] == '-') { + if (brks[i] == LINEBREAK_ALLOWBREAK) { + /* + * Fix the problem with the contextual hyphen-minus sign, the implementation of + * libunibreak has (possibly) forgotten this case. + * + * The HYPHEN-MINUS (U+002D) needs special context treatment. For simplicity we + * will only check whether we have two preceding, and two succeeding letters. + * TODO: Is there a better method for the decision? + */ + + const char *r = text + i; + const char *p, *q, *s; + Tcl_UniChar uc; + bool allow = false; + + q = Tcl_UtfPrev(r, text); + if (q != r) { + Tcl_UtfToUniChar(q, &uc); + if (Tcl_UniCharIsAlpha(uc)) { + p = Tcl_UtfPrev(q, text); + if (p != q) { + Tcl_UtfToUniChar(p, &uc); + if (Tcl_UniCharIsAlpha(uc)) { + s = r + 1; + s += Tcl_UtfToUniChar(s, &uc); + if (Tcl_UniCharIsAlpha(uc)) { + Tcl_UtfToUniChar(s, &uc); + if (Tcl_UniCharIsAlpha(uc)) { + allow = true; + } + } + } + } + } + } + + if (!allow) { + brks[i] = LINEBREAK_NOBREAK; + } + } + } else if (text[i] == '/' && i > 8) { + /* + * Ignore the breaking chance if there is a chance immediately before: + * no break inside "c/o", and no break after "http://" in a long line + * (a suggestion from Wu Yongwei). + */ + + if (lastBreakablePos >= (int) i - 2 + || (i > 40u && lastBreakablePos >= (int) i - 7 && text[i - 1] == '/')) { + continue; + } + + /* + * Special rule to treat Unix paths more nicely (a suggestion from Wu Yongwei). + */ + + if (i < len - 1 && text[i + 1] != ' ' && text[i - 1] == ' ') { + lastBreakablePos = i - 1; + continue; + } + } + lastBreakablePos = i; + break; + case LINEBREAK_INSIDEACHAR: + break; + } + } + + return func != ComputeBreakLocations; +} + +/* + * The following is implementing the recommendations at + * http://www.unicode.org/reports/tr14/tr14-26.html, but simplified - + * no language specific support, not all the rules (especially no + * combining marks), and mostly restricted to Latin-1 and relevant + * letters not belonging to specific languages. For a more sophisticated + * line break algorithm the library "libunibreak" (from Wu Yongwei) + * should be used. + */ + +typedef enum { + /* Note that CR, LF, and NL will be interpreted as BK, so only BK is used. */ + AI, AL, B2, BA, BB, BK, CL, CP, EX, GL, HY, IN, IS, NS, NU, OP, PO, PR, QU, SP, SY, WJ, ZW +} LBClass; + +#define __ AI + +/* + * Changes in table below (different from Unicode recommendation): + * + * 0a: CB -> BK (LINE FEED) + * 0d: CR -> BK (CARRIAGE RETURN) + * 0e: XX -> BK (SHIFT OUT) + * 23: AL -> IN (NUMBER SIGN) + * 26: AL -> BB (AMPERSAND) + * 3d: AL -> GL (EQUALS SIGN) + * 60: CM -> AL (GRAVE ACCENT) + */ + +static const char Table_0000[256] = { +/* 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f */ +/* 0 */ __, __, __, __, __, __, __, __, __, BA, BK, BK, BK, BK, BK, __, /* 00 - 0f */ +/* 1 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 10 - 1f */ +/* 2 */ SP, EX, QU, IN, PR, PO, BB, QU, OP, CP, AL, PR, IS, HY, IS, SY, /* 20 - 2f */ +/* 3 */ NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, IS, IS, AL, GL, AL, EX, /* 30 - 3f */ +/* 4 */ AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, /* 40 - 4f */ +/* 5 */ AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, OP, PR, CP, AL, AL, /* 50 - 5f */ +/* 6 */ AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, /* 60 - 6f */ +/* 7 */ AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, OP, BA, CL, AL, __, /* 70 - 7f */ +/* 8 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 80 - 8f */ +/* 9 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 90 - 9f */ +/* a */ GL, OP, PO, PR, PR, PR, AL, AL, AL, AL, __, QU, __, __, AL, AL, /* a0 - af */ +/* b */ PO, PR, AL, AL, BB, __, AL, AL, AL, AL, __, __, AL, AL, AL, OP, /* b0 - bf */ +/* c */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* c0 - cf */ +/* d */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* d0 - df */ +/* e */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* e0 - ef */ +/* f */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* f0 - ff */ +/* 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f */ +}; + +/* + * Changes in table below (different from Unicode recommendation): + * + * e2 80 89: BA -> WJ (THIN SPACE) + * e2 80 0a: BA -> WJ (HAIR SPACE) + */ + +static const char Table_E280[256] = { +/* 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f */ +/* 0 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 00 - 0f */ +/* 1 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 10 - 1f */ +/* 2 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 20 - 2f */ +/* 3 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 30 - 3f */ +/* 4 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 40 - 4f */ +/* 5 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 50 - 5f */ +/* 6 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 60 - 6f */ +/* 7 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 70 - 7f */ +/* 8 */ BA, BA, BA, BA, BA, BA, BA, GL, BA, __, __, ZW, __, __, __, __, /* 80 - 8f */ +/* 9 */ BA, AL, BA, BA, B2, AL, AL, AL, QU, QU, OP, QU, QU, QU, OP, QU, /* 90 - 9f */ +/* a */ AL, AL, AL, AL, IN, IN, IN, BA, BK, BK, __, __, __, __, __, GL, /* a0 - af */ +/* b */ PO, PO, PO, PO, PO, PO, PO, PO, AL, QU, QU, AL, NS, NS, AL, AL, /* b0 - bf */ +/* c */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* c0 - cf */ +/* d */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* d0 - df */ +/* e */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* e0 - ef */ +/* f */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* f0 - ff */ +/* 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f */ +}; + +static const char Table_E281[256] = { +/* 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f */ +/* 0 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 00 - 0f */ +/* 1 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 10 - 1f */ +/* 2 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 20 - 2f */ +/* 3 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 30 - 3f */ +/* 4 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 40 - 4f */ +/* 5 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 50 - 5f */ +/* 6 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 60 - 6f */ +/* 7 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 70 - 7f */ +/* 8 */ AL, AL, AL, AL, IS, OP, CL, NS, NS, NS, AL, AL, AL, AL, AL, AL, /* 80 - 8f */ +/* 9 */ AL, AL, __, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, __, /* 90 - 9f */ +/* a */ WJ, AL, AL, AL, AL, __, __, __, __, __, __, __, __, __, __, __, /* a0 - af */ +/* b */ __, __, __, __, __, __, __, __, __, __, __, __, __, OP, CL, __, /* b0 - bf */ +/* c */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* c0 - cf */ +/* d */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* d0 - df */ +/* e */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* e0 - ef */ +/* f */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* f0 - ff */ +/* 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f */ +}; + +static const char Table_E282[256] = { +/* 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f */ +/* 0 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 00 - 0f */ +/* 1 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 10 - 1f */ +/* 2 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 20 - 2f */ +/* 3 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 30 - 3f */ +/* 4 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 40 - 4f */ +/* 5 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 50 - 5f */ +/* 6 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 60 - 6f */ +/* 7 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 70 - 7f */ +/* 8 */ __, __, __, __, __, __, __, __, __, __, __, __, __, CL, CL, __, /* 80 - 8f */ +/* 9 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 90 - 9f */ +/* a */ PR, PR, PR, PR, PR, PR, PR, PO, PR, PR, PR, PR, PR, PR, PR, PR, /* a0 - af */ +/* b */ PR, PR, PR, PR, PR, PR, PR, PR, PR, PR, PR, PR, PR, PR, PR, __, /* b0 - bf */ +/* c */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* c0 - cf */ +/* d */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* d0 - df */ +/* e */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* e0 - ef */ +/* f */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* f0 - ff */ +/* 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f */ +}; + +static const char Table_E28C[256] = { +/* 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f */ +/* 0 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 00 - 0f */ +/* 1 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 10 - 1f */ +/* 2 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 20 - 2f */ +/* 3 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 30 - 3f */ +/* 4 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 40 - 4f */ +/* 5 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 50 - 5f */ +/* 6 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 60 - 6f */ +/* 7 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 70 - 7f */ +/* 8 */ __, __, __, __, __, __, __, __, OP, CL, OP, CL, __, __, __, __, /* 80 - 8f */ +/* 9 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 90 - 9f */ +/* a */ __, __, __, __, __, __, __, __, __, OP, CL, __, __, __, __, __, /* a0 - af */ +/* b */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* b0 - bf */ +/* c */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* c0 - cf */ +/* d */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* d0 - df */ +/* e */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* e0 - ef */ +/* f */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* f0 - ff */ +/* 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f */ +}; + +static const char Table_E29D[256] = { +/* 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f */ +/* 0 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 00 - 0f */ +/* 1 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 10 - 1f */ +/* 2 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 20 - 2f */ +/* 3 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 30 - 3f */ +/* 4 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 40 - 4f */ +/* 5 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 50 - 5f */ +/* 6 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 60 - 6f */ +/* 7 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 70 - 7f */ +/* 8 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 80 - 8f */ +/* 9 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 90 - 9f */ +/* a */ __, __, __, __, __, __, __, __, OP, CL, OP, CL, OP, CL, OP, CL, /* a0 - af */ +/* b */ OP, CL, OP, CL, OP, CL, __, __, __, __, __, __, __, __, __, __, /* b0 - bf */ +/* c */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* c0 - cf */ +/* d */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* d0 - df */ +/* e */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* e0 - ef */ +/* f */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* f0 - ff */ +/* 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f */ +}; + +static const char Table_E29F[256] = { +/* 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f */ +/* 0 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 00 - 0f */ +/* 1 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 10 - 1f */ +/* 2 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 20 - 2f */ +/* 3 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 30 - 3f */ +/* 4 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 40 - 4f */ +/* 5 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 50 - 5f */ +/* 6 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 60 - 6f */ +/* 7 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 70 - 7f */ +/* 8 */ __, __, __, __, __, OP, CL, __, __, __, __, __, __, __, __, __, /* 80 - 8f */ +/* 9 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 90 - 9f */ +/* a */ __, __, __, __, __, __, OP, CL, OP, CL, OP, CL, OP, CL, OP, CL, /* a0 - af */ +/* b */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* b0 - bf */ +/* c */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* c0 - cf */ +/* d */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* d0 - df */ +/* e */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* e0 - ef */ +/* f */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* f0 - ff */ +/* 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f */ +}; + +static const char Table_E2A6[256] = { +/* 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f */ +/* 0 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 00 - 0f */ +/* 1 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 10 - 1f */ +/* 2 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 20 - 2f */ +/* 3 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 30 - 3f */ +/* 4 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 40 - 4f */ +/* 5 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 50 - 5f */ +/* 6 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 60 - 6f */ +/* 7 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 70 - 7f */ +/* 8 */ __, __, __, OP, CL, OP, CL, OP, CL, OP, CL, OP, CL, OP, CL, OP, /* 80 - 8f */ +/* 9 */ CL, OP, CL, OP, CL, OP, CL, OP, CL, __, __, __, __, __, __, __, /* 90 - 9f */ +/* a */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* a0 - af */ +/* b */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* b0 - bf */ +/* c */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* c0 - cf */ +/* d */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* d0 - df */ +/* e */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* e0 - ef */ +/* f */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* f0 - ff */ +/* 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f */ +}; + +static const char Table_E2A7[256] = { +/* 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f */ +/* 0 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 00 - 0f */ +/* 1 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 10 - 1f */ +/* 2 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 20 - 2f */ +/* 3 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 30 - 3f */ +/* 4 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 40 - 4f */ +/* 5 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 50 - 5f */ +/* 6 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 60 - 6f */ +/* 7 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 70 - 7f */ +/* 8 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 80 - 8f */ +/* 9 */ __, __, __, __, __, __, __, __, OP, CL, OP, CL, __, __, __, __, /* 90 - 9f */ +/* a */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* a0 - af */ +/* b */ __, __, __, __, __, __, __, __, __, __, __, __, OP, CL, __, __, /* b0 - bf */ +/* c */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* c0 - cf */ +/* d */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* d0 - df */ +/* e */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* e0 - ef */ +/* f */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* f0 - ff */ +/* 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f */ +}; + +static const char Table_E2B8[256] = { +/* 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f */ +/* 0 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 00 - 0f */ +/* 1 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 10 - 1f */ +/* 2 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 20 - 2f */ +/* 3 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 30 - 3f */ +/* 4 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 40 - 4f */ +/* 5 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 50 - 5f */ +/* 6 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 60 - 6f */ +/* 7 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 70 - 7f */ +/* 8 */ AL, AL, QU, QU, QU, QU, AL, AL, AL, QU, QU, AL, QU, QU, AL, AL, /* 80 - 8f */ +/* 9 */ AL, AL, AL, AL, AL, AL, AL, AL, OP, AL, AL, AL, QU, QU, AL, AL, /* 90 - 9f */ +/* a */ QU, QU, OP, CL, OP, CL, OP, CL, OP, CL, AL, AL, AL, AL, AL, __, /* a0 - af */ +/* b */ AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, B2, B2, AL, AL, AL, AL, /* b0 - bf */ +/* c */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* c0 - cf */ +/* d */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* d0 - df */ +/* e */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* e0 - ef */ +/* f */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* f0 - ff */ +/* 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f */ +}; + +static const char Table_E380[256] = { +/* 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f */ +/* 0 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 00 - 0f */ +/* 1 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 10 - 1f */ +/* 2 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 20 - 2f */ +/* 3 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 30 - 3f */ +/* 4 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 40 - 4f */ +/* 5 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 50 - 5f */ +/* 6 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 60 - 6f */ +/* 7 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 70 - 7f */ +/* 8 */ __, CL, CL, AL, __, NS, __, __, OP, CL, OP, CL, OP, CL, OP, CL, /* 80 - 8f */ +/* 9 */ OP, CL, __, __, OP, CL, OP, CL, OP, CL, OP, CL, NS, OP, CL, CL, /* 90 - 9f */ +/* a */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* a0 - af */ +/* b */ AL, __, __, __, __, __, __, __, __, __, __, NS, NS, AL, __, __, /* b0 - bf */ +/* c */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* c0 - cf */ +/* d */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* d0 - df */ +/* e */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* e0 - ef */ +/* f */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* f0 - ff */ +/* 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f */ +}; + +static const char Table_EFB8[256] = { +/* 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f */ +/* 0 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 00 - 0f */ +/* 1 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 10 - 1f */ +/* 2 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 20 - 2f */ +/* 3 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 30 - 3f */ +/* 4 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 40 - 4f */ +/* 5 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 50 - 5f */ +/* 6 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 60 - 6f */ +/* 7 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 70 - 7f */ +/* 8 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 80 - 8f */ +/* 9 */ IS, CL, CL, IS, IS, AL, AL, OP, CL, IN, __, __, __, __, __, __, /* 90 - 9f */ +/* a */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* a0 - af */ +/* b */ AL, AL, AL, AL, AL, OP, CL, OP, CL, OP, CL, OP, CL, OP, CL, OP, /* b0 - bf */ +/* c */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* c0 - cf */ +/* d */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* d0 - df */ +/* e */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* e0 - ef */ +/* f */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* f0 - ff */ +/* 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f */ +}; + +static const char Table_EFB9[256] = { +/* 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f */ +/* 0 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 00 - 0f */ +/* 1 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 10 - 1f */ +/* 2 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 20 - 2f */ +/* 3 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 30 - 3f */ +/* 4 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 40 - 4f */ +/* 5 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 50 - 5f */ +/* 6 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 60 - 6f */ +/* 7 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 70 - 7f */ +/* 8 */ CL, OP, CL, OP, CL, AL, AL, OP, CL, AL, AL, AL, AL, AL, AL, AL, /* 80 - 8f */ +/* 9 */ CL, CL, CL, __, NS, NS, AL, AL, B2, OP, CL, OP, CL, OP, CL, AL, /* 90 - 9f */ +/* a */ AL, AL, __, B2, __, __, __, __, AL, PR, PO, AL, __, __, __, __, /* a0 - af */ +/* b */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* b0 - bf */ +/* c */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* c0 - cf */ +/* d */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* d0 - df */ +/* e */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* e0 - ef */ +/* f */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* f0 - ff */ +/* 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f */ +}; + +static const char Table_EFBC[256] = { +/* 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f */ +/* 0 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 00 - 0f */ +/* 1 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 10 - 1f */ +/* 2 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 20 - 2f */ +/* 3 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, AL, /* 30 - 3f */ +/* 4 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 40 - 4f */ +/* 5 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 50 - 5f */ +/* 6 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 60 - 6f */ +/* 7 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 70 - 7f */ +/* 8 */ __, EX, AL, AL, PR, PO, AL, AL, OP, CL, AL, __, CL, B2, CL, AL, /* 80 - 8f */ +/* 9 */ NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NS, NS, __, __, __, EX, /* 90 - 9f */ +/* a */ AL, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* a0 - af */ +/* b */ __, __, __, __, __, __, __, __, __, __, __, OP, AL, CL, __, __, /* b0 - bf */ +/* c */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* c0 - cf */ +/* d */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* d0 - df */ +/* e */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* e0 - ef */ +/* f */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* f0 - ff */ +/* 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f */ +}; + +static const char Table_EFBD[256] = { +/* 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f */ +/* 0 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 00 - 0f */ +/* 1 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 10 - 1f */ +/* 2 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 20 - 2f */ +/* 3 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 30 - 3f */ +/* 4 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 40 - 4f */ +/* 5 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 50 - 5f */ +/* 6 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 60 - 6f */ +/* 7 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 70 - 7f */ +/* 8 */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* 80 - 8f */ +/* 9 */ __, __, __, __, __, __, __, __, __, __, __, OP, __, CL, __, OP, /* 90 - 9f */ +/* a */ CL, CL, OP, CL, CL, AL, __, __, __, __, __, __, __, __, __, __, /* a0 - af */ +/* b */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, WJ, /* b0 - bf */ +/* c */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* c0 - cf */ +/* d */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* d0 - df */ +/* e */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* e0 - ef */ +/* f */ __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, /* f0 - ff */ +/* 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f */ +}; + +#undef __ + +#define PROHIBITED LINEBREAK_NOBREAK +#define DIRECT LINEBREAK_ALLOWBREAK +#define INDIRECT ((char) (~LINEBREAK_NOBREAK & ~LINEBREAK_ALLOWBREAK & 0x7f)) + +#define X PROHIBITED /* B ^ A === B SP* × A */ +#define i INDIRECT /* B % A === B × A and B SP+ ÷ A */ +#define _ DIRECT /* B ÷ A */ + +/* Note that BK, SP will no be used for lookup. */ +static const char BrkPairTable[23][23] = { +/* AI AL B2 BA BB BK CL CP EX GL HY IN IS NS NU OP PO PR QU SP SY WJ ZW */ +/* AI */ { X, X, _, i, _, _, X, X, X, i, i, i, X, i, i, i, _, _, i, _, X, X, X }, /* AI */ +/* AL */ { i, i, _, i, _, _, X, X, X, i, i, i, X, i, i, i, _, _, i, _, X, X, X }, /* AL */ +/* B2 */ { _, _, _, i, _, _, X, X, X, i, i, _, X, i, _, _, _, _, i, _, X, X, X }, /* B2 */ +/* BA */ { _, _, _, i, _, _, X, X, X, i, i, _, X, i, _, _, _, _, i, _, X, X, X }, /* BA */ +/* BB */ { i, i, i, i, i, _, X, X, X, _, i, i, X, i, i, i, i, i, i, _, X, X, X }, /* BB */ +/* BK */ { _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _ }, /* BK */ +/* CL */ { _, _, _, i, _, _, X, X, X, i, i, _, X, X, _, _, i, i, i, _, X, X, X }, /* CL */ +/* CP */ { i, i, _, i, _, _, X, X, X, i, i, _, X, X, i, _, i, i, i, _, X, X, X }, /* CP */ +/* EX */ { _, _, _, i, _, _, X, X, X, i, i, _, X, i, _, _, _, _, i, _, X, X, X }, /* EX */ +/* GL */ { i, i, i, i, i, _, X, X, X, i, i, i, X, i, i, i, i, i, i, _, X, X, X }, /* GL */ +/* HY */ { _, _, _, i, _, _, X, X, X, _, i, _, X, i, i, _, _, _, i, _, X, X, X }, /* HY */ +/* IN */ { _, _, _, i, _, _, X, X, X, i, i, i, X, i, _, _, _, _, i, _, X, X, X }, /* IN */ +/* IS */ { i, i, _, i, _, _, X, X, X, i, i, _, X, i, i, _, _, _, i, _, X, X, X }, /* IS */ +/* NS */ { _, _, _, i, _, _, X, X, X, i, i, _, X, i, _, _, _, _, i, _, X, X, X }, /* NS */ +/* NU */ { i, i, _, i, _, _, X, X, X, i, i, i, X, i, i, i, i, i, i, _, X, X, X }, /* NU */ +/* OP */ { X, X, X, X, X, _, X, X, X, X, X, X, X, X, X, X, X, X, X, _, X, X, X }, /* OP */ +/* PO */ { i, i, _, i, _, _, X, X, X, i, i, _, X, i, i, i, _, _, i, _, X, X, X }, /* PO */ +/* PR */ { _, i, _, i, _, _, X, X, X, i, i, _, X, i, i, i, _, _, i, _, X, X, X }, /* PR */ +/* QU */ { i, i, i, i, i, _, X, X, X, i, i, i, X, i, i, X, i, i, i, _, X, X, X }, /* QU */ +/* SP */ { _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _ }, /* SP */ +/* SY */ { _, _, _, i, _, _, X, X, X, i, i, _, X, i, i, _, _, _, i, _, X, X, X }, /* SY */ +/* WJ */ { i, i, i, i, i, _, X, X, X, i, i, i, X, i, i, i, i, i, i, _, X, X, X }, /* WJ */ +/* ZW */ { _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, X }, /* ZW */ +/* AI AL B2 BA BB BK CL CP EX GL HY IN IS NS NU OP PO PR QU SP SY WJ ZW */ +}; + +#undef _ +#undef i +#undef X + +/* + *---------------------------------------------------------------------- + * + * ComputeBreakLocations -- + * + * Compute break locations in UTF-8 text. This function is doing + * the same as set_linebreaks_utf8 (from "libunibreak"), but this + * function is using a simplified line break algorithm, although + * it is following the recommendations at + * http://www.unicode.org/reports/tr14/tr14-26.html. + * + * Note that this functions expects that the whole line will be + * parsed at once. This interface corresponds to the interface + * of the linebreak library. Of course, such a design is a bit + * unluckily. + * + * Results: + * The computed break locations, in 'brks'. This array must be as + * large as the input length (specified by 'len'). + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +static void +ComputeBreakLocations( + const unsigned char *text, + size_t len, + const char *lang, + char *brks) +{ + size_t i; + size_t nbytes; + size_t nletters; + size_t brkIndex; + LBClass cls; + LBClass prevCls; + + if (len == 0) { + return; + } + + i = 0; + nletters = 0; + brkIndex = 0; + cls = BK; + prevCls = WJ; + brks[len - 1] = LINEBREAK_MUSTBREAK; + + while (i < len) { + unsigned char ch; + LBClass pcls; + + ch = text[i]; + + if (ch < 0x80) { + pcls = Table_0000[ch]; + nbytes = 1; + } else if ((ch & 0xe0) == 0xc0) { + pcls = AI; + switch (ch) { + case 0xc2: + switch (UCHAR(text[i + 1])) { + case 0x85: pcls = BK; break; /* NL */ + case 0xac: pcls = AL; break; + case 0xad: pcls = BA; break; + case 0xb1: pcls = AL; break; + case 0xbb: pcls = QU; break; + } + break; + case 0xc3: + case 0xc4: + case 0xc5: + case 0xc6: + case 0xc7: + case 0xc8: + case 0xc9: + ch = text[i + 1]; + if (0x80 <= ch && ch <= 0xbf) { + pcls = AL; + } + break; + case 0xca: + ch = text[i + 1]; + if (0x80 <= ch && ch <= 0xaf) { + pcls = AL; + } + break; + case 0xcb: + switch (UCHAR(text[i + 1])) { + case 0x88: /* fallthru */ + case 0x8c: /* fallthru */ + case 0x9f: pcls = BB; break; + } + break; + case 0xcd: + if (UCHAR(text[i + 1]) == 0x8f) { + pcls = GL; + } + break; + case 0xd7: + if (UCHAR(text[i + 1]) == 0x86) { + pcls = EX; + } + break; + case 0xdf: + if (UCHAR(text[i + 1]) == 0xb8) { + pcls = IS; + } + break; + } + nbytes = 2; + brks[i] = LINEBREAK_INSIDEACHAR; + } else if ((ch & 0xf0) == 0xe0) { + pcls = AI; + switch (ch) { + case 0xe2: + switch (UCHAR(text[i + 1])) { + case 0x80: pcls = Table_E280[UCHAR(text[i + 2])]; break; + case 0x81: pcls = Table_E281[UCHAR(text[i + 2])]; break; + case 0x82: pcls = Table_E282[UCHAR(text[i + 2])]; break; + case 0x8c: pcls = Table_E28C[UCHAR(text[i + 2])]; break; + case 0x9d: pcls = Table_E29D[UCHAR(text[i + 2])]; break; + case 0x9f: pcls = Table_E29F[UCHAR(text[i + 2])]; break; + case 0xa6: pcls = Table_E2A6[UCHAR(text[i + 2])]; break; + case 0xa7: pcls = Table_E2A7[UCHAR(text[i + 2])]; break; + case 0xb8: pcls = Table_E2B8[UCHAR(text[i + 2])]; break; + case 0x84: + switch (UCHAR(text[i + 2])) { + case 0x83: /* fallthru */ + case 0x89: pcls = PO; break; + case 0x96: pcls = PR; break; + } + break; + case 0x88: + switch (UCHAR(text[i + 2])) { + case 0x92: /* fallthru */ + case 0x93: pcls = PR; break; + } + break; + case 0xb9: + switch (UCHAR(text[i + 2])) { + case 0x80: pcls = B2; break; + case 0x81: pcls = AL; break; + case 0x82: pcls = OP; break; + } + break; + } + break; + case 0xe3: + if (UCHAR(text[i + 1]) == 0x80) { + pcls = Table_E380[UCHAR(text[i + 2])]; + } + break; + case 0xef: + switch (UCHAR(text[i + 1])) { + case 0xb8: pcls = Table_EFB8[UCHAR(text[i + 2])]; break; + case 0xb9: pcls = Table_EFB9[UCHAR(text[i + 2])]; break; + case 0xbc: pcls = Table_EFBC[UCHAR(text[i + 2])]; break; + case 0xbd: pcls = Table_EFBD[UCHAR(text[i + 2])]; break; + case 0xb4: + switch (UCHAR(text[i + 2])) { + case 0xbe: pcls = CL; break; + case 0xbf: pcls = OP; break; + } + break; + case 0xbb: + if (UCHAR(text[i + 2]) == 0xbf) { + pcls = WJ; /* ZWNBSP (deprecated word joiner) */ + } + break; + case 0xbf: + switch (UCHAR(text[i + 2])) { + case 0xa0: pcls = PO; break; + case 0xa1: /* fallthru */ + case 0xa5: /* fallthru */ + case 0xa6: pcls = PR; break; + } + break; + } + break; + } + nbytes = 3; + brks[i + 0] = LINEBREAK_INSIDEACHAR; + brks[i + 1] = LINEBREAK_INSIDEACHAR; + } else if ((ch & 0xf8) == 0xf0) { + pcls = AI; + nbytes = 4; + brks[i + 0] = LINEBREAK_INSIDEACHAR; + brks[i + 1] = LINEBREAK_INSIDEACHAR; + brks[i + 2] = LINEBREAK_INSIDEACHAR; + } else { +#if TCL_UTF_MAX > 4 + /* + * NOTE: For any reason newer TCL versions will allow > 4 bytes. I cannot + * understand this decision, this is not conform to UTF-8 standard. + * Moreover this decision is introducing severe compatibility problems. + * BTW: TCL_UTF_MAX>4 nothing else than a bad hack. If TCL want's to support + * full Unicode range, a proper implementation is required. + * This corrupted encoding will use the fallback handling. + */ +#endif /* TCL_UTF_MAX > 4 */ + /* + * This fallback is required, because ths current character conversion + * algorithm in Tcl library is producing overlong sequences (a violation + * of the UTF-8 standard). This observation has been reported to the + * Tcl/Tk team, but the response was ignorance. + */ + + unsigned k; + const char *p = (const char *) text + i; + + pcls = AI; + nbytes = Tcl_UtfNext(p) - p; + for (k = 0; k < nbytes; ++k) { + brks[i + k] = LINEBREAK_INSIDEACHAR; + } + } + + if (i == 0) { + if ((cls = pcls) == SP) { + /* treat SP at start of input as if it followed a WJ */ + prevCls = cls = WJ; + } + } else { + switch (pcls) { + case BK: + brks[i - 1] = LINEBREAK_NOBREAK; + brks[i] = LINEBREAK_MUSTBREAK; + prevCls = WJ; + return; + case SP: + /* handle spaces explicitly; do not update cls */ + if (i > 0) { + brks[i - 1] = LINEBREAK_NOBREAK; + prevCls = SP; + } else { + prevCls = WJ; + } + nletters = 0; + break; + case HY: { + char brk = BrkPairTable[cls][HY]; + + /* + * The HYPHEN-MINUS (U+002D) needs special context treatment. For simplicity we + * will only check whether we have two preceding, and two succeeding letters. + * TODO: Is there a better method for the decision? + */ + + brks[i - 1] = LINEBREAK_NOBREAK; + cls = pcls; + + if (brk == INDIRECT) { + prevCls = pcls; + } else { + prevCls = WJ; + + if (brk == LINEBREAK_ALLOWBREAK && nletters >= 2) { + brkIndex = i - 1; + } + } + nletters = 0; + break; + } + default: { + char brk = BrkPairTable[cls][pcls]; + + if (brk == INDIRECT) { + brk = (prevCls == SP) ? LINEBREAK_ALLOWBREAK : LINEBREAK_NOBREAK; + prevCls = pcls; + } else { + prevCls = WJ; + } + brks[i - 1] = brk; + cls = pcls; + + if (pcls == AL) { + nletters += 1; + + if (brkIndex && nletters >= 2) { + brks[brkIndex] = LINEBREAK_ALLOWBREAK; + brkIndex = 0; + } + } else { + nletters = 0; + } + break; + } + } + } + + i += nbytes; + } +} + +/* + * Local Variables: + * mode: c + * c-basic-offset: 4 + * fill-column: 105 + * End: + * vi:set ts=8 sw=4: + */ |