diff options
Diffstat (limited to 'Modules/unicodedata.c')
-rw-r--r-- | Modules/unicodedata.c | 111 |
1 files changed, 109 insertions, 2 deletions
diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c index d42c3b6..269ba57 100644 --- a/Modules/unicodedata.c +++ b/Modules/unicodedata.c @@ -1,11 +1,12 @@ /* ------------------------------------------------------------------------ - unicodedata -- Provides access to the Unicode 3.0 data base. + unicodedata -- Provides access to the Unicode 3.2 data base. - Data was extracted from the Unicode 3.0 UnicodeData.txt file. + Data was extracted from the Unicode 3.2 UnicodeData.txt file. Written by Marc-Andre Lemburg (mal@lemburg.com). Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com) + Modified by Martin v. Löwis (martin@v.loewis.de) Copyright (c) Corporation for National Research Initiatives. @@ -276,6 +277,47 @@ _gethash(const char *s, int len, int scale) return h; } +#define SBase 0xAC00 +#define LBase 0x1100 +#define VBase 0x1161 +#define TBase 0x11A7 +#define LCount 19 +#define VCount 21 +#define TCount 28 +#define NCount (VCount*TCount) +#define SCount (LCount*NCount) + +static char *hangul_syllables[][3] = { + { "G", "A", "" }, + { "GG", "AE", "G" }, + { "N", "YA", "GG" }, + { "D", "YAE", "GS" }, + { "DD", "EO", "N", }, + { "R", "E", "NJ" }, + { "M", "YEO", "NH" }, + { "B", "YE", "D" }, + { "BB", "O", "L" }, + { "S", "WA", "LG" }, + { "SS", "WAE", "LM" }, + { "", "OE", "LB" }, + { "J", "YO", "LS" }, + { "JJ", "U", "LT" }, + { "C", "WEO", "LP" }, + { "K", "WE", "LH" }, + { "T", "WI", "M" }, + { "P", "YU", "B" }, + { "H", "EU", "BS" }, + { 0, "YI", "S" }, + { 0, "I", "SS" }, + { 0, 0, "NG" }, + { 0, 0, "J" }, + { 0, 0, "C" }, + { 0, 0, "K" }, + { 0, 0, "T" }, + { 0, 0, "P" }, + { 0, 0, "H" } +}; + static int _getucname(Py_UCS4 code, char* buffer, int buflen) { @@ -284,6 +326,28 @@ _getucname(Py_UCS4 code, char* buffer, int buflen) int word; unsigned char* w; + if (SBase <= code && code <= SBase+SCount) { + /* Hangul syllable. */ + int SIndex = code - SBase; + int L = SIndex / NCount; + int V = (SIndex % NCount) / TCount; + int T = SIndex % TCount; + + if (buflen < 27) + /* Worst case: HANGUL SYLLABLE <10chars>. */ + return 0; + strcpy(buffer, "HANGUL SYLLABLE "); + buffer += 16; + strcpy(buffer, hangul_syllables[L][0]); + buffer += strlen(hangul_syllables[L][0]); + strcpy(buffer, hangul_syllables[V][1]); + buffer += strlen(hangul_syllables[V][1]); + strcpy(buffer, hangul_syllables[T][2]); + buffer += strlen(hangul_syllables[T][2]); + *buffer = '\0'; + return 1; + } + if (code >= 0x110000) return 0; @@ -343,6 +407,27 @@ _cmpname(int code, const char* name, int namelen) return buffer[namelen] == '\0'; } +static void +find_syllable(const char *str, int *len, int *pos, int count, int column) +{ + int i, len1; + *len = -1; + for (i = 0; i < count; i++) { + char *s = hangul_syllables[i][column]; + len1 = strlen(s); + if (len1 <= *len) + continue; + if (strncmp(str, s, len1) == 0) { + *len = len1; + *pos = i; + } + } + if (*len == -1) { + *len = 0; + *pos = -1; + } +} + static int _getcode(const char* name, int namelen, Py_UCS4* code) { @@ -350,6 +435,22 @@ _getcode(const char* name, int namelen, Py_UCS4* code) unsigned int mask = code_size-1; unsigned int i, incr; + /* Check for hangul syllables. */ + if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) { + int L, V, T, len; + const char *pos = name + 16; + find_syllable(pos, &len, &L, LCount, 0); + pos += len; + find_syllable(pos, &len, &V, VCount, 1); + pos += len; + find_syllable(pos, &len, &T, TCount, 2); + pos += len; + if (V != -1 && V != -1 && T != -1 && pos-name == namelen) { + *code = SBase + (L*VCount+V)*TCount + T; + return 1; + } + } + /* the following is the same as python's dictionary lookup, with only minor changes. see the makeunicodedata script for more details */ @@ -475,3 +576,9 @@ initunicodedata(void) if (v != NULL) PyModule_AddObject(m, "ucnhash_CAPI", v); } + +/* +Local variables: +c-basic-offset: 4 +End: +*/ |