From 677bde2dd14ac2c8f170779adcc732f991db8bd6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20v=2E=20L=C3=B6wis?= Date: Sat, 23 Nov 2002 22:08:15 +0000 Subject: Patch #626485: Support Unicode normalization. --- Doc/lib/libunicodedata.tex | 40 ++- Lib/test/test_normalization.py | 68 +++++ Misc/NEWS | 4 +- Modules/unicodedata.c | 294 +++++++++++++++++++- Modules/unicodedata_db.h | 577 +++++++++++++++++++++++++++++++++++++++ Tools/unicode/makeunicodedata.py | 93 ++++++- 6 files changed, 1053 insertions(+), 23 deletions(-) create mode 100644 Lib/test/test_normalization.py diff --git a/Doc/lib/libunicodedata.tex b/Doc/lib/libunicodedata.tex index 5096652..add00c9 100644 --- a/Doc/lib/libunicodedata.tex +++ b/Doc/lib/libunicodedata.tex @@ -5,7 +5,7 @@ \modulesynopsis{Access the Unicode Database.} \moduleauthor{Marc-Andre Lemburg}{mal@lemburg.com} \sectionauthor{Marc-Andre Lemburg}{mal@lemburg.com} - +\sectionauthor{Martin v. L\"owis}{martin@v.loewis.de} \index{Unicode} \index{character} @@ -14,10 +14,10 @@ This module provides access to the Unicode Character Database which defines character properties for all Unicode characters. The data in this database is based on the \file{UnicodeData.txt} file version -3.0.0 which is publically available from \url{ftp://ftp.unicode.org/}. +3.2.0 which is publically available from \url{ftp://ftp.unicode.org/}. The module uses the same names and symbols as defined by the -UnicodeData File Format 3.0.0 (see +UnicodeData File Format 3.2.0 (see \url{http://www.unicode.org/Public/UNIDATA/UnicodeData.html}). It defines the following functions: @@ -83,3 +83,37 @@ defines the following functions: character \var{unichr} as string. An empty string is returned in case no such mapping is defined. \end{funcdesc} + +\begin{funcdesc}{normalize}{form, unistr} + +Return the normal form \var{form} for the Unicode string \var{unistr}. +Valid values for \var{form} are 'NFC', 'NFKC', 'NFD', and 'NFKD'. + +The Unicode standard defines various normalization forms of a Unicode +string, based on the definition of canonical equivalence and +compatibility equivalence. In Unicode, several characters can be +expressed in various way. For example, the character U+00C7 (LATIN +CAPITAL LETTER C WITH CEDILLA) can also be expressed as the sequence +U+0043 (LATIN CAPITAL LETTER C) U+0327 (COMBINING CEDILLA). + +For each character, there are two normal forms: normal form C and +normal form D. Normal form D (NFD) is also known as canonical +decomposition, and translates each character into its decomposed form. +Normal form C (NFC) first applies a canonical decomposition, then +composes pre-combined characters again. + +In addition to these two forms, there two additional normal forms +based on compatibility equivalence. In Unicode, certain characters are +supported which normally would be unified with other characters. For +example, U+2160 (ROMAN NUMERAL ONE) is really the same thing as U+0049 +(LATIN CAPITAL LETTER I). However, it is supported in Unicode for +compatibility with existing character sets (e.g. gb2312). + +The normal form KD (NFKD) will apply the compatibility decomposition, +i.e. replace all compatibility characters with their equivalents. The +normal form KC (NFKC) first applies the compatibility decomposition, +followed by the canonical composition. + +\versionadded{2.3} +\end{funcdesc} + diff --git a/Lib/test/test_normalization.py b/Lib/test/test_normalization.py new file mode 100644 index 0000000..a263fc5 --- /dev/null +++ b/Lib/test/test_normalization.py @@ -0,0 +1,68 @@ +from test.test_support import verbose, TestFailed, TestSkipped, verify +import sys +from unicodedata import normalize +try: + data = open("NormalizationTest.txt","r").readlines() +except IOError: + raise TestSkipped("NormalizationTest.txt not found, download from http://www.unicode.org/Public/UNIDATA/NormalizationTest.txt") + +class RangeError: + pass + +def NFC(str): + return normalize("NFC", str) + +def NFKC(str): + return normalize("NFKC", str) + +def NFD(str): + return normalize("NFD", str) + +def NFKD(str): + return normalize("NFKD", str) + +def unistr(data): + data = [int(x, 16) for x in data.split(" ")] + for x in data: + if x > sys.maxunicode: + raise RangeError + return u"".join([unichr(x) for x in data]) + +part1_data = {} +for line in data: + if '#' in line: + line = line.split('#')[0] + line = line.strip() + if not line: + continue + if line.startswith("@Part"): + part = line + continue + try: + c1,c2,c3,c4,c5 = [unistr(x) for x in line.split(';')[:-1]] + except RangeError: + # Skip unsupported characters + continue + + if verbose: + print line + + # Perform tests + verify(c2 == NFC(c1) == NFC(c2) == NFC(c3), line) + verify(c4 == NFC(c4) == NFC(c5), line) + verify(c3 == NFD(c1) == NFD(c2) == NFD(c3), line) + verify(c5 == NFD(c4) == NFD(c5), line) + verify(c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5), line) + verify(c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5), line) + + # Record part 1 data + if part == "@Part1": + part1_data[c1] = 1 + +# Perform tests for all other data +for c in range(sys.maxunicode+1): + X = unichr(c) + if X in part1_data: + continue + assert X == NFC(X) == NFD(X) == NFKC(X) == NFKD(X), c + diff --git a/Misc/NEWS b/Misc/NEWS index 49977dc..d1fa3f9 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -317,8 +317,8 @@ Extension modules available in source code, but not built automatically anymore, and is now named bsddb185. -- unicodedata was updated to Unicode 3.2. In now also supports names - for Hangul syllables and CJK unified ideographs. +- unicodedata was updated to Unicode 3.2. It supports normalization + and names for Hangul syllables and CJK unified ideographs. - resource.getrlimit() now returns longs instead of ints. diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c index 3620936..502b5bd 100644 --- a/Modules/unicodedata.c +++ b/Modules/unicodedata.c @@ -30,13 +30,9 @@ typedef struct { #include "unicodedata_db.h" static const _PyUnicode_DatabaseRecord* -_getrecord(PyUnicodeObject* v) +_getrecord_ex(Py_UCS4 code) { - int code; int index; - - code = (int) *PyUnicode_AS_UNICODE(v); - if (code < 0 || code >= 0x110000) index = 0; else { @@ -47,6 +43,12 @@ _getrecord(PyUnicodeObject* v) return &_PyUnicode_Database_Records[index]; } +static const _PyUnicode_DatabaseRecord* +_getrecord(PyUnicodeObject* v) +{ + return _getrecord_ex(*PyUnicode_AS_UNICODE(v)); +} + /* --- Module API --------------------------------------------------------- */ static PyObject * @@ -253,6 +255,276 @@ unicodedata_decomposition(PyObject *self, PyObject *args) return PyString_FromString(decomp); } +void +get_decomp_record(Py_UCS4 code, int *index, int *prefix, int *count) +{ + if (code < 0 || code >= 0x110000) { + *index = 0; + } + else { + *index = decomp_index1[(code>>DECOMP_SHIFT)]; + *index = decomp_index2[(*index<> 8; + *prefix = decomp_data[*index] & 255; + + (*index)++; +} + +#define SBase 0xAC00 +#define LBase 0x1100 +#define VBase 0x1161 +#define TBase 0x11A7 +#define LCount 19 +#define VCount 21 +#define TCount 28 +#define NCount (VCount*TCount) +#define SCount (LCount*NCount) + +static PyObject* +nfd_nfkd(PyObject *input, int k) +{ + PyObject *result; + Py_UNICODE *i, *end, *o; + /* Longest decomposition in Unicode 3.2: U+FDFA */ + Py_UNICODE stack[20]; + int space, stackptr, isize; + int index, prefix, count; + unsigned char prev, cur; + + stackptr = 0; + isize = PyUnicode_GET_SIZE(input); + /* Overallocate atmost 10 characters. */ + space = (isize > 10 ? 10 : isize) + isize; + result = PyUnicode_FromUnicode(NULL, space); + if (!result) + return NULL; + i = PyUnicode_AS_UNICODE(input); + end = i + isize; + o = PyUnicode_AS_UNICODE(result); + + while (i < end) { + stack[stackptr++] = *i++; + while(stackptr) { + Py_UNICODE code = stack[--stackptr]; + if (!space) { + space = PyString_GET_SIZE(result) + 10; + if (PyUnicode_Resize(&result, space) == -1) + return NULL; + o = PyUnicode_AS_UNICODE(result) + space - 10; + space = 10; + } + /* Hangul Decomposition. */ + if (SBase <= code && code < (SBase+SCount)) { + int SIndex = code - SBase; + int L = LBase + SIndex / NCount; + int V = VBase + (SIndex % NCount) / TCount; + int T = TBase + SIndex % TCount; + *o++ = L; + *o++ = V; + space -= 2; + if (T != TBase) { + *o++ = T; + space --; + } + continue; + } + /* Other decompoistions. */ + get_decomp_record(code, &index, &prefix, &count); + + /* Copy character if it is not decomposable, or has a + compatibility decomposition, but we do NFD. */ + if (!count || (prefix && !k)) { + *o++ = code; + space--; + continue; + } + /* Copy decomposition onto the stack, in reverse + order. */ + while(count) { + code = decomp_data[index + (--count)]; + stack[stackptr++] = code; + } + } + } + + /* Drop overallocation. Cannot fail. */ + PyUnicode_Resize(&result, PyUnicode_GET_SIZE(result) - space); + + /* Sort canonically. */ + i = PyUnicode_AS_UNICODE(result); + prev = _getrecord_ex(*i)->combining; + end = i + PyUnicode_GET_SIZE(result); + for (i++; i < end; i++) { + cur = _getrecord_ex(*i)->combining; + if (prev == 0 || cur == 0 || prev <= cur) { + prev = cur; + continue; + } + /* Non-canonical order. Need to switch *i with previous. */ + o = i - 1; + while (1) { + Py_UNICODE tmp = o[1]; + o[1] = o[0]; + o[0] = tmp; + o--; + if (o < PyUnicode_AS_UNICODE(result)) + break; + prev = _getrecord_ex(*o)->combining; + if (prev == 0 || prev <= cur) + break; + } + prev = _getrecord_ex(*i)->combining; + } + return result; +} + +static int +find_nfc_index(struct reindex* nfc, Py_UNICODE code) +{ + int index; + for (index = 0; nfc[index].start; index++) { + int start = nfc[index].start; + if (code < start) + return -1; + if (code <= start + nfc[index].count) { + int delta = code - start; + return nfc[index].index + delta; + } + } + return -1; +} + +static PyObject* +nfc_nfkc(PyObject *input, int k) +{ + PyObject *result; + Py_UNICODE *i, *i1, *o, *end; + int f,l,index,index1,comb; + Py_UNICODE code; + Py_UNICODE *skipped[20]; + int cskipped = 0; + + result = nfd_nfkd(input, k); + if (!result) + return NULL; + + /* We are going to modify result in-place. + If nfd_nfkd is changed to sometimes return the input, + this code needs to be reviewed. */ + assert(result != input); + + i = PyUnicode_AS_UNICODE(result); + end = i + PyUnicode_GET_SIZE(result); + o = PyUnicode_AS_UNICODE(result); + + again: + while (i < end) { + for (index = 0; index < cskipped; index++) { + if (skipped[index] == i) { + /* *i character is skipped. + Remove from list. */ + skipped[index] = skipped[cskipped-1]; + cskipped--; + i++; + goto again; // continue while + } + } + /* Hangul Composition. We don't need to check for + pairs, since we always have decomposed data. */ + if (LBase <= *i && *i < (LBase+LCount) && + i + 1 < end && + VBase <= i[1] && i[1] <= (VBase+VCount)) { + int LIndex, VIndex; + LIndex = i[0] - LBase; + VIndex = i[1] - VBase; + code = SBase + (LIndex*VCount+VIndex)*TCount; + i+=2; + if (i < end && + TBase <= *i && *i <= (TBase+TCount)) { + code += *i-TBase; + i++; + } + *o++ = code; + continue; + } + + f = find_nfc_index(nfc_first, *i); + if (f == -1) { + *o++ = *i++; + continue; + } + /* Find next unblocked character. */ + i1 = i+1; + comb = 0; + while (i1 < end) { + int comb1 = _getrecord_ex(*i1)->combining; + if (comb1 && comb == comb1) { + /* Character is blocked. */ + i1++; + continue; + } + l = find_nfc_index(nfc_last, *i1); + /* *i1 cannot be combined with *i. If *i1 + is a starter, we don't need to look further. + Otherwise, record the combining class. */ + if (l == -1) { + not_combinable: + if (comb1 == 0) + break; + comb = comb1; + i1++; + continue; + } + index = f*TOTAL_LAST + l; + index1 = comp_index[index >> COMP_SHIFT]; + code = comp_data[(index1<>fp, "};" print >>fp + print >>fp, "/* Reindexing of NFC first characters. */" + print >>fp, "#define TOTAL_FIRST",total_first + print >>fp, "#define TOTAL_LAST",total_last + print >>fp, "struct reindex{int start;short count,index;};" + print >>fp, "struct reindex nfc_first[] = {" + for start,end in comp_first_ranges: + print >>fp," { %d, %d, %d}," % (start,end-start,comp_first[start]) + print >>fp," {0,0,0}" + print >>fp,"};\n" + print >>fp, "struct reindex nfc_last[] = {" + for start,end in comp_last_ranges: + print >>fp," { %d, %d, %d}," % (start,end-start,comp_last[start]) + print >>fp," {0,0,0}" + print >>fp,"};\n" + # FIXME: the following tables could be made static, and # the support code moved into unicodedatabase.c @@ -185,6 +255,12 @@ def makeunicodedata(unicode, trace): Array("decomp_index1", index1).dump(fp, trace) Array("decomp_index2", index2).dump(fp, trace) + index, index2, shift = splitbins(comp_data, trace) + print >>fp, "/* NFC pairs */" + print >>fp, "#define COMP_SHIFT", shift + Array("comp_index", index).dump(fp, trace) + Array("comp_data", index2).dump(fp, trace) + fp.close() # -------------------------------------------------------------------- @@ -454,7 +530,7 @@ import sys class UnicodeData: - def __init__(self, filename, expand=1): + def __init__(self, filename, exclusions, expand=1): file = open(filename) table = [None] * 0x110000 while 1: @@ -486,6 +562,17 @@ class UnicodeData: self.table = table self.chars = range(0x110000) # unicode 3.2 + file = open(exclusions) + self.exclusions = {} + for s in file: + s = s.strip() + if not s: + continue + if s[0] == '#': + continue + char = int(s.split()[0],16) + self.exclusions[char] = 1 + def uselatin1(self): # restrict character range to ISO Latin 1 self.chars = range(256) -- cgit v0.12