From ce9b5a55e164f1128756478b6a2bb548abec1980 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20v=2E=20L=C3=B6wis?= Date: Wed, 27 Jun 2001 06:28:56 +0000 Subject: Encode surrogates in UTF-8 even for a wide Py_UNICODE. Implement sys.maxunicode. Explicitly wrap around upper/lower computations for wide Py_UNICODE. When decoding large characters with UTF-8, represent expected test results using the \U notation. --- Include/unicodeobject.h | 3 +++ Lib/test/test_unicode.py | 4 ++-- Objects/unicodectype.c | 35 +++++++++++++++++++++++++++-------- Objects/unicodeobject.c | 19 ++++++++++++------- Python/sysmodule.c | 4 ++++ 5 files changed, 48 insertions(+), 17 deletions(-) diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h index 87e01af..d89537f 100644 --- a/Include/unicodeobject.h +++ b/Include/unicodeobject.h @@ -274,6 +274,9 @@ extern DL_IMPORT(int) PyUnicode_GetSize( PyObject *unicode /* Unicode object */ ); +/* Get the maximum ordinal for a Unicode character. */ +extern DL_IMPORT(Py_UNICODE) PyUnicode_GetMax(void); + /* Resize an already allocated Unicode object to the new size length. *unicode is modified to point to the new (resized) object and 0 diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py index c82ac69..c9732d6 100644 --- a/Lib/test/test_unicode.py +++ b/Lib/test/test_unicode.py @@ -386,9 +386,9 @@ verify(u'\ud84d\udc56'.encode('utf-8') == \ ''.join((chr(0xf0), chr(0xa3), chr(0x91), chr(0x96))) ) # UTF-8 specific decoding tests verify(unicode(''.join((chr(0xf0), chr(0xa3), chr(0x91), chr(0x96))), - 'utf-8') == u'\ud84d\udc56' ) + 'utf-8') == u'\U00023456' ) verify(unicode(''.join((chr(0xf0), chr(0x90), chr(0x80), chr(0x82))), - 'utf-8') == u'\ud800\udc02' ) + 'utf-8') == u'\U00010002' ) verify(unicode(''.join((chr(0xe2), chr(0x82), chr(0xac))), 'utf-8') == u'\u20ac' ) diff --git a/Objects/unicodectype.c b/Objects/unicodectype.c index 3bc19b2..13fc612 100644 --- a/Objects/unicodectype.c +++ b/Objects/unicodectype.c @@ -59,14 +59,21 @@ int _PyUnicode_IsLinebreak(register const Py_UNICODE ch) /* Returns the titlecase Unicode characters corresponding to ch or just ch if no titlecase mapping is known. */ -Py_UNICODE _PyUnicode_ToTitlecase(register const Py_UNICODE ch) +Py_UNICODE _PyUnicode_ToTitlecase(register Py_UNICODE ch) { const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); if (ctype->title) - return ch + ctype->title; - - return ch + ctype->upper; + ch += ctype->title; + else + ch += ctype->upper; + +#ifdef USE_UCS4_STORAGE + /* The database assumes that the values wrap around at 0x10000. */ + if (ch > 0x10000) + ch -= 0x10000; +#endif + return ch; } /* Returns 1 for Unicode characters having the category 'Lt', 0 @@ -348,21 +355,33 @@ int _PyUnicode_IsUppercase(register const Py_UNICODE ch) /* Returns the uppercase Unicode characters corresponding to ch or just ch if no uppercase mapping is known. */ -Py_UNICODE _PyUnicode_ToUppercase(register const Py_UNICODE ch) +Py_UNICODE _PyUnicode_ToUppercase(register Py_UNICODE ch) { const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); - return ch + ctype->upper; + ch += ctype->upper; +#ifdef USE_UCS4_STORAGE + /* The database assumes that the values wrap around at 0x10000. */ + if (ch > 0x10000) + ch -= 0x10000; +#endif + return ch; } /* Returns the lowercase Unicode characters corresponding to ch or just ch if no lowercase mapping is known. */ -Py_UNICODE _PyUnicode_ToLowercase(register const Py_UNICODE ch) +Py_UNICODE _PyUnicode_ToLowercase(register Py_UNICODE ch) { const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); - return ch + ctype->lower; + ch += ctype->lower; +#ifdef USE_UCS4_STORAGE + /* The database assumes that the values wrap around at 0x10000. */ + if (ch > 0x10000) + ch -= 0x10000; +#endif + return ch; } /* Returns 1 for Unicode characters having the category 'Ll', 'Lu', 'Lt', diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index ffac371..2f66c3c 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -103,6 +103,18 @@ static PyUnicodeObject *unicode_latin1[256]; */ static char unicode_default_encoding[100]; +Py_UNICODE +PyUnicode_GetMax() +{ +#ifdef USE_UCS4_STORAGE + return 0x10FFFF; +#else + /* This is actually an illegal character, so it should + not be passed to unichr. */ + return 0xFFFF; +#endif +} + /* --- Unicode Object ----------------------------------------------------- */ static @@ -884,12 +896,6 @@ PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s, cbWritten += 2; } else if (ch < 0x10000) { -#if Py_UNICODE_SIZE == 4 - *p++ = 0xe0 | (ch>>12); - *p++ = 0x80 | ((ch>>6) & 0x3f); - *p++ = 0x80 | (ch & 0x3f); - cbWritten += 3; -#else /* Check for high surrogate */ if (0xD800 <= ch && ch <= 0xDBFF) { if (i != size) { @@ -920,7 +926,6 @@ PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s, } *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); *p++ = (char)(0x80 | (ch & 0x3f)); -#endif } else { *p++ = 0xf0 | (ch>>18); *p++ = 0x80 | ((ch>>12) & 0x3f); diff --git a/Python/sysmodule.c b/Python/sysmodule.c index 62e0841..fe880d5 100644 --- a/Python/sysmodule.c +++ b/Python/sysmodule.c @@ -533,6 +533,7 @@ exc_traceback -- traceback of exception currently being handled\n\ Static objects:\n\ \n\ maxint -- the largest supported integer (the smallest is -maxint-1)\n\ +maxunicode -- the largest supported character\n\ builtin_module_names -- tuple of module names built into this intepreter\n\ version -- the version of this interpreter as a string\n\ version_info -- version information as a tuple\n\ @@ -643,6 +644,9 @@ _PySys_Init(void) PyDict_SetItemString(sysdict, "maxint", v = PyInt_FromLong(PyInt_GetMax())); Py_XDECREF(v); + PyDict_SetItemString(sysdict, "maxunicode", + v = PyInt_FromLong(PyUnicode_GetMax())); + Py_XDECREF(v); PyDict_SetItemString(sysdict, "builtin_module_names", v = list_builtin_module_names()); Py_XDECREF(v); -- cgit v0.12