summaryrefslogtreecommitdiffstats
path: root/Objects
diff options
context:
space:
mode:
authorMartin v. Löwis <martin@v.loewis.de>2001-06-27 06:28:56 (GMT)
committerMartin v. Löwis <martin@v.loewis.de>2001-06-27 06:28:56 (GMT)
commitce9b5a55e164f1128756478b6a2bb548abec1980 (patch)
tree0b616e0fae5ec7204f723235d196ae2b7c124d78 /Objects
parent236d8b79748fec890d57ad0dd99ea3f1c3ba57df (diff)
downloadcpython-ce9b5a55e164f1128756478b6a2bb548abec1980.zip
cpython-ce9b5a55e164f1128756478b6a2bb548abec1980.tar.gz
cpython-ce9b5a55e164f1128756478b6a2bb548abec1980.tar.bz2
Encode surrogates in UTF-8 even for a wide Py_UNICODE.
Implement sys.maxunicode. Explicitly wrap around upper/lower computations for wide Py_UNICODE. When decoding large characters with UTF-8, represent expected test results using the \U notation.
Diffstat (limited to 'Objects')
-rw-r--r--Objects/unicodectype.c35
-rw-r--r--Objects/unicodeobject.c19
2 files changed, 39 insertions, 15 deletions
diff --git a/Objects/unicodectype.c b/Objects/unicodectype.c
index 3bc19b2..13fc612 100644
--- a/Objects/unicodectype.c
+++ b/Objects/unicodectype.c
@@ -59,14 +59,21 @@ int _PyUnicode_IsLinebreak(register const Py_UNICODE ch)
/* Returns the titlecase Unicode characters corresponding to ch or just
ch if no titlecase mapping is known. */
-Py_UNICODE _PyUnicode_ToTitlecase(register const Py_UNICODE ch)
+Py_UNICODE _PyUnicode_ToTitlecase(register Py_UNICODE ch)
{
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
if (ctype->title)
- return ch + ctype->title;
-
- return ch + ctype->upper;
+ ch += ctype->title;
+ else
+ ch += ctype->upper;
+
+#ifdef USE_UCS4_STORAGE
+ /* The database assumes that the values wrap around at 0x10000. */
+ if (ch > 0x10000)
+ ch -= 0x10000;
+#endif
+ return ch;
}
/* Returns 1 for Unicode characters having the category 'Lt', 0
@@ -348,21 +355,33 @@ int _PyUnicode_IsUppercase(register const Py_UNICODE ch)
/* Returns the uppercase Unicode characters corresponding to ch or just
ch if no uppercase mapping is known. */
-Py_UNICODE _PyUnicode_ToUppercase(register const Py_UNICODE ch)
+Py_UNICODE _PyUnicode_ToUppercase(register Py_UNICODE ch)
{
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
- return ch + ctype->upper;
+ ch += ctype->upper;
+#ifdef USE_UCS4_STORAGE
+ /* The database assumes that the values wrap around at 0x10000. */
+ if (ch > 0x10000)
+ ch -= 0x10000;
+#endif
+ return ch;
}
/* Returns the lowercase Unicode characters corresponding to ch or just
ch if no lowercase mapping is known. */
-Py_UNICODE _PyUnicode_ToLowercase(register const Py_UNICODE ch)
+Py_UNICODE _PyUnicode_ToLowercase(register Py_UNICODE ch)
{
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
- return ch + ctype->lower;
+ ch += ctype->lower;
+#ifdef USE_UCS4_STORAGE
+ /* The database assumes that the values wrap around at 0x10000. */
+ if (ch > 0x10000)
+ ch -= 0x10000;
+#endif
+ return ch;
}
/* Returns 1 for Unicode characters having the category 'Ll', 'Lu', 'Lt',
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index ffac371..2f66c3c 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -103,6 +103,18 @@ static PyUnicodeObject *unicode_latin1[256];
*/
static char unicode_default_encoding[100];
+Py_UNICODE
+PyUnicode_GetMax()
+{
+#ifdef USE_UCS4_STORAGE
+ return 0x10FFFF;
+#else
+ /* This is actually an illegal character, so it should
+ not be passed to unichr. */
+ return 0xFFFF;
+#endif
+}
+
/* --- Unicode Object ----------------------------------------------------- */
static
@@ -884,12 +896,6 @@ PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
cbWritten += 2;
}
else if (ch < 0x10000) {
-#if Py_UNICODE_SIZE == 4
- *p++ = 0xe0 | (ch>>12);
- *p++ = 0x80 | ((ch>>6) & 0x3f);
- *p++ = 0x80 | (ch & 0x3f);
- cbWritten += 3;
-#else
/* Check for high surrogate */
if (0xD800 <= ch && ch <= 0xDBFF) {
if (i != size) {
@@ -920,7 +926,6 @@ PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
}
*p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
*p++ = (char)(0x80 | (ch & 0x3f));
-#endif
} else {
*p++ = 0xf0 | (ch>>18);
*p++ = 0x80 | ((ch>>12) & 0x3f);