1 files changed, 59 insertions, 43 deletions
diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c
index 95bdf3c..4db5087 100644
--- a/Modules/unicodedata.c
+++ b/Modules/unicodedata.c
@@ -1,12 +1,13 @@
 /* ------------------------------------------------------------------------
 
-   unicodedata -- Provides access to the Unicode 5.2 data base.
+   unicodedata -- Provides access to the Unicode database.
 
-   Data was extracted from the Unicode 5.2 UnicodeData.txt file.
+   Data was extracted from the UnicodeData.txt file.
+   The current version number is reported in the unidata_version constant.
 
    Written by Marc-Andre Lemburg (mal@lemburg.com).
    Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
-   Modified by Martin v. Löwis (martin@v.loewis.de)
+   Modified by Martin v. LÃ¶wis (martin@v.loewis.de)
 
    Copyright (c) Corporation for National Research Initiatives.
 
@@ -73,6 +74,7 @@ static PyMemberDef DB_members[] = {
 
 /* forward declaration */
 static PyTypeObject UCD_Type;
+#define UCD_Check(o) (Py_TYPE(o)==&UCD_Type)
 
 static PyObject*
 new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4),
@@ -130,7 +132,7 @@ unicodedata_decimal(PyObject *self, PyObject *args)
     if (c == (Py_UCS4)-1)
         return NULL;
 
-    if (self) {
+    if (self && UCD_Check(self)) {
         const change_record *old = get_old_record(self, c);
         if (old->category_changed == 0) {
             /* unassigned */
@@ -156,7 +158,7 @@ unicodedata_decimal(PyObject *self, PyObject *args)
             return defobj;
         }
     }
-    return PyInt_FromLong(rc);
+    return PyLong_FromLong(rc);
 }
 
 PyDoc_STRVAR(unicodedata_digit__doc__,
@@ -190,7 +192,7 @@ unicodedata_digit(PyObject *self, PyObject *args)
             return defobj;
         }
     }
-    return PyInt_FromLong(rc);
+    return PyLong_FromLong(rc);
 }
 
 PyDoc_STRVAR(unicodedata_numeric__doc__,
@@ -215,7 +217,7 @@ unicodedata_numeric(PyObject *self, PyObject *args)
     if (c == (Py_UCS4)-1)
         return NULL;
 
-    if (self) {
+    if (self && UCD_Check(self)) {
         const change_record *old = get_old_record(self, c);
         if (old->category_changed == 0) {
             /* unassigned */
@@ -263,12 +265,12 @@ unicodedata_category(PyObject *self, PyObject *args)
     if (c == (Py_UCS4)-1)
         return NULL;
     index = (int) _getrecord_ex(c)->category;
-    if (self) {
+    if (self && UCD_Check(self)) {
         const change_record *old = get_old_record(self, c);
         if (old->category_changed != 0xFF)
             index = old->category_changed;
     }
-    return PyString_FromString(_PyUnicode_CategoryNames[index]);
+    return PyUnicode_FromString(_PyUnicode_CategoryNames[index]);
 }
 
 PyDoc_STRVAR(unicodedata_bidirectional__doc__,
@@ -292,14 +294,14 @@ unicodedata_bidirectional(PyObject *self, PyObject *args)
     if (c == (Py_UCS4)-1)
         return NULL;
     index = (int) _getrecord_ex(c)->bidirectional;
-    if (self) {
+    if (self && UCD_Check(self)) {
         const change_record *old = get_old_record(self, c);
         if (old->category_changed == 0)
             index = 0; /* unassigned */
         else if (old->bidir_changed != 0xFF)
             index = old->bidir_changed;
     }
-    return PyString_FromString(_PyUnicode_BidirectionalNames[index]);
+    return PyUnicode_FromString(_PyUnicode_BidirectionalNames[index]);
 }
 
 PyDoc_STRVAR(unicodedata_combining__doc__,
@@ -323,12 +325,12 @@ unicodedata_combining(PyObject *self, PyObject *args)
     if (c == (Py_UCS4)-1)
         return NULL;
     index = (int) _getrecord_ex(c)->combining;
-    if (self) {
+    if (self && UCD_Check(self)) {
         const change_record *old = get_old_record(self, c);
         if (old->category_changed == 0)
             index = 0; /* unassigned */
     }
-    return PyInt_FromLong(index);
+    return PyLong_FromLong(index);
 }
 
 PyDoc_STRVAR(unicodedata_mirrored__doc__,
@@ -352,14 +354,14 @@ unicodedata_mirrored(PyObject *self, PyObject *args)
     if (c == (Py_UCS4)-1)
         return NULL;
     index = (int) _getrecord_ex(c)->mirrored;
-    if (self) {
+    if (self && UCD_Check(self)) {
         const change_record *old = get_old_record(self, c);
         if (old->category_changed == 0)
             index = 0; /* unassigned */
         else if (old->mirrored_changed != 0xFF)
             index = old->mirrored_changed;
     }
-    return PyInt_FromLong(index);
+    return PyLong_FromLong(index);
 }
 
 PyDoc_STRVAR(unicodedata_east_asian_width__doc__,
@@ -382,12 +384,12 @@ unicodedata_east_asian_width(PyObject *self, PyObject *args)
     if (c == (Py_UCS4)-1)
         return NULL;
     index = (int) _getrecord_ex(c)->east_asian_width;
-    if (self) {
+    if (self && UCD_Check(self)) {
         const change_record *old = get_old_record(self, c);
         if (old->category_changed == 0)
             index = 0; /* unassigned */
     }
-    return PyString_FromString(_PyUnicode_EastAsianWidthNames[index]);
+    return PyUnicode_FromString(_PyUnicode_EastAsianWidthNames[index]);
 }
 
 PyDoc_STRVAR(unicodedata_decomposition__doc__,
@@ -402,7 +404,8 @@ unicodedata_decomposition(PyObject *self, PyObject *args)
 {
     PyUnicodeObject *v;
     char decomp[256];
-    int code, index, count, i;
+    int code, index, count;
+    size_t i;
     unsigned int prefix_index;
     Py_UCS4 c;
 
@@ -415,10 +418,10 @@ unicodedata_decomposition(PyObject *self, PyObject *args)
 
     code = (int)c;
 
-    if (self) {
+    if (self && UCD_Check(self)) {
         const change_record *old = get_old_record(self, c);
         if (old->category_changed == 0)
-            return PyString_FromString(""); /* unassigned */
+            return PyUnicode_FromString(""); /* unassigned */
     }
 
     if (code < 0 || code >= 0x110000)
@@ -449,15 +452,12 @@ unicodedata_decomposition(PyObject *self, PyObject *args)
     while (count-- > 0) {
         if (i)
             decomp[i++] = ' ';
-        assert((size_t)i < sizeof(decomp));
+        assert(i < sizeof(decomp));
         PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
                       decomp_data[++index]);
         i += strlen(decomp + i);
     }
-
-    decomp[i] = '\0';
-
-    return PyString_FromString(decomp);
+    return PyUnicode_FromStringAndSize(decomp, i);
 }
 
 static void
@@ -465,7 +465,8 @@ get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *co
 {
     if (code >= 0x110000) {
         *index = 0;
-    } else if (self && get_old_record(self, code)->category_changed==0) {
+    } else if (self && UCD_Check(self) &&
+               get_old_record(self, code)->category_changed==0) {
         /* unassigned in old version */
         *index = 0;
     }
@@ -522,7 +523,7 @@ nfd_nfkd(PyObject *self, PyObject *input, int k)
             /* Hangul Decomposition adds three characters in
                a single step, so we need atleast that much room. */
             if (space < 3) {
-                Py_ssize_t newsize = PyString_GET_SIZE(result) + 10;
+                Py_ssize_t newsize = PyUnicode_GET_SIZE(result) + 10;
                 space += 10;
                 if (PyUnicode_Resize(&result, newsize) == -1)
                     return NULL;
@@ -544,7 +545,7 @@ nfd_nfkd(PyObject *self, PyObject *input, int k)
                 continue;
             }
             /* normalization changes */
-            if (self) {
+            if (self && UCD_Check(self)) {
                 Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
                 if (value != 0) {
                     stack[stackptr++] = value;
@@ -736,7 +737,7 @@ is_normalized(PyObject *self, PyObject *input, int nfc, int k)
 
     /* An older version of the database is requested, quickchecks must be
        disabled. */
-    if (self != NULL)
+    if (self && UCD_Check(self))
         return 0;
 
     /* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No,
@@ -869,14 +870,16 @@ static char *hangul_syllables[][3] = {
     { 0,    0,     "H"  }
 };
 
+/* These ranges need to match makeunicodedata.py:cjk_ranges. */
 static int
 is_unified_ideograph(Py_UCS4 code)
 {
-    return (
-        (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
-        (0x4E00 <= code && code <= 0x9FCB) || /* CJK Ideograph, Unicode 5.2 */
+    return
+        (0x3400 <= code && code <= 0x4DB5)   || /* CJK Ideograph Extension A */
+        (0x4E00 <= code && code <= 0x9FCB)   || /* CJK Ideograph */
         (0x20000 <= code && code <= 0x2A6D6) || /* CJK Ideograph Extension B */
-        (0x2A700 <= code && code <= 0x2B734));  /* CJK Ideograph Extension C */
+        (0x2A700 <= code && code <= 0x2B734) || /* CJK Ideograph Extension C */
+        (0x2B740 <= code && code <= 0x2B81D);   /* CJK Ideograph Extension D */
 }
 
 static int
@@ -890,7 +893,7 @@ _getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen)
     if (code >= 0x110000)
         return 0;
 
-    if (self) {
+    if (self && UCD_Check(self)) {
         const change_record *old = get_old_record(self, code);
         if (old->category_changed == 0) {
             /* unassigned */
@@ -1126,7 +1129,7 @@ unicodedata_name(PyObject* self, PyObject* args)
         }
     }
 
-    return Py_BuildValue("s", name);
+    return PyUnicode_FromString(name);
 }
 
 PyDoc_STRVAR(unicodedata_lookup__doc__,
@@ -1201,7 +1204,7 @@ static PyTypeObject UCD_Type = {
         0,                      /*tp_print*/
         0,                      /*tp_getattr*/
         0,                      /*tp_setattr*/
-        0,                      /*tp_compare*/
+        0,                      /*tp_reserved*/
         0,                      /*tp_repr*/
         0,                      /*tp_as_number*/
         0,                      /*tp_as_sequence*/
@@ -1239,23 +1242,35 @@ PyDoc_STRVAR(unicodedata_docstring,
 "This module provides access to the Unicode Character Database which\n\
 defines character properties for all Unicode characters. The data in\n\
 this database is based on the UnicodeData.txt file version\n\
-5.2.0 which is publically available from ftp://ftp.unicode.org/.\n\
+6.0.0 which is publically available from ftp://ftp.unicode.org/.\n\
 \n\
 The module uses the same names and symbols as defined by the\n\
-UnicodeData File Format 5.2.0 (see\n\
-http://www.unicode.org/reports/tr44/tr44-4.html).");
+UnicodeData File Format 6.0.0 (see\n\
+http://www.unicode.org/reports/tr44/tr44-6.html).");
+
+
+static struct PyModuleDef unicodedatamodule = {
+        PyModuleDef_HEAD_INIT,
+        "unicodedata",
+        unicodedata_docstring,
+        -1,
+        unicodedata_functions,
+        NULL,
+        NULL,
+        NULL,
+        NULL
+};
 
 PyMODINIT_FUNC
-initunicodedata(void)
+PyInit_unicodedata(void)
 {
     PyObject *m, *v;
 
     Py_TYPE(&UCD_Type) = &PyType_Type;
 
-    m = Py_InitModule3(
-        "unicodedata", unicodedata_functions, unicodedata_docstring);
+    m = PyModule_Create(&unicodedatamodule);
     if (!m)
-        return;
+        return NULL;
 
     PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
     Py_INCREF(&UCD_Type);
@@ -1270,6 +1285,7 @@ initunicodedata(void)
     v = PyCapsule_New((void *)&hashAPI, PyUnicodeData_CAPSULE_NAME, NULL);
     if (v != NULL)
         PyModule_AddObject(m, "ucnhash_CAPI", v);
+    return m;
 }
 
 /*