diff options
author | Martin v. Löwis <martin@v.loewis.de> | 2006-06-04 19:36:28 (GMT) |
---|---|---|
committer | Martin v. Löwis <martin@v.loewis.de> | 2006-06-04 19:36:28 (GMT) |
commit | 3f767795f6784ca6bf797b055be67fce5bf2fa06 (patch) | |
tree | 851eea9180023026ef3ef2319fae871a50bb479c /Objects | |
parent | 67966bed72dc175938c4b971145c4746ac993f19 (diff) | |
download | cpython-3f767795f6784ca6bf797b055be67fce5bf2fa06.zip cpython-3f767795f6784ca6bf797b055be67fce5bf2fa06.tar.gz cpython-3f767795f6784ca6bf797b055be67fce5bf2fa06.tar.bz2 |
Patch #1359618: Speed-up charmap encoder.
Diffstat (limited to 'Objects')
-rw-r--r-- | Objects/unicodeobject.c | 334 |
1 files changed, 285 insertions, 49 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index f93cfa5..eb5bdd8 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -3057,6 +3057,219 @@ PyObject *PyUnicode_DecodeCharmap(const char *s, return NULL; } +/* Charmap encoding: the lookup table */ + +struct encoding_map{ + PyObject_HEAD + unsigned char level1[32]; + int count2, count3; + unsigned char level23[1]; +}; + +static PyObject* +encoding_map_size(PyObject *obj, PyObject* args) +{ + struct encoding_map *map = (struct encoding_map*)obj; + return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 + + 128*map->count3); +} + +static PyMethodDef encoding_map_methods[] = { + {"size", encoding_map_size, METH_NOARGS, + PyDoc_STR("Return the size (in bytes) of this object") }, + { 0 } +}; + +static void +encoding_map_dealloc(PyObject* o) +{ + PyObject_FREE(o); +} + +static PyTypeObject EncodingMapType = { + PyObject_HEAD_INIT(NULL) + 0, /*ob_size*/ + "EncodingMap", /*tp_name*/ + sizeof(struct encoding_map), /*tp_basicsize*/ + 0, /*tp_itemsize*/ + /* methods */ + encoding_map_dealloc, /*tp_dealloc*/ + 0, /*tp_print*/ + 0, /*tp_getattr*/ + 0, /*tp_setattr*/ + 0, /*tp_compare*/ + 0, /*tp_repr*/ + 0, /*tp_as_number*/ + 0, /*tp_as_sequence*/ + 0, /*tp_as_mapping*/ + 0, /*tp_hash*/ + 0, /*tp_call*/ + 0, /*tp_str*/ + 0, /*tp_getattro*/ + 0, /*tp_setattro*/ + 0, /*tp_as_buffer*/ + Py_TPFLAGS_DEFAULT, /*tp_flags*/ + 0, /*tp_doc*/ + 0, /*tp_traverse*/ + 0, /*tp_clear*/ + 0, /*tp_richcompare*/ + 0, /*tp_weaklistoffset*/ + 0, /*tp_iter*/ + 0, /*tp_iternext*/ + encoding_map_methods, /*tp_methods*/ + 0, /*tp_members*/ + 0, /*tp_getset*/ + 0, /*tp_base*/ + 0, /*tp_dict*/ + 0, /*tp_descr_get*/ + 0, /*tp_descr_set*/ + 0, /*tp_dictoffset*/ + 0, /*tp_init*/ + 0, /*tp_alloc*/ + 0, /*tp_new*/ + 0, /*tp_free*/ + 0, /*tp_is_gc*/ +}; + +PyObject* +PyUnicode_BuildEncodingMap(PyObject* string) +{ + Py_UNICODE *decode; + PyObject *result; + struct encoding_map *mresult; + int i; + int need_dict = 0; + unsigned char level1[32]; + unsigned char level2[512]; + unsigned char *mlevel1, *mlevel2, *mlevel3; + int count2 = 0, count3 = 0; + + if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) { + PyErr_BadArgument(); + return NULL; + } + decode = PyUnicode_AS_UNICODE(string); + memset(level1, 0xFF, sizeof level1); + memset(level2, 0xFF, sizeof level2); + + /* If there isn't a one-to-one mapping of NULL to \0, + or if there are non-BMP characters, we need to use + a mapping dictionary. */ + if (decode[0] != 0) + need_dict = 1; + for (i = 1; i < 256; i++) { + int l1, l2; + if (decode[i] == 0 + #ifdef Py_UNICODE_WIDE + || decode[i] > 0xFFFF + #endif + ) { + need_dict = 1; + break; + } + if (decode[i] == 0xFFFE) + /* unmapped character */ + continue; + l1 = decode[i] >> 11; + l2 = decode[i] >> 7; + if (level1[l1] == 0xFF) + level1[l1] = count2++; + if (level2[l2] == 0xFF) + level2[l2] = count3++; + } + + if (count2 >= 0xFF || count3 >= 0xFF) + need_dict = 1; + + if (need_dict) { + PyObject *result = PyDict_New(); + PyObject *key, *value; + if (!result) + return NULL; + for (i = 0; i < 256; i++) { + key = value = NULL; + key = PyInt_FromLong(decode[i]); + value = PyInt_FromLong(i); + if (!key || !value) + goto failed1; + if (PyDict_SetItem(result, key, value) == -1) + goto failed1; + } + return result; + failed1: + Py_XDECREF(key); + Py_XDECREF(value); + Py_DECREF(result); + return NULL; + } + + /* Create a three-level trie */ + result = PyObject_MALLOC(sizeof(struct encoding_map) + + 16*count2 + 128*count3 - 1); + if (!result) + return PyErr_NoMemory(); + PyObject_Init(result, &EncodingMapType); + mresult = (struct encoding_map*)result; + mresult->count2 = count2; + mresult->count3 = count3; + mlevel1 = mresult->level1; + mlevel2 = mresult->level23; + mlevel3 = mresult->level23 + 16*count2; + memcpy(mlevel1, level1, 32); + memset(mlevel2, 0xFF, 16*count2); + memset(mlevel3, 0, 128*count3); + count3 = 0; + for (i = 1; i < 256; i++) { + int o1, o2, o3, i2, i3; + if (decode[i] == 0xFFFE) + /* unmapped character */ + continue; + o1 = decode[i]>>11; + o2 = (decode[i]>>7) & 0xF; + i2 = 16*mlevel1[o1] + o2; + if (mlevel2[i2] == 0xFF) + mlevel2[i2] = count3++; + o3 = decode[i] & 0x7F; + i3 = 128*mlevel2[i2] + o3; + mlevel3[i3] = i; + } + return result; +} + +static int +encoding_map_lookup(Py_UNICODE c, PyObject *mapping) +{ + struct encoding_map *map = (struct encoding_map*)mapping; + int l1 = c>>11; + int l2 = (c>>7) & 0xF; + int l3 = c & 0x7F; + int i; + +#ifdef Py_UNICODE_WIDE + if (c > 0xFFFF) { + return -1; + } +#endif + if (c == 0) + return 0; + /* level 1*/ + i = map->level1[l1]; + if (i == 0xFF) { + return -1; + } + /* level 2*/ + i = map->level23[16*i+l2]; + if (i == 0xFF) { + return -1; + } + /* level 3 */ + i = map->level23[16*map->count2 + 128*i + l3]; + if (i == 0) { + return -1; + } + return i; +} + /* Lookup the character ch in the mapping. If the character can't be found, Py_None is returned (or NULL, if another error occurred). */ @@ -3102,6 +3315,22 @@ static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping) } } +static int +charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize) +{ + Py_ssize_t outsize = PyString_GET_SIZE(*outobj); + /* exponentially overallocate to minimize reallocations */ + if (requiredsize < 2*outsize) + requiredsize = 2*outsize; + if (_PyString_Resize(outobj, requiredsize)) { + return 0; + } + return 1; +} + +typedef enum charmapencode_result { + enc_SUCCESS, enc_FAILED, enc_EXCEPTION +}charmapencode_result; /* lookup the character, put the result in the output string and adjust various state variables. Reallocate the output string if not enough space is available. Return a new reference to the object that @@ -3109,51 +3338,58 @@ static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping) (in which case no character was written) or NULL, if a reallocation error occurred. The caller must decref the result */ static -PyObject *charmapencode_output(Py_UNICODE c, PyObject *mapping, +charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping, PyObject **outobj, Py_ssize_t *outpos) { - PyObject *rep = charmapencode_lookup(c, mapping); + PyObject *rep; + char *outstart; + Py_ssize_t outsize = PyString_GET_SIZE(*outobj); + if (mapping->ob_type == &EncodingMapType) { + int res = encoding_map_lookup(c, mapping); + Py_ssize_t requiredsize = *outpos+1; + if (res == -1) + return enc_FAILED; + if (outsize<requiredsize) + if (!charmapencode_resize(outobj, outpos, requiredsize)) + return enc_EXCEPTION; + outstart = PyString_AS_STRING(*outobj); + outstart[(*outpos)++] = (char)res; + return enc_SUCCESS; + } + + rep = charmapencode_lookup(c, mapping); if (rep==NULL) - return NULL; - else if (rep==Py_None) - return rep; - else { - char *outstart = PyString_AS_STRING(*outobj); - Py_ssize_t outsize = PyString_GET_SIZE(*outobj); + return enc_EXCEPTION; + else if (rep==Py_None) { + Py_DECREF(rep); + return enc_FAILED; + } else { if (PyInt_Check(rep)) { Py_ssize_t requiredsize = *outpos+1; - if (outsize<requiredsize) { - /* exponentially overallocate to minimize reallocations */ - if (requiredsize < 2*outsize) - requiredsize = 2*outsize; - if (_PyString_Resize(outobj, requiredsize)) { + if (outsize<requiredsize) + if (!charmapencode_resize(outobj, outpos, requiredsize)) { Py_DECREF(rep); - return NULL; + return enc_EXCEPTION; } - outstart = PyString_AS_STRING(*outobj); - } + outstart = PyString_AS_STRING(*outobj); outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep); } else { const char *repchars = PyString_AS_STRING(rep); Py_ssize_t repsize = PyString_GET_SIZE(rep); Py_ssize_t requiredsize = *outpos+repsize; - if (outsize<requiredsize) { - /* exponentially overallocate to minimize reallocations */ - if (requiredsize < 2*outsize) - requiredsize = 2*outsize; - if (_PyString_Resize(outobj, requiredsize)) { + if (outsize<requiredsize) + if (!charmapencode_resize(outobj, outpos, requiredsize)) { Py_DECREF(rep); - return NULL; + return enc_EXCEPTION; } - outstart = PyString_AS_STRING(*outobj); - } + outstart = PyString_AS_STRING(*outobj); memcpy(outstart + *outpos, repchars, repsize); *outpos += repsize; } } - return rep; + return enc_SUCCESS; } /* handle an error in PyUnicode_EncodeCharmap @@ -3175,18 +3411,27 @@ int charmap_encoding_error( Py_ssize_t collpos; char *encoding = "charmap"; char *reason = "character maps to <undefined>"; + charmapencode_result x; - PyObject *x; /* find all unencodable characters */ while (collendpos < size) { - x = charmapencode_lookup(p[collendpos], mapping); - if (x==NULL) + PyObject *rep; + if (mapping->ob_type == &EncodingMapType) { + int res = encoding_map_lookup(p[collendpos], mapping); + if (res != -1) + break; + ++collendpos; + continue; + } + + rep = charmapencode_lookup(p[collendpos], mapping); + if (rep==NULL) return -1; - else if (x!=Py_None) { - Py_DECREF(x); + else if (rep!=Py_None) { + Py_DECREF(rep); break; } - Py_DECREF(x); + Py_DECREF(rep); ++collendpos; } /* cache callback name lookup @@ -3210,15 +3455,13 @@ int charmap_encoding_error( case 2: /* replace */ for (collpos = collstartpos; collpos<collendpos; ++collpos) { x = charmapencode_output('?', mapping, res, respos); - if (x==NULL) { + if (x==enc_EXCEPTION) { return -1; } - else if (x==Py_None) { - Py_DECREF(x); + else if (x==enc_FAILED) { raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); return -1; } - Py_DECREF(x); } /* fall through */ case 3: /* ignore */ @@ -3232,14 +3475,12 @@ int charmap_encoding_error( sprintf(buffer, "&#%d;", (int)p[collpos]); for (cp = buffer; *cp; ++cp) { x = charmapencode_output(*cp, mapping, res, respos); - if (x==NULL) + if (x==enc_EXCEPTION) return -1; - else if (x==Py_None) { - Py_DECREF(x); + else if (x==enc_FAILED) { raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); return -1; } - Py_DECREF(x); } } *inpos = collendpos; @@ -3254,17 +3495,14 @@ int charmap_encoding_error( repsize = PyUnicode_GET_SIZE(repunicode); for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) { x = charmapencode_output(*uni2, mapping, res, respos); - if (x==NULL) { - Py_DECREF(repunicode); + if (x==enc_EXCEPTION) { return -1; } - else if (x==Py_None) { + else if (x==enc_FAILED) { Py_DECREF(repunicode); - Py_DECREF(x); raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); return -1; } - Py_DECREF(x); } *inpos = newpos; Py_DECREF(repunicode); @@ -3304,22 +3542,20 @@ PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p, while (inpos<size) { /* try to encode it */ - PyObject *x = charmapencode_output(p[inpos], mapping, &res, &respos); - if (x==NULL) /* error */ + charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos); + if (x==enc_EXCEPTION) /* error */ goto onError; - if (x==Py_None) { /* unencodable character */ + if (x==enc_FAILED) { /* unencodable character */ if (charmap_encoding_error(p, size, &inpos, mapping, &exc, &known_errorHandler, &errorHandler, errors, &res, &respos)) { - Py_DECREF(x); goto onError; } } else /* done with this character => adjust input position */ ++inpos; - Py_DECREF(x); } /* Resize if we allocated to much */ |