summaryrefslogtreecommitdiffstats
path: root/Objects/unicodeobject.c
diff options
context:
space:
mode:
Diffstat (limited to 'Objects/unicodeobject.c')
-rw-r--r--Objects/unicodeobject.c295
1 files changed, 233 insertions, 62 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index c3ab2d8..9e35b61 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -2820,65 +2820,199 @@ PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
/* --- MBCS codecs for Windows -------------------------------------------- */
-PyObject *PyUnicode_DecodeMBCS(const char *s,
- Py_ssize_t size,
- const char *errors)
+#if SIZEOF_INT < SIZEOF_SSIZE_T
+#define NEED_RETRY
+#endif
+
+/* XXX This code is limited to "true" double-byte encodings, as
+ a) it assumes an incomplete character consists of a single byte, and
+ b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
+ encodings, see IsDBCSLeadByteEx documentation. */
+
+static int is_dbcs_lead_byte(const char *s, int offset)
+{
+ const char *curr = s + offset;
+
+ if (IsDBCSLeadByte(*curr)) {
+ const char *prev = CharPrev(s, curr);
+ return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
+ }
+ return 0;
+}
+
+/*
+ * Decode MBCS string into unicode object. If 'final' is set, converts
+ * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
+ */
+static int decode_mbcs(PyUnicodeObject **v,
+ const char *s, /* MBCS string */
+ int size, /* sizeof MBCS string */
+ int final)
{
- PyUnicodeObject *v;
Py_UNICODE *p;
- DWORD usize;
+ Py_ssize_t n = 0;
+ int usize = 0;
+
+ assert(size >= 0);
+
+ /* Skip trailing lead-byte unless 'final' is set */
+ if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
+ --size;
/* First get the size of the result */
- assert(size < INT_MAX);
- usize = MultiByteToWideChar(CP_ACP, 0, s, (int)size, NULL, 0);
- if (size > 0 && usize==0)
- return PyErr_SetFromWindowsErrWithFilename(0, NULL);
+ if (size > 0) {
+ usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
+ if (usize == 0) {
+ PyErr_SetFromWindowsErrWithFilename(0, NULL);
+ return -1;
+ }
+ }
- v = _PyUnicode_New(usize);
- if (v == NULL)
- return NULL;
- if (usize == 0)
- return (PyObject *)v;
- p = PyUnicode_AS_UNICODE(v);
- if (0 == MultiByteToWideChar(CP_ACP, 0, s, (int)size, p, usize)) {
- Py_DECREF(v);
- return PyErr_SetFromWindowsErrWithFilename(0, NULL);
+ if (*v == NULL) {
+ /* Create unicode object */
+ *v = _PyUnicode_New(usize);
+ if (*v == NULL)
+ return -1;
+ }
+ else {
+ /* Extend unicode object */
+ n = PyUnicode_GET_SIZE(*v);
+ if (_PyUnicode_Resize(v, n + usize) < 0)
+ return -1;
+ }
+
+ /* Do the conversion */
+ if (size > 0) {
+ p = PyUnicode_AS_UNICODE(*v) + n;
+ if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
+ PyErr_SetFromWindowsErrWithFilename(0, NULL);
+ return -1;
+ }
+ }
+
+ return size;
+}
+
+PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
+ Py_ssize_t size,
+ const char *errors,
+ Py_ssize_t *consumed)
+{
+ PyUnicodeObject *v = NULL;
+ int done;
+
+ if (consumed)
+ *consumed = 0;
+
+#ifdef NEED_RETRY
+ retry:
+ if (size > INT_MAX)
+ done = decode_mbcs(&v, s, INT_MAX, 0);
+ else
+#endif
+ done = decode_mbcs(&v, s, (int)size, !consumed);
+
+ if (done < 0) {
+ Py_XDECREF(v);
+ return NULL;
+ }
+
+ if (consumed)
+ *consumed += done;
+
+#ifdef NEED_RETRY
+ if (size > INT_MAX) {
+ s += done;
+ size -= done;
+ goto retry;
}
+#endif
return (PyObject *)v;
}
-PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
+PyObject *PyUnicode_DecodeMBCS(const char *s,
Py_ssize_t size,
const char *errors)
{
- PyObject *repr;
- char *s;
- DWORD mbcssize;
+ return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
+}
+
+/*
+ * Convert unicode into string object (MBCS).
+ * Returns 0 if succeed, -1 otherwise.
+ */
+static int encode_mbcs(PyObject **repr,
+ const Py_UNICODE *p, /* unicode */
+ int size) /* size of unicode */
+{
+ int mbcssize = 0;
+ Py_ssize_t n = 0;
- /* If there are no characters, bail now! */
- if (size==0)
- return PyString_FromString("");
+ assert(size >= 0);
/* First get the size of the result */
- assert(size<INT_MAX);
- mbcssize = WideCharToMultiByte(CP_ACP, 0, p, (int)size, NULL, 0, NULL, NULL);
- if (mbcssize==0)
- return PyErr_SetFromWindowsErrWithFilename(0, NULL);
+ if (size > 0) {
+ mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
+ if (mbcssize == 0) {
+ PyErr_SetFromWindowsErrWithFilename(0, NULL);
+ return -1;
+ }
+ }
- repr = PyString_FromStringAndSize(NULL, mbcssize);
- if (repr == NULL)
- return NULL;
- if (mbcssize == 0)
- return repr;
+ if (*repr == NULL) {
+ /* Create string object */
+ *repr = PyString_FromStringAndSize(NULL, mbcssize);
+ if (*repr == NULL)
+ return -1;
+ }
+ else {
+ /* Extend string object */
+ n = PyString_Size(*repr);
+ if (_PyString_Resize(repr, n + mbcssize) < 0)
+ return -1;
+ }
/* Do the conversion */
- s = PyString_AS_STRING(repr);
- assert(size < INT_MAX);
- if (0 == WideCharToMultiByte(CP_ACP, 0, p, (int)size, s, mbcssize, NULL, NULL)) {
- Py_DECREF(repr);
- return PyErr_SetFromWindowsErrWithFilename(0, NULL);
+ if (size > 0) {
+ char *s = PyString_AS_STRING(*repr) + n;
+ if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
+ PyErr_SetFromWindowsErrWithFilename(0, NULL);
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
+ Py_ssize_t size,
+ const char *errors)
+{
+ PyObject *repr = NULL;
+ int ret;
+
+#ifdef NEED_RETRY
+ retry:
+ if (size > INT_MAX)
+ ret = encode_mbcs(&repr, p, INT_MAX);
+ else
+#endif
+ ret = encode_mbcs(&repr, p, (int)size);
+
+ if (ret < 0) {
+ Py_XDECREF(repr);
+ return NULL;
}
+
+#ifdef NEED_RETRY
+ if (size > INT_MAX) {
+ p += INT_MAX;
+ size -= INT_MAX;
+ goto retry;
+ }
+#endif
+
return repr;
}
@@ -2893,6 +3027,8 @@ PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
NULL);
}
+#undef NEED_RETRY
+
#endif /* MS_WINDOWS */
/* --- Character Mapping Codec -------------------------------------------- */
@@ -4491,11 +4627,11 @@ PyUnicode_Join(PyObject *separator, PyObject *seq)
/* Make sure we have enough space for the separator and the item. */
itemlen = PyUnicode_GET_SIZE(item);
new_res_used = res_used + itemlen;
- if (new_res_used <= 0)
+ if (new_res_used < 0)
goto Overflow;
if (i < seqlen - 1) {
new_res_used += seplen;
- if (new_res_used <= 0)
+ if (new_res_used < 0)
goto Overflow;
}
if (new_res_used > res_alloc) {
@@ -4536,7 +4672,7 @@ PyUnicode_Join(PyObject *separator, PyObject *seq)
Overflow:
PyErr_SetString(PyExc_OverflowError,
- "join() is too long for a Python string");
+ "join() result is too long for a Python string");
Py_DECREF(item);
/* fall through */
@@ -6667,29 +6803,44 @@ PyDoc_STRVAR(startswith__doc__,
\n\
Return True if S starts with the specified prefix, False otherwise.\n\
With optional start, test S beginning at that position.\n\
-With optional end, stop comparing S at that position.");
+With optional end, stop comparing S at that position.\n\
+prefix can also be a tuple of strings to try.");
static PyObject *
unicode_startswith(PyUnicodeObject *self,
PyObject *args)
{
+ PyObject *subobj;
PyUnicodeObject *substring;
Py_ssize_t start = 0;
Py_ssize_t end = PY_SSIZE_T_MAX;
- PyObject *result;
+ int result;
- if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
+ if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
return NULL;
- substring = (PyUnicodeObject *)PyUnicode_FromObject(
- (PyObject *)substring);
+ if (PyTuple_Check(subobj)) {
+ Py_ssize_t i;
+ for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
+ substring = (PyUnicodeObject *)PyUnicode_FromObject(
+ PyTuple_GET_ITEM(subobj, i));
+ if (substring == NULL)
+ return NULL;
+ result = tailmatch(self, substring, start, end, -1);
+ Py_DECREF(substring);
+ if (result) {
+ Py_RETURN_TRUE;
+ }
+ }
+ /* nothing matched */
+ Py_RETURN_FALSE;
+ }
+ substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
if (substring == NULL)
- return NULL;
-
- result = PyBool_FromLong(tailmatch(self, substring, start, end, -1));
-
+ return NULL;
+ result = tailmatch(self, substring, start, end, -1);
Py_DECREF(substring);
- return result;
+ return PyBool_FromLong(result);
}
@@ -6698,29 +6849,44 @@ PyDoc_STRVAR(endswith__doc__,
\n\
Return True if S ends with the specified suffix, False otherwise.\n\
With optional start, test S beginning at that position.\n\
-With optional end, stop comparing S at that position.");
+With optional end, stop comparing S at that position.\n\
+suffix can also be a tuple of strings to try.");
static PyObject *
unicode_endswith(PyUnicodeObject *self,
PyObject *args)
{
+ PyObject *subobj;
PyUnicodeObject *substring;
Py_ssize_t start = 0;
Py_ssize_t end = PY_SSIZE_T_MAX;
- PyObject *result;
+ int result;
- if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
- _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
+ if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
+ _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
return NULL;
- substring = (PyUnicodeObject *)PyUnicode_FromObject(
- (PyObject *)substring);
+ if (PyTuple_Check(subobj)) {
+ Py_ssize_t i;
+ for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
+ substring = (PyUnicodeObject *)PyUnicode_FromObject(
+ PyTuple_GET_ITEM(subobj, i));
+ if (substring == NULL)
+ return NULL;
+ result = tailmatch(self, substring, start, end, +1);
+ Py_DECREF(substring);
+ if (result) {
+ Py_RETURN_TRUE;
+ }
+ }
+ Py_RETURN_FALSE;
+ }
+ substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
if (substring == NULL)
- return NULL;
-
- result = PyBool_FromLong(tailmatch(self, substring, start, end, +1));
+ return NULL;
+ result = tailmatch(self, substring, start, end, +1);
Py_DECREF(substring);
- return result;
+ return PyBool_FromLong(result);
}
@@ -7748,6 +7914,9 @@ void _PyUnicode_Init(void)
unicode_freelist = NULL;
unicode_freelist_size = 0;
unicode_empty = _PyUnicode_New(0);
+ if (!unicode_empty)
+ return;
+
strcpy(unicode_default_encoding, "ascii");
for (i = 0; i < 256; i++)
unicode_latin1[i] = NULL;
@@ -7758,6 +7927,8 @@ void _PyUnicode_Init(void)
bloom_linebreak = make_bloom_mask(
linebreak, sizeof(linebreak) / sizeof(linebreak[0])
);
+
+ PyType_Ready(&EncodingMapType);
}
/* Finalize the Unicode implementation */