summaryrefslogtreecommitdiffstats
path: root/Objects/unicodeobject.c
diff options
context:
space:
mode:
Diffstat (limited to 'Objects/unicodeobject.c')
-rw-r--r--Objects/unicodeobject.c91
1 files changed, 71 insertions, 20 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index e00a9b8..dd21a5f 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -117,6 +117,16 @@ static PyUnicodeObject *unicode_empty = NULL;
static PyUnicodeObject *unicode_freelist = NULL;
static int unicode_freelist_size = 0;
+/* Default encoding to use and assume when NULL is passed as encoding
+ parameter; it is initialized by _PyUnicode_Init().
+
+ Always use the PyUnicode_SetDefaultEncoding() and
+ PyUnicode_GetDefaultEncoding() APIs to access this global.
+
+*/
+
+static char unicode_default_encoding[100];
+
/* --- Unicode Object ----------------------------------------------------- */
static
@@ -366,7 +376,7 @@ PyObject *PyUnicode_FromObject(register PyObject *obj)
Py_INCREF(unicode_empty);
return (PyObject *)unicode_empty;
}
- return PyUnicode_DecodeUTF8(s, len, "strict");
+ return PyUnicode_Decode(s, len, NULL, "strict");
}
PyObject *PyUnicode_Decode(const char *s,
@@ -376,10 +386,16 @@ PyObject *PyUnicode_Decode(const char *s,
{
PyObject *buffer = NULL, *unicode;
- /* Shortcut for the default encoding UTF-8 */
- if (encoding == NULL ||
- (strcmp(encoding, "utf-8") == 0))
+ if (encoding == NULL)
+ encoding = PyUnicode_GetDefaultEncoding();
+
+ /* Shortcuts for common default encodings */
+ if (strcmp(encoding, "utf-8") == 0)
return PyUnicode_DecodeUTF8(s, size, errors);
+ else if (strcmp(encoding, "latin-1") == 0)
+ return PyUnicode_DecodeLatin1(s, size, errors);
+ else if (strcmp(encoding, "ascii") == 0)
+ return PyUnicode_DecodeASCII(s, size, errors);
/* Decode via the codec registry */
buffer = PyBuffer_FromMemory((void *)s, size);
@@ -428,11 +444,19 @@ PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
PyErr_BadArgument();
goto onError;
}
- /* Shortcut for the default encoding UTF-8 */
- if ((encoding == NULL ||
- (strcmp(encoding, "utf-8") == 0)) &&
- errors == NULL)
+
+ if (encoding == NULL)
+ encoding = PyUnicode_GetDefaultEncoding();
+
+ /* Shortcuts for common default encodings */
+ if (errors == NULL) {
+ if (strcmp(encoding, "utf-8") == 0)
return PyUnicode_AsUTF8String(unicode);
+ else if (strcmp(encoding, "latin-1") == 0)
+ return PyUnicode_AsLatin1String(unicode);
+ else if (strcmp(encoding, "ascii") == 0)
+ return PyUnicode_AsASCIIString(unicode);
+ }
/* Encode via the codec registry */
v = PyCodec_Encode(unicode, encoding, errors);
@@ -476,6 +500,30 @@ int PyUnicode_GetSize(PyObject *unicode)
return -1;
}
+const char *PyUnicode_GetDefaultEncoding()
+{
+ return unicode_default_encoding;
+}
+
+int PyUnicode_SetDefaultEncoding(const char *encoding)
+{
+ PyObject *v;
+
+ /* Make sure the encoding is valid. As side effect, this also
+ loads the encoding into the codec registry cache. */
+ v = _PyCodec_Lookup(encoding);
+ if (v == NULL)
+ goto onError;
+ Py_DECREF(v);
+ strncpy(unicode_default_encoding,
+ encoding,
+ sizeof(unicode_default_encoding));
+ return 0;
+
+ onError:
+ return -1;
+}
+
/* --- UTF-8 Codec -------------------------------------------------------- */
static
@@ -772,7 +820,8 @@ int utf16_decoding_error(const Py_UNICODE **source,
}
else {
PyErr_Format(PyExc_ValueError,
- "UTF-16 decoding error; unknown error handling code: %.400s",
+ "UTF-16 decoding error; "
+ "unknown error handling code: %.400s",
errors);
return -1;
}
@@ -3057,10 +3106,10 @@ unicode_count(PyUnicodeObject *self, PyObject *args)
static char encode__doc__[] =
"S.encode([encoding[,errors]]) -> string\n\
\n\
-Return an encoded string version of S. Default encoding is 'UTF-8'.\n\
-errors may be given to set a different error handling scheme. Default\n\
-is 'strict' meaning that encoding errors raise a ValueError. Other\n\
-possible values are 'ignore' and 'replace'.";
+Return an encoded string version of S. Default encoding is the current\n\
+default string encoding. errors may be given to set a different error\n\
+handling scheme. Default is 'strict' meaning that encoding errors raise\n\
+a ValueError. Other possible values are 'ignore' and 'replace'.";
static PyObject *
unicode_encode(PyUnicodeObject *self, PyObject *args)
@@ -3816,7 +3865,7 @@ unicode_splitlines(PyUnicodeObject *self, PyObject *args)
static
PyObject *unicode_str(PyUnicodeObject *self)
{
- return PyUnicode_AsUTF8String((PyObject *)self);
+ return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
}
static char strip__doc__[] =
@@ -4246,6 +4295,8 @@ PyObject *PyUnicode_Format(PyObject *format,
return NULL;
}
uformat = PyUnicode_FromObject(format);
+ if (uformat == NULL)
+ return NULL;
fmt = PyUnicode_AS_UNICODE(uformat);
fmtcnt = PyUnicode_GET_SIZE(uformat);
@@ -4322,13 +4373,10 @@ PyObject *PyUnicode_Format(PyObject *format,
"incomplete format key");
goto onError;
}
- /* keys are converted to strings (using UTF-8) and
+ /* keys are converted to strings using UTF-8 and
then looked up since Python uses strings to hold
variables names etc. in its namespaces and we
- wouldn't want to break common idioms. The
- alternative would be using Unicode objects for the
- lookup but u"abc" and "abc" have different hash
- values (on purpose). */
+ wouldn't want to break common idioms. */
key = PyUnicode_EncodeUTF8(keystart,
keylen,
NULL);
@@ -4472,8 +4520,9 @@ PyObject *PyUnicode_Format(PyObject *format,
"%s argument has non-string str()");
goto onError;
}
- unicode = PyUnicode_DecodeUTF8(PyString_AS_STRING(temp),
+ unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
PyString_GET_SIZE(temp),
+ NULL,
"strict");
Py_DECREF(temp);
temp = unicode;
@@ -4659,7 +4708,9 @@ void _PyUnicode_Init()
Py_FatalError("Unicode configuration error: "
"sizeof(Py_UNICODE) != 2 bytes");
+ /* Init the implementation */
unicode_empty = _PyUnicode_New(0);
+ strcpy(unicode_default_encoding, "utf-8");
}
/* Finalize the Unicode implementation */