M.-A. Lemburg <mal@lemburg.com>:

Added support for user settable default encodings. The current implementation uses a per-process global which defines the value of the encoding parameter in case it is set to NULL (meaning: use the default encoding).
author: Fred Drake <fdrake@acm.org> 2000-05-09 19:53:39 (GMT)
committer: Fred Drake <fdrake@acm.org> 2000-05-09 19:53:39 (GMT)
commit: e4315f58d212f080fe0ff132ff7bcfab5e862a75 (patch)
tree: b6b7b4d80c534033874daaefbbd505af62bb40e3 /Objects/unicodeobject.c
parent: aff601804d0b100815d1bdfdb2cf5778385a912c (diff)
download: cpython-e4315f58d212f080fe0ff132ff7bcfab5e862a75.zip
cpython-e4315f58d212f080fe0ff132ff7bcfab5e862a75.tar.gz
cpython-e4315f58d212f080fe0ff132ff7bcfab5e862a75.tar.bz2
1 files changed, 71 insertions, 20 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index e00a9b8..dd21a5f 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -117,6 +117,16 @@ static PyUnicodeObject *unicode_empty = NULL;
 static PyUnicodeObject *unicode_freelist = NULL;
 static int unicode_freelist_size = 0;
 
+/* Default encoding to use and assume when NULL is passed as encoding
+   parameter; it is initialized by _PyUnicode_Init().
+
+   Always use the PyUnicode_SetDefaultEncoding() and
+   PyUnicode_GetDefaultEncoding() APIs to access this global. 
+
+*/
+
+static char unicode_default_encoding[100];
+
 /* --- Unicode Object ----------------------------------------------------- */
 
 static
@@ -366,7 +376,7 @@ PyObject *PyUnicode_FromObject(register PyObject *obj)
 	Py_INCREF(unicode_empty);
 	return (PyObject *)unicode_empty;
     }
-    return PyUnicode_DecodeUTF8(s, len, "strict");
+    return PyUnicode_Decode(s, len, NULL, "strict");
 }
 
 PyObject *PyUnicode_Decode(const char *s,
@@ -376,10 +386,16 @@ PyObject *PyUnicode_Decode(const char *s,
 {
     PyObject *buffer = NULL, *unicode;
     
-    /* Shortcut for the default encoding UTF-8 */
-    if (encoding == NULL || 
-        (strcmp(encoding, "utf-8") == 0))
+    if (encoding == NULL) 
+	encoding = PyUnicode_GetDefaultEncoding();
+
+    /* Shortcuts for common default encodings */
+    if (strcmp(encoding, "utf-8") == 0)
         return PyUnicode_DecodeUTF8(s, size, errors);
+    else if (strcmp(encoding, "latin-1") == 0)
+        return PyUnicode_DecodeLatin1(s, size, errors);
+    else if (strcmp(encoding, "ascii") == 0)
+        return PyUnicode_DecodeASCII(s, size, errors);
 
     /* Decode via the codec registry */
     buffer = PyBuffer_FromMemory((void *)s, size);
@@ -428,11 +444,19 @@ PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
         PyErr_BadArgument();
         goto onError;
     }
-    /* Shortcut for the default encoding UTF-8 */
-    if ((encoding == NULL || 
-	 (strcmp(encoding, "utf-8") == 0)) &&
-	errors == NULL)
+
+    if (encoding == NULL) 
+	encoding = PyUnicode_GetDefaultEncoding();
+
+    /* Shortcuts for common default encodings */
+    if (errors == NULL) {
+	if (strcmp(encoding, "utf-8") == 0)
         return PyUnicode_AsUTF8String(unicode);
+	else if (strcmp(encoding, "latin-1") == 0)
+	    return PyUnicode_AsLatin1String(unicode);
+	else if (strcmp(encoding, "ascii") == 0)
+	    return PyUnicode_AsASCIIString(unicode);
+    }
 
     /* Encode via the codec registry */
     v = PyCodec_Encode(unicode, encoding, errors);
@@ -476,6 +500,30 @@ int PyUnicode_GetSize(PyObject *unicode)
     return -1;
 }
 
+const char *PyUnicode_GetDefaultEncoding()
+{
+    return unicode_default_encoding;
+}
+
+int PyUnicode_SetDefaultEncoding(const char *encoding)
+{
+    PyObject *v;
+    
+    /* Make sure the encoding is valid. As side effect, this also
+       loads the encoding into the codec registry cache. */
+    v = _PyCodec_Lookup(encoding);
+    if (v == NULL)
+	goto onError;
+    Py_DECREF(v);
+    strncpy(unicode_default_encoding,
+	    encoding, 
+	    sizeof(unicode_default_encoding));
+    return 0;
+
+ onError:
+    return -1;
+}
+
 /* --- UTF-8 Codec -------------------------------------------------------- */
 
 static 
@@ -772,7 +820,8 @@ int utf16_decoding_error(const Py_UNICODE **source,
     }
     else {
         PyErr_Format(PyExc_ValueError,
-                     "UTF-16 decoding error; unknown error handling code: %.400s",
+                     "UTF-16 decoding error; "
+		     "unknown error handling code: %.400s",
                      errors);
         return -1;
     }
@@ -3057,10 +3106,10 @@ unicode_count(PyUnicodeObject *self, PyObject *args)
 static char encode__doc__[] =
 "S.encode([encoding[,errors]]) -> string\n\
 \n\
-Return an encoded string version of S. Default encoding is 'UTF-8'.\n\
-errors may be given to set a different error handling scheme. Default\n\
-is 'strict' meaning that encoding errors raise a ValueError. Other\n\
-possible values are 'ignore' and 'replace'.";
+Return an encoded string version of S. Default encoding is the current\n\
+default string encoding. errors may be given to set a different error\n\
+handling scheme. Default is 'strict' meaning that encoding errors raise\n\
+a ValueError. Other possible values are 'ignore' and 'replace'.";
 
 static PyObject *
 unicode_encode(PyUnicodeObject *self, PyObject *args)
@@ -3816,7 +3865,7 @@ unicode_splitlines(PyUnicodeObject *self, PyObject *args)
 static
 PyObject *unicode_str(PyUnicodeObject *self)
 {
-    return PyUnicode_AsUTF8String((PyObject *)self);
+    return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
 }
 
 static char strip__doc__[] =
@@ -4246,6 +4295,8 @@ PyObject *PyUnicode_Format(PyObject *format,
 	return NULL;
     }
     uformat = PyUnicode_FromObject(format);
+    if (uformat == NULL)
+	return NULL;
     fmt = PyUnicode_AS_UNICODE(uformat);
     fmtcnt = PyUnicode_GET_SIZE(uformat);
 
@@ -4322,13 +4373,10 @@ PyObject *PyUnicode_Format(PyObject *format,
 				    "incomplete format key");
 		    goto onError;
 		}
-		/* keys are converted to strings (using UTF-8) and
+		/* keys are converted to strings using UTF-8 and
 		   then looked up since Python uses strings to hold
 		   variables names etc. in its namespaces and we
-		   wouldn't want to break common idioms.  The
-		   alternative would be using Unicode objects for the
-		   lookup but u"abc" and "abc" have different hash
-		   values (on purpose). */
+		   wouldn't want to break common idioms. */
 		key = PyUnicode_EncodeUTF8(keystart,
 					   keylen,
 					   NULL);
@@ -4472,8 +4520,9 @@ PyObject *PyUnicode_Format(PyObject *format,
 					"%s argument has non-string str()");
 			goto onError;
 		    }
-		    unicode = PyUnicode_DecodeUTF8(PyString_AS_STRING(temp),
+		    unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
 						   PyString_GET_SIZE(temp),
+					       NULL,
 						   "strict");
 		    Py_DECREF(temp);
 		    temp = unicode;
@@ -4659,7 +4708,9 @@ void _PyUnicode_Init()
         Py_FatalError("Unicode configuration error: "
 		      "sizeof(Py_UNICODE) != 2 bytes");
 
+    /* Init the implementation */
     unicode_empty = _PyUnicode_New(0);
+    strcpy(unicode_default_encoding, "utf-8");
 }
 
 /* Finalize the Unicode implementation */
author	Fred Drake <fdrake@acm.org>	2000-05-09 19:53:39 (GMT)
committer	Fred Drake <fdrake@acm.org>	2000-05-09 19:53:39 (GMT)
commit	e4315f58d212f080fe0ff132ff7bcfab5e862a75 (patch)
tree	b6b7b4d80c534033874daaefbbd505af62bb40e3 /Objects/unicodeobject.c
parent	aff601804d0b100815d1bdfdb2cf5778385a912c (diff)
download	cpython-e4315f58d212f080fe0ff132ff7bcfab5e862a75.zip cpython-e4315f58d212f080fe0ff132ff7bcfab5e862a75.tar.gz cpython-e4315f58d212f080fe0ff132ff7bcfab5e862a75.tar.bz2