summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorGuido van Rossum <guido@python.org>2000-03-24 22:14:19 (GMT)
committerGuido van Rossum <guido@python.org>2000-03-24 22:14:19 (GMT)
commitd8855fde885ffcd9956352edb75674f38c64acaa (patch)
treee956abb92678c85ffb8674c9a49d1fb7e8459140
parent27fc3c05e14d8b876bf0577225d509cbde45bfe0 (diff)
downloadcpython-d8855fde885ffcd9956352edb75674f38c64acaa.zip
cpython-d8855fde885ffcd9956352edb75674f38c64acaa.tar.gz
cpython-d8855fde885ffcd9956352edb75674f38c64acaa.tar.bz2
Marc-Andre Lemburg:
Attached you find the latest update of the Unicode implementation. The patch is against the current CVS version. It includes the fix I posted yesterday for the core dump problem in codecs.c (was introduced by my previous patch set -- sorry), adds more tests for the codecs and two new parser markers "es" and "es#".
-rw-r--r--Lib/codecs.py2
-rw-r--r--Lib/test/output/test_unicode1
-rw-r--r--Lib/test/test_unicode.py30
-rw-r--r--Misc/unicode.txt114
-rw-r--r--Python/getargs.c118
5 files changed, 259 insertions, 6 deletions
diff --git a/Lib/codecs.py b/Lib/codecs.py
index 7f478d7..c09f804 100644
--- a/Lib/codecs.py
+++ b/Lib/codecs.py
@@ -46,7 +46,7 @@ class Codec:
handling schemes by providing the errors argument. These
string values are defined:
- 'strict' - raise an error (or a subclass)
+ 'strict' - raise a ValueError error (or a subclass)
'ignore' - ignore the character and continue with the next
'replace' - replace with a suitable replacement character;
Python will use the official U+FFFD REPLACEMENT
diff --git a/Lib/test/output/test_unicode b/Lib/test/output/test_unicode
index 382a631..1ec9031 100644
--- a/Lib/test/output/test_unicode
+++ b/Lib/test/output/test_unicode
@@ -1,5 +1,4 @@
test_unicode
Testing Unicode comparisons... done.
-Testing Unicode contains method... done.
Testing Unicode formatting strings... done.
Testing unicodedata module... done.
diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py
index 69d4273..3d15f22 100644
--- a/Lib/test/test_unicode.py
+++ b/Lib/test/test_unicode.py
@@ -293,3 +293,33 @@ else:
assert unicodedata.combining(u'\u20e1') == 230
print 'done.'
+
+# Test builtin codecs
+print 'Testing builtin codecs...',
+
+assert unicode('hello','ascii') == u'hello'
+assert unicode('hello','utf-8') == u'hello'
+assert unicode('hello','utf8') == u'hello'
+assert unicode('hello','latin-1') == u'hello'
+
+assert u'hello'.encode('ascii') == 'hello'
+assert u'hello'.encode('utf-8') == 'hello'
+assert u'hello'.encode('utf8') == 'hello'
+assert u'hello'.encode('utf-16-le') == 'h\000e\000l\000l\000o\000'
+assert u'hello'.encode('utf-16-be') == '\000h\000e\000l\000l\000o'
+assert u'hello'.encode('latin-1') == 'hello'
+
+u = u''.join(map(unichr, range(1024)))
+for encoding in ('utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
+ 'raw_unicode_escape', 'unicode_escape', 'unicode_internal'):
+ assert unicode(u.encode(encoding),encoding) == u
+
+u = u''.join(map(unichr, range(256)))
+for encoding in ('latin-1',):
+ assert unicode(u.encode(encoding),encoding) == u
+
+u = u''.join(map(unichr, range(128)))
+for encoding in ('ascii',):
+ assert unicode(u.encode(encoding),encoding) == u
+
+print 'done.'
diff --git a/Misc/unicode.txt b/Misc/unicode.txt
index 9a4832a..fc1f2c5 100644
--- a/Misc/unicode.txt
+++ b/Misc/unicode.txt
@@ -715,21 +715,126 @@ Internal Argument Parsing:
These markers are used by the PyArg_ParseTuple() APIs:
- 'U': Check for Unicode object and return a pointer to it
+ "U": Check for Unicode object and return a pointer to it
- 's': For Unicode objects: auto convert them to the <default encoding>
+ "s": For Unicode objects: auto convert them to the <default encoding>
and return a pointer to the object's <defencstr> buffer.
- 's#': Access to the Unicode object via the bf_getreadbuf buffer interface
+ "s#": Access to the Unicode object via the bf_getreadbuf buffer interface
(see Buffer Interface); note that the length relates to the buffer
length, not the Unicode string length (this may be different
depending on the Internal Format).
- 't#': Access to the Unicode object via the bf_getcharbuf buffer interface
+ "t#": Access to the Unicode object via the bf_getcharbuf buffer interface
(see Buffer Interface); note that the length relates to the buffer
length, not necessarily to the Unicode string length (this may
be different depending on the <default encoding>).
+ "es":
+ Takes two parameters: encoding (const char *) and
+ buffer (char **).
+
+ The input object is first coerced to Unicode in the usual way
+ and then encoded into a string using the given encoding.
+
+ On output, a buffer of the needed size is allocated and
+ returned through *buffer as NULL-terminated string.
+ The encoded may not contain embedded NULL characters.
+ The caller is responsible for free()ing the allocated *buffer
+ after usage.
+
+ "es#":
+ Takes three parameters: encoding (const char *),
+ buffer (char **) and buffer_len (int *).
+
+ The input object is first coerced to Unicode in the usual way
+ and then encoded into a string using the given encoding.
+
+ If *buffer is non-NULL, *buffer_len must be set to sizeof(buffer)
+ on input. Output is then copied to *buffer.
+
+ If *buffer is NULL, a buffer of the needed size is
+ allocated and output copied into it. *buffer is then
+ updated to point to the allocated memory area. The caller
+ is responsible for free()ing *buffer after usage.
+
+ In both cases *buffer_len is updated to the number of
+ characters written (excluding the trailing NULL-byte).
+ The output buffer is assured to be NULL-terminated.
+
+Examples:
+
+Using "es#" with auto-allocation:
+
+ static PyObject *
+ test_parser(PyObject *self,
+ PyObject *args)
+ {
+ PyObject *str;
+ const char *encoding = "latin-1";
+ char *buffer = NULL;
+ int buffer_len = 0;
+
+ if (!PyArg_ParseTuple(args, "es#:test_parser",
+ encoding, &buffer, &buffer_len))
+ return NULL;
+ if (!buffer) {
+ PyErr_SetString(PyExc_SystemError,
+ "buffer is NULL");
+ return NULL;
+ }
+ str = PyString_FromStringAndSize(buffer, buffer_len);
+ free(buffer);
+ return str;
+ }
+
+Using "es" with auto-allocation returning a NULL-terminated string:
+
+ static PyObject *
+ test_parser(PyObject *self,
+ PyObject *args)
+ {
+ PyObject *str;
+ const char *encoding = "latin-1";
+ char *buffer = NULL;
+
+ if (!PyArg_ParseTuple(args, "es:test_parser",
+ encoding, &buffer))
+ return NULL;
+ if (!buffer) {
+ PyErr_SetString(PyExc_SystemError,
+ "buffer is NULL");
+ return NULL;
+ }
+ str = PyString_FromString(buffer);
+ free(buffer);
+ return str;
+ }
+
+Using "es#" with a pre-allocated buffer:
+
+ static PyObject *
+ test_parser(PyObject *self,
+ PyObject *args)
+ {
+ PyObject *str;
+ const char *encoding = "latin-1";
+ char _buffer[10];
+ char *buffer = _buffer;
+ int buffer_len = sizeof(_buffer);
+
+ if (!PyArg_ParseTuple(args, "es#:test_parser",
+ encoding, &buffer, &buffer_len))
+ return NULL;
+ if (!buffer) {
+ PyErr_SetString(PyExc_SystemError,
+ "buffer is NULL");
+ return NULL;
+ }
+ str = PyString_FromStringAndSize(buffer, buffer_len);
+ return str;
+ }
+
File/Stream Output:
-------------------
@@ -837,6 +942,7 @@ Encodings:
History of this Proposal:
-------------------------
+1.3: Added new "es" and "es#" parser markers
1.2: Removed POD about codecs.open()
1.1: Added note about comparisons and hash values. Added note about
case mapping algorithms. Changed stream codecs .read() and
diff --git a/Python/getargs.c b/Python/getargs.c
index 4617d05..a4b0fe4 100644
--- a/Python/getargs.c
+++ b/Python/getargs.c
@@ -178,6 +178,8 @@ vgetargs1(args, format, p_va, compat)
}
else if (level != 0)
; /* Pass */
+ else if (c == 'e')
+ ; /* Pass */
else if (isalpha(c))
max++;
else if (c == '|')
@@ -654,6 +656,122 @@ convertsimple1(arg, p_format, p_va)
break;
}
+ case 'e': /* encoded string */
+ {
+ char **buffer;
+ const char *encoding;
+ PyObject *u, *s;
+ int size;
+
+ /* Get 'e' parameter: the encoding name */
+ encoding = (const char *)va_arg(*p_va, const char *);
+ if (encoding == NULL)
+ return "(encoding is NULL)";
+
+ /* Get 's' parameter: the output buffer to use */
+ if (*format != 's')
+ return "(unkown parser marker combination)";
+ buffer = (char **)va_arg(*p_va, char **);
+ format++;
+ if (buffer == NULL)
+ return "(buffer is NULL)";
+
+ /* Convert object to Unicode */
+ u = PyUnicode_FromObject(arg);
+ if (u == NULL)
+ return "string, unicode or text buffer";
+
+ /* Encode object; use default error handling */
+ s = PyUnicode_AsEncodedString(u,
+ encoding,
+ NULL);
+ Py_DECREF(u);
+ if (s == NULL)
+ return "(encoding failed)";
+ if (!PyString_Check(s)) {
+ Py_DECREF(s);
+ return "(encoder failed to return a string)";
+ }
+ size = PyString_GET_SIZE(s);
+
+ /* Write output; output is guaranteed to be
+ 0-terminated */
+ if (*format == '#') {
+ /* Using buffer length parameter '#':
+
+ - if *buffer is NULL, a new buffer
+ of the needed size is allocated and
+ the data copied into it; *buffer is
+ updated to point to the new buffer;
+ the caller is responsible for
+ free()ing it after usage
+
+ - if *buffer is not NULL, the data
+ is copied to *buffer; *buffer_len
+ has to be set to the size of the
+ buffer on input; buffer overflow is
+ signalled with an error; buffer has
+ to provide enough room for the
+ encoded string plus the trailing
+ 0-byte
+
+ - in both cases, *buffer_len is
+ updated to the size of the buffer
+ /excluding/ the trailing 0-byte
+
+ */
+ int *buffer_len = va_arg(*p_va, int *);
+
+ format++;
+ if (buffer_len == NULL)
+ return "(buffer_len is NULL)";
+ if (*buffer == NULL) {
+ *buffer = PyMem_NEW(char, size + 1);
+ if (*buffer == NULL) {
+ Py_DECREF(s);
+ return "(memory error)";
+ }
+ } else {
+ if (size + 1 > *buffer_len) {
+ Py_DECREF(s);
+ return "(buffer overflow)";
+ }
+ }
+ memcpy(*buffer,
+ PyString_AS_STRING(s),
+ size + 1);
+ *buffer_len = size;
+ } else {
+ /* Using a 0-terminated buffer:
+
+ - the encoded string has to be
+ 0-terminated for this variant to
+ work; if it is not, an error raised
+
+ - a new buffer of the needed size
+ is allocated and the data copied
+ into it; *buffer is updated to
+ point to the new buffer; the caller
+ is responsible for free()ing it
+ after usage
+
+ */
+ if (strlen(PyString_AS_STRING(s)) != size)
+ return "(encoded string without "\
+ "NULL bytes)";
+ *buffer = PyMem_NEW(char, size + 1);
+ if (*buffer == NULL) {
+ Py_DECREF(s);
+ return "(memory error)";
+ }
+ memcpy(*buffer,
+ PyString_AS_STRING(s),
+ size + 1);
+ }
+ Py_DECREF(s);
+ break;
+ }
+
case 'S': /* string object */
{
PyObject **p = va_arg(*p_va, PyObject **);