From 24bdb0474fca186da95dc045f157074e4d57c6b6 Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Tue, 28 Mar 2000 20:29:59 +0000 Subject: Marc-Andre Lemburg: The attached patch set includes a workaround to get Python with Unicode compile on BSDI 4.x (courtesy Thomas Wouters; the cause is a bug in the BSDI wchar.h header file) and Python interfaces for the MBCS codec donated by Mark Hammond. Also included are some minor corrections w/r to the docs of the new "es" and "es#" parser markers (use PyMem_Free() instead of free(); thanks to Mark Hammond for finding these). The unicodedata tests are now in a separate file (test_unicodedata.py) to avoid problems if the module cannot be found. --- Include/unicodeobject.h | 8 ++++++- Lib/encodings/mbcs.py | 1 - Lib/test/output/test_unicode | 3 ++- Lib/test/output/test_unicodedata | 2 ++ Lib/test/test_unicode.py | 45 ------------------------------------ Lib/test/test_unicodedata.py | 50 ++++++++++++++++++++++++++++++++++++++++ Misc/unicode.txt | 13 ++++++----- Modules/_codecsmodule.c | 46 ++++++++++++++++++++++++++++++++++++ Python/getargs.c | 4 ++-- 9 files changed, 116 insertions(+), 56 deletions(-) create mode 100644 Lib/test/output/test_unicodedata create mode 100644 Lib/test/test_unicodedata.py diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h index e9e60d8..cfc8126 100644 --- a/Include/unicodeobject.h +++ b/Include/unicodeobject.h @@ -82,6 +82,10 @@ Unicode Integration Proposal (see file Misc/unicode.txt). #endif #ifdef HAVE_WCHAR_H +/* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */ +# ifdef _HAVE_BSDI +# include +# endif # include "wchar.h" #endif @@ -562,7 +566,9 @@ extern DL_IMPORT(PyObject *) PyUnicode_TranslateCharmap( ); #ifdef MS_WIN32 + /* --- MBCS codecs for Windows -------------------------------------------- */ + extern DL_IMPORT(PyObject*) PyUnicode_DecodeMBCS( const char *string, /* MBCS encoded string */ int length, /* size of string */ @@ -579,8 +585,8 @@ extern DL_IMPORT(PyObject*) PyUnicode_EncodeMBCS( const char *errors /* error handling */ ); - #endif /* MS_WIN32 */ + /* --- Methods & Slots ---------------------------------------------------- These are capable of handling Unicode objects and strings on input diff --git a/Lib/encodings/mbcs.py b/Lib/encodings/mbcs.py index b7fafbd..5103980 100644 --- a/Lib/encodings/mbcs.py +++ b/Lib/encodings/mbcs.py @@ -34,4 +34,3 @@ class StreamConverter(StreamWriter,StreamReader): def getregentry(): return (Codec.encode,Codec.decode,StreamReader,StreamWriter) - diff --git a/Lib/test/output/test_unicode b/Lib/test/output/test_unicode index 1ec9031..88e8624 100644 --- a/Lib/test/output/test_unicode +++ b/Lib/test/output/test_unicode @@ -1,4 +1,5 @@ test_unicode Testing Unicode comparisons... done. +Testing Unicode contains method... done. Testing Unicode formatting strings... done. -Testing unicodedata module... done. +Testing builtin codecs... done. diff --git a/Lib/test/output/test_unicodedata b/Lib/test/output/test_unicodedata new file mode 100644 index 0000000..fc9562f --- /dev/null +++ b/Lib/test/output/test_unicodedata @@ -0,0 +1,2 @@ +test_unicodedata +Testing unicodedata module... done. diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py index 3d15f22..f90887a 100644 --- a/Lib/test/test_unicode.py +++ b/Lib/test/test_unicode.py @@ -1,6 +1,5 @@ """ Test script for the Unicode implementation. - Written by Marc-Andre Lemburg (mal@lemburg.com). (c) Copyright CNRI, All Rights Reserved. NO WARRANTY. @@ -250,50 +249,6 @@ assert u"%(x)s, %(y)s" % {'x':u"abc", 'y':"def"} == u'abc, def' assert u"%(x)s, %(ä)s" % {'x':u"abc", u'ä'.encode('utf-8'):"def"} == u'abc, def' print 'done.' -# Test Unicode database APIs -try: - import unicodedata -except ImportError: - pass -else: - print 'Testing unicodedata module...', - - assert unicodedata.digit(u'A',None) is None - assert unicodedata.digit(u'9') == 9 - assert unicodedata.digit(u'\u215b',None) is None - assert unicodedata.digit(u'\u2468') == 9 - - assert unicodedata.numeric(u'A',None) is None - assert unicodedata.numeric(u'9') == 9 - assert unicodedata.numeric(u'\u215b') == 0.125 - assert unicodedata.numeric(u'\u2468') == 9.0 - - assert unicodedata.decimal(u'A',None) is None - assert unicodedata.decimal(u'9') == 9 - assert unicodedata.decimal(u'\u215b',None) is None - assert unicodedata.decimal(u'\u2468',None) is None - - assert unicodedata.category(u'\uFFFE') == 'Cn' - assert unicodedata.category(u'a') == 'Ll' - assert unicodedata.category(u'A') == 'Lu' - - assert unicodedata.bidirectional(u'\uFFFE') == '' - assert unicodedata.bidirectional(u' ') == 'WS' - assert unicodedata.bidirectional(u'A') == 'L' - - assert unicodedata.decomposition(u'\uFFFE') == '' - assert unicodedata.decomposition(u'\u00bc') == ' 0031 2044 0034' - - assert unicodedata.mirrored(u'\uFFFE') == 0 - assert unicodedata.mirrored(u'a') == 0 - assert unicodedata.mirrored(u'\u2201') == 1 - - assert unicodedata.combining(u'\uFFFE') == 0 - assert unicodedata.combining(u'a') == 0 - assert unicodedata.combining(u'\u20e1') == 230 - - print 'done.' - # Test builtin codecs print 'Testing builtin codecs...', diff --git a/Lib/test/test_unicodedata.py b/Lib/test/test_unicodedata.py new file mode 100644 index 0000000..6ddd077 --- /dev/null +++ b/Lib/test/test_unicodedata.py @@ -0,0 +1,50 @@ +""" Test script for the unicodedata module. + +Written by Marc-Andre Lemburg (mal@lemburg.com). + +(c) Copyright CNRI, All Rights Reserved. NO WARRANTY. + +"""#" +from test_support import verbose +import sys + +# Test Unicode database APIs +import unicodedata + +print 'Testing unicodedata module...', + +assert unicodedata.digit(u'A',None) is None +assert unicodedata.digit(u'9') == 9 +assert unicodedata.digit(u'\u215b',None) is None +assert unicodedata.digit(u'\u2468') == 9 + +assert unicodedata.numeric(u'A',None) is None +assert unicodedata.numeric(u'9') == 9 +assert unicodedata.numeric(u'\u215b') == 0.125 +assert unicodedata.numeric(u'\u2468') == 9.0 + +assert unicodedata.decimal(u'A',None) is None +assert unicodedata.decimal(u'9') == 9 +assert unicodedata.decimal(u'\u215b',None) is None +assert unicodedata.decimal(u'\u2468',None) is None + +assert unicodedata.category(u'\uFFFE') == 'Cn' +assert unicodedata.category(u'a') == 'Ll' +assert unicodedata.category(u'A') == 'Lu' + +assert unicodedata.bidirectional(u'\uFFFE') == '' +assert unicodedata.bidirectional(u' ') == 'WS' +assert unicodedata.bidirectional(u'A') == 'L' + +assert unicodedata.decomposition(u'\uFFFE') == '' +assert unicodedata.decomposition(u'\u00bc') == ' 0031 2044 0034' + +assert unicodedata.mirrored(u'\uFFFE') == 0 +assert unicodedata.mirrored(u'a') == 0 +assert unicodedata.mirrored(u'\u2201') == 1 + +assert unicodedata.combining(u'\uFFFE') == 0 +assert unicodedata.combining(u'a') == 0 +assert unicodedata.combining(u'\u20e1') == 230 + +print 'done.' diff --git a/Misc/unicode.txt b/Misc/unicode.txt index fc1f2c5..ce74c05 100644 --- a/Misc/unicode.txt +++ b/Misc/unicode.txt @@ -740,8 +740,8 @@ These markers are used by the PyArg_ParseTuple() APIs: On output, a buffer of the needed size is allocated and returned through *buffer as NULL-terminated string. The encoded may not contain embedded NULL characters. - The caller is responsible for free()ing the allocated *buffer - after usage. + The caller is responsible for calling PyMem_Free() + to free the allocated *buffer after usage. "es#": Takes three parameters: encoding (const char *), @@ -755,8 +755,9 @@ These markers are used by the PyArg_ParseTuple() APIs: If *buffer is NULL, a buffer of the needed size is allocated and output copied into it. *buffer is then - updated to point to the allocated memory area. The caller - is responsible for free()ing *buffer after usage. + updated to point to the allocated memory area. + The caller is responsible for calling PyMem_Free() + to free the allocated *buffer after usage. In both cases *buffer_len is updated to the number of characters written (excluding the trailing NULL-byte). @@ -784,7 +785,7 @@ Using "es#" with auto-allocation: return NULL; } str = PyString_FromStringAndSize(buffer, buffer_len); - free(buffer); + PyMem_Free(buffer); return str; } @@ -807,7 +808,7 @@ Using "es" with auto-allocation returning a NULL-terminated string: return NULL; } str = PyString_FromString(buffer); - free(buffer); + PyMem_Free(buffer); return str; } diff --git a/Modules/_codecsmodule.c b/Modules/_codecsmodule.c index 6c8a2d4..4f368f8 100644 --- a/Modules/_codecsmodule.c +++ b/Modules/_codecsmodule.c @@ -286,6 +286,26 @@ charmap_decode(PyObject *self, size); } +#ifdef MS_WIN32 + +static PyObject * +mbcs_decode(PyObject *self, + PyObject *args) +{ + const char *data; + int size; + const char *errors = NULL; + + if (!PyArg_ParseTuple(args, "t#|z:mbcs_decode", + &data, &size, &errors)) + return NULL; + + return codec_tuple(PyUnicode_DecodeMBCS(data, size, errors), + size); +} + +#endif /* MS_WIN32 */ + /* --- Encoder ------------------------------------------------------------ */ static PyObject * @@ -491,6 +511,28 @@ charmap_encode(PyObject *self, PyUnicode_GET_SIZE(str)); } +#ifdef MS_WIN32 + +static PyObject * +mbcs_encode(PyObject *self, + PyObject *args) +{ + PyObject *str; + const char *errors = NULL; + + if (!PyArg_ParseTuple(args, "U|z:mbcs_encode", + &str, &errors)) + return NULL; + + return codec_tuple(PyUnicode_EncodeMBCS( + PyUnicode_AS_UNICODE(str), + PyUnicode_GET_SIZE(str), + errors), + PyUnicode_GET_SIZE(str)); +} + +#endif /* MS_WIN32 */ + /* --- Module API --------------------------------------------------------- */ static PyMethodDef _codecs_functions[] = { @@ -519,6 +561,10 @@ static PyMethodDef _codecs_functions[] = { {"charmap_decode", charmap_decode, 1}, {"readbuffer_encode", readbuffer_encode, 1}, {"charbuffer_encode", charbuffer_encode, 1}, +#ifdef MS_WIN32 + {"mbcs_encode", mbcs_encode, 1}, + {"mbcs_decode", mbcs_decode, 1}, +#endif {NULL, NULL} /* sentinel */ }; diff --git a/Python/getargs.c b/Python/getargs.c index 27a69d0..91fe267 100644 --- a/Python/getargs.c +++ b/Python/getargs.c @@ -704,7 +704,7 @@ convertsimple1(arg, p_format, p_va) the data copied into it; *buffer is updated to point to the new buffer; the caller is responsible for - free()ing it after usage + PyMem_Free()ing it after usage - if *buffer is not NULL, the data is copied to *buffer; *buffer_len @@ -752,7 +752,7 @@ convertsimple1(arg, p_format, p_va) is allocated and the data copied into it; *buffer is updated to point to the new buffer; the caller - is responsible for free()ing it + is responsible for PyMem_Free()ing it after usage */ -- cgit v0.12