From 38337d1e1542eddd7f3f839535c8b440a3c90499 Mon Sep 17 00:00:00 2001 From: Larry Hastings Date: Thu, 7 May 2015 23:30:09 -0700 Subject: Issue #24000: Improved Argument Clinic's mapping of converters to legacy "format units". Updated the documentation to match. --- Doc/howto/clinic.rst | 70 +++++++++++++--------------- Misc/NEWS | 3 ++ Modules/_dbmmodule.c | 9 ++-- Modules/_gdbmmodule.c | 4 +- Modules/arraymodule.c | 4 +- Modules/unicodedata.c | 4 +- Tools/clinic/clinic.py | 123 +++++++++++++++++++++++++++++-------------------- 7 files changed, 119 insertions(+), 98 deletions(-) diff --git a/Doc/howto/clinic.rst b/Doc/howto/clinic.rst index ca8e1cb..cb208d5 100644 --- a/Doc/howto/clinic.rst +++ b/Doc/howto/clinic.rst @@ -758,6 +758,14 @@ All Argument Clinic converters accept the following arguments: In addition, some converters accept additional arguments. Here is a list of these arguments, along with their meanings: + ``accept`` + A set of Python types (and possibly pseudo-types); + this restricts the allowable Python argument to values of these types. + (This is not a general-purpose facility; as a rule it only supports + specific lists of types as shown in the legacy converter table.) + + To accept ``None``, add ``NoneType`` to this set. + ``bitwise`` Only supported for unsigned integers. The native integer value of this Python argument will be written to the parameter without any range checking, @@ -772,39 +780,27 @@ of these arguments, along with their meanings: Only supported for strings. Specifies the encoding to use when converting this string from a Python str (Unicode) value into a C ``char *`` value. - ``length`` - Only supported for strings. If true, requests that the length of the - string be passed in to the impl function, just after the string parameter, - in a parameter named ``_length``. - - ``nullable`` - Only supported for strings. If true, this parameter may also be set to - ``None``, in which case the C parameter will be set to ``NULL``. ``subclass_of`` Only supported for the ``object`` converter. Requires that the Python value be a subclass of a Python type, as expressed in C. - ``types`` - Only supported for the ``object`` (and ``self``) converter. Specifies + ``type`` + Only supported for the ``object`` and ``self`` converters. Specifies the C type that will be used to declare the variable. Default value is ``"PyObject *"``. - ``types`` - A string containing a list of Python types (and possibly pseudo-types); - this restricts the allowable Python argument to values of these types. - (This is not a general-purpose facility; as a rule it only supports - specific lists of types as shown in the legacy converter table.) - ``zeroes`` Only supported for strings. If true, embedded NUL bytes (``'\\0'``) are - permitted inside the value. + permitted inside the value. The length of the string will be passed in + to the impl function, just after the string parameter, as a parameter named + ``_length``. Please note, not every possible combination of arguments will work. -Often these arguments are implemented internally by specific ``PyArg_ParseTuple`` +Usually these arguments are implemented by specific ``PyArg_ParseTuple`` *format units*, with specific behavior. For example, currently you cannot -call ``str`` and pass in ``zeroes=True`` without also specifying an ``encoding``; -although it's perfectly reasonable to think this would work, these semantics don't +call ``unsigned_short`` without also specifying ``bitwise=True``. +Although it's perfectly reasonable to think this would work, these semantics don't map to any existing format unit. So Argument Clinic doesn't support it. (Or, at least, not yet.) @@ -816,13 +812,13 @@ on the right is the text you'd replace it with. ``'B'`` ``unsigned_char(bitwise=True)`` ``'b'`` ``unsigned_char`` ``'c'`` ``char`` -``'C'`` ``int(types='str')`` +``'C'`` ``int(accept={str})`` ``'d'`` ``double`` ``'D'`` ``Py_complex`` -``'es#'`` ``str(encoding='name_of_encoding', length=True, zeroes=True)`` ``'es'`` ``str(encoding='name_of_encoding')`` -``'et#'`` ``str(encoding='name_of_encoding', types='bytes bytearray str', length=True)`` -``'et'`` ``str(encoding='name_of_encoding', types='bytes bytearray str')`` +``'es#'`` ``str(encoding='name_of_encoding', zeroes=True)`` +``'et'`` ``str(encoding='name_of_encoding', accept={bytes, bytearray, str})`` +``'et#'`` ``str(encoding='name_of_encoding', accept={bytes, bytearray, str}, zeroes=True)`` ``'f'`` ``float`` ``'h'`` ``short`` ``'H'`` ``unsigned_short(bitwise=True)`` @@ -832,27 +828,27 @@ on the right is the text you'd replace it with. ``'K'`` ``unsigned_PY_LONG_LONG(bitwise=True)`` ``'L'`` ``PY_LONG_LONG`` ``'n'`` ``Py_ssize_t`` +``'O'`` ``object`` ``'O!'`` ``object(subclass_of='&PySomething_Type')`` ``'O&'`` ``object(converter='name_of_c_function')`` -``'O'`` ``object`` ``'p'`` ``bool`` -``'s#'`` ``str(length=True)`` ``'S'`` ``PyBytesObject`` ``'s'`` ``str`` -``'s*'`` ``Py_buffer(types='str bytes bytearray buffer')`` -``'u#'`` ``Py_UNICODE(length=True)`` -``'u'`` ``Py_UNICODE`` +``'s#'`` ``str(zeroes=True)`` +``'s*'`` ``Py_buffer(accept={buffer, str})`` ``'U'`` ``unicode`` -``'w*'`` ``Py_buffer(types='bytearray rwbuffer')`` -``'y#'`` ``str(types='bytes', length=True)`` +``'u'`` ``Py_UNICODE`` +``'u#'`` ``Py_UNICODE(zeroes=True)`` +``'w*'`` ``Py_buffer(accept={rwbuffer})`` ``'Y'`` ``PyByteArrayObject`` -``'y'`` ``str(types='bytes')`` +``'y'`` ``str(accept={bytes})`` +``'y#'`` ``str(accept={robuffer}, zeroes=True)`` ``'y*'`` ``Py_buffer`` -``'Z#'`` ``Py_UNICODE(nullable=True, length=True)`` -``'z#'`` ``str(nullable=True, length=True)`` -``'Z'`` ``Py_UNICODE(nullable=True)`` -``'z'`` ``str(nullable=True)`` -``'z*'`` ``Py_buffer(types='str bytes bytearray buffer', nullable=True)`` +``'Z'`` ``Py_UNICODE(accept={str, NoneType})`` +``'Z#'`` ``Py_UNICODE(accept={str, NoneType}, zeroes=True)`` +``'z'`` ``str(accept={str, NoneType})`` +``'z#'`` ``str(accept={str, NoneType}, zeroes=True)`` +``'z*'`` ``Py_buffer(accept={buffer, str, NoneType})`` ========= ================================================================================= As an example, here's our sample ``pickle.Pickler.dump`` using the proper diff --git a/Misc/NEWS b/Misc/NEWS index 336038b..f7f14b7 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -82,6 +82,9 @@ Documentation Tools/Demos ----------- +- Issue #24000: Improved Argument Clinic's mapping of converters to legacy + "format units". Updated the documentation to match. + - Issue #24001: Argument Clinic converters now use accept={type} instead of types={'type'} to specify the types the converter accepts. diff --git a/Modules/_dbmmodule.c b/Modules/_dbmmodule.c index 191cdd6..02899e4 100644 --- a/Modules/_dbmmodule.c +++ b/Modules/_dbmmodule.c @@ -274,7 +274,7 @@ static PySequenceMethods dbm_as_sequence = { /*[clinic input] _dbm.dbm.get - key: str(accept={str, robuffer}, length=True) + key: str(accept={str, robuffer}, zeroes=True) default: object(c_default="NULL") = b'' / @@ -284,7 +284,8 @@ Return the value for key if present, otherwise default. static PyObject * _dbm_dbm_get_impl(dbmobject *self, const char *key, Py_ssize_clean_t key_length, PyObject *default_value) -/*[clinic end generated code: output=b44f95eba8203d93 input=3c7c1afd9c508457]*/ +/*[clinic end generated code: output=b44f95eba8203d93 input=a3a279957f85eb6d]*/ +/*[clinic end generated code: output=4f5c0e523eaf1251 input=9402c0af8582dc69]*/ { datum dbm_key, val; @@ -301,7 +302,7 @@ _dbm_dbm_get_impl(dbmobject *self, const char *key, /*[clinic input] _dbm.dbm.setdefault - key: str(accept={str, robuffer}, length=True) + key: str(accept={str, robuffer}, zeroes=True) default: object(c_default="NULL") = b'' / @@ -314,7 +315,7 @@ static PyObject * _dbm_dbm_setdefault_impl(dbmobject *self, const char *key, Py_ssize_clean_t key_length, PyObject *default_value) -/*[clinic end generated code: output=52545886cf272161 input=a66fcb7f18ee2f50]*/ +/*[clinic end generated code: output=52545886cf272161 input=bf40c48edaca01d6]*/ { datum dbm_key, val; Py_ssize_t tmp_size; diff --git a/Modules/_gdbmmodule.c b/Modules/_gdbmmodule.c index cce5c7e..f070a14 100644 --- a/Modules/_gdbmmodule.c +++ b/Modules/_gdbmmodule.c @@ -383,7 +383,7 @@ _gdbm_gdbm_firstkey_impl(dbmobject *self) /*[clinic input] _gdbm.gdbm.nextkey - key: str(accept={str, robuffer}, length=True) + key: str(accept={str, robuffer}, zeroes=True) / Returns the key that follows key in the traversal. @@ -400,7 +400,7 @@ to create a list in memory that contains them all: static PyObject * _gdbm_gdbm_nextkey_impl(dbmobject *self, const char *key, Py_ssize_clean_t key_length) -/*[clinic end generated code: output=192ab892de6eb2f6 input=1eb2ff9b4b0e6ffd]*/ +/*[clinic end generated code: output=192ab892de6eb2f6 input=1f1606943614e36f]*/ { PyObject *v; datum dbm_key, nextkey; diff --git a/Modules/arraymodule.c b/Modules/arraymodule.c index 45e27eb..8d0462d 100644 --- a/Modules/arraymodule.c +++ b/Modules/arraymodule.c @@ -1673,7 +1673,7 @@ array_array_tostring_impl(arrayobject *self) /*[clinic input] array.array.fromunicode - ustr: Py_UNICODE(length=True) + ustr: Py_UNICODE(zeroes=True) / Extends this array with data from the unicode string ustr. @@ -1686,7 +1686,7 @@ some other type. static PyObject * array_array_fromunicode_impl(arrayobject *self, Py_UNICODE *ustr, Py_ssize_clean_t ustr_length) -/*[clinic end generated code: output=ebb72fc16975e06d input=56bcedb5ef70139f]*/ +/*[clinic end generated code: output=ebb72fc16975e06d input=150f00566ffbca6e]*/ { char typecode; diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c index b4336f2..a6e2dbc 100644 --- a/Modules/unicodedata.c +++ b/Modules/unicodedata.c @@ -1215,7 +1215,7 @@ unicodedata_UCD_name_impl(PyObject *self, int chr, PyObject *default_value) unicodedata.UCD.lookup self: self - name: str(accept={str, robuffer}, length=True) + name: str(accept={str, robuffer}, zeroes=True) / Look up character by name. @@ -1227,7 +1227,7 @@ corresponding character. If not found, KeyError is raised. static PyObject * unicodedata_UCD_lookup_impl(PyObject *self, const char *name, Py_ssize_clean_t name_length) -/*[clinic end generated code: output=765cb8186788e6be input=2dfe682c2491447a]*/ +/*[clinic end generated code: output=765cb8186788e6be input=a557be0f8607a0d6]*/ { Py_UCS4 code; unsigned int index; diff --git a/Tools/clinic/clinic.py b/Tools/clinic/clinic.py index 4f0615f..3ce3587 100755 --- a/Tools/clinic/clinic.py +++ b/Tools/clinic/clinic.py @@ -2644,64 +2644,85 @@ class buffer: pass class rwbuffer: pass class robuffer: pass -@add_legacy_c_converter('s#', accept={str, robuffer}, length=True) -@add_legacy_c_converter('y', accept={robuffer}) -@add_legacy_c_converter('y#', accept={robuffer}, length=True) -@add_legacy_c_converter('z', accept={str, NoneType}) -@add_legacy_c_converter('z#', accept={str, NoneType}, length=True) -# add_legacy_c_converter not supported for es, es#, et, et# -# because of their extra encoding argument +def str_converter_key(types, encoding, zeroes): + return (frozenset(types), bool(encoding), bool(zeroes)) + +str_converter_argument_map = {} + class str_converter(CConverter): type = 'const char *' default_type = (str, Null, NoneType) format_unit = 's' - def converter_init(self, *, encoding=None, accept={str}, length=False, zeroes=False): - - self.length = bool(length) + def converter_init(self, *, accept={str}, encoding=None, zeroes=False): - is_b_or_ba = accept == {bytes, bytearray} - is_b_or_ba_or_none = accept == {bytes, bytearray, NoneType} - is_str = accept == {str} - is_str_or_none = accept == {str, NoneType} - is_robuffer = accept == {robuffer} - is_str_or_robuffer = accept == {str, robuffer} - is_str_or_robuffer_or_none = accept == {str, robuffer, NoneType} - - format_unit = None + key = str_converter_key(accept, encoding, zeroes) + format_unit = str_converter_argument_map.get(key) + if not format_unit: + fail("str_converter: illegal combination of arguments", key) + self.format_unit = format_unit + self.length = bool(zeroes) if encoding: + if self.default not in (Null, None, unspecified): + fail("str_converter: Argument Clinic doesn't support default values for encoded strings") self.encoding = encoding + self.type = 'char *' + # sorry, clinic can't support preallocated buffers + # for es# and et# + self.c_default = "NULL" - if is_str and not length and not zeroes: - format_unit = 'es' - elif is_str_or_none and length and zeroes: - format_unit = 'es#' - elif is_b_or_ba and not length and not zeroes: - format_unit = 'et' - elif is_b_or_ba_or_none and length and zeroes: - format_unit = 'et#' - - else: - if zeroes: - fail("str_converter: illegal combination of arguments (zeroes is only legal with an encoding)") - - if is_str and not length: - format_unit = 's' - elif is_str_or_none and not length: - format_unit = 'z' - elif is_robuffer and not length: - format_unit = 'y' - elif is_robuffer and length: - format_unit = 'y#' - elif is_str_or_robuffer and length: - format_unit = 's#' - elif is_str_or_robuffer_or_none and length: - format_unit = 'z#' + def cleanup(self): + if self.encoding: + name = ensure_legal_c_identifier(self.name) + return "".join(["if (", name, ")\n PyMem_FREE(", name, ");\n"]) - if not format_unit: - fail("str_converter: illegal combination of arguments") - self.format_unit = format_unit +# +# This is the fourth or fifth rewrite of registering all the +# crazy string converter format units. Previous approaches hid +# bugs--generally mismatches between the semantics of the format +# unit and the arguments necessary to represent those semantics +# properly. Hopefully with this approach we'll get it 100% right. +# +# The r() function (short for "register") both registers the +# mapping from arguments to format unit *and* registers the +# legacy C converter for that format unit. +# +def r(format_unit, *, accept, encoding=False, zeroes=False): + if not encoding and format_unit != 's': + # add the legacy c converters here too. + # + # note: add_legacy_c_converter can't work for + # es, es#, et, or et# + # because of their extra encoding argument + # + # also don't add the converter for 's' because + # the metaclass for CConverter adds it for us. + kwargs = {} + if accept != {str}: + kwargs['accept'] = accept + if zeroes: + kwargs['zeroes'] = True + added_f = functools.partial(str_converter, **kwargs) + legacy_converters[format_unit] = added_f + + d = str_converter_argument_map + key = str_converter_key(accept, encoding, zeroes) + if key in d: + sys.exit("Duplicate keys specified for str_converter_argument_map!") + d[key] = format_unit + +r('es', encoding=True, accept={str}) +r('es#', encoding=True, zeroes=True, accept={str}) +r('et', encoding=True, accept={bytes, bytearray, str}) +r('et#', encoding=True, zeroes=True, accept={bytes, bytearray, str}) +r('s', accept={str}) +r('s#', zeroes=True, accept={robuffer, str}) +r('y', accept={robuffer}) +r('y#', zeroes=True, accept={robuffer}) +r('z', accept={str, NoneType}) +r('z#', zeroes=True, accept={robuffer, str, NoneType}) +del r class PyBytesObject_converter(CConverter): @@ -2719,17 +2740,17 @@ class unicode_converter(CConverter): default_type = (str, Null, NoneType) format_unit = 'U' -@add_legacy_c_converter('u#', length=True) +@add_legacy_c_converter('u#', zeroes=True) @add_legacy_c_converter('Z', accept={str, NoneType}) -@add_legacy_c_converter('Z#', accept={str, NoneType}, length=True) +@add_legacy_c_converter('Z#', accept={str, NoneType}, zeroes=True) class Py_UNICODE_converter(CConverter): type = 'Py_UNICODE *' default_type = (str, Null, NoneType) format_unit = 'u' - def converter_init(self, *, accept={str}, length=False): + def converter_init(self, *, accept={str}, zeroes=False): format_unit = 'Z' if accept=={str, NoneType} else 'u' - if length: + if zeroes: format_unit += '#' self.length = True self.format_unit = format_unit -- cgit v0.12