7 files changed, 119 insertions, 98 deletions
diff --git a/Doc/howto/clinic.rst b/Doc/howto/clinic.rst
index ca8e1cb..cb208d5 100644
--- a/Doc/howto/clinic.rst
+++ b/Doc/howto/clinic.rst
@@ -758,6 +758,14 @@ All Argument Clinic converters accept the following arguments:
 In addition, some converters accept additional arguments.  Here is a list
 of these arguments, along with their meanings:
 
+  ``accept``
+    A set of Python types (and possibly pseudo-types);
+    this restricts the allowable Python argument to values of these types.
+    (This is not a general-purpose facility; as a rule it only supports
+    specific lists of types as shown in the legacy converter table.)
+
+    To accept ``None``, add ``NoneType`` to this set.
+
   ``bitwise``
     Only supported for unsigned integers.  The native integer value of this
     Python argument will be written to the parameter without any range checking,
@@ -772,39 +780,27 @@ of these arguments, along with their meanings:
     Only supported for strings.  Specifies the encoding to use when converting
     this string from a Python str (Unicode) value into a C ``char *`` value.
 
-  ``length``
-    Only supported for strings.  If true, requests that the length of the
-    string be passed in to the impl function, just after the string parameter,
-    in a parameter named ``<parameter_name>_length``.
-
-  ``nullable``
-    Only supported for strings.  If true, this parameter may also be set to
-    ``None``, in which case the C parameter will be set to ``NULL``.
 
   ``subclass_of``
     Only supported for the ``object`` converter.  Requires that the Python
     value be a subclass of a Python type, as expressed in C.
 
-  ``types``
-    Only supported for the ``object`` (and ``self``) converter.  Specifies
+  ``type``
+    Only supported for the ``object`` and ``self`` converters.  Specifies
     the C type that will be used to declare the variable.  Default value is
     ``"PyObject *"``.
 
-  ``types``
-    A string containing a list of Python types (and possibly pseudo-types);
-    this restricts the allowable Python argument to values of these types.
-    (This is not a general-purpose facility; as a rule it only supports
-    specific lists of types as shown in the legacy converter table.)
-
   ``zeroes``
     Only supported for strings.  If true, embedded NUL bytes (``'\\0'``) are
-    permitted inside the value.
+    permitted inside the value.  The length of the string will be passed in
+    to the impl function, just after the string parameter, as a parameter named
+    ``<parameter_name>_length``.
 
 Please note, not every possible combination of arguments will work.
-Often these arguments are implemented internally by specific ``PyArg_ParseTuple``
+Usually these arguments are implemented by specific ``PyArg_ParseTuple``
 *format units*, with specific behavior.  For example, currently you cannot
-call ``str`` and pass in ``zeroes=True`` without also specifying an ``encoding``;
-although it's perfectly reasonable to think this would work, these semantics don't
+call ``unsigned_short`` without also specifying ``bitwise=True``.
+Although it's perfectly reasonable to think this would work, these semantics don't
 map to any existing format unit.  So Argument Clinic doesn't support it.  (Or, at
 least, not yet.)
 
@@ -816,13 +812,13 @@ on the right is the text you'd replace it with.
 ``'B'``     ``unsigned_char(bitwise=True)``
 ``'b'``     ``unsigned_char``
 ``'c'``     ``char``
-``'C'``     ``int(types='str')``
+``'C'``     ``int(accept={str})``
 ``'d'``     ``double``
 ``'D'``     ``Py_complex``
-``'es#'``   ``str(encoding='name_of_encoding', length=True, zeroes=True)``
 ``'es'``    ``str(encoding='name_of_encoding')``
-``'et#'``   ``str(encoding='name_of_encoding', types='bytes bytearray str', length=True)``
-``'et'``    ``str(encoding='name_of_encoding', types='bytes bytearray str')``
+``'es#'``   ``str(encoding='name_of_encoding', zeroes=True)``
+``'et'``    ``str(encoding='name_of_encoding', accept={bytes, bytearray, str})``
+``'et#'``   ``str(encoding='name_of_encoding', accept={bytes, bytearray, str}, zeroes=True)``
 ``'f'``     ``float``
 ``'h'``     ``short``
 ``'H'``     ``unsigned_short(bitwise=True)``
@@ -832,27 +828,27 @@ on the right is the text you'd replace it with.
 ``'K'``     ``unsigned_PY_LONG_LONG(bitwise=True)``
 ``'L'``     ``PY_LONG_LONG``
 ``'n'``     ``Py_ssize_t``
+``'O'``     ``object``
 ``'O!'``    ``object(subclass_of='&PySomething_Type')``
 ``'O&'``    ``object(converter='name_of_c_function')``
-``'O'``     ``object``
 ``'p'``     ``bool``
-``'s#'``    ``str(length=True)``
 ``'S'``     ``PyBytesObject``
 ``'s'``     ``str``
-``'s*'``    ``Py_buffer(types='str bytes bytearray buffer')``
-``'u#'``    ``Py_UNICODE(length=True)``
-``'u'``     ``Py_UNICODE``
+``'s#'``    ``str(zeroes=True)``
+``'s*'``    ``Py_buffer(accept={buffer, str})``
 ``'U'``     ``unicode``
-``'w*'``    ``Py_buffer(types='bytearray rwbuffer')``
-``'y#'``    ``str(types='bytes', length=True)``
+``'u'``     ``Py_UNICODE``
+``'u#'``    ``Py_UNICODE(zeroes=True)``
+``'w*'``    ``Py_buffer(accept={rwbuffer})``
 ``'Y'``     ``PyByteArrayObject``
-``'y'``     ``str(types='bytes')``
+``'y'``     ``str(accept={bytes})``
+``'y#'``    ``str(accept={robuffer}, zeroes=True)``
 ``'y*'``    ``Py_buffer``
-``'Z#'``    ``Py_UNICODE(nullable=True, length=True)``
-``'z#'``    ``str(nullable=True, length=True)``
-``'Z'``     ``Py_UNICODE(nullable=True)``
-``'z'``     ``str(nullable=True)``
-``'z*'``    ``Py_buffer(types='str bytes bytearray buffer', nullable=True)``
+``'Z'``     ``Py_UNICODE(accept={str, NoneType})``
+``'Z#'``    ``Py_UNICODE(accept={str, NoneType}, zeroes=True)``
+``'z'``     ``str(accept={str, NoneType})``
+``'z#'``    ``str(accept={str, NoneType}, zeroes=True)``
+``'z*'``    ``Py_buffer(accept={buffer, str, NoneType})``
 =========   =================================================================================
 
 As an example, here's our sample ``pickle.Pickler.dump`` using the proper
diff --git a/Misc/NEWS b/Misc/NEWS
index 336038b..f7f14b7 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -82,6 +82,9 @@ Documentation
 Tools/Demos
 -----------
 
+- Issue #24000: Improved Argument Clinic's mapping of converters to legacy
+  "format units".  Updated the documentation to match.
+
 - Issue #24001: Argument Clinic converters now use accept={type}
   instead of types={'type'} to specify the types the converter accepts.
 
diff --git a/Modules/_dbmmodule.c b/Modules/_dbmmodule.c
index 191cdd6..02899e4 100644
--- a/Modules/_dbmmodule.c
+++ b/Modules/_dbmmodule.c
@@ -274,7 +274,7 @@ static PySequenceMethods dbm_as_sequence = {
 /*[clinic input]
 _dbm.dbm.get
 
-    key: str(accept={str, robuffer}, length=True)
+    key: str(accept={str, robuffer}, zeroes=True)
     default: object(c_default="NULL") = b''
     /
 
@@ -284,7 +284,8 @@ Return the value for key if present, otherwise default.
 static PyObject *
 _dbm_dbm_get_impl(dbmobject *self, const char *key,
                   Py_ssize_clean_t key_length, PyObject *default_value)
-/*[clinic end generated code: output=b44f95eba8203d93 input=3c7c1afd9c508457]*/
+/*[clinic end generated code: output=b44f95eba8203d93 input=a3a279957f85eb6d]*/
+/*[clinic end generated code: output=4f5c0e523eaf1251 input=9402c0af8582dc69]*/
 {
     datum dbm_key, val;
 
@@ -301,7 +302,7 @@ _dbm_dbm_get_impl(dbmobject *self, const char *key,
 
 /*[clinic input]
 _dbm.dbm.setdefault
-    key: str(accept={str, robuffer}, length=True)
+    key: str(accept={str, robuffer}, zeroes=True)
     default: object(c_default="NULL") = b''
     /
 
@@ -314,7 +315,7 @@ static PyObject *
 _dbm_dbm_setdefault_impl(dbmobject *self, const char *key,
                          Py_ssize_clean_t key_length,
                          PyObject *default_value)
-/*[clinic end generated code: output=52545886cf272161 input=a66fcb7f18ee2f50]*/
+/*[clinic end generated code: output=52545886cf272161 input=bf40c48edaca01d6]*/
 {
     datum dbm_key, val;
     Py_ssize_t tmp_size;
diff --git a/Modules/_gdbmmodule.c b/Modules/_gdbmmodule.c
index cce5c7e..f070a14 100644
--- a/Modules/_gdbmmodule.c
+++ b/Modules/_gdbmmodule.c
@@ -383,7 +383,7 @@ _gdbm_gdbm_firstkey_impl(dbmobject *self)
 /*[clinic input]
 _gdbm.gdbm.nextkey
 
-    key: str(accept={str, robuffer}, length=True)
+    key: str(accept={str, robuffer}, zeroes=True)
     /
 
 Returns the key that follows key in the traversal.
@@ -400,7 +400,7 @@ to create a list in memory that contains them all:
 static PyObject *
 _gdbm_gdbm_nextkey_impl(dbmobject *self, const char *key,
                         Py_ssize_clean_t key_length)
-/*[clinic end generated code: output=192ab892de6eb2f6 input=1eb2ff9b4b0e6ffd]*/
+/*[clinic end generated code: output=192ab892de6eb2f6 input=1f1606943614e36f]*/
 {
     PyObject *v;
     datum dbm_key, nextkey;
diff --git a/Modules/arraymodule.c b/Modules/arraymodule.c
index 45e27eb..8d0462d 100644
--- a/Modules/arraymodule.c
+++ b/Modules/arraymodule.c
@@ -1673,7 +1673,7 @@ array_array_tostring_impl(arrayobject *self)
 /*[clinic input]
 array.array.fromunicode
 
-    ustr: Py_UNICODE(length=True)
+    ustr: Py_UNICODE(zeroes=True)
     /
 
 Extends this array with data from the unicode string ustr.
@@ -1686,7 +1686,7 @@ some other type.
 static PyObject *
 array_array_fromunicode_impl(arrayobject *self, Py_UNICODE *ustr,
                              Py_ssize_clean_t ustr_length)
-/*[clinic end generated code: output=ebb72fc16975e06d input=56bcedb5ef70139f]*/
+/*[clinic end generated code: output=ebb72fc16975e06d input=150f00566ffbca6e]*/
 {
     char typecode;
 
diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c
index b4336f2..a6e2dbc 100644
--- a/Modules/unicodedata.c
+++ b/Modules/unicodedata.c
@@ -1215,7 +1215,7 @@ unicodedata_UCD_name_impl(PyObject *self, int chr, PyObject *default_value)
 unicodedata.UCD.lookup
 
     self: self
-    name: str(accept={str, robuffer}, length=True)
+    name: str(accept={str, robuffer}, zeroes=True)
     /
 
 Look up character by name.
@@ -1227,7 +1227,7 @@ corresponding character.  If not found, KeyError is raised.
 static PyObject *
 unicodedata_UCD_lookup_impl(PyObject *self, const char *name,
                             Py_ssize_clean_t name_length)
-/*[clinic end generated code: output=765cb8186788e6be input=2dfe682c2491447a]*/
+/*[clinic end generated code: output=765cb8186788e6be input=a557be0f8607a0d6]*/
 {
     Py_UCS4 code;
     unsigned int index;
diff --git a/Tools/clinic/clinic.py b/Tools/clinic/clinic.py
index 4f0615f..3ce3587 100755
--- a/Tools/clinic/clinic.py
+++ b/Tools/clinic/clinic.py
@@ -2644,64 +2644,85 @@ class buffer: pass
 class rwbuffer: pass
 class robuffer: pass
 
-@add_legacy_c_converter('s#', accept={str, robuffer}, length=True)
-@add_legacy_c_converter('y', accept={robuffer})
-@add_legacy_c_converter('y#', accept={robuffer}, length=True)
-@add_legacy_c_converter('z', accept={str, NoneType})
-@add_legacy_c_converter('z#', accept={str, NoneType}, length=True)
-# add_legacy_c_converter not supported for es, es#, et, et#
-# because of their extra encoding argument
+def str_converter_key(types, encoding, zeroes):
+    return (frozenset(types), bool(encoding), bool(zeroes))
+
+str_converter_argument_map = {}
+
 class str_converter(CConverter):
     type = 'const char *'
     default_type = (str, Null, NoneType)
     format_unit = 's'
 
-    def converter_init(self, *, encoding=None, accept={str}, length=False, zeroes=False):
-
-        self.length = bool(length)
+    def converter_init(self, *, accept={str}, encoding=None, zeroes=False):
 
-        is_b_or_ba = accept == {bytes, bytearray}
-        is_b_or_ba_or_none = accept == {bytes, bytearray, NoneType}
-        is_str = accept == {str}
-        is_str_or_none = accept == {str, NoneType}
-        is_robuffer = accept == {robuffer}
-        is_str_or_robuffer = accept == {str, robuffer}
-        is_str_or_robuffer_or_none = accept == {str, robuffer, NoneType}
-
-        format_unit = None
+        key = str_converter_key(accept, encoding, zeroes)
+        format_unit = str_converter_argument_map.get(key)
+        if not format_unit:
+            fail("str_converter: illegal combination of arguments", key)
 
+        self.format_unit = format_unit
+        self.length = bool(zeroes)
         if encoding:
+            if self.default not in (Null, None, unspecified):
+                fail("str_converter: Argument Clinic doesn't support default values for encoded strings")
             self.encoding = encoding
+            self.type = 'char *'
+            # sorry, clinic can't support preallocated buffers
+            # for es# and et#
+            self.c_default = "NULL"
 
-            if   is_str             and not length and not zeroes:
-                format_unit = 'es'
-            elif is_str_or_none     and     length and     zeroes:
-                format_unit = 'es#'
-            elif is_b_or_ba         and not length and not zeroes:
-                format_unit = 'et'
-            elif is_b_or_ba_or_none and     length and     zeroes:
-                format_unit = 'et#'
-
-        else:
-            if zeroes:
-                fail("str_converter: illegal combination of arguments (zeroes is only legal with an encoding)")
-
-            if is_str               and not length:
-                format_unit = 's'
-            elif is_str_or_none     and not length:
-                format_unit = 'z'
-            elif is_robuffer        and not length:
-                format_unit = 'y'
-            elif is_robuffer        and     length:
-                format_unit = 'y#'
-            elif is_str_or_robuffer and     length:
-                format_unit = 's#'
-            elif is_str_or_robuffer_or_none and     length:
-                format_unit = 'z#'
+    def cleanup(self):
+        if self.encoding:
+            name = ensure_legal_c_identifier(self.name)
+            return "".join(["if (", name, ")\n   PyMem_FREE(", name, ");\n"])
 
-        if not format_unit:
-            fail("str_converter: illegal combination of arguments")
-        self.format_unit = format_unit
+#
+# This is the fourth or fifth rewrite of registering all the
+# crazy string converter format units.  Previous approaches hid
+# bugs--generally mismatches between the semantics of the format
+# unit and the arguments necessary to represent those semantics
+# properly.  Hopefully with this approach we'll get it 100% right.
+#
+# The r() function (short for "register") both registers the
+# mapping from arguments to format unit *and* registers the
+# legacy C converter for that format unit.
+#
+def r(format_unit, *, accept, encoding=False, zeroes=False):
+    if not encoding and format_unit != 's':
+        # add the legacy c converters here too.
+        #
+        # note: add_legacy_c_converter can't work for
+        #   es, es#, et, or et#
+        #   because of their extra encoding argument
+        #
+        # also don't add the converter for 's' because
+        # the metaclass for CConverter adds it for us.
+        kwargs = {}
+        if accept != {str}:
+            kwargs['accept'] = accept
+        if zeroes:
+            kwargs['zeroes'] = True
+        added_f = functools.partial(str_converter, **kwargs)
+        legacy_converters[format_unit] = added_f
+
+    d = str_converter_argument_map
+    key = str_converter_key(accept, encoding, zeroes)
+    if key in d:
+        sys.exit("Duplicate keys specified for str_converter_argument_map!")
+    d[key] = format_unit
+
+r('es',  encoding=True,              accept={str})
+r('es#', encoding=True, zeroes=True, accept={str})
+r('et',  encoding=True,              accept={bytes, bytearray, str})
+r('et#', encoding=True, zeroes=True, accept={bytes, bytearray, str})
+r('s',                               accept={str})
+r('s#',                 zeroes=True, accept={robuffer, str})
+r('y',                               accept={robuffer})
+r('y#',                 zeroes=True, accept={robuffer})
+r('z',                               accept={str, NoneType})
+r('z#',                 zeroes=True, accept={robuffer, str, NoneType})
+del r
 
 
 class PyBytesObject_converter(CConverter):
@@ -2719,17 +2740,17 @@ class unicode_converter(CConverter):
     default_type = (str, Null, NoneType)
     format_unit = 'U'
 
-@add_legacy_c_converter('u#', length=True)
+@add_legacy_c_converter('u#', zeroes=True)
 @add_legacy_c_converter('Z', accept={str, NoneType})
-@add_legacy_c_converter('Z#', accept={str, NoneType}, length=True)
+@add_legacy_c_converter('Z#', accept={str, NoneType}, zeroes=True)
 class Py_UNICODE_converter(CConverter):
     type = 'Py_UNICODE *'
     default_type = (str, Null, NoneType)
     format_unit = 'u'
 
-    def converter_init(self, *, accept={str}, length=False):
+    def converter_init(self, *, accept={str}, zeroes=False):
         format_unit = 'Z' if accept=={str, NoneType} else 'u'
-        if length:
+        if zeroes:
             format_unit += '#'
             self.length = True
         self.format_unit = format_unit