From 2566b74b26bcce24199427acea392aed644f4b17 Mon Sep 17 00:00:00 2001 From: Inada Naoki Date: Sat, 15 Jul 2023 19:33:32 +0900 Subject: gh-81283: compiler: remove indent from docstring (#106411) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Éric --- Doc/whatsnew/3.13.rst | 7 ++ Include/internal/pycore_compile.h | 2 + Lib/inspect.py | 45 +++++----- Lib/test/test_doctest.py | 4 +- Lib/test/test_inspect.py | 35 +++++++- .../2023-07-04-20-42-54.gh-issue-81283.hfh_MD.rst | 3 + Modules/_testinternalcapi.c | 20 ++++- Modules/clinic/_testinternalcapi.c.h | 61 ++++++++++++- Python/compile.c | 99 +++++++++++++++++++++- 9 files changed, 246 insertions(+), 30 deletions(-) create mode 100644 Misc/NEWS.d/next/Core and Builtins/2023-07-04-20-42-54.gh-issue-81283.hfh_MD.rst diff --git a/Doc/whatsnew/3.13.rst b/Doc/whatsnew/3.13.rst index 06fcaf4..161d5fb 100644 --- a/Doc/whatsnew/3.13.rst +++ b/Doc/whatsnew/3.13.rst @@ -79,6 +79,13 @@ Other Language Changes * Allow the *count* argument of :meth:`str.replace` to be a keyword. (Contributed by Hugo van Kemenade in :gh:`106487`.) +* Compiler now strip indents from docstrings. + This will reduce the size of :term:`bytecode cache ` (e.g. ``.pyc`` file). + For example, cache file size for ``sqlalchemy.orm.session`` in SQLAlchemy 2.0 + is reduced by about 5%. + This change will affect tools using docstrings, like :mod:`doctest`. + (Contributed by Inada Naoki in :gh:`81283`.) + New Modules =========== diff --git a/Include/internal/pycore_compile.h b/Include/internal/pycore_compile.h index e204d4d..beb37cc 100644 --- a/Include/internal/pycore_compile.h +++ b/Include/internal/pycore_compile.h @@ -91,6 +91,8 @@ int _PyCompile_ConstCacheMergeOne(PyObject *const_cache, PyObject **obj); /* Access compiler internals for unit testing */ +PyAPI_FUNC(PyObject*) _PyCompile_CleanDoc(PyObject *doc); + PyAPI_FUNC(PyObject*) _PyCompile_CodeGen( PyObject *ast, PyObject *filename, diff --git a/Lib/inspect.py b/Lib/inspect.py index a550202..15f94a1 100644 --- a/Lib/inspect.py +++ b/Lib/inspect.py @@ -881,29 +881,28 @@ def cleandoc(doc): Any whitespace that can be uniformly removed from the second line onwards is removed.""" - try: - lines = doc.expandtabs().split('\n') - except UnicodeError: - return None - else: - # Find minimum indentation of any non-blank lines after first line. - margin = sys.maxsize - for line in lines[1:]: - content = len(line.lstrip()) - if content: - indent = len(line) - content - margin = min(margin, indent) - # Remove indentation. - if lines: - lines[0] = lines[0].lstrip() - if margin < sys.maxsize: - for i in range(1, len(lines)): lines[i] = lines[i][margin:] - # Remove any trailing or leading blank lines. - while lines and not lines[-1]: - lines.pop() - while lines and not lines[0]: - lines.pop(0) - return '\n'.join(lines) + lines = doc.expandtabs().split('\n') + + # Find minimum indentation of any non-blank lines after first line. + margin = sys.maxsize + for line in lines[1:]: + content = len(line.lstrip(' ')) + if content: + indent = len(line) - content + margin = min(margin, indent) + # Remove indentation. + if lines: + lines[0] = lines[0].lstrip(' ') + if margin < sys.maxsize: + for i in range(1, len(lines)): + lines[i] = lines[i][margin:] + # Remove any trailing or leading blank lines. + while lines and not lines[-1]: + lines.pop() + while lines and not lines[0]: + lines.pop(0) + return '\n'.join(lines) + def getfile(object): """Work out which source or compiled file an object was defined in.""" diff --git a/Lib/test/test_doctest.py b/Lib/test/test_doctest.py index 542fcdb..bea52c6 100644 --- a/Lib/test/test_doctest.py +++ b/Lib/test/test_doctest.py @@ -1287,14 +1287,14 @@ The NORMALIZE_WHITESPACE flag causes all sequences of whitespace to be treated as equal: >>> def f(x): - ... '>>> print(1, 2, 3)\n 1 2\n 3' + ... '\n>>> print(1, 2, 3)\n 1 2\n 3' >>> # Without the flag: >>> test = doctest.DocTestFinder().find(f)[0] >>> doctest.DocTestRunner(verbose=False).run(test) ... # doctest: +ELLIPSIS ********************************************************************** - File ..., line 2, in f + File ..., line 3, in f Failed example: print(1, 2, 3) Expected: diff --git a/Lib/test/test_inspect.py b/Lib/test/test_inspect.py index d89953a..64afeec 100644 --- a/Lib/test/test_inspect.py +++ b/Lib/test/test_inspect.py @@ -596,9 +596,40 @@ class TestRetrievingSourceCode(GetSourceBase): self.assertEqual(finddoc(int.from_bytes), int.from_bytes.__doc__) self.assertEqual(finddoc(int.real), int.real.__doc__) + cleandoc_testdata = [ + # first line should have different margin + (' An\n indented\n docstring.', 'An\nindented\n docstring.'), + # trailing whitespace are not removed. + (' An \n \n indented \n docstring. ', + 'An \n \nindented \n docstring. '), + # NUL is not termination. + ('doc\0string\n\n second\0line\n third\0line\0', + 'doc\0string\n\nsecond\0line\nthird\0line\0'), + # first line is lstrip()-ped. other lines are kept when no margin.[w: + (' ', ''), + # compiler.cleandoc() doesn't strip leading/trailing newlines + # to keep maximum backward compatibility. + # inspect.cleandoc() removes them. + ('\n\n\n first paragraph\n\n second paragraph\n\n', + '\n\n\nfirst paragraph\n\n second paragraph\n\n'), + (' \n \n \n ', '\n \n \n '), + ] + def test_cleandoc(self): - self.assertEqual(inspect.cleandoc('An\n indented\n docstring.'), - 'An\nindented\ndocstring.') + func = inspect.cleandoc + for i, (input, expected) in enumerate(self.cleandoc_testdata): + # only inspect.cleandoc() strip \n + expected = expected.strip('\n') + with self.subTest(i=i): + self.assertEqual(func(input), expected) + + @cpython_only + def test_c_cleandoc(self): + import _testinternalcapi + func = _testinternalcapi.compiler_cleandoc + for i, (input, expected) in enumerate(self.cleandoc_testdata): + with self.subTest(i=i): + self.assertEqual(func(input), expected) def test_getcomments(self): self.assertEqual(inspect.getcomments(mod), '# line 1\n') diff --git a/Misc/NEWS.d/next/Core and Builtins/2023-07-04-20-42-54.gh-issue-81283.hfh_MD.rst b/Misc/NEWS.d/next/Core and Builtins/2023-07-04-20-42-54.gh-issue-81283.hfh_MD.rst new file mode 100644 index 0000000..f673c66 --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2023-07-04-20-42-54.gh-issue-81283.hfh_MD.rst @@ -0,0 +1,3 @@ +Compiler now strips indents from docstrings. It reduces ``pyc`` file size 5% +when the module is heavily documented. This change affects to ``__doc__`` so +tools like doctest will be affected. diff --git a/Modules/_testinternalcapi.c b/Modules/_testinternalcapi.c index 7745dd5..271ad6c 100644 --- a/Modules/_testinternalcapi.c +++ b/Modules/_testinternalcapi.c @@ -15,7 +15,7 @@ #include "pycore_atomic_funcs.h" // _Py_atomic_int_get() #include "pycore_bitutils.h" // _Py_bswap32() #include "pycore_bytesobject.h" // _PyBytes_Find() -#include "pycore_compile.h" // _PyCompile_CodeGen, _PyCompile_OptimizeCfg, _PyCompile_Assemble +#include "pycore_compile.h" // _PyCompile_CodeGen, _PyCompile_OptimizeCfg, _PyCompile_Assemble, _PyCompile_CleanDoc #include "pycore_ceval.h" // _PyEval_AddPendingCall #include "pycore_fileutils.h" // _Py_normpath #include "pycore_frame.h" // _PyInterpreterFrame @@ -706,6 +706,23 @@ set_eval_frame_record(PyObject *self, PyObject *list) /*[clinic input] +_testinternalcapi.compiler_cleandoc -> object + + doc: unicode + +C implementation of inspect.cleandoc(). +[clinic start generated code]*/ + +static PyObject * +_testinternalcapi_compiler_cleandoc_impl(PyObject *module, PyObject *doc) +/*[clinic end generated code: output=2dd203a80feff5bc input=2de03fab931d9cdc]*/ +{ + return _PyCompile_CleanDoc(doc); +} + + +/*[clinic input] + _testinternalcapi.compiler_codegen -> object ast: object @@ -1448,6 +1465,7 @@ static PyMethodDef module_functions[] = { {"DecodeLocaleEx", decode_locale_ex, METH_VARARGS}, {"set_eval_frame_default", set_eval_frame_default, METH_NOARGS, NULL}, {"set_eval_frame_record", set_eval_frame_record, METH_O, NULL}, + _TESTINTERNALCAPI_COMPILER_CLEANDOC_METHODDEF _TESTINTERNALCAPI_COMPILER_CODEGEN_METHODDEF _TESTINTERNALCAPI_OPTIMIZE_CFG_METHODDEF _TESTINTERNALCAPI_ASSEMBLE_CODE_OBJECT_METHODDEF diff --git a/Modules/clinic/_testinternalcapi.c.h b/Modules/clinic/_testinternalcapi.c.h index f512412..9419dcd 100644 --- a/Modules/clinic/_testinternalcapi.c.h +++ b/Modules/clinic/_testinternalcapi.c.h @@ -8,6 +8,65 @@ preserve #endif +PyDoc_STRVAR(_testinternalcapi_compiler_cleandoc__doc__, +"compiler_cleandoc($module, /, doc)\n" +"--\n" +"\n" +"C implementation of inspect.cleandoc()."); + +#define _TESTINTERNALCAPI_COMPILER_CLEANDOC_METHODDEF \ + {"compiler_cleandoc", _PyCFunction_CAST(_testinternalcapi_compiler_cleandoc), METH_FASTCALL|METH_KEYWORDS, _testinternalcapi_compiler_cleandoc__doc__}, + +static PyObject * +_testinternalcapi_compiler_cleandoc_impl(PyObject *module, PyObject *doc); + +static PyObject * +_testinternalcapi_compiler_cleandoc(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames) +{ + PyObject *return_value = NULL; + #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE) + + #define NUM_KEYWORDS 1 + static struct { + PyGC_Head _this_is_not_used; + PyObject_VAR_HEAD + PyObject *ob_item[NUM_KEYWORDS]; + } _kwtuple = { + .ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS) + .ob_item = { &_Py_ID(doc), }, + }; + #undef NUM_KEYWORDS + #define KWTUPLE (&_kwtuple.ob_base.ob_base) + + #else // !Py_BUILD_CORE + # define KWTUPLE NULL + #endif // !Py_BUILD_CORE + + static const char * const _keywords[] = {"doc", NULL}; + static _PyArg_Parser _parser = { + .keywords = _keywords, + .fname = "compiler_cleandoc", + .kwtuple = KWTUPLE, + }; + #undef KWTUPLE + PyObject *argsbuf[1]; + PyObject *doc; + + args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, 1, 1, 0, argsbuf); + if (!args) { + goto exit; + } + if (!PyUnicode_Check(args[0])) { + _PyArg_BadArgument("compiler_cleandoc", "argument 'doc'", "str", args[0]); + goto exit; + } + doc = args[0]; + return_value = _testinternalcapi_compiler_cleandoc_impl(module, doc); + +exit: + return return_value; +} + PyDoc_STRVAR(_testinternalcapi_compiler_codegen__doc__, "compiler_codegen($module, /, ast, filename, optimize, compile_mode=0)\n" "--\n" @@ -206,4 +265,4 @@ _testinternalcapi_assemble_code_object(PyObject *module, PyObject *const *args, exit: return return_value; } -/*[clinic end generated code: output=2965f1578b986218 input=a9049054013a1b77]*/ +/*[clinic end generated code: output=811d50772c8f285a input=a9049054013a1b77]*/ diff --git a/Python/compile.c b/Python/compile.c index 9e86e06..b80f7c0 100644 --- a/Python/compile.c +++ b/Python/compile.c @@ -1704,10 +1704,16 @@ compiler_body(struct compiler *c, location loc, asdl_stmt_seq *stmts) if (c->c_optimize < 2) { docstring = _PyAST_GetDocString(stmts); if (docstring) { + PyObject *cleandoc = _PyCompile_CleanDoc(docstring); + if (cleandoc == NULL) { + return ERROR; + } i = 1; st = (stmt_ty)asdl_seq_GET(stmts, 0); assert(st->kind == Expr_kind); - VISIT(c, expr, st->v.Expr.value); + location loc = LOC(st->v.Expr.value); + ADDOP_LOAD_CONST(c, loc, cleandoc); + Py_DECREF(cleandoc); RETURN_IF_ERROR(compiler_nameop(c, NO_LOCATION, &_Py_ID(__doc__), Store)); } } @@ -2252,11 +2258,19 @@ compiler_function_body(struct compiler *c, stmt_ty s, int is_async, Py_ssize_t f /* if not -OO mode, add docstring */ if (c->c_optimize < 2) { docstring = _PyAST_GetDocString(body); + if (docstring) { + docstring = _PyCompile_CleanDoc(docstring); + if (docstring == NULL) { + compiler_exit_scope(c); + return ERROR; + } + } } if (compiler_add_const(c->c_const_cache, c->u, docstring ? docstring : Py_None) < 0) { compiler_exit_scope(c); return ERROR; } + Py_XDECREF(docstring); c->u->u_metadata.u_argcount = asdl_seq_LEN(args->args); c->u->u_metadata.u_posonlyargcount = asdl_seq_LEN(args->posonlyargs); @@ -7967,6 +7981,89 @@ error: return NULL; } +// C implementation of inspect.cleandoc() +// +// Difference from inspect.cleandoc(): +// - Do not remove leading and trailing blank lines to keep lineno. +PyObject * +_PyCompile_CleanDoc(PyObject *doc) +{ + doc = PyObject_CallMethod(doc, "expandtabs", NULL); + if (doc == NULL) { + return NULL; + } + + Py_ssize_t doc_size; + const char *doc_utf8 = PyUnicode_AsUTF8AndSize(doc, &doc_size); + if (doc_utf8 == NULL) { + Py_DECREF(doc); + return NULL; + } + const char *p = doc_utf8; + const char *pend = p + doc_size; + + // First pass: find minimum indentation of any non-blank lines + // after first line. + while (p < pend && *p++ != '\n') { + } + + Py_ssize_t margin = PY_SSIZE_T_MAX; + while (p < pend) { + const char *s = p; + while (*p == ' ') p++; + if (p < pend && *p != '\n') { + margin = Py_MIN(margin, p - s); + } + while (p < pend && *p++ != '\n') { + } + } + if (margin == PY_SSIZE_T_MAX) { + margin = 0; + } + + // Second pass: write cleandoc into buff. + + // copy first line without leading spaces. + p = doc_utf8; + while (*p == ' ') { + p++; + } + if (p == doc_utf8 && margin == 0 ) { + // doc is already clean. + return doc; + } + + char *buff = PyMem_Malloc(doc_size); + char *w = buff; + + while (p < pend) { + int ch = *w++ = *p++; + if (ch == '\n') { + break; + } + } + + // copy subsequent lines without margin. + while (p < pend) { + for (Py_ssize_t i = 0; i < margin; i++, p++) { + if (*p != ' ') { + assert(*p == '\n' || *p == '\0'); + break; + } + } + while (p < pend) { + int ch = *w++ = *p++; + if (ch == '\n') { + break; + } + } + } + + Py_DECREF(doc); + return PyUnicode_FromStringAndSize(buff, w - buff); +} + + PyObject * _PyCompile_CodeGen(PyObject *ast, PyObject *filename, PyCompilerFlags *pflags, int optimize, int compile_mode) -- cgit v0.12