summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorInada Naoki <songofacandy@gmail.com>2023-07-15 10:33:32 (GMT)
committerGitHub <noreply@github.com>2023-07-15 10:33:32 (GMT)
commit2566b74b26bcce24199427acea392aed644f4b17 (patch)
tree91d2ae486db81b01c8451f26382f2eedddaf2ba1
parentbbf62979851283b601b2dac0073ab331ebeb3be9 (diff)
downloadcpython-2566b74b26bcce24199427acea392aed644f4b17.zip
cpython-2566b74b26bcce24199427acea392aed644f4b17.tar.gz
cpython-2566b74b26bcce24199427acea392aed644f4b17.tar.bz2
gh-81283: compiler: remove indent from docstring (#106411)
Co-authored-by: Éric <merwok@netwok.org>
-rw-r--r--Doc/whatsnew/3.13.rst7
-rw-r--r--Include/internal/pycore_compile.h2
-rw-r--r--Lib/inspect.py45
-rw-r--r--Lib/test/test_doctest.py4
-rw-r--r--Lib/test/test_inspect.py35
-rw-r--r--Misc/NEWS.d/next/Core and Builtins/2023-07-04-20-42-54.gh-issue-81283.hfh_MD.rst3
-rw-r--r--Modules/_testinternalcapi.c20
-rw-r--r--Modules/clinic/_testinternalcapi.c.h61
-rw-r--r--Python/compile.c99
9 files changed, 246 insertions, 30 deletions
diff --git a/Doc/whatsnew/3.13.rst b/Doc/whatsnew/3.13.rst
index 06fcaf4..161d5fb 100644
--- a/Doc/whatsnew/3.13.rst
+++ b/Doc/whatsnew/3.13.rst
@@ -79,6 +79,13 @@ Other Language Changes
* Allow the *count* argument of :meth:`str.replace` to be a keyword.
(Contributed by Hugo van Kemenade in :gh:`106487`.)
+* Compiler now strip indents from docstrings.
+ This will reduce the size of :term:`bytecode cache <bytecode>` (e.g. ``.pyc`` file).
+ For example, cache file size for ``sqlalchemy.orm.session`` in SQLAlchemy 2.0
+ is reduced by about 5%.
+ This change will affect tools using docstrings, like :mod:`doctest`.
+ (Contributed by Inada Naoki in :gh:`81283`.)
+
New Modules
===========
diff --git a/Include/internal/pycore_compile.h b/Include/internal/pycore_compile.h
index e204d4d..beb37cc 100644
--- a/Include/internal/pycore_compile.h
+++ b/Include/internal/pycore_compile.h
@@ -91,6 +91,8 @@ int _PyCompile_ConstCacheMergeOne(PyObject *const_cache, PyObject **obj);
/* Access compiler internals for unit testing */
+PyAPI_FUNC(PyObject*) _PyCompile_CleanDoc(PyObject *doc);
+
PyAPI_FUNC(PyObject*) _PyCompile_CodeGen(
PyObject *ast,
PyObject *filename,
diff --git a/Lib/inspect.py b/Lib/inspect.py
index a550202..15f94a1 100644
--- a/Lib/inspect.py
+++ b/Lib/inspect.py
@@ -881,29 +881,28 @@ def cleandoc(doc):
Any whitespace that can be uniformly removed from the second line
onwards is removed."""
- try:
- lines = doc.expandtabs().split('\n')
- except UnicodeError:
- return None
- else:
- # Find minimum indentation of any non-blank lines after first line.
- margin = sys.maxsize
- for line in lines[1:]:
- content = len(line.lstrip())
- if content:
- indent = len(line) - content
- margin = min(margin, indent)
- # Remove indentation.
- if lines:
- lines[0] = lines[0].lstrip()
- if margin < sys.maxsize:
- for i in range(1, len(lines)): lines[i] = lines[i][margin:]
- # Remove any trailing or leading blank lines.
- while lines and not lines[-1]:
- lines.pop()
- while lines and not lines[0]:
- lines.pop(0)
- return '\n'.join(lines)
+ lines = doc.expandtabs().split('\n')
+
+ # Find minimum indentation of any non-blank lines after first line.
+ margin = sys.maxsize
+ for line in lines[1:]:
+ content = len(line.lstrip(' '))
+ if content:
+ indent = len(line) - content
+ margin = min(margin, indent)
+ # Remove indentation.
+ if lines:
+ lines[0] = lines[0].lstrip(' ')
+ if margin < sys.maxsize:
+ for i in range(1, len(lines)):
+ lines[i] = lines[i][margin:]
+ # Remove any trailing or leading blank lines.
+ while lines and not lines[-1]:
+ lines.pop()
+ while lines and not lines[0]:
+ lines.pop(0)
+ return '\n'.join(lines)
+
def getfile(object):
"""Work out which source or compiled file an object was defined in."""
diff --git a/Lib/test/test_doctest.py b/Lib/test/test_doctest.py
index 542fcdb..bea52c6 100644
--- a/Lib/test/test_doctest.py
+++ b/Lib/test/test_doctest.py
@@ -1287,14 +1287,14 @@ The NORMALIZE_WHITESPACE flag causes all sequences of whitespace to be
treated as equal:
>>> def f(x):
- ... '>>> print(1, 2, 3)\n 1 2\n 3'
+ ... '\n>>> print(1, 2, 3)\n 1 2\n 3'
>>> # Without the flag:
>>> test = doctest.DocTestFinder().find(f)[0]
>>> doctest.DocTestRunner(verbose=False).run(test)
... # doctest: +ELLIPSIS
**********************************************************************
- File ..., line 2, in f
+ File ..., line 3, in f
Failed example:
print(1, 2, 3)
Expected:
diff --git a/Lib/test/test_inspect.py b/Lib/test/test_inspect.py
index d89953a..64afeec 100644
--- a/Lib/test/test_inspect.py
+++ b/Lib/test/test_inspect.py
@@ -596,9 +596,40 @@ class TestRetrievingSourceCode(GetSourceBase):
self.assertEqual(finddoc(int.from_bytes), int.from_bytes.__doc__)
self.assertEqual(finddoc(int.real), int.real.__doc__)
+ cleandoc_testdata = [
+ # first line should have different margin
+ (' An\n indented\n docstring.', 'An\nindented\n docstring.'),
+ # trailing whitespace are not removed.
+ (' An \n \n indented \n docstring. ',
+ 'An \n \nindented \n docstring. '),
+ # NUL is not termination.
+ ('doc\0string\n\n second\0line\n third\0line\0',
+ 'doc\0string\n\nsecond\0line\nthird\0line\0'),
+ # first line is lstrip()-ped. other lines are kept when no margin.[w:
+ (' ', ''),
+ # compiler.cleandoc() doesn't strip leading/trailing newlines
+ # to keep maximum backward compatibility.
+ # inspect.cleandoc() removes them.
+ ('\n\n\n first paragraph\n\n second paragraph\n\n',
+ '\n\n\nfirst paragraph\n\n second paragraph\n\n'),
+ (' \n \n \n ', '\n \n \n '),
+ ]
+
def test_cleandoc(self):
- self.assertEqual(inspect.cleandoc('An\n indented\n docstring.'),
- 'An\nindented\ndocstring.')
+ func = inspect.cleandoc
+ for i, (input, expected) in enumerate(self.cleandoc_testdata):
+ # only inspect.cleandoc() strip \n
+ expected = expected.strip('\n')
+ with self.subTest(i=i):
+ self.assertEqual(func(input), expected)
+
+ @cpython_only
+ def test_c_cleandoc(self):
+ import _testinternalcapi
+ func = _testinternalcapi.compiler_cleandoc
+ for i, (input, expected) in enumerate(self.cleandoc_testdata):
+ with self.subTest(i=i):
+ self.assertEqual(func(input), expected)
def test_getcomments(self):
self.assertEqual(inspect.getcomments(mod), '# line 1\n')
diff --git a/Misc/NEWS.d/next/Core and Builtins/2023-07-04-20-42-54.gh-issue-81283.hfh_MD.rst b/Misc/NEWS.d/next/Core and Builtins/2023-07-04-20-42-54.gh-issue-81283.hfh_MD.rst
new file mode 100644
index 0000000..f673c66
--- /dev/null
+++ b/Misc/NEWS.d/next/Core and Builtins/2023-07-04-20-42-54.gh-issue-81283.hfh_MD.rst
@@ -0,0 +1,3 @@
+Compiler now strips indents from docstrings. It reduces ``pyc`` file size 5%
+when the module is heavily documented. This change affects to ``__doc__`` so
+tools like doctest will be affected.
diff --git a/Modules/_testinternalcapi.c b/Modules/_testinternalcapi.c
index 7745dd5..271ad6c 100644
--- a/Modules/_testinternalcapi.c
+++ b/Modules/_testinternalcapi.c
@@ -15,7 +15,7 @@
#include "pycore_atomic_funcs.h" // _Py_atomic_int_get()
#include "pycore_bitutils.h" // _Py_bswap32()
#include "pycore_bytesobject.h" // _PyBytes_Find()
-#include "pycore_compile.h" // _PyCompile_CodeGen, _PyCompile_OptimizeCfg, _PyCompile_Assemble
+#include "pycore_compile.h" // _PyCompile_CodeGen, _PyCompile_OptimizeCfg, _PyCompile_Assemble, _PyCompile_CleanDoc
#include "pycore_ceval.h" // _PyEval_AddPendingCall
#include "pycore_fileutils.h" // _Py_normpath
#include "pycore_frame.h" // _PyInterpreterFrame
@@ -706,6 +706,23 @@ set_eval_frame_record(PyObject *self, PyObject *list)
/*[clinic input]
+_testinternalcapi.compiler_cleandoc -> object
+
+ doc: unicode
+
+C implementation of inspect.cleandoc().
+[clinic start generated code]*/
+
+static PyObject *
+_testinternalcapi_compiler_cleandoc_impl(PyObject *module, PyObject *doc)
+/*[clinic end generated code: output=2dd203a80feff5bc input=2de03fab931d9cdc]*/
+{
+ return _PyCompile_CleanDoc(doc);
+}
+
+
+/*[clinic input]
+
_testinternalcapi.compiler_codegen -> object
ast: object
@@ -1448,6 +1465,7 @@ static PyMethodDef module_functions[] = {
{"DecodeLocaleEx", decode_locale_ex, METH_VARARGS},
{"set_eval_frame_default", set_eval_frame_default, METH_NOARGS, NULL},
{"set_eval_frame_record", set_eval_frame_record, METH_O, NULL},
+ _TESTINTERNALCAPI_COMPILER_CLEANDOC_METHODDEF
_TESTINTERNALCAPI_COMPILER_CODEGEN_METHODDEF
_TESTINTERNALCAPI_OPTIMIZE_CFG_METHODDEF
_TESTINTERNALCAPI_ASSEMBLE_CODE_OBJECT_METHODDEF
diff --git a/Modules/clinic/_testinternalcapi.c.h b/Modules/clinic/_testinternalcapi.c.h
index f512412..9419dcd 100644
--- a/Modules/clinic/_testinternalcapi.c.h
+++ b/Modules/clinic/_testinternalcapi.c.h
@@ -8,6 +8,65 @@ preserve
#endif
+PyDoc_STRVAR(_testinternalcapi_compiler_cleandoc__doc__,
+"compiler_cleandoc($module, /, doc)\n"
+"--\n"
+"\n"
+"C implementation of inspect.cleandoc().");
+
+#define _TESTINTERNALCAPI_COMPILER_CLEANDOC_METHODDEF \
+ {"compiler_cleandoc", _PyCFunction_CAST(_testinternalcapi_compiler_cleandoc), METH_FASTCALL|METH_KEYWORDS, _testinternalcapi_compiler_cleandoc__doc__},
+
+static PyObject *
+_testinternalcapi_compiler_cleandoc_impl(PyObject *module, PyObject *doc);
+
+static PyObject *
+_testinternalcapi_compiler_cleandoc(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames)
+{
+ PyObject *return_value = NULL;
+ #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE)
+
+ #define NUM_KEYWORDS 1
+ static struct {
+ PyGC_Head _this_is_not_used;
+ PyObject_VAR_HEAD
+ PyObject *ob_item[NUM_KEYWORDS];
+ } _kwtuple = {
+ .ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS)
+ .ob_item = { &_Py_ID(doc), },
+ };
+ #undef NUM_KEYWORDS
+ #define KWTUPLE (&_kwtuple.ob_base.ob_base)
+
+ #else // !Py_BUILD_CORE
+ # define KWTUPLE NULL
+ #endif // !Py_BUILD_CORE
+
+ static const char * const _keywords[] = {"doc", NULL};
+ static _PyArg_Parser _parser = {
+ .keywords = _keywords,
+ .fname = "compiler_cleandoc",
+ .kwtuple = KWTUPLE,
+ };
+ #undef KWTUPLE
+ PyObject *argsbuf[1];
+ PyObject *doc;
+
+ args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, 1, 1, 0, argsbuf);
+ if (!args) {
+ goto exit;
+ }
+ if (!PyUnicode_Check(args[0])) {
+ _PyArg_BadArgument("compiler_cleandoc", "argument 'doc'", "str", args[0]);
+ goto exit;
+ }
+ doc = args[0];
+ return_value = _testinternalcapi_compiler_cleandoc_impl(module, doc);
+
+exit:
+ return return_value;
+}
+
PyDoc_STRVAR(_testinternalcapi_compiler_codegen__doc__,
"compiler_codegen($module, /, ast, filename, optimize, compile_mode=0)\n"
"--\n"
@@ -206,4 +265,4 @@ _testinternalcapi_assemble_code_object(PyObject *module, PyObject *const *args,
exit:
return return_value;
}
-/*[clinic end generated code: output=2965f1578b986218 input=a9049054013a1b77]*/
+/*[clinic end generated code: output=811d50772c8f285a input=a9049054013a1b77]*/
diff --git a/Python/compile.c b/Python/compile.c
index 9e86e06..b80f7c0 100644
--- a/Python/compile.c
+++ b/Python/compile.c
@@ -1704,10 +1704,16 @@ compiler_body(struct compiler *c, location loc, asdl_stmt_seq *stmts)
if (c->c_optimize < 2) {
docstring = _PyAST_GetDocString(stmts);
if (docstring) {
+ PyObject *cleandoc = _PyCompile_CleanDoc(docstring);
+ if (cleandoc == NULL) {
+ return ERROR;
+ }
i = 1;
st = (stmt_ty)asdl_seq_GET(stmts, 0);
assert(st->kind == Expr_kind);
- VISIT(c, expr, st->v.Expr.value);
+ location loc = LOC(st->v.Expr.value);
+ ADDOP_LOAD_CONST(c, loc, cleandoc);
+ Py_DECREF(cleandoc);
RETURN_IF_ERROR(compiler_nameop(c, NO_LOCATION, &_Py_ID(__doc__), Store));
}
}
@@ -2252,11 +2258,19 @@ compiler_function_body(struct compiler *c, stmt_ty s, int is_async, Py_ssize_t f
/* if not -OO mode, add docstring */
if (c->c_optimize < 2) {
docstring = _PyAST_GetDocString(body);
+ if (docstring) {
+ docstring = _PyCompile_CleanDoc(docstring);
+ if (docstring == NULL) {
+ compiler_exit_scope(c);
+ return ERROR;
+ }
+ }
}
if (compiler_add_const(c->c_const_cache, c->u, docstring ? docstring : Py_None) < 0) {
compiler_exit_scope(c);
return ERROR;
}
+ Py_XDECREF(docstring);
c->u->u_metadata.u_argcount = asdl_seq_LEN(args->args);
c->u->u_metadata.u_posonlyargcount = asdl_seq_LEN(args->posonlyargs);
@@ -7967,6 +7981,89 @@ error:
return NULL;
}
+// C implementation of inspect.cleandoc()
+//
+// Difference from inspect.cleandoc():
+// - Do not remove leading and trailing blank lines to keep lineno.
+PyObject *
+_PyCompile_CleanDoc(PyObject *doc)
+{
+ doc = PyObject_CallMethod(doc, "expandtabs", NULL);
+ if (doc == NULL) {
+ return NULL;
+ }
+
+ Py_ssize_t doc_size;
+ const char *doc_utf8 = PyUnicode_AsUTF8AndSize(doc, &doc_size);
+ if (doc_utf8 == NULL) {
+ Py_DECREF(doc);
+ return NULL;
+ }
+ const char *p = doc_utf8;
+ const char *pend = p + doc_size;
+
+ // First pass: find minimum indentation of any non-blank lines
+ // after first line.
+ while (p < pend && *p++ != '\n') {
+ }
+
+ Py_ssize_t margin = PY_SSIZE_T_MAX;
+ while (p < pend) {
+ const char *s = p;
+ while (*p == ' ') p++;
+ if (p < pend && *p != '\n') {
+ margin = Py_MIN(margin, p - s);
+ }
+ while (p < pend && *p++ != '\n') {
+ }
+ }
+ if (margin == PY_SSIZE_T_MAX) {
+ margin = 0;
+ }
+
+ // Second pass: write cleandoc into buff.
+
+ // copy first line without leading spaces.
+ p = doc_utf8;
+ while (*p == ' ') {
+ p++;
+ }
+ if (p == doc_utf8 && margin == 0 ) {
+ // doc is already clean.
+ return doc;
+ }
+
+ char *buff = PyMem_Malloc(doc_size);
+ char *w = buff;
+
+ while (p < pend) {
+ int ch = *w++ = *p++;
+ if (ch == '\n') {
+ break;
+ }
+ }
+
+ // copy subsequent lines without margin.
+ while (p < pend) {
+ for (Py_ssize_t i = 0; i < margin; i++, p++) {
+ if (*p != ' ') {
+ assert(*p == '\n' || *p == '\0');
+ break;
+ }
+ }
+ while (p < pend) {
+ int ch = *w++ = *p++;
+ if (ch == '\n') {
+ break;
+ }
+ }
+ }
+
+ Py_DECREF(doc);
+ return PyUnicode_FromStringAndSize(buff, w - buff);
+}
+
+
PyObject *
_PyCompile_CodeGen(PyObject *ast, PyObject *filename, PyCompilerFlags *pflags,
int optimize, int compile_mode)