summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPablo Galindo Salgado <Pablogsal@gmail.com>2022-08-30 17:11:18 (GMT)
committerGitHub <noreply@github.com>2022-08-30 17:11:18 (GMT)
commit6d791a97364b68d5f9c3514a0470aac487fc538d (patch)
tree745205d7e8698ea7398eb353311f55dc973507bf
parent0f733fffe8f4caaac3ce1b5306af86b42fb0c7fa (diff)
downloadcpython-6d791a97364b68d5f9c3514a0470aac487fc538d.zip
cpython-6d791a97364b68d5f9c3514a0470aac487fc538d.tar.gz
cpython-6d791a97364b68d5f9c3514a0470aac487fc538d.tar.bz2
gh-96143: Allow Linux perf profiler to see Python calls (GH-96123)
:warning: :warning: Note for reviewers, hackers and fellow systems/low-level/compiler engineers :warning: :warning: If you have a lot of experience with this kind of shenanigans and want to improve the **first** version, **please make a PR against my branch** or **reach out by email** or **suggest code changes directly on GitHub**. If you have any **refinements or optimizations** please, wait until the first version is merged before starting hacking or proposing those so we can keep this PR productive.
-rw-r--r--Doc/c-api/init_config.rst14
-rw-r--r--Doc/howto/index.rst1
-rw-r--r--Doc/howto/perf_profiling.rst200
-rw-r--r--Doc/using/cmdline.rst13
-rw-r--r--Include/cpython/initconfig.h1
-rw-r--r--Include/internal/pycore_ceval.h21
-rw-r--r--Lib/test/test_embed.py5
-rw-r--r--Lib/test/test_perf_profiler.py348
-rw-r--r--Makefile.pre.in7
-rw-r--r--Misc/NEWS.d/next/Core and Builtins/2022-08-20-18-36-40.gh-issue-96143.nh3GFM.rst7
-rw-r--r--Modules/posixmodule.c5
-rw-r--r--Objects/asm_trampoline.S28
-rw-r--r--Objects/perf_trampoline.c501
-rw-r--r--PCbuild/_freeze_module.vcxproj1
-rw-r--r--PCbuild/_freeze_module.vcxproj.filters3
-rw-r--r--PCbuild/pythoncore.vcxproj1
-rw-r--r--PCbuild/pythoncore.vcxproj.filters3
-rw-r--r--Python/clinic/sysmodule.c.h75
-rw-r--r--Python/initconfig.c39
-rw-r--r--Python/pylifecycle.c11
-rw-r--r--Python/sysmodule.c77
-rwxr-xr-xconfigure30
-rw-r--r--configure.ac20
-rw-r--r--pyconfig.h.in3
24 files changed, 1412 insertions, 2 deletions
diff --git a/Doc/c-api/init_config.rst b/Doc/c-api/init_config.rst
index 2074ec4..c4a342e 100644
--- a/Doc/c-api/init_config.rst
+++ b/Doc/c-api/init_config.rst
@@ -1155,6 +1155,20 @@ PyConfig
Default: ``-1`` in Python mode, ``0`` in isolated mode.
+ .. c:member:: int perf_profiling
+
+ Enable compatibility mode with the perf profiler?
+
+ If non-zero, initialize the perf trampoline. See :ref:`perf_profiling`
+ for more information.
+
+ Set by :option:`-X perf <-X>` command line option and by the
+ :envvar:`PYTHONPERFSUPPORT` environment variable.
+
+ Default: ``-1``.
+
+ .. versionadded:: 3.12
+
.. c:member:: int use_environment
Use :ref:`environment variables <using-on-envvars>`?
diff --git a/Doc/howto/index.rst b/Doc/howto/index.rst
index 8a378e6..f521276 100644
--- a/Doc/howto/index.rst
+++ b/Doc/howto/index.rst
@@ -30,6 +30,7 @@ Currently, the HOWTOs are:
ipaddress.rst
clinic.rst
instrumentation.rst
+ perf_profiling.rst
annotations.rst
isolating-extensions.rst
diff --git a/Doc/howto/perf_profiling.rst b/Doc/howto/perf_profiling.rst
new file mode 100644
index 0000000..2e1bb48
--- /dev/null
+++ b/Doc/howto/perf_profiling.rst
@@ -0,0 +1,200 @@
+.. highlight:: shell-session
+
+.. _perf_profiling:
+
+==============================================
+Python support for the Linux ``perf`` profiler
+==============================================
+
+:author: Pablo Galindo
+
+The Linux ``perf`` profiler is a very powerful tool that allows you to profile and
+obtain information about the performance of your application. ``perf`` also has
+a very vibrant ecosystem of tools that aid with the analysis of the data that it
+produces.
+
+The main problem with using the ``perf`` profiler with Python applications is that
+``perf`` only allows to get information about native symbols, this is, the names of
+the functions and procedures written in C. This means that the names and file names
+of the Python functions in your code will not appear in the output of the ``perf``.
+
+Since Python 3.12, the interpreter can run in a special mode that allows Python
+functions to appear in the output of the ``perf`` profiler. When this mode is
+enabled, the interpreter will interpose a small piece of code compiled on the
+fly before the execution of every Python function and it will teach ``perf`` the
+relationship between this piece of code and the associated Python function using
+`perf map files`_.
+
+.. warning::
+
+ Support for the ``perf`` profiler is only currently available for Linux on
+ selected architectures. Check the output of the configure build step or
+ check the output of ``python -m sysconfig | grep HAVE_PERF_TRAMPOLINE``
+ to see if your system is supported.
+
+For example, consider the following script:
+
+.. code-block:: python
+
+ def foo(n):
+ result = 0
+ for _ in range(n):
+ result += 1
+ return result
+
+ def bar(n):
+ foo(n)
+
+ def baz(n):
+ bar(n)
+
+ if __name__ == "__main__":
+ baz(1000000)
+
+We can run perf to sample CPU stack traces at 9999 Hertz:
+
+ $ perf record -F 9999 -g -o perf.data python my_script.py
+
+Then we can use perf report to analyze the data:
+
+.. code-block:: shell-session
+
+ $ perf report --stdio -n -g
+
+ # Children Self Samples Command Shared Object Symbol
+ # ........ ........ ............ .......... .................. ..........................................
+ #
+ 91.08% 0.00% 0 python.exe python.exe [.] _start
+ |
+ ---_start
+ |
+ --90.71%--__libc_start_main
+ Py_BytesMain
+ |
+ |--56.88%--pymain_run_python.constprop.0
+ | |
+ | |--56.13%--_PyRun_AnyFileObject
+ | | _PyRun_SimpleFileObject
+ | | |
+ | | |--55.02%--run_mod
+ | | | |
+ | | | --54.65%--PyEval_EvalCode
+ | | | _PyEval_EvalFrameDefault
+ | | | PyObject_Vectorcall
+ | | | _PyEval_Vector
+ | | | _PyEval_EvalFrameDefault
+ | | | PyObject_Vectorcall
+ | | | _PyEval_Vector
+ | | | _PyEval_EvalFrameDefault
+ | | | PyObject_Vectorcall
+ | | | _PyEval_Vector
+ | | | |
+ | | | |--51.67%--_PyEval_EvalFrameDefault
+ | | | | |
+ | | | | |--11.52%--_PyLong_Add
+ | | | | | |
+ | | | | | |--2.97%--_PyObject_Malloc
+ ...
+
+As you can see here, the Python functions are not shown in the output, only ``_Py_Eval_EvalFrameDefault`` appears
+(the function that evaluates the Python bytecode) shows up. Unfortunately that's not very useful because all Python
+functions use the same C function to evaluate bytecode so we cannot know which Python function corresponds to which
+bytecode-evaluating function.
+
+Instead, if we run the same experiment with perf support activated we get:
+
+.. code-block:: shell-session
+
+ $ perf report --stdio -n -g
+
+ # Children Self Samples Command Shared Object Symbol
+ # ........ ........ ............ .......... .................. .....................................................................
+ #
+ 90.58% 0.36% 1 python.exe python.exe [.] _start
+ |
+ ---_start
+ |
+ --89.86%--__libc_start_main
+ Py_BytesMain
+ |
+ |--55.43%--pymain_run_python.constprop.0
+ | |
+ | |--54.71%--_PyRun_AnyFileObject
+ | | _PyRun_SimpleFileObject
+ | | |
+ | | |--53.62%--run_mod
+ | | | |
+ | | | --53.26%--PyEval_EvalCode
+ | | | py::<module>:/src/script.py
+ | | | _PyEval_EvalFrameDefault
+ | | | PyObject_Vectorcall
+ | | | _PyEval_Vector
+ | | | py::baz:/src/script.py
+ | | | _PyEval_EvalFrameDefault
+ | | | PyObject_Vectorcall
+ | | | _PyEval_Vector
+ | | | py::bar:/src/script.py
+ | | | _PyEval_EvalFrameDefault
+ | | | PyObject_Vectorcall
+ | | | _PyEval_Vector
+ | | | py::foo:/src/script.py
+ | | | |
+ | | | |--51.81%--_PyEval_EvalFrameDefault
+ | | | | |
+ | | | | |--13.77%--_PyLong_Add
+ | | | | | |
+ | | | | | |--3.26%--_PyObject_Malloc
+
+
+
+Enabling perf profiling mode
+----------------------------
+
+There are two main ways to activate the perf profiling mode. If you want it to be
+active since the start of the Python interpreter, you can use the `-Xperf` option:
+
+ $ python -Xperf my_script.py
+
+There is also support for dynamically activating and deactivating the perf
+profiling mode by using the APIs in the :mod:`sys` module:
+
+.. code-block:: python
+
+ import sys
+ sys.activate_stack_trampoline("perf")
+
+ # Run some code with Perf profiling active
+
+ sys.deactivate_stack_trampoline()
+
+ # Perf profiling is not active anymore
+
+These APIs can be handy if you want to activate/deactivate profiling mode in
+response to a signal or other communication mechanism with your process.
+
+
+
+Now we can analyze the data with ``perf report``:
+
+ $ perf report -g -i perf.data
+
+
+How to obtain the best results
+-------------------------------
+
+For the best results, Python should be compiled with
+``CFLAGS="-fno-omit-frame-pointer -mno-omit-leaf-frame-pointer"`` as this allows
+profilers to unwind using only the frame pointer and not on DWARF debug
+information. This is because as the code that is interposed to allow perf
+support is dynamically generated it doesn't have any DWARF debugging information
+available.
+
+You can check if you system has been compiled with this flag by running:
+
+ $ python -m sysconfig | grep 'no-omit-frame-pointer'
+
+If you don't see any output it means that your interpreter has not been compiled with
+frame pointers and therefore it may not be able to show Python functions in the output
+of ``perf``.
+
+.. _perf map files: https://github.com/torvalds/linux/blob/0513e464f9007b70b96740271a948ca5ab6e7dd7/tools/perf/Documentation/jit-interface.txt
diff --git a/Doc/using/cmdline.rst b/Doc/using/cmdline.rst
index 6678d47..5ecc882 100644
--- a/Doc/using/cmdline.rst
+++ b/Doc/using/cmdline.rst
@@ -535,6 +535,12 @@ Miscellaneous options
development (running from the source tree) then the default is "off".
Note that the "importlib_bootstrap" and "importlib_bootstrap_external"
frozen modules are always used, even if this flag is set to "off".
+ * ``-X perf`` to activate compatibility mode with the ``perf`` profiler.
+ When this option is activated, the Linux ``perf`` profiler will be able to
+ report Python calls. This option is only available on some platforms and
+ will do nothing if is not supported on the current system. The default value
+ is "off". See also :envvar:`PYTHONPERFSUPPORT` and :ref:`perf_profiling`
+ for more information.
It also allows passing arbitrary values and retrieving them through the
:data:`sys._xoptions` dictionary.
@@ -1025,6 +1031,13 @@ conflict.
.. versionadded:: 3.11
+.. envvar:: PYTHONPERFSUPPORT
+
+ If this variable is set to a nonzero value, it activates compatibility mode
+ with the ``perf`` profiler so Python calls can be detected by it. See the
+ :ref:`perf_profiling` section for more information.
+
+ .. versionadded:: 3.12
Debug-mode variables
diff --git a/Include/cpython/initconfig.h b/Include/cpython/initconfig.h
index 3b6d593..c6057a4 100644
--- a/Include/cpython/initconfig.h
+++ b/Include/cpython/initconfig.h
@@ -142,6 +142,7 @@ typedef struct PyConfig {
unsigned long hash_seed;
int faulthandler;
int tracemalloc;
+ int perf_profiling;
int import_time;
int code_debug_ranges;
int show_ref_count;
diff --git a/Include/internal/pycore_ceval.h b/Include/internal/pycore_ceval.h
index 2fcdaad..4914948c 100644
--- a/Include/internal/pycore_ceval.h
+++ b/Include/internal/pycore_ceval.h
@@ -65,6 +65,27 @@ extern PyObject* _PyEval_BuiltinsFromGlobals(
PyThreadState *tstate,
PyObject *globals);
+// Trampoline API
+
+typedef struct {
+ // Callback to initialize the trampoline state
+ void* (*init_state)(void);
+ // Callback to register every trampoline being created
+ void (*write_state)(void* state, const void *code_addr,
+ unsigned int code_size, PyCodeObject* code);
+ // Callback to free the trampoline state
+ int (*free_state)(void* state);
+} _PyPerf_Callbacks;
+
+extern int _PyPerfTrampoline_SetCallbacks(_PyPerf_Callbacks *);
+extern void _PyPerfTrampoline_GetCallbacks(_PyPerf_Callbacks *);
+extern int _PyPerfTrampoline_Init(int activate);
+extern int _PyPerfTrampoline_Fini(void);
+extern int _PyIsPerfTrampolineActive(void);
+extern PyStatus _PyPerfTrampoline_AfterFork_Child(void);
+#ifdef PY_HAVE_PERF_TRAMPOLINE
+extern _PyPerf_Callbacks _Py_perfmap_callbacks;
+#endif
static inline PyObject*
_PyEval_EvalFrame(PyThreadState *tstate, struct _PyInterpreterFrame *frame, int throwflag)
diff --git a/Lib/test/test_embed.py b/Lib/test/test_embed.py
index c546bb0..70d7367 100644
--- a/Lib/test/test_embed.py
+++ b/Lib/test/test_embed.py
@@ -436,6 +436,7 @@ class InitConfigTests(EmbeddingTestsMixin, unittest.TestCase):
'hash_seed': 0,
'faulthandler': 0,
'tracemalloc': 0,
+ 'perf_profiling': 0,
'import_time': 0,
'code_debug_ranges': 1,
'show_ref_count': 0,
@@ -520,6 +521,7 @@ class InitConfigTests(EmbeddingTestsMixin, unittest.TestCase):
use_hash_seed=0,
faulthandler=0,
tracemalloc=0,
+ perf_profiling=0,
pathconfig_warnings=0,
)
if MS_WINDOWS:
@@ -828,6 +830,7 @@ class InitConfigTests(EmbeddingTestsMixin, unittest.TestCase):
'use_hash_seed': 1,
'hash_seed': 123,
'tracemalloc': 2,
+ 'perf_profiling': 0,
'import_time': 1,
'code_debug_ranges': 0,
'show_ref_count': 1,
@@ -890,6 +893,7 @@ class InitConfigTests(EmbeddingTestsMixin, unittest.TestCase):
'use_hash_seed': 1,
'hash_seed': 42,
'tracemalloc': 2,
+ 'perf_profiling': 0,
'import_time': 1,
'code_debug_ranges': 0,
'malloc_stats': 1,
@@ -921,6 +925,7 @@ class InitConfigTests(EmbeddingTestsMixin, unittest.TestCase):
'use_hash_seed': 1,
'hash_seed': 42,
'tracemalloc': 2,
+ 'perf_profiling': 0,
'import_time': 1,
'code_debug_ranges': 0,
'malloc_stats': 1,
diff --git a/Lib/test/test_perf_profiler.py b/Lib/test/test_perf_profiler.py
new file mode 100644
index 0000000..c2aad85
--- /dev/null
+++ b/Lib/test/test_perf_profiler.py
@@ -0,0 +1,348 @@
+import unittest
+import subprocess
+import sys
+import sysconfig
+import os
+import pathlib
+from test import support
+from test.support.script_helper import (
+ make_script,
+ assert_python_failure,
+ assert_python_ok,
+)
+from test.support.os_helper import temp_dir
+
+
+if not support.has_subprocess_support:
+ raise unittest.SkipTest("test module requires subprocess")
+
+
+def supports_trampoline_profiling():
+ perf_trampoline = sysconfig.get_config_var("PY_HAVE_PERF_TRAMPOLINE")
+ if not perf_trampoline:
+ return False
+ return int(perf_trampoline) == 1
+
+
+if not supports_trampoline_profiling():
+ raise unittest.SkipTest("perf trampoline profiling not supported")
+
+
+class TestPerfTrampoline(unittest.TestCase):
+ def setUp(self):
+ super().setUp()
+ self.perf_files = set(pathlib.Path("/tmp/").glob("perf-*.map"))
+
+ def tearDown(self) -> None:
+ super().tearDown()
+ files_to_delete = (
+ set(pathlib.Path("/tmp/").glob("perf-*.map")) - self.perf_files
+ )
+ for file in files_to_delete:
+ file.unlink()
+
+ def test_trampoline_works(self):
+ code = """if 1:
+ def foo():
+ pass
+
+ def bar():
+ foo()
+
+ def baz():
+ bar()
+
+ baz()
+ """
+ with temp_dir() as script_dir:
+ script = make_script(script_dir, "perftest", code)
+ with subprocess.Popen(
+ [sys.executable, "-Xperf", script],
+ universal_newlines=True,
+ stderr=subprocess.PIPE,
+ stdout=subprocess.PIPE,
+ ) as process:
+ stdout, stderr = process.communicate()
+
+ self.assertEqual(stderr, "")
+ self.assertEqual(stdout, "")
+
+ perf_file = pathlib.Path(f"/tmp/perf-{process.pid}.map")
+ self.assertTrue(perf_file.exists())
+ perf_file_contents = perf_file.read_text()
+ self.assertIn(f"py::foo:{script}", perf_file_contents)
+ self.assertIn(f"py::bar:{script}", perf_file_contents)
+ self.assertIn(f"py::baz:{script}", perf_file_contents)
+
+ def test_trampoline_works_with_forks(self):
+ code = """if 1:
+ import os, sys
+
+ def foo_fork():
+ pass
+
+ def bar_fork():
+ foo_fork()
+
+ def baz_fork():
+ bar_fork()
+
+ def foo():
+ pid = os.fork()
+ if pid == 0:
+ print(os.getpid())
+ baz_fork()
+ else:
+ _, status = os.waitpid(-1, 0)
+ sys.exit(status)
+
+ def bar():
+ foo()
+
+ def baz():
+ bar()
+
+ baz()
+ """
+ with temp_dir() as script_dir:
+ script = make_script(script_dir, "perftest", code)
+ with subprocess.Popen(
+ [sys.executable, "-Xperf", script],
+ universal_newlines=True,
+ stderr=subprocess.PIPE,
+ stdout=subprocess.PIPE,
+ ) as process:
+ stdout, stderr = process.communicate()
+
+ self.assertEqual(process.returncode, 0)
+ self.assertEqual(stderr, "")
+ child_pid = int(stdout.strip())
+ perf_file = pathlib.Path(f"/tmp/perf-{process.pid}.map")
+ perf_child_file = pathlib.Path(f"/tmp/perf-{child_pid}.map")
+ self.assertTrue(perf_file.exists())
+ self.assertTrue(perf_child_file.exists())
+
+ perf_file_contents = perf_file.read_text()
+ self.assertIn(f"py::foo:{script}", perf_file_contents)
+ self.assertIn(f"py::bar:{script}", perf_file_contents)
+ self.assertIn(f"py::baz:{script}", perf_file_contents)
+
+ child_perf_file_contents = perf_child_file.read_text()
+ self.assertIn(f"py::foo_fork:{script}", child_perf_file_contents)
+ self.assertIn(f"py::bar_fork:{script}", child_perf_file_contents)
+ self.assertIn(f"py::baz_fork:{script}", child_perf_file_contents)
+
+ def test_sys_api(self):
+ code = """if 1:
+ import sys
+ def foo():
+ pass
+
+ def spam():
+ pass
+
+ def bar():
+ sys.deactivate_stack_trampoline()
+ foo()
+ sys.activate_stack_trampoline("perf")
+ spam()
+
+ def baz():
+ bar()
+
+ sys.activate_stack_trampoline("perf")
+ baz()
+ """
+ with temp_dir() as script_dir:
+ script = make_script(script_dir, "perftest", code)
+ with subprocess.Popen(
+ [sys.executable, script],
+ universal_newlines=True,
+ stderr=subprocess.PIPE,
+ stdout=subprocess.PIPE,
+ ) as process:
+ stdout, stderr = process.communicate()
+
+ self.assertEqual(stderr, "")
+ self.assertEqual(stdout, "")
+
+ perf_file = pathlib.Path(f"/tmp/perf-{process.pid}.map")
+ self.assertTrue(perf_file.exists())
+ perf_file_contents = perf_file.read_text()
+ self.assertNotIn(f"py::foo:{script}", perf_file_contents)
+ self.assertIn(f"py::spam:{script}", perf_file_contents)
+ self.assertIn(f"py::bar:{script}", perf_file_contents)
+ self.assertIn(f"py::baz:{script}", perf_file_contents)
+
+ def test_sys_api_with_existing_trampoline(self):
+ code = """if 1:
+ import sys
+ sys.activate_stack_trampoline("perf")
+ sys.activate_stack_trampoline("perf")
+ """
+ assert_python_ok("-c", code)
+
+ def test_sys_api_with_invalid_trampoline(self):
+ code = """if 1:
+ import sys
+ sys.activate_stack_trampoline("invalid")
+ """
+ rc, out, err = assert_python_failure("-c", code)
+ self.assertIn("invalid backend: invalid", err.decode())
+
+ def test_sys_api_get_status(self):
+ code = """if 1:
+ import sys
+ sys.activate_stack_trampoline("perf")
+ assert sys.is_stack_trampoline_active() is True
+ sys.deactivate_stack_trampoline()
+ assert sys.is_stack_trampoline_active() is False
+ """
+ assert_python_ok("-c", code)
+
+
+def is_unwinding_reliable():
+ cflags = sysconfig.get_config_var("PY_CORE_CFLAGS")
+ if not cflags:
+ return False
+ return "no-omit-frame-pointer" in cflags
+
+
+def perf_command_works():
+ try:
+ cmd = ["perf", "--help"]
+ stdout = subprocess.check_output(cmd, universal_newlines=True)
+ except (subprocess.SubprocessError, OSError):
+ return False
+
+ # perf version does not return a version number on Fedora. Use presence
+ # of "perf.data" in help as indicator that it's perf from Linux tools.
+ if "perf.data" not in stdout:
+ return False
+
+ # Check that we can run a simple perf run
+ with temp_dir() as script_dir:
+ try:
+ output_file = script_dir + "/perf_output.perf"
+ cmd = (
+ "perf",
+ "record",
+ "-g",
+ "--call-graph=fp",
+ "-o",
+ output_file,
+ "--",
+ sys.executable,
+ "-c",
+ 'print("hello")',
+ )
+ stdout = subprocess.check_output(
+ cmd, cwd=script_dir, universal_newlines=True, stderr=subprocess.STDOUT
+ )
+ except (subprocess.SubprocessError, OSError):
+ return False
+
+ if "hello" not in stdout:
+ return False
+
+ return True
+
+
+def run_perf(cwd, *args, **env_vars):
+ if env_vars:
+ env = os.environ.copy()
+ env.update(env_vars)
+ else:
+ env = None
+ output_file = cwd + "/perf_output.perf"
+ base_cmd = ("perf", "record", "-g", "--call-graph=fp", "-o", output_file, "--")
+ proc = subprocess.run(
+ base_cmd + args,
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE,
+ env=env,
+ )
+ if proc.returncode:
+ print(proc.stderr)
+ raise ValueError(f"Perf failed with return code {proc.returncode}")
+
+ base_cmd = ("perf", "script")
+ proc = subprocess.run(
+ ("perf", "script", "-i", output_file),
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE,
+ env=env,
+ check=True,
+ )
+ return proc.stdout.decode("utf-8", "replace"), proc.stderr.decode(
+ "utf-8", "replace"
+ )
+
+
+@unittest.skipUnless(perf_command_works(), "perf command doesn't work")
+@unittest.skipUnless(is_unwinding_reliable(), "Unwinding is unreliable")
+@support.skip_if_sanitizer(address=True, memory=True, ub=True)
+class TestPerfProfiler(unittest.TestCase):
+ def setUp(self):
+ super().setUp()
+ self.perf_files = set(pathlib.Path("/tmp/").glob("perf-*.map"))
+
+ def tearDown(self) -> None:
+ super().tearDown()
+ files_to_delete = (
+ set(pathlib.Path("/tmp/").glob("perf-*.map")) - self.perf_files
+ )
+ for file in files_to_delete:
+ file.unlink()
+
+ def test_python_calls_appear_in_the_stack_if_perf_activated(self):
+ with temp_dir() as script_dir:
+ code = """if 1:
+ def foo(n):
+ x = 0
+ for i in range(n):
+ x += i
+
+ def bar(n):
+ foo(n)
+
+ def baz(n):
+ bar(n)
+
+ baz(10000000)
+ """
+ script = make_script(script_dir, "perftest", code)
+ stdout, stderr = run_perf(script_dir, sys.executable, "-Xperf", script)
+ self.assertEqual(stderr, "")
+
+ self.assertIn(f"py::foo:{script}", stdout)
+ self.assertIn(f"py::bar:{script}", stdout)
+ self.assertIn(f"py::baz:{script}", stdout)
+
+ def test_python_calls_do_not_appear_in_the_stack_if_perf_activated(self):
+ with temp_dir() as script_dir:
+ code = """if 1:
+ def foo(n):
+ x = 0
+ for i in range(n):
+ x += i
+
+ def bar(n):
+ foo(n)
+
+ def baz(n):
+ bar(n)
+
+ baz(10000000)
+ """
+ script = make_script(script_dir, "perftest", code)
+ stdout, stderr = run_perf(script_dir, sys.executable, script)
+ self.assertEqual(stderr, "")
+
+ self.assertNotIn(f"py::foo:{script}", stdout)
+ self.assertNotIn(f"py::bar:{script}", stdout)
+ self.assertNotIn(f"py::baz:{script}", stdout)
+
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/Makefile.pre.in b/Makefile.pre.in
index 94ddfa4..107a707 100644
--- a/Makefile.pre.in
+++ b/Makefile.pre.in
@@ -478,7 +478,9 @@ OBJECT_OBJS= \
Objects/unicodeobject.o \
Objects/unicodectype.o \
Objects/unionobject.o \
- Objects/weakrefobject.o
+ Objects/weakrefobject.o \
+ Objects/perf_trampoline.o \
+ @PERF_TRAMPOLINE_OBJ@
DEEPFREEZE_OBJS = Python/deepfreeze/deepfreeze.o
@@ -2358,6 +2360,9 @@ config.status: $(srcdir)/configure
.PRECIOUS: config.status $(BUILDPYTHON) Makefile Makefile.pre
+Objects/asm_trampoline.o: $(srcdir)/Objects/asm_trampoline.S
+ $(CC) -c $(PY_CORE_CFLAGS) -o $@ $<
+
# Some make's put the object file in the current directory
.c.o:
$(CC) -c $(PY_CORE_CFLAGS) -o $@ $<
diff --git a/Misc/NEWS.d/next/Core and Builtins/2022-08-20-18-36-40.gh-issue-96143.nh3GFM.rst b/Misc/NEWS.d/next/Core and Builtins/2022-08-20-18-36-40.gh-issue-96143.nh3GFM.rst
new file mode 100644
index 0000000..30f44fd
--- /dev/null
+++ b/Misc/NEWS.d/next/Core and Builtins/2022-08-20-18-36-40.gh-issue-96143.nh3GFM.rst
@@ -0,0 +1,7 @@
+Add a new ``-X perf`` Python command line option as well as
+:func:`sys.activate_stack_trampoline` and :func:`sys.deactivate_stack_trampoline`
+function in the :mod:`sys` module that allows to set/unset the interpreter in a
+way that the Linux ``perf`` profiler can detect Python calls. The new
+:func:`sys.is_stack_trampoline_active` function allows to query the state of the
+perf trampoline. Design by Pablo Galindo. Patch by Pablo Galindo and Christian Heimes
+with contributions from Gregory P. Smith [Google] and Mark Shannon.
diff --git a/Modules/posixmodule.c b/Modules/posixmodule.c
index d45fa23..3810bc8 100644
--- a/Modules/posixmodule.c
+++ b/Modules/posixmodule.c
@@ -606,6 +606,11 @@ PyOS_AfterFork_Child(void)
}
assert(_PyThreadState_GET() == tstate);
+ status = _PyPerfTrampoline_AfterFork_Child();
+ if (_PyStatus_EXCEPTION(status)) {
+ goto fatal_error;
+ }
+
run_at_forkers(tstate->interp->after_forkers_child, 0);
return;
diff --git a/Objects/asm_trampoline.S b/Objects/asm_trampoline.S
new file mode 100644
index 0000000..4607077
--- /dev/null
+++ b/Objects/asm_trampoline.S
@@ -0,0 +1,28 @@
+ .text
+ .globl _Py_trampoline_func_start
+# The following assembly is equivalent to:
+# PyObject *
+# trampoline(PyThreadState *ts, _PyInterpreterFrame *f,
+# int throwflag, py_evaluator evaluator)
+# {
+# return evaluator(ts, f, throwflag);
+# }
+_Py_trampoline_func_start:
+#ifdef __x86_64__
+ sub $8, %rsp
+ call *%rcx
+ add $8, %rsp
+ ret
+#endif // __x86_64__
+#if defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__)
+ // ARM64 little endian, 64bit ABI
+ // generate with aarch64-linux-gnu-gcc 12.1
+ stp x29, x30, [sp, -16]!
+ mov x29, sp
+ blr x3
+ ldp x29, x30, [sp], 16
+ ret
+#endif
+ .globl _Py_trampoline_func_end
+_Py_trampoline_func_end:
+ .section .note.GNU-stack,"",@progbits
diff --git a/Objects/perf_trampoline.c b/Objects/perf_trampoline.c
new file mode 100644
index 0000000..02206b2
--- /dev/null
+++ b/Objects/perf_trampoline.c
@@ -0,0 +1,501 @@
+/*
+
+Perf trampoline instrumentation
+===============================
+
+This file contains instrumentation to allow to associate
+calls to the CPython eval loop back to the names of the Python
+functions and filename being executed.
+
+Many native performance profilers like the Linux perf tools are
+only available to 'see' the C stack when sampling from the profiled
+process. This means that if we have the following python code:
+
+ import time
+ def foo(n):
+ # Some CPU intensive code
+
+ def bar(n):
+ foo(n)
+
+ def baz(n):
+ bar(n)
+
+ baz(10000000)
+
+A performance profiler that is only able to see native frames will
+produce the following backtrace when sampling from foo():
+
+ _PyEval_EvalFrameDefault -----> Evaluation frame of foo()
+ _PyEval_Vector
+ _PyFunction_Vectorcall
+ PyObject_Vectorcall
+ call_function
+
+ _PyEval_EvalFrameDefault ------> Evaluation frame of bar()
+ _PyEval_EvalFrame
+ _PyEval_Vector
+ _PyFunction_Vectorcall
+ PyObject_Vectorcall
+ call_function
+
+ _PyEval_EvalFrameDefault -------> Evaluation frame of baz()
+ _PyEval_EvalFrame
+ _PyEval_Vector
+ _PyFunction_Vectorcall
+ PyObject_Vectorcall
+ call_function
+
+ ...
+
+ Py_RunMain
+
+Because the profiler is only able to see the native frames and the native
+function that runs the evaluation loop is the same (_PyEval_EvalFrameDefault)
+then the profiler and any reporter generated by it will not be able to
+associate the names of the Python functions and the filenames associated with
+those calls, rendering the results useless in the Python world.
+
+To fix this problem, we introduce the concept of a trampoline frame. A
+trampoline frame is a piece of code that is unique per Python code object that
+is executed before entering the CPython eval loop. This piece of code just
+calls the original Python evaluation function (_PyEval_EvalFrameDefault) and
+forwards all the arguments received. In this way, when a profiler samples
+frames from the previous example it will see;
+
+ _PyEval_EvalFrameDefault -----> Evaluation frame of foo()
+ [Jit compiled code 3]
+ _PyEval_Vector
+ _PyFunction_Vectorcall
+ PyObject_Vectorcall
+ call_function
+
+ _PyEval_EvalFrameDefault ------> Evaluation frame of bar()
+ [Jit compiled code 2]
+ _PyEval_EvalFrame
+ _PyEval_Vector
+ _PyFunction_Vectorcall
+ PyObject_Vectorcall
+ call_function
+
+ _PyEval_EvalFrameDefault -------> Evaluation frame of baz()
+ [Jit compiled code 1]
+ _PyEval_EvalFrame
+ _PyEval_Vector
+ _PyFunction_Vectorcall
+ PyObject_Vectorcall
+ call_function
+
+ ...
+
+ Py_RunMain
+
+When we generate every unique copy of the trampoline (what here we called "[Jit
+compiled code N]") we write the relationship between the compiled code and the
+Python function that is associated with it. Every profiler requires this
+information in a different format. For example, the Linux "perf" profiler
+requires a file in "/tmp/perf-PID.map" (name and location not configurable)
+with the following format:
+
+ <compiled code address> <compiled code size> <name of the compiled code>
+
+If this file is available when "perf" generates reports, it will automatically
+associate every trampoline with the Python function that it is associated with
+allowing it to generate reports that include Python information. These reports
+then can also be filtered in a way that *only* Python information appears.
+
+Notice that for this to work, there must be a unique copied of the trampoline
+per Python code object even if the code in the trampoline is the same. To
+achieve this we have a assembly template in Objects/asm_trampiline.S that is
+compiled into the Python executable/shared library. This template generates a
+symbol that maps the start of the assembly code and another that marks the end
+of the assembly code for the trampoline. Then, every time we need a unique
+trampoline for a Python code object, we copy the assembly code into a mmaped
+area that has executable permissions and we return the start of that area as
+our trampoline function.
+
+Asking for a mmap-ed memory area for trampoline is very wasteful so we
+allocate big arenas of memory in a single mmap call, we populate the entire
+arena with copies of the trampoline (this allows us to now have to invalidate
+the icache for the instructions in the page) and then we return the next
+available chunk every time someone asks for a new trampoline. We keep a linked
+list of arenas in case the current memory arena is exhausted and another one is
+needed.
+
+For the best results, Python should be compiled with
+CFLAGS="-fno-omit-frame-pointer -mno-omit-leaf-frame-pointer" as this allows
+profilers to unwind using only the frame pointer and not on DWARF debug
+information (note that as trampilines are dynamically generated there won't be
+any DWARF information available for them).
+*/
+
+#include "Python.h"
+#include "pycore_ceval.h"
+#include "pycore_frame.h"
+#include "pycore_interp.h"
+
+typedef enum {
+ PERF_STATUS_FAILED = -1, // Perf trampoline is in an invalid state
+ PERF_STATUS_NO_INIT = 0, // Perf trampoline is not initialized
+ PERF_STATUS_OK = 1, // Perf trampoline is ready to be executed
+} perf_status_t;
+
+#ifdef PY_HAVE_PERF_TRAMPOLINE
+
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+/* The function pointer is passed as last argument. The other three arguments
+ * are passed in the same order as the function requires. This results in
+ * shorter, more efficient ASM code for trampoline.
+ */
+typedef PyObject *(*py_evaluator)(PyThreadState *, _PyInterpreterFrame *,
+ int throwflag);
+typedef PyObject *(*py_trampoline)(PyThreadState *, _PyInterpreterFrame *, int,
+ py_evaluator);
+
+extern void *_Py_trampoline_func_start; // Start of the template of the
+ // assembly trampoline
+extern void *
+ _Py_trampoline_func_end; // End of the template of the assembly trampoline
+
+struct code_arena_st {
+ char *start_addr; // Start of the memory arena
+ char *current_addr; // Address of the current trampoline within the arena
+ size_t size; // Size of the memory arena
+ size_t size_left; // Remaining size of the memory arena
+ size_t code_size; // Size of the code of every trampoline in the arena
+ struct code_arena_st
+ *prev; // Pointer to the arena or NULL if this is the first arena.
+};
+
+typedef struct code_arena_st code_arena_t;
+
+struct trampoline_api_st {
+ void* (*init_state)(void);
+ void (*write_state)(void* state, const void *code_addr,
+ unsigned int code_size, PyCodeObject* code);
+ int (*free_state)(void* state);
+ void *state;
+};
+
+typedef struct trampoline_api_st trampoline_api_t;
+
+static perf_status_t perf_status = PERF_STATUS_NO_INIT;
+static Py_ssize_t extra_code_index = -1;
+static code_arena_t *code_arena;
+static trampoline_api_t trampoline_api;
+
+static FILE *perf_map_file;
+
+static void *
+perf_map_get_file(void)
+{
+ if (perf_map_file) {
+ return perf_map_file;
+ }
+ char filename[100];
+ pid_t pid = getpid();
+ // Location and file name of perf map is hard-coded in perf tool.
+ // Use exclusive create flag wit nofollow to prevent symlink attacks.
+ int flags = O_WRONLY | O_CREAT | O_EXCL | O_NOFOLLOW | O_CLOEXEC;
+ snprintf(filename, sizeof(filename) - 1, "/tmp/perf-%jd.map",
+ (intmax_t)pid);
+ int fd = open(filename, flags, 0600);
+ if (fd == -1) {
+ perf_status = PERF_STATUS_FAILED;
+ PyErr_SetFromErrnoWithFilename(PyExc_OSError, filename);
+ return NULL;
+ }
+ perf_map_file = fdopen(fd, "w");
+ if (!perf_map_file) {
+ perf_status = PERF_STATUS_FAILED;
+ PyErr_SetFromErrnoWithFilename(PyExc_OSError, filename);
+ close(fd);
+ return NULL;
+ }
+ return perf_map_file;
+}
+
+static int
+perf_map_close(void *state)
+{
+ FILE *fp = (FILE *)state;
+ int ret = 0;
+ if (fp) {
+ ret = fclose(fp);
+ }
+ perf_map_file = NULL;
+ perf_status = PERF_STATUS_NO_INIT;
+ return ret;
+}
+
+static void
+perf_map_write_entry(void *state, const void *code_addr,
+ unsigned int code_size, PyCodeObject *co)
+{
+ assert(state != NULL);
+ FILE *method_file = (FILE *)state;
+ const char *entry = PyUnicode_AsUTF8(co->co_qualname);
+ if (entry == NULL) {
+ _PyErr_WriteUnraisableMsg("Failed to get qualname from code object",
+ NULL);
+ return;
+ }
+ const char *filename = PyUnicode_AsUTF8(co->co_filename);
+ if (filename == NULL) {
+ _PyErr_WriteUnraisableMsg("Failed to get filename from code object",
+ NULL);
+ return;
+ }
+ fprintf(method_file, "%p %x py::%s:%s\n", code_addr, code_size, entry,
+ filename);
+ fflush(method_file);
+}
+
+_PyPerf_Callbacks _Py_perfmap_callbacks = {
+ &perf_map_get_file,
+ &perf_map_write_entry,
+ &perf_map_close
+};
+
+static int
+new_code_arena(void)
+{
+ // non-trivial programs typically need 64 to 256 kiB.
+ size_t mem_size = 4096 * 16;
+ assert(mem_size % sysconf(_SC_PAGESIZE) == 0);
+ char *memory =
+ mmap(NULL, // address
+ mem_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS,
+ -1, // fd (not used here)
+ 0); // offset (not used here)
+ if (!memory) {
+ PyErr_SetFromErrno(PyExc_OSError);
+ _PyErr_WriteUnraisableMsg(
+ "Failed to create new mmap for perf trampoline", NULL);
+ perf_status = PERF_STATUS_FAILED;
+ return -1;
+ }
+ void *start = &_Py_trampoline_func_start;
+ void *end = &_Py_trampoline_func_end;
+ size_t code_size = end - start;
+
+ size_t n_copies = mem_size / code_size;
+ for (size_t i = 0; i < n_copies; i++) {
+ memcpy(memory + i * code_size, start, code_size * sizeof(char));
+ }
+ // Some systems may prevent us from creating executable code on the fly.
+ int res = mprotect(memory, mem_size, PROT_READ | PROT_EXEC);
+ if (res == -1) {
+ PyErr_SetFromErrno(PyExc_OSError);
+ munmap(memory, mem_size);
+ _PyErr_WriteUnraisableMsg(
+ "Failed to set mmap for perf trampoline to PROT_READ | PROT_EXEC",
+ NULL);
+ return -1;
+ }
+
+ code_arena_t *new_arena = PyMem_RawCalloc(1, sizeof(code_arena_t));
+ if (new_arena == NULL) {
+ PyErr_NoMemory();
+ munmap(memory, mem_size);
+ _PyErr_WriteUnraisableMsg("Failed to allocate new code arena struct",
+ NULL);
+ return -1;
+ }
+
+ new_arena->start_addr = memory;
+ new_arena->current_addr = memory;
+ new_arena->size = mem_size;
+ new_arena->size_left = mem_size;
+ new_arena->code_size = code_size;
+ new_arena->prev = code_arena;
+ code_arena = new_arena;
+ return 0;
+}
+
+static void
+free_code_arenas(void)
+{
+ code_arena_t *cur = code_arena;
+ code_arena_t *prev;
+ code_arena = NULL; // invalid static pointer
+ while (cur) {
+ munmap(cur->start_addr, cur->size);
+ prev = cur->prev;
+ PyMem_RawFree(cur);
+ cur = prev;
+ }
+}
+
+static inline py_trampoline
+code_arena_new_code(code_arena_t *code_arena)
+{
+ py_trampoline trampoline = (py_trampoline)code_arena->current_addr;
+ code_arena->size_left -= code_arena->code_size;
+ code_arena->current_addr += code_arena->code_size;
+ return trampoline;
+}
+
+static inline py_trampoline
+compile_trampoline(void)
+{
+ if ((code_arena == NULL) ||
+ (code_arena->size_left <= code_arena->code_size)) {
+ if (new_code_arena() < 0) {
+ return NULL;
+ }
+ }
+ assert(code_arena->size_left <= code_arena->size);
+ return code_arena_new_code(code_arena);
+}
+
+static PyObject *
+py_trampoline_evaluator(PyThreadState *ts, _PyInterpreterFrame *frame,
+ int throw)
+{
+ if (perf_status == PERF_STATUS_FAILED ||
+ perf_status == PERF_STATUS_NO_INIT) {
+ goto default_eval;
+ }
+ PyCodeObject *co = frame->f_code;
+ py_trampoline f = NULL;
+ assert(extra_code_index != -1);
+ int ret = _PyCode_GetExtra((PyObject *)co, extra_code_index, (void **)&f);
+ if (ret != 0 || f == NULL) {
+ // This is the first time we see this code object so we need
+ // to compile a trampoline for it.
+ py_trampoline new_trampoline = compile_trampoline();
+ if (new_trampoline == NULL) {
+ goto default_eval;
+ }
+ trampoline_api.write_state(trampoline_api.state, new_trampoline,
+ code_arena->code_size, co);
+ _PyCode_SetExtra((PyObject *)co, extra_code_index,
+ (void *)new_trampoline);
+ f = new_trampoline;
+ }
+ assert(f != NULL);
+ return f(ts, frame, throw, _PyEval_EvalFrameDefault);
+default_eval:
+ // Something failed, fall back to the default evaluator.
+ return _PyEval_EvalFrameDefault(ts, frame, throw);
+}
+#endif // PY_HAVE_PERF_TRAMPOLINE
+
+int
+_PyIsPerfTrampolineActive(void)
+{
+#ifdef PY_HAVE_PERF_TRAMPOLINE
+ PyThreadState *tstate = _PyThreadState_GET();
+ return tstate->interp->eval_frame == py_trampoline_evaluator;
+#endif
+ return 0;
+}
+
+void
+_PyPerfTrampoline_GetCallbacks(_PyPerf_Callbacks *callbacks)
+{
+ if (callbacks == NULL) {
+ return;
+ }
+#ifdef PY_HAVE_PERF_TRAMPOLINE
+ callbacks->init_state = trampoline_api.init_state;
+ callbacks->write_state = trampoline_api.write_state;
+ callbacks->free_state = trampoline_api.free_state;
+#endif
+ return;
+}
+
+int
+_PyPerfTrampoline_SetCallbacks(_PyPerf_Callbacks *callbacks)
+{
+ if (callbacks == NULL) {
+ return -1;
+ }
+#ifdef PY_HAVE_PERF_TRAMPOLINE
+ if (trampoline_api.state) {
+ _PyPerfTrampoline_Fini();
+ }
+ trampoline_api.init_state = callbacks->init_state;
+ trampoline_api.write_state = callbacks->write_state;
+ trampoline_api.free_state = callbacks->free_state;
+ trampoline_api.state = NULL;
+ perf_status = PERF_STATUS_OK;
+#endif
+ return 0;
+}
+
+int
+_PyPerfTrampoline_Init(int activate)
+{
+#ifdef PY_HAVE_PERF_TRAMPOLINE
+ PyThreadState *tstate = _PyThreadState_GET();
+ if (tstate->interp->eval_frame &&
+ tstate->interp->eval_frame != py_trampoline_evaluator) {
+ PyErr_SetString(PyExc_RuntimeError,
+ "Trampoline cannot be initialized as a custom eval "
+ "frame is already present");
+ return -1;
+ }
+ if (!activate) {
+ tstate->interp->eval_frame = NULL;
+ }
+ else {
+ tstate->interp->eval_frame = py_trampoline_evaluator;
+ if (new_code_arena() < 0) {
+ return -1;
+ }
+ if (trampoline_api.state == NULL) {
+ void *state = trampoline_api.init_state();
+ if (state == NULL) {
+ return -1;
+ }
+ trampoline_api.state = state;
+ }
+ extra_code_index = _PyEval_RequestCodeExtraIndex(NULL);
+ if (extra_code_index == -1) {
+ return -1;
+ }
+ perf_status = PERF_STATUS_OK;
+ }
+#endif
+ return 0;
+}
+
+int
+_PyPerfTrampoline_Fini(void)
+{
+#ifdef PY_HAVE_PERF_TRAMPOLINE
+ PyThreadState *tstate = _PyThreadState_GET();
+ if (tstate->interp->eval_frame == py_trampoline_evaluator) {
+ tstate->interp->eval_frame = NULL;
+ }
+ free_code_arenas();
+ if (trampoline_api.state != NULL) {
+ trampoline_api.free_state(trampoline_api.state);
+ trampoline_api.state = NULL;
+ }
+ extra_code_index = -1;
+#endif
+ return 0;
+}
+
+PyStatus
+_PyPerfTrampoline_AfterFork_Child(void)
+{
+#ifdef PY_HAVE_PERF_TRAMPOLINE
+ // Restart trampoline in file in child.
+ int was_active = _PyIsPerfTrampolineActive();
+ _PyPerfTrampoline_Fini();
+ if (was_active) {
+ _PyPerfTrampoline_Init(1);
+ }
+#endif
+ return PyStatus_Ok();
+}
diff --git a/PCbuild/_freeze_module.vcxproj b/PCbuild/_freeze_module.vcxproj
index 4c00729..0e446fe 100644
--- a/PCbuild/_freeze_module.vcxproj
+++ b/PCbuild/_freeze_module.vcxproj
@@ -129,6 +129,7 @@
<ClCompile Include="..\Objects\cellobject.c" />
<ClCompile Include="..\Objects\classobject.c" />
<ClCompile Include="..\Objects\codeobject.c" />
+ <ClCompile Include="..\Objects\perf_trampoline.c" />
<ClCompile Include="..\Objects\complexobject.c" />
<ClCompile Include="..\Objects\descrobject.c" />
<ClCompile Include="..\Objects\dictobject.c" />
diff --git a/PCbuild/_freeze_module.vcxproj.filters b/PCbuild/_freeze_module.vcxproj.filters
index 5c98499..96ab2f2 100644
--- a/PCbuild/_freeze_module.vcxproj.filters
+++ b/PCbuild/_freeze_module.vcxproj.filters
@@ -85,6 +85,9 @@
<ClCompile Include="..\Objects\codeobject.c">
<Filter>Source Files</Filter>
</ClCompile>
+ <ClCompile Include="..\Objects\perf_trampoline.c">
+ <Filter>Source Files</Filter>
+ </ClCompile>
<ClCompile Include="..\Python\compile.c">
<Filter>Source Files</Filter>
</ClCompile>
diff --git a/PCbuild/pythoncore.vcxproj b/PCbuild/pythoncore.vcxproj
index 45e5013..ff17304 100644
--- a/PCbuild/pythoncore.vcxproj
+++ b/PCbuild/pythoncore.vcxproj
@@ -429,6 +429,7 @@
<ClCompile Include="..\Objects\cellobject.c" />
<ClCompile Include="..\Objects\classobject.c" />
<ClCompile Include="..\Objects\codeobject.c" />
+ <ClCompile Include="..\Objects\perf_trampoline.c" />
<ClCompile Include="..\Objects\complexobject.c" />
<ClCompile Include="..\Objects\descrobject.c" />
<ClCompile Include="..\Objects\dictobject.c" />
diff --git a/PCbuild/pythoncore.vcxproj.filters b/PCbuild/pythoncore.vcxproj.filters
index 581ea6e..7d7fe72 100644
--- a/PCbuild/pythoncore.vcxproj.filters
+++ b/PCbuild/pythoncore.vcxproj.filters
@@ -923,6 +923,9 @@
<ClCompile Include="..\Objects\codeobject.c">
<Filter>Objects</Filter>
</ClCompile>
+ <ClCompile Include="..\Objects\perf_trampoline.c">
+ <Filter>Objects</Filter>
+ </ClCompile>
<ClCompile Include="..\Objects\complexobject.c">
<Filter>Objects</Filter>
</ClCompile>
diff --git a/Python/clinic/sysmodule.c.h b/Python/clinic/sysmodule.c.h
index 0f96366..ddf01a7 100644
--- a/Python/clinic/sysmodule.c.h
+++ b/Python/clinic/sysmodule.c.h
@@ -1151,6 +1151,79 @@ sys_getandroidapilevel(PyObject *module, PyObject *Py_UNUSED(ignored))
#endif /* defined(ANDROID_API_LEVEL) */
+PyDoc_STRVAR(sys_activate_stack_trampoline__doc__,
+"activate_stack_trampoline($module, backend, /)\n"
+"--\n"
+"\n"
+"Activate the perf profiler trampoline.");
+
+#define SYS_ACTIVATE_STACK_TRAMPOLINE_METHODDEF \
+ {"activate_stack_trampoline", (PyCFunction)sys_activate_stack_trampoline, METH_O, sys_activate_stack_trampoline__doc__},
+
+static PyObject *
+sys_activate_stack_trampoline_impl(PyObject *module, const char *backend);
+
+static PyObject *
+sys_activate_stack_trampoline(PyObject *module, PyObject *arg)
+{
+ PyObject *return_value = NULL;
+ const char *backend;
+
+ if (!PyUnicode_Check(arg)) {
+ _PyArg_BadArgument("activate_stack_trampoline", "argument", "str", arg);
+ goto exit;
+ }
+ Py_ssize_t backend_length;
+ backend = PyUnicode_AsUTF8AndSize(arg, &backend_length);
+ if (backend == NULL) {
+ goto exit;
+ }
+ if (strlen(backend) != (size_t)backend_length) {
+ PyErr_SetString(PyExc_ValueError, "embedded null character");
+ goto exit;
+ }
+ return_value = sys_activate_stack_trampoline_impl(module, backend);
+
+exit:
+ return return_value;
+}
+
+PyDoc_STRVAR(sys_deactivate_stack_trampoline__doc__,
+"deactivate_stack_trampoline($module, /)\n"
+"--\n"
+"\n"
+"Dectivate the perf profiler trampoline.");
+
+#define SYS_DEACTIVATE_STACK_TRAMPOLINE_METHODDEF \
+ {"deactivate_stack_trampoline", (PyCFunction)sys_deactivate_stack_trampoline, METH_NOARGS, sys_deactivate_stack_trampoline__doc__},
+
+static PyObject *
+sys_deactivate_stack_trampoline_impl(PyObject *module);
+
+static PyObject *
+sys_deactivate_stack_trampoline(PyObject *module, PyObject *Py_UNUSED(ignored))
+{
+ return sys_deactivate_stack_trampoline_impl(module);
+}
+
+PyDoc_STRVAR(sys_is_stack_trampoline_active__doc__,
+"is_stack_trampoline_active($module, /)\n"
+"--\n"
+"\n"
+"Returns *True* if the perf profiler trampoline is active.");
+
+#define SYS_IS_STACK_TRAMPOLINE_ACTIVE_METHODDEF \
+ {"is_stack_trampoline_active", (PyCFunction)sys_is_stack_trampoline_active, METH_NOARGS, sys_is_stack_trampoline_active__doc__},
+
+static PyObject *
+sys_is_stack_trampoline_active_impl(PyObject *module);
+
+static PyObject *
+sys_is_stack_trampoline_active(PyObject *module, PyObject *Py_UNUSED(ignored))
+{
+ return sys_is_stack_trampoline_active_impl(module);
+}
+
#ifndef SYS_GETWINDOWSVERSION_METHODDEF
#define SYS_GETWINDOWSVERSION_METHODDEF
#endif /* !defined(SYS_GETWINDOWSVERSION_METHODDEF) */
@@ -1194,4 +1267,4 @@ sys_getandroidapilevel(PyObject *module, PyObject *Py_UNUSED(ignored))
#ifndef SYS_GETANDROIDAPILEVEL_METHODDEF
#define SYS_GETANDROIDAPILEVEL_METHODDEF
#endif /* !defined(SYS_GETANDROIDAPILEVEL_METHODDEF) */
-/*[clinic end generated code: output=322fb0409e376ad4 input=a9049054013a1b77]*/
+/*[clinic end generated code: output=43b44240211afe95 input=a9049054013a1b77]*/
diff --git a/Python/initconfig.c b/Python/initconfig.c
index 70f0363..33a8f27 100644
--- a/Python/initconfig.c
+++ b/Python/initconfig.c
@@ -118,6 +118,11 @@ The following implementation-specific options are available:\n\
files are desired as well as suppressing the extra visual location indicators \n\
when the interpreter displays tracebacks.\n\
\n\
+-X perf: activate support for the Linux \"perf\" profiler by activating the \"perf\"\n\
+ trampoline. When this option is activated, the Linux \"perf\" profiler will be \n\
+ able to report Python calls. This option is only available on some platforms and will \n\
+ do nothing if is not supported on the current system. The default value is \"off\".\n\
+\n\
-X frozen_modules=[on|off]: whether or not frozen modules should be used.\n\
The default is \"on\" (or \"off\" if you are running a local build).";
@@ -745,6 +750,7 @@ _PyConfig_InitCompatConfig(PyConfig *config)
config->use_hash_seed = -1;
config->faulthandler = -1;
config->tracemalloc = -1;
+ config->perf_profiling = -1;
config->module_search_paths_set = 0;
config->parse_argv = 0;
config->site_import = -1;
@@ -829,6 +835,7 @@ PyConfig_InitIsolatedConfig(PyConfig *config)
config->use_hash_seed = 0;
config->faulthandler = 0;
config->tracemalloc = 0;
+ config->perf_profiling = 0;
config->safe_path = 1;
config->pathconfig_warnings = 0;
#ifdef MS_WINDOWS
@@ -940,6 +947,7 @@ _PyConfig_Copy(PyConfig *config, const PyConfig *config2)
COPY_ATTR(_install_importlib);
COPY_ATTR(faulthandler);
COPY_ATTR(tracemalloc);
+ COPY_ATTR(perf_profiling);
COPY_ATTR(import_time);
COPY_ATTR(code_debug_ranges);
COPY_ATTR(show_ref_count);
@@ -1050,6 +1058,7 @@ _PyConfig_AsDict(const PyConfig *config)
SET_ITEM_UINT(hash_seed);
SET_ITEM_INT(faulthandler);
SET_ITEM_INT(tracemalloc);
+ SET_ITEM_INT(perf_profiling);
SET_ITEM_INT(import_time);
SET_ITEM_INT(code_debug_ranges);
SET_ITEM_INT(show_ref_count);
@@ -1331,6 +1340,7 @@ _PyConfig_FromDict(PyConfig *config, PyObject *dict)
CHECK_VALUE("hash_seed", config->hash_seed <= MAX_HASH_SEED);
GET_UINT(faulthandler);
GET_UINT(tracemalloc);
+ GET_UINT(perf_profiling);
GET_UINT(import_time);
GET_UINT(code_debug_ranges);
GET_UINT(show_ref_count);
@@ -1687,6 +1697,26 @@ config_read_env_vars(PyConfig *config)
return _PyStatus_OK();
}
+static PyStatus
+config_init_perf_profiling(PyConfig *config)
+{
+ int active = 0;
+ const char *env = config_get_env(config, "PYTHONPERFSUPPORT");
+ if (env) {
+ if (_Py_str_to_int(env, &active) != 0) {
+ active = 0;
+ }
+ if (active) {
+ config->perf_profiling = 1;
+ }
+ }
+ const wchar_t *xoption = config_get_xoption(config, L"perf");
+ if (xoption) {
+ config->perf_profiling = 1;
+ }
+ return _PyStatus_OK();
+
+}
static PyStatus
config_init_tracemalloc(PyConfig *config)
@@ -1788,6 +1818,12 @@ config_read_complex_options(PyConfig *config)
return status;
}
}
+ if (config->perf_profiling < 0) {
+ status = config_init_perf_profiling(config);
+ if (_PyStatus_EXCEPTION(status)) {
+ return status;
+ }
+ }
if (config->pycache_prefix == NULL) {
status = config_init_pycache_prefix(config);
@@ -2104,6 +2140,9 @@ config_read(PyConfig *config, int compute_path_config)
if (config->tracemalloc < 0) {
config->tracemalloc = 0;
}
+ if (config->perf_profiling < 0) {
+ config->perf_profiling = 0;
+ }
if (config->use_hash_seed < 0) {
config->use_hash_seed = 0;
config->hash_seed = 0;
diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c
index bb646f1..8ce6d71 100644
--- a/Python/pylifecycle.c
+++ b/Python/pylifecycle.c
@@ -1149,6 +1149,16 @@ init_interp_main(PyThreadState *tstate)
if (_PyTraceMalloc_Init(config->tracemalloc) < 0) {
return _PyStatus_ERR("can't initialize tracemalloc");
}
+
+
+#ifdef PY_HAVE_PERF_TRAMPOLINE
+ if (config->perf_profiling) {
+ if (_PyPerfTrampoline_SetCallbacks(&_Py_perfmap_callbacks) < 0 ||
+ _PyPerfTrampoline_Init(config->perf_profiling) < 0) {
+ return _PyStatus_ERR("can't initialize the perf trampoline");
+ }
+ }
+#endif
}
status = init_sys_streams(tstate);
@@ -1723,6 +1733,7 @@ finalize_interp_clear(PyThreadState *tstate)
_PyArg_Fini();
_Py_ClearFileSystemEncoding();
_Py_Deepfreeze_Fini();
+ _PyPerfTrampoline_Fini();
}
finalize_interp_types(tstate->interp);
diff --git a/Python/sysmodule.c b/Python/sysmodule.c
index c286438..75e6455 100644
--- a/Python/sysmodule.c
+++ b/Python/sysmodule.c
@@ -2053,6 +2053,80 @@ sys_getandroidapilevel_impl(PyObject *module)
}
#endif /* ANDROID_API_LEVEL */
+/*[clinic input]
+sys.activate_stack_trampoline
+
+ backend: str
+ /
+
+Activate the perf profiler trampoline.
+[clinic start generated code]*/
+
+static PyObject *
+sys_activate_stack_trampoline_impl(PyObject *module, const char *backend)
+/*[clinic end generated code: output=5783cdeb51874b43 input=b09020e3a17c78c5]*/
+{
+#ifdef PY_HAVE_PERF_TRAMPOLINE
+ if (strcmp(backend, "perf") == 0) {
+ _PyPerf_Callbacks cur_cb;
+ _PyPerfTrampoline_GetCallbacks(&cur_cb);
+ if (cur_cb.init_state != _Py_perfmap_callbacks.init_state) {
+ if (_PyPerfTrampoline_SetCallbacks(&_Py_perfmap_callbacks) < 0 ) {
+ PyErr_SetString(PyExc_ValueError, "can't activate perf trampoline");
+ return NULL;
+ }
+ }
+ }
+ else {
+ PyErr_Format(PyExc_ValueError, "invalid backend: %s", backend);
+ return NULL;
+ }
+ if (_PyPerfTrampoline_Init(1) < 0) {
+ return NULL;
+ }
+ Py_RETURN_NONE;
+#else
+ PyErr_SetString(PyExc_ValueError, "perf trampoline not available");
+ return NULL;
+#endif
+}
+
+
+/*[clinic input]
+sys.deactivate_stack_trampoline
+
+Dectivate the perf profiler trampoline.
+[clinic start generated code]*/
+
+static PyObject *
+sys_deactivate_stack_trampoline_impl(PyObject *module)
+/*[clinic end generated code: output=b50da25465df0ef1 input=491f4fc1ed615736]*/
+{
+ if (_PyPerfTrampoline_Init(0) < 0) {
+ return NULL;
+ }
+ Py_RETURN_NONE;
+}
+
+/*[clinic input]
+sys.is_stack_trampoline_active
+
+Returns *True* if the perf profiler trampoline is active.
+[clinic start generated code]*/
+
+static PyObject *
+sys_is_stack_trampoline_active_impl(PyObject *module)
+/*[clinic end generated code: output=ab2746de0ad9d293 input=061fa5776ac9dd59]*/
+{
+#ifdef PY_HAVE_PERF_TRAMPOLINE
+ if (_PyIsPerfTrampolineActive()) {
+ Py_RETURN_TRUE;
+ }
+#endif
+ Py_RETURN_FALSE;
+}
+
+
static PyMethodDef sys_methods[] = {
/* Might as well keep this in alphabetic order */
SYS_ADDAUDITHOOK_METHODDEF
@@ -2108,6 +2182,9 @@ static PyMethodDef sys_methods[] = {
METH_VARARGS | METH_KEYWORDS, set_asyncgen_hooks_doc},
SYS_GET_ASYNCGEN_HOOKS_METHODDEF
SYS_GETANDROIDAPILEVEL_METHODDEF
+ SYS_ACTIVATE_STACK_TRAMPOLINE_METHODDEF
+ SYS_DEACTIVATE_STACK_TRAMPOLINE_METHODDEF
+ SYS_IS_STACK_TRAMPOLINE_ACTIVE_METHODDEF
SYS_UNRAISABLEHOOK_METHODDEF
#ifdef Py_STATS
SYS__STATS_ON_METHODDEF
diff --git a/configure b/configure
index fc7f7fa..9522977 100755
--- a/configure
+++ b/configure
@@ -861,6 +861,7 @@ LIBEXPAT_CFLAGS
TZPATH
LIBUUID_LIBS
LIBUUID_CFLAGS
+PERF_TRAMPOLINE_OBJ
SHLIBS
CFLAGSFORSHARED
LINKFORSHARED
@@ -11498,6 +11499,35 @@ esac
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $SHLIBS" >&5
$as_echo "$SHLIBS" >&6; }
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking perf trampoline" >&5
+$as_echo_n "checking perf trampoline... " >&6; }
+case $PLATFORM_TRIPLET in #(
+ x86_64-linux-gnu) :
+ perf_trampoline=yes ;; #(
+ aarch64-linux-gnu) :
+ perf_trampoline=yes ;; #(
+ *) :
+ perf_trampoline=no
+ ;;
+esac
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $perf_trampoline" >&5
+$as_echo "$perf_trampoline" >&6; }
+
+if test "x$perf_trampoline" = xyes; then :
+
+
+$as_echo "#define PY_HAVE_PERF_TRAMPOLINE 1" >>confdefs.h
+
+ PERF_TRAMPOLINE_OBJ=Objects/asm_trampoline.o
+
+ if test "x$Py_DEBUG" = xtrue; then :
+
+ as_fn_append BASECFLAGS " -fno-omit-frame-pointer -mno-omit-leaf-frame-pointer"
+
+fi
+
+fi
+
# checks for libraries
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for sendfile in -lsendfile" >&5
diff --git a/configure.ac b/configure.ac
index 2b927cd..3a009bb 100644
--- a/configure.ac
+++ b/configure.ac
@@ -3452,6 +3452,26 @@ case "$ac_sys_system" in
esac
AC_MSG_RESULT($SHLIBS)
+dnl perf trampoline is Linux specific and requires an arch-specific
+dnl trampoline in asssembly.
+AC_MSG_CHECKING([perf trampoline])
+AS_CASE([$PLATFORM_TRIPLET],
+ [x86_64-linux-gnu], [perf_trampoline=yes],
+ [aarch64-linux-gnu], [perf_trampoline=yes],
+ [perf_trampoline=no]
+)
+AC_MSG_RESULT([$perf_trampoline])
+
+AS_VAR_IF([perf_trampoline], [yes], [
+ AC_DEFINE([PY_HAVE_PERF_TRAMPOLINE], [1], [Define to 1 if you have the perf trampoline.])
+ PERF_TRAMPOLINE_OBJ=Objects/asm_trampoline.o
+
+ dnl perf needs frame pointers for unwinding, include compiler option in debug builds
+ AS_VAR_IF([Py_DEBUG], [true], [
+ AS_VAR_APPEND([BASECFLAGS], [" -fno-omit-frame-pointer -mno-omit-leaf-frame-pointer"])
+ ])
+])
+AC_SUBST([PERF_TRAMPOLINE_OBJ])
# checks for libraries
AC_CHECK_LIB(sendfile, sendfile)
diff --git a/pyconfig.h.in b/pyconfig.h.in
index 10e7ad1..1ce0985 100644
--- a/pyconfig.h.in
+++ b/pyconfig.h.in
@@ -1568,6 +1568,9 @@
/* Define if you want to coerce the C locale to a UTF-8 based locale */
#undef PY_COERCE_C_LOCALE
+/* Define to 1 if you have the perf trampoline. */
+#undef PY_HAVE_PERF_TRAMPOLINE
+
/* Define to 1 to build the sqlite module with loadable extensions support. */
#undef PY_SQLITE_ENABLE_LOAD_EXTENSION