Python/perf_trampoline.c - platform/external/python/cpython3 - Git at Google

 /*

 Perf trampoline instrumentation
 ===============================

 This file contains instrumentation to allow to associate
 calls to the CPython eval loop back to the names of the Python
 functions and filename being executed.

 Many native performance profilers like the Linux perf tools are
 only available to 'see' the C stack when sampling from the profiled
 process. This means that if we have the following python code:

     import time
     def foo(n):
         # Some CPU intensive code

     def bar(n):
         foo(n)

     def baz(n):
         bar(n)

     baz(10000000)

 A performance profiler that is only able to see native frames will
 produce the following backtrace when sampling from foo():

     _PyEval_EvalFrameDefault -----> Evaluation frame of foo()
     _PyEval_Vector
     _PyFunction_Vectorcall
     PyObject_Vectorcall
     call_function

     _PyEval_EvalFrameDefault ------> Evaluation frame of bar()
     _PyEval_EvalFrame
     _PyEval_Vector
     _PyFunction_Vectorcall
     PyObject_Vectorcall
     call_function

     _PyEval_EvalFrameDefault -------> Evaluation frame of baz()
     _PyEval_EvalFrame
     _PyEval_Vector
     _PyFunction_Vectorcall
     PyObject_Vectorcall
     call_function

     ...

     Py_RunMain

 Because the profiler is only able to see the native frames and the native
 function that runs the evaluation loop is the same (_PyEval_EvalFrameDefault)
 then the profiler and any reporter generated by it will not be able to
 associate the names of the Python functions and the filenames associated with
 those calls, rendering the results useless in the Python world.

 To fix this problem, we introduce the concept of a trampoline frame. A
 trampoline frame is a piece of code that is unique per Python code object that
 is executed before entering the CPython eval loop. This piece of code just
 calls the original Python evaluation function (_PyEval_EvalFrameDefault) and
 forwards all the arguments received. In this way, when a profiler samples
 frames from the previous example it will see;

     _PyEval_EvalFrameDefault -----> Evaluation frame of foo()
     [Jit compiled code 3]
     _PyEval_Vector
     _PyFunction_Vectorcall
     PyObject_Vectorcall
     call_function

     _PyEval_EvalFrameDefault ------> Evaluation frame of bar()
     [Jit compiled code 2]
     _PyEval_EvalFrame
     _PyEval_Vector
     _PyFunction_Vectorcall
     PyObject_Vectorcall
     call_function

     _PyEval_EvalFrameDefault -------> Evaluation frame of baz()
     [Jit compiled code 1]
     _PyEval_EvalFrame
     _PyEval_Vector
     _PyFunction_Vectorcall
     PyObject_Vectorcall
     call_function

     ...

     Py_RunMain

 When we generate every unique copy of the trampoline (what here we called "[Jit
 compiled code N]") we write the relationship between the compiled code and the
 Python function that is associated with it. Every profiler requires this
 information in a different format. For example, the Linux "perf" profiler
 requires a file in "/tmp/perf-PID.map" (name and location not configurable)
 with the following format:

     <compiled code address> <compiled code size> <name of the compiled code>

 If this file is available when "perf" generates reports, it will automatically
 associate every trampoline with the Python function that it is associated with
 allowing it to generate reports that include Python information. These reports
 then can also be filtered in a way that *only* Python information appears.

 Notice that for this to work, there must be a unique copied of the trampoline
 per Python code object even if the code in the trampoline is the same. To
 achieve this we have a assembly template in Objects/asm_trampiline.S that is
 compiled into the Python executable/shared library. This template generates a
 symbol that maps the start of the assembly code and another that marks the end
 of the assembly code for the trampoline.  Then, every time we need a unique
 trampoline for a Python code object, we copy the assembly code into a mmaped
 area that has executable permissions and we return the start of that area as
 our trampoline function.

 Asking for a mmap-ed memory area for trampoline is very wasteful so we
 allocate big arenas of memory in a single mmap call, we populate the entire
 arena with copies of the trampoline (this allows us to now have to invalidate
 the icache for the instructions in the page) and then we return the next
 available chunk every time someone asks for a new trampoline. We keep a linked
 list of arenas in case the current memory arena is exhausted and another one is
 needed.

 For the best results, Python should be compiled with
 CFLAGS="-fno-omit-frame-pointer -mno-omit-leaf-frame-pointer" as this allows
 profilers to unwind using only the frame pointer and not on DWARF debug
 information (note that as trampilines are dynamically generated there won't be
 any DWARF information available for them).
 */

 #include "Python.h"
 #include "pycore_ceval.h"         // _PyPerf_Callbacks
 #include "pycore_frame.h"
 #include "pycore_interp.h"


 #ifdef PY_HAVE_PERF_TRAMPOLINE

 #include <fcntl.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <sys/mman.h>             // mmap()
 #include <sys/types.h>
 #include <unistd.h>               // sysconf()
 #include <sys/time.h>           // gettimeofday()


 #if defined(__arm__) || defined(__arm64__) || defined(__aarch64__)
 #define PY_HAVE_INVALIDATE_ICACHE

 #if defined(__clang__) || defined(__GNUC__)
 extern void __clear_cache(void *, void*);
 #endif

 static void invalidate_icache(char* begin, char*end) {
 #if defined(__clang__) || defined(__GNUC__)
     return __clear_cache(begin, end);
 #else
     return;
 #endif
 }
 #endif

 /* The function pointer is passed as last argument. The other three arguments
  * are passed in the same order as the function requires. This results in
  * shorter, more efficient ASM code for trampoline.
  */
 typedef PyObject *(*py_evaluator)(PyThreadState *, _PyInterpreterFrame *,
                                   int throwflag);
 typedef PyObject *(*py_trampoline)(PyThreadState *, _PyInterpreterFrame *, int,
                                    py_evaluator);

 extern void *_Py_trampoline_func_start;  // Start of the template of the
                                          // assembly trampoline
 extern void *
     _Py_trampoline_func_end;  // End of the template of the assembly trampoline

 struct code_arena_st {
     char *start_addr;    // Start of the memory arena
     char *current_addr;  // Address of the current trampoline within the arena
     size_t size;         // Size of the memory arena
     size_t size_left;    // Remaining size of the memory arena
     size_t code_size;    // Size of the code of every trampoline in the arena
     struct code_arena_st
         *prev;  // Pointer to the arena  or NULL if this is the first arena.
 };

 typedef struct code_arena_st code_arena_t;
 typedef struct trampoline_api_st trampoline_api_t;

 enum perf_trampoline_type {
     PERF_TRAMPOLINE_UNSET = 0,
     PERF_TRAMPOLINE_TYPE_MAP = 1,
     PERF_TRAMPOLINE_TYPE_JITDUMP = 2,
 };

 #define perf_status _PyRuntime.ceval.perf.status
 #define extra_code_index _PyRuntime.ceval.perf.extra_code_index
 #define perf_code_arena _PyRuntime.ceval.perf.code_arena
 #define trampoline_api _PyRuntime.ceval.perf.trampoline_api
 #define perf_map_file _PyRuntime.ceval.perf.map_file
 #define persist_after_fork _PyRuntime.ceval.perf.persist_after_fork
 #define perf_trampoline_type _PyRuntime.ceval.perf.perf_trampoline_type

 static void
 perf_map_write_entry(void *state, const void *code_addr,
                          unsigned int code_size, PyCodeObject *co)
 {
     const char *entry = "";
     if (co->co_qualname != NULL) {
         entry = PyUnicode_AsUTF8(co->co_qualname);
     }
     const char *filename = "";
     if (co->co_filename != NULL) {
         filename = PyUnicode_AsUTF8(co->co_filename);
     }
     size_t perf_map_entry_size = snprintf(NULL, 0, "py::%s:%s", entry, filename) + 1;
     char* perf_map_entry = (char*) PyMem_RawMalloc(perf_map_entry_size);
     if (perf_map_entry == NULL) {
         return;
     }
     snprintf(perf_map_entry, perf_map_entry_size, "py::%s:%s", entry, filename);
     PyUnstable_WritePerfMapEntry(code_addr, code_size, perf_map_entry);
     PyMem_RawFree(perf_map_entry);
 }

 static void*
 perf_map_init_state(void)
 {
     PyUnstable_PerfMapState_Init();
     trampoline_api.code_padding = 0;
     perf_trampoline_type = PERF_TRAMPOLINE_TYPE_MAP;
     return NULL;
 }

 static int
 perf_map_free_state(void *state)
 {
     PyUnstable_PerfMapState_Fini();
     return 0;
 }

 _PyPerf_Callbacks _Py_perfmap_callbacks = {
     &perf_map_init_state,
     &perf_map_write_entry,
     &perf_map_free_state,
 };


 static size_t round_up(int64_t value, int64_t multiple) {
     if (multiple == 0) {
         // Avoid division by zero
         return value;
     }

     int64_t remainder = value % multiple;
     if (remainder == 0) {
         // Value is already a multiple of 'multiple'
         return value;
     }

     // Calculate the difference to the next multiple
     int64_t difference = multiple - remainder;

     // Add the difference to the value
     int64_t rounded_up_value = value + difference;

     return rounded_up_value;
 }

 // TRAMPOLINE MANAGEMENT API

 static int
 new_code_arena(void)
 {
     // non-trivial programs typically need 64 to 256 kiB.
     size_t mem_size = 4096 * 16;
     assert(mem_size % sysconf(_SC_PAGESIZE) == 0);
     char *memory =
         mmap(NULL,  // address
              mem_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS,
              -1,  // fd (not used here)
              0);  // offset (not used here)
     if (memory == MAP_FAILED) {
         PyErr_SetFromErrno(PyExc_OSError);
         PyErr_FormatUnraisable("Failed to create new mmap for perf trampoline");
         perf_status = PERF_STATUS_FAILED;
         return -1;
     }
     void *start = &_Py_trampoline_func_start;
     void *end = &_Py_trampoline_func_end;
     size_t code_size = end - start;
     size_t chunk_size = round_up(code_size + trampoline_api.code_padding, 16);
     // TODO: Check the effect of alignment of the code chunks. Initial investigation
     // showed that this has no effect on performance in x86-64 or aarch64 and the current
     // version has the advantage that the unwinder in GDB can unwind across JIT-ed code.
     //
     // We should check the values in the future and see if there is a
     // measurable performance improvement by rounding trampolines up to 32-bit
     // or 64-bit alignment.

     size_t n_copies = mem_size / chunk_size;
     for (size_t i = 0; i < n_copies; i++) {
         memcpy(memory + i * chunk_size, start, code_size * sizeof(char));
     }
     // Some systems may prevent us from creating executable code on the fly.
     int res = mprotect(memory, mem_size, PROT_READ | PROT_EXEC);
     if (res == -1) {
         PyErr_SetFromErrno(PyExc_OSError);
         munmap(memory, mem_size);
         PyErr_FormatUnraisable("Failed to set mmap for perf trampoline to "
                                "PROT_READ | PROT_EXEC");
         return -1;
     }

 #ifdef PY_HAVE_INVALIDATE_ICACHE
     // Before the JIT can run a block of code that has been emitted it must invalidate
     // the instruction cache on some platforms like arm and aarch64.
     invalidate_icache(memory, memory + mem_size);
 #endif

     code_arena_t *new_arena = PyMem_RawCalloc(1, sizeof(code_arena_t));
     if (new_arena == NULL) {
         PyErr_NoMemory();
         munmap(memory, mem_size);
         PyErr_FormatUnraisable("Failed to allocate new code arena struct for perf trampoline");
         return -1;
     }

     new_arena->start_addr = memory;
     new_arena->current_addr = memory;
     new_arena->size = mem_size;
     new_arena->size_left = mem_size;
     new_arena->code_size = code_size;
     new_arena->prev = perf_code_arena;
     perf_code_arena = new_arena;
     return 0;
 }

 static void
 free_code_arenas(void)
 {
     code_arena_t *cur = perf_code_arena;
     code_arena_t *prev;
     perf_code_arena = NULL;  // invalid static pointer
     while (cur) {
         munmap(cur->start_addr, cur->size);
         prev = cur->prev;
         PyMem_RawFree(cur);
         cur = prev;
     }
 }

 static inline py_trampoline
 code_arena_new_code(code_arena_t *code_arena)
 {
     py_trampoline trampoline = (py_trampoline)code_arena->current_addr;
     size_t total_code_size = round_up(code_arena->code_size + trampoline_api.code_padding, 16);
     code_arena->size_left -= total_code_size;
     code_arena->current_addr += total_code_size;
     return trampoline;
 }

 static inline py_trampoline
 compile_trampoline(void)
 {
     size_t total_code_size = round_up(perf_code_arena->code_size + trampoline_api.code_padding, 16);
     if ((perf_code_arena == NULL) ||
         (perf_code_arena->size_left <= total_code_size)) {
         if (new_code_arena() < 0) {
             return NULL;
         }
     }
     assert(perf_code_arena->size_left <= perf_code_arena->size);
     return code_arena_new_code(perf_code_arena);
 }

 static PyObject *
 py_trampoline_evaluator(PyThreadState *ts, _PyInterpreterFrame *frame,
                         int throw)
 {
     if (perf_status == PERF_STATUS_FAILED ||
         perf_status == PERF_STATUS_NO_INIT) {
         goto default_eval;
     }
     PyCodeObject *co = _PyFrame_GetCode(frame);
     py_trampoline f = NULL;
     assert(extra_code_index != -1);
     int ret = _PyCode_GetExtra((PyObject *)co, extra_code_index, (void **)&f);
     if (ret != 0 || f == NULL) {
         // This is the first time we see this code object so we need
         // to compile a trampoline for it.
         py_trampoline new_trampoline = compile_trampoline();
         if (new_trampoline == NULL) {
             goto default_eval;
         }
         trampoline_api.write_state(trampoline_api.state, new_trampoline,
                                    perf_code_arena->code_size, co);
         _PyCode_SetExtra((PyObject *)co, extra_code_index,
                          (void *)new_trampoline);
         f = new_trampoline;
     }
     assert(f != NULL);
     return f(ts, frame, throw, _PyEval_EvalFrameDefault);
 default_eval:
     // Something failed, fall back to the default evaluator.
     return _PyEval_EvalFrameDefault(ts, frame, throw);
 }
 #endif  // PY_HAVE_PERF_TRAMPOLINE

 int PyUnstable_PerfTrampoline_CompileCode(PyCodeObject *co)
 {
 #ifdef PY_HAVE_PERF_TRAMPOLINE
     py_trampoline f = NULL;
     assert(extra_code_index != -1);
     int ret = _PyCode_GetExtra((PyObject *)co, extra_code_index, (void **)&f);
     if (ret != 0 || f == NULL) {
         py_trampoline new_trampoline = compile_trampoline();
         if (new_trampoline == NULL) {
             return 0;
         }
         trampoline_api.write_state(trampoline_api.state, new_trampoline,
                                    perf_code_arena->code_size, co);
         return _PyCode_SetExtra((PyObject *)co, extra_code_index,
                          (void *)new_trampoline);
     }
 #endif // PY_HAVE_PERF_TRAMPOLINE
     return 0;
 }

 int
 _PyIsPerfTrampolineActive(void)
 {
 #ifdef PY_HAVE_PERF_TRAMPOLINE
     PyThreadState *tstate = _PyThreadState_GET();
     return tstate->interp->eval_frame == py_trampoline_evaluator;
 #endif
     return 0;
 }

 void
 _PyPerfTrampoline_GetCallbacks(_PyPerf_Callbacks *callbacks)
 {
     if (callbacks == NULL) {
         return;
     }
 #ifdef PY_HAVE_PERF_TRAMPOLINE
     callbacks->init_state = trampoline_api.init_state;
     callbacks->write_state = trampoline_api.write_state;
     callbacks->free_state = trampoline_api.free_state;
 #endif
     return;
 }

 int
 _PyPerfTrampoline_SetCallbacks(_PyPerf_Callbacks *callbacks)
 {
     if (callbacks == NULL) {
         return -1;
     }
 #ifdef PY_HAVE_PERF_TRAMPOLINE
     if (trampoline_api.state) {
         _PyPerfTrampoline_Fini();
     }
     trampoline_api.init_state = callbacks->init_state;
     trampoline_api.write_state = callbacks->write_state;
     trampoline_api.free_state = callbacks->free_state;
     trampoline_api.state = NULL;
 #endif
     return 0;
 }

 int
 _PyPerfTrampoline_Init(int activate)
 {
 #ifdef PY_HAVE_PERF_TRAMPOLINE
     PyThreadState *tstate = _PyThreadState_GET();
     if (tstate->interp->eval_frame &&
         tstate->interp->eval_frame != py_trampoline_evaluator) {
         PyErr_SetString(PyExc_RuntimeError,
                         "Trampoline cannot be initialized as a custom eval "
                         "frame is already present");
         return -1;
     }
     if (!activate) {
         tstate->interp->eval_frame = NULL;
         perf_status = PERF_STATUS_NO_INIT;
     }
     else {
         tstate->interp->eval_frame = py_trampoline_evaluator;
         if (new_code_arena() < 0) {
             return -1;
         }
         extra_code_index = _PyEval_RequestCodeExtraIndex(NULL);
         if (extra_code_index == -1) {
             return -1;
         }
         if (trampoline_api.state == NULL && trampoline_api.init_state != NULL) {
             trampoline_api.state = trampoline_api.init_state();
         }
         perf_status = PERF_STATUS_OK;
     }
 #endif
     return 0;
 }

 int
 _PyPerfTrampoline_Fini(void)
 {
 #ifdef PY_HAVE_PERF_TRAMPOLINE
     if (perf_status != PERF_STATUS_OK) {
         return 0;
     }
     PyThreadState *tstate = _PyThreadState_GET();
     if (tstate->interp->eval_frame == py_trampoline_evaluator) {
         tstate->interp->eval_frame = NULL;
     }
     if (perf_status == PERF_STATUS_OK) {
         trampoline_api.free_state(trampoline_api.state);
         perf_trampoline_type = PERF_TRAMPOLINE_UNSET;
     }
     extra_code_index = -1;
     perf_status = PERF_STATUS_NO_INIT;
 #endif
     return 0;
 }

 void _PyPerfTrampoline_FreeArenas(void) {
 #ifdef PY_HAVE_PERF_TRAMPOLINE
     free_code_arenas();
 #endif
     return;
 }

 int
 PyUnstable_PerfTrampoline_SetPersistAfterFork(int enable){
 #ifdef PY_HAVE_PERF_TRAMPOLINE
     persist_after_fork = enable;
     return persist_after_fork;
 #endif
     return 0;
 }

 PyStatus
 _PyPerfTrampoline_AfterFork_Child(void)
 {
 #ifdef PY_HAVE_PERF_TRAMPOLINE
     if (persist_after_fork) {
         if (perf_trampoline_type != PERF_TRAMPOLINE_TYPE_MAP) {
             return PyStatus_Error("Failed to copy perf map file as perf trampoline type is not type map.");
         }
         _PyPerfTrampoline_Fini();
         char filename[256];
         pid_t parent_pid = getppid();
         snprintf(filename, sizeof(filename), "/tmp/perf-%d.map", parent_pid);
         if (PyUnstable_CopyPerfMapFile(filename) != 0) {
             return PyStatus_Error("Failed to copy perf map file.");
         }
     } else {
         // Restart trampoline in file in child.
         int was_active = _PyIsPerfTrampolineActive();
         _PyPerfTrampoline_Fini();
         if (was_active) {
             _PyPerfTrampoline_Init(1);
         }
     }
 #endif
     return PyStatus_Ok();
 }
	/*

	Perf trampoline instrumentation
	===============================

	This file contains instrumentation to allow to associate
	calls to the CPython eval loop back to the names of the Python
	functions and filename being executed.

	Many native performance profilers like the Linux perf tools are
	only available to 'see' the C stack when sampling from the profiled
	process. This means that if we have the following python code:

	import time
	def foo(n):
	# Some CPU intensive code

	def bar(n):
	foo(n)

	def baz(n):
	bar(n)

	baz(10000000)

	A performance profiler that is only able to see native frames will
	produce the following backtrace when sampling from foo():

	_PyEval_EvalFrameDefault -----> Evaluation frame of foo()
	_PyEval_Vector
	_PyFunction_Vectorcall
	PyObject_Vectorcall
	call_function

	_PyEval_EvalFrameDefault ------> Evaluation frame of bar()
	_PyEval_EvalFrame
	_PyEval_Vector
	_PyFunction_Vectorcall
	PyObject_Vectorcall
	call_function

	_PyEval_EvalFrameDefault -------> Evaluation frame of baz()
	_PyEval_EvalFrame
	_PyEval_Vector
	_PyFunction_Vectorcall
	PyObject_Vectorcall
	call_function

	...

	Py_RunMain

	Because the profiler is only able to see the native frames and the native
	function that runs the evaluation loop is the same (_PyEval_EvalFrameDefault)
	then the profiler and any reporter generated by it will not be able to
	associate the names of the Python functions and the filenames associated with
	those calls, rendering the results useless in the Python world.

	To fix this problem, we introduce the concept of a trampoline frame. A
	trampoline frame is a piece of code that is unique per Python code object that
	is executed before entering the CPython eval loop. This piece of code just
	calls the original Python evaluation function (_PyEval_EvalFrameDefault) and
	forwards all the arguments received. In this way, when a profiler samples
	frames from the previous example it will see;

	_PyEval_EvalFrameDefault -----> Evaluation frame of foo()
	[Jit compiled code 3]
	_PyEval_Vector
	_PyFunction_Vectorcall
	PyObject_Vectorcall
	call_function

	_PyEval_EvalFrameDefault ------> Evaluation frame of bar()
	[Jit compiled code 2]
	_PyEval_EvalFrame
	_PyEval_Vector
	_PyFunction_Vectorcall
	PyObject_Vectorcall
	call_function

	_PyEval_EvalFrameDefault -------> Evaluation frame of baz()
	[Jit compiled code 1]
	_PyEval_EvalFrame
	_PyEval_Vector
	_PyFunction_Vectorcall
	PyObject_Vectorcall
	call_function

	...

	Py_RunMain

	When we generate every unique copy of the trampoline (what here we called "[Jit
	compiled code N]") we write the relationship between the compiled code and the
	Python function that is associated with it. Every profiler requires this
	information in a different format. For example, the Linux "perf" profiler
	requires a file in "/tmp/perf-PID.map" (name and location not configurable)
	with the following format:

	<compiled code address> <compiled code size> <name of the compiled code>

	If this file is available when "perf" generates reports, it will automatically
	associate every trampoline with the Python function that it is associated with
	allowing it to generate reports that include Python information. These reports
	then can also be filtered in a way that only Python information appears.

	Notice that for this to work, there must be a unique copied of the trampoline
	per Python code object even if the code in the trampoline is the same. To
	achieve this we have a assembly template in Objects/asm_trampiline.S that is
	compiled into the Python executable/shared library. This template generates a
	symbol that maps the start of the assembly code and another that marks the end
	of the assembly code for the trampoline. Then, every time we need a unique
	trampoline for a Python code object, we copy the assembly code into a mmaped
	area that has executable permissions and we return the start of that area as
	our trampoline function.

	Asking for a mmap-ed memory area for trampoline is very wasteful so we
	allocate big arenas of memory in a single mmap call, we populate the entire
	arena with copies of the trampoline (this allows us to now have to invalidate
	the icache for the instructions in the page) and then we return the next
	available chunk every time someone asks for a new trampoline. We keep a linked
	list of arenas in case the current memory arena is exhausted and another one is
	needed.

	For the best results, Python should be compiled with
	CFLAGS="-fno-omit-frame-pointer -mno-omit-leaf-frame-pointer" as this allows
	profilers to unwind using only the frame pointer and not on DWARF debug
	information (note that as trampilines are dynamically generated there won't be
	any DWARF information available for them).
	*/

	#include "Python.h"
	#include "pycore_ceval.h" // _PyPerf_Callbacks
	#include "pycore_frame.h"
	#include "pycore_interp.h"


	#ifdef PY_HAVE_PERF_TRAMPOLINE

	#include <fcntl.h>
	#include <stdio.h>
	#include <stdlib.h>
	#include <sys/mman.h> // mmap()
	#include <sys/types.h>
	#include <unistd.h> // sysconf()
	#include <sys/time.h> // gettimeofday()


	#if defined(__arm__) \|\| defined(__arm64__) \|\| defined(__aarch64__)
	#define PY_HAVE_INVALIDATE_ICACHE

	#if defined(__clang__) \|\| defined(__GNUC__)
	extern void __clear_cache(void , void);
	#endif

	static void invalidate_icache(char* begin, char*end) {
	#if defined(__clang__) \|\| defined(__GNUC__)
	return __clear_cache(begin, end);
	#else
	return;
	#endif
	}
	#endif

	/* The function pointer is passed as last argument. The other three arguments
	* are passed in the same order as the function requires. This results in
	* shorter, more efficient ASM code for trampoline.
	*/
	typedef PyObject (py_evaluator)(PyThreadState , _PyInterpreterFrame ,
	int throwflag);
	typedef PyObject (py_trampoline)(PyThreadState , _PyInterpreterFrame , int,
	py_evaluator);

	extern void *_Py_trampoline_func_start; // Start of the template of the
	// assembly trampoline
	extern void *
	_Py_trampoline_func_end; // End of the template of the assembly trampoline

	struct code_arena_st {
	char *start_addr; // Start of the memory arena
	char *current_addr; // Address of the current trampoline within the arena
	size_t size; // Size of the memory arena
	size_t size_left; // Remaining size of the memory arena
	size_t code_size; // Size of the code of every trampoline in the arena
	struct code_arena_st
	*prev; // Pointer to the arena or NULL if this is the first arena.
	};

	typedef struct code_arena_st code_arena_t;
	typedef struct trampoline_api_st trampoline_api_t;

	enum perf_trampoline_type {
	PERF_TRAMPOLINE_UNSET = 0,
	PERF_TRAMPOLINE_TYPE_MAP = 1,
	PERF_TRAMPOLINE_TYPE_JITDUMP = 2,
	};

	#define perf_status _PyRuntime.ceval.perf.status
	#define extra_code_index _PyRuntime.ceval.perf.extra_code_index
	#define perf_code_arena _PyRuntime.ceval.perf.code_arena
	#define trampoline_api _PyRuntime.ceval.perf.trampoline_api
	#define perf_map_file _PyRuntime.ceval.perf.map_file
	#define persist_after_fork _PyRuntime.ceval.perf.persist_after_fork
	#define perf_trampoline_type _PyRuntime.ceval.perf.perf_trampoline_type

	static void
	perf_map_write_entry(void state, const void code_addr,
	unsigned int code_size, PyCodeObject *co)
	{
	const char *entry = "";
	if (co->co_qualname != NULL) {
	entry = PyUnicode_AsUTF8(co->co_qualname);
	}
	const char *filename = "";
	if (co->co_filename != NULL) {
	filename = PyUnicode_AsUTF8(co->co_filename);
	}
	size_t perf_map_entry_size = snprintf(NULL, 0, "py::%s:%s", entry, filename) + 1;
	char* perf_map_entry = (char*) PyMem_RawMalloc(perf_map_entry_size);
	if (perf_map_entry == NULL) {
	return;
	}
	snprintf(perf_map_entry, perf_map_entry_size, "py::%s:%s", entry, filename);
	PyUnstable_WritePerfMapEntry(code_addr, code_size, perf_map_entry);
	PyMem_RawFree(perf_map_entry);
	}

	static void*
	perf_map_init_state(void)
	{
	PyUnstable_PerfMapState_Init();
	trampoline_api.code_padding = 0;
	perf_trampoline_type = PERF_TRAMPOLINE_TYPE_MAP;
	return NULL;
	}

	static int
	perf_map_free_state(void *state)
	{
	PyUnstable_PerfMapState_Fini();
	return 0;
	}

	_PyPerf_Callbacks _Py_perfmap_callbacks = {
	&perf_map_init_state,
	&perf_map_write_entry,
	&perf_map_free_state,
	};


	static size_t round_up(int64_t value, int64_t multiple) {
	if (multiple == 0) {
	// Avoid division by zero
	return value;
	}

	int64_t remainder = value % multiple;
	if (remainder == 0) {
	// Value is already a multiple of 'multiple'
	return value;
	}

	// Calculate the difference to the next multiple
	int64_t difference = multiple - remainder;

	// Add the difference to the value
	int64_t rounded_up_value = value + difference;

	return rounded_up_value;
	}

	// TRAMPOLINE MANAGEMENT API

	static int
	new_code_arena(void)
	{
	// non-trivial programs typically need 64 to 256 kiB.
	size_t mem_size = 4096 * 16;
	assert(mem_size % sysconf(_SC_PAGESIZE) == 0);
	char *memory =
	mmap(NULL, // address
	mem_size, PROT_READ \| PROT_WRITE, MAP_PRIVATE \| MAP_ANONYMOUS,
	-1, // fd (not used here)
	0); // offset (not used here)
	if (memory == MAP_FAILED) {
	PyErr_SetFromErrno(PyExc_OSError);
	PyErr_FormatUnraisable("Failed to create new mmap for perf trampoline");
	perf_status = PERF_STATUS_FAILED;
	return -1;
	}
	void *start = &_Py_trampoline_func_start;
	void *end = &_Py_trampoline_func_end;
	size_t code_size = end - start;
	size_t chunk_size = round_up(code_size + trampoline_api.code_padding, 16);
	// TODO: Check the effect of alignment of the code chunks. Initial investigation
	// showed that this has no effect on performance in x86-64 or aarch64 and the current
	// version has the advantage that the unwinder in GDB can unwind across JIT-ed code.
	//
	// We should check the values in the future and see if there is a
	// measurable performance improvement by rounding trampolines up to 32-bit
	// or 64-bit alignment.

	size_t n_copies = mem_size / chunk_size;
	for (size_t i = 0; i < n_copies; i++) {
	memcpy(memory + i * chunk_size, start, code_size * sizeof(char));
	}
	// Some systems may prevent us from creating executable code on the fly.
	int res = mprotect(memory, mem_size, PROT_READ \| PROT_EXEC);
	if (res == -1) {
	PyErr_SetFromErrno(PyExc_OSError);
	munmap(memory, mem_size);
	PyErr_FormatUnraisable("Failed to set mmap for perf trampoline to "
	"PROT_READ \| PROT_EXEC");
	return -1;
	}

	#ifdef PY_HAVE_INVALIDATE_ICACHE
	// Before the JIT can run a block of code that has been emitted it must invalidate
	// the instruction cache on some platforms like arm and aarch64.
	invalidate_icache(memory, memory + mem_size);
	#endif

	code_arena_t *new_arena = PyMem_RawCalloc(1, sizeof(code_arena_t));
	if (new_arena == NULL) {
	PyErr_NoMemory();
	munmap(memory, mem_size);
	PyErr_FormatUnraisable("Failed to allocate new code arena struct for perf trampoline");
	return -1;
	}

	new_arena->start_addr = memory;
	new_arena->current_addr = memory;
	new_arena->size = mem_size;
	new_arena->size_left = mem_size;
	new_arena->code_size = code_size;
	new_arena->prev = perf_code_arena;
	perf_code_arena = new_arena;
	return 0;
	}

	static void
	free_code_arenas(void)
	{
	code_arena_t *cur = perf_code_arena;
	code_arena_t *prev;
	perf_code_arena = NULL; // invalid static pointer
	while (cur) {
	munmap(cur->start_addr, cur->size);
	prev = cur->prev;
	PyMem_RawFree(cur);
	cur = prev;
	}
	}

	static inline py_trampoline
	code_arena_new_code(code_arena_t *code_arena)
	{
	py_trampoline trampoline = (py_trampoline)code_arena->current_addr;
	size_t total_code_size = round_up(code_arena->code_size + trampoline_api.code_padding, 16);
	code_arena->size_left -= total_code_size;
	code_arena->current_addr += total_code_size;
	return trampoline;
	}

	static inline py_trampoline
	compile_trampoline(void)
	{
	size_t total_code_size = round_up(perf_code_arena->code_size + trampoline_api.code_padding, 16);
	if ((perf_code_arena == NULL) \|\|
	(perf_code_arena->size_left <= total_code_size)) {
	if (new_code_arena() < 0) {
	return NULL;
	}
	}
	assert(perf_code_arena->size_left <= perf_code_arena->size);
	return code_arena_new_code(perf_code_arena);
	}

	static PyObject *
	py_trampoline_evaluator(PyThreadState ts, _PyInterpreterFrame frame,
	int throw)
	{
	if (perf_status == PERF_STATUS_FAILED \|\|
	perf_status == PERF_STATUS_NO_INIT) {
	goto default_eval;
	}
	PyCodeObject *co = _PyFrame_GetCode(frame);
	py_trampoline f = NULL;
	assert(extra_code_index != -1);
	int ret = _PyCode_GetExtra((PyObject )co, extra_code_index, (void *)&f);
	if (ret != 0 \|\| f == NULL) {
	// This is the first time we see this code object so we need
	// to compile a trampoline for it.
	py_trampoline new_trampoline = compile_trampoline();
	if (new_trampoline == NULL) {
	goto default_eval;
	}
	trampoline_api.write_state(trampoline_api.state, new_trampoline,
	perf_code_arena->code_size, co);
	_PyCode_SetExtra((PyObject *)co, extra_code_index,
	(void *)new_trampoline);
	f = new_trampoline;
	}
	assert(f != NULL);
	return f(ts, frame, throw, _PyEval_EvalFrameDefault);
	default_eval:
	// Something failed, fall back to the default evaluator.
	return _PyEval_EvalFrameDefault(ts, frame, throw);
	}
	#endif // PY_HAVE_PERF_TRAMPOLINE

	int PyUnstable_PerfTrampoline_CompileCode(PyCodeObject *co)
	{
	#ifdef PY_HAVE_PERF_TRAMPOLINE
	py_trampoline f = NULL;
	assert(extra_code_index != -1);
	int ret = _PyCode_GetExtra((PyObject )co, extra_code_index, (void *)&f);
	if (ret != 0 \|\| f == NULL) {
	py_trampoline new_trampoline = compile_trampoline();
	if (new_trampoline == NULL) {
	return 0;
	}
	trampoline_api.write_state(trampoline_api.state, new_trampoline,
	perf_code_arena->code_size, co);
	return _PyCode_SetExtra((PyObject *)co, extra_code_index,
	(void *)new_trampoline);
	}
	#endif // PY_HAVE_PERF_TRAMPOLINE
	return 0;
	}

	int
	_PyIsPerfTrampolineActive(void)
	{
	#ifdef PY_HAVE_PERF_TRAMPOLINE
	PyThreadState *tstate = _PyThreadState_GET();
	return tstate->interp->eval_frame == py_trampoline_evaluator;
	#endif
	return 0;
	}

	void
	_PyPerfTrampoline_GetCallbacks(_PyPerf_Callbacks *callbacks)
	{
	if (callbacks == NULL) {
	return;
	}
	#ifdef PY_HAVE_PERF_TRAMPOLINE
	callbacks->init_state = trampoline_api.init_state;
	callbacks->write_state = trampoline_api.write_state;
	callbacks->free_state = trampoline_api.free_state;
	#endif
	return;
	}

	int
	_PyPerfTrampoline_SetCallbacks(_PyPerf_Callbacks *callbacks)
	{
	if (callbacks == NULL) {
	return -1;
	}
	#ifdef PY_HAVE_PERF_TRAMPOLINE
	if (trampoline_api.state) {
	_PyPerfTrampoline_Fini();
	}
	trampoline_api.init_state = callbacks->init_state;
	trampoline_api.write_state = callbacks->write_state;
	trampoline_api.free_state = callbacks->free_state;
	trampoline_api.state = NULL;
	#endif
	return 0;
	}

	int
	_PyPerfTrampoline_Init(int activate)
	{
	#ifdef PY_HAVE_PERF_TRAMPOLINE
	PyThreadState *tstate = _PyThreadState_GET();
	if (tstate->interp->eval_frame &&
	tstate->interp->eval_frame != py_trampoline_evaluator) {
	PyErr_SetString(PyExc_RuntimeError,
	"Trampoline cannot be initialized as a custom eval "
	"frame is already present");
	return -1;
	}
	if (!activate) {
	tstate->interp->eval_frame = NULL;
	perf_status = PERF_STATUS_NO_INIT;
	}
	else {
	tstate->interp->eval_frame = py_trampoline_evaluator;
	if (new_code_arena() < 0) {
	return -1;
	}
	extra_code_index = _PyEval_RequestCodeExtraIndex(NULL);
	if (extra_code_index == -1) {
	return -1;
	}
	if (trampoline_api.state == NULL && trampoline_api.init_state != NULL) {
	trampoline_api.state = trampoline_api.init_state();
	}
	perf_status = PERF_STATUS_OK;
	}
	#endif
	return 0;
	}

	int
	_PyPerfTrampoline_Fini(void)
	{
	#ifdef PY_HAVE_PERF_TRAMPOLINE
	if (perf_status != PERF_STATUS_OK) {
	return 0;
	}
	PyThreadState *tstate = _PyThreadState_GET();
	if (tstate->interp->eval_frame == py_trampoline_evaluator) {
	tstate->interp->eval_frame = NULL;
	}
	if (perf_status == PERF_STATUS_OK) {
	trampoline_api.free_state(trampoline_api.state);
	perf_trampoline_type = PERF_TRAMPOLINE_UNSET;
	}
	extra_code_index = -1;
	perf_status = PERF_STATUS_NO_INIT;
	#endif
	return 0;
	}

	void _PyPerfTrampoline_FreeArenas(void) {
	#ifdef PY_HAVE_PERF_TRAMPOLINE
	free_code_arenas();
	#endif
	return;
	}

	int
	PyUnstable_PerfTrampoline_SetPersistAfterFork(int enable){
	#ifdef PY_HAVE_PERF_TRAMPOLINE
	persist_after_fork = enable;
	return persist_after_fork;
	#endif
	return 0;
	}

	PyStatus
	_PyPerfTrampoline_AfterFork_Child(void)
	{
	#ifdef PY_HAVE_PERF_TRAMPOLINE
	if (persist_after_fork) {
	if (perf_trampoline_type != PERF_TRAMPOLINE_TYPE_MAP) {
	return PyStatus_Error("Failed to copy perf map file as perf trampoline type is not type map.");
	}
	_PyPerfTrampoline_Fini();
	char filename[256];
	pid_t parent_pid = getppid();
	snprintf(filename, sizeof(filename), "/tmp/perf-%d.map", parent_pid);
	if (PyUnstable_CopyPerfMapFile(filename) != 0) {
	return PyStatus_Error("Failed to copy perf map file.");
	}
	} else {
	// Restart trampoline in file in child.
	int was_active = _PyIsPerfTrampolineActive();
	_PyPerfTrampoline_Fini();
	if (was_active) {
	_PyPerfTrampoline_Init(1);
	}
	}
	#endif
	return PyStatus_Ok();
	}