python · cocolato · Mar 31, 2026 · Apr 1, 2026 · Apr 1, 2026 · Apr 2, 2026
diff --git a/Include/cpython/pystats.h b/Include/cpython/pystats.h
@@ -144,6 +144,7 @@ typedef struct _optimization_stats {
     uint64_t unknown_callee;
     uint64_t trace_immediately_deopts;
     uint64_t executors_invalidated;
+    uint64_t fitness_terminated_traces;
     UOpStats opcode[PYSTATS_MAX_UOP_ID + 1];
     uint64_t unsupported_opcode[256];
     uint64_t trace_length_hist[_Py_UOP_HIST_SIZE];

diff --git a/Include/internal/pycore_interp_structs.h b/Include/internal/pycore_interp_structs.h
@@ -449,6 +449,9 @@ typedef struct _PyOptimizationConfig {
     uint16_t side_exit_initial_value;
     uint16_t side_exit_initial_backoff;
 
+    // Trace fitness thresholds
+    uint16_t fitness_initial;
+
     // Optimization flags
     bool specialization_enabled;
     bool uops_optimize_enabled;

diff --git a/Include/internal/pycore_optimizer.h b/Include/internal/pycore_optimizer.h
@@ -15,6 +15,51 @@ extern "C" {
 #include "pycore_optimizer_types.h"
 #include <stdbool.h>
 
+/* Fitness controls how long a trace can grow.
+ * Starts at FITNESS_INITIAL, then decreases from per-bytecode buffer usage
+ * plus branch/frame heuristics. The trace stops when fitness drops below the
+ * current exit_quality.
+ *
+ * Design targets for the constants below:
+ * 1. Reaching the abstract frame-depth limit should drop fitness below
+ *    EXIT_QUALITY_SPECIALIZABLE.
+ * 2. A backward edge should leave budget for roughly N_BACKWARD_SLACK more
+ *    bytecodes, assuming AVG_SLOTS_PER_INSTRUCTION.
+ * 3. Roughly seven balanced branches should reduce fitness to
+ *    EXIT_QUALITY_DEFAULT before per-slot costs.
+ * 4. A push followed by a matching return is net-zero on frame-specific
+ *    fitness, excluding per-slot costs.
+ */
+#define MAX_TARGET_LENGTH          400
+#define OPTIMIZER_EFFECTIVENESS    2
+#define FITNESS_INITIAL            (MAX_TARGET_LENGTH * OPTIMIZER_EFFECTIVENESS)
+
+/* Exit quality thresholds: trace stops when fitness < exit_quality.
+ * Higher = trace is more willing to stop here. */
+#define EXIT_QUALITY_CLOSE_LOOP      (FITNESS_INITIAL / 2)
+#define EXIT_QUALITY_ENTER_EXECUTOR  (FITNESS_INITIAL * 3 / 8)
+#define EXIT_QUALITY_DEFAULT         (FITNESS_INITIAL / 8)
+#define EXIT_QUALITY_SPECIALIZABLE   (FITNESS_INITIAL / 80)
+
+/* Estimated buffer slots per bytecode, used only to derive heuristics.
+ * Runtime charging uses trace-buffer capacity consumed for each bytecode. */
+#define AVG_SLOTS_PER_INSTRUCTION  6
+
+/* Heuristic backward-edge penalty: leave room for about
+ * N_BACKWARD_SLACK more bytecodes before reaching EXIT_QUALITY_CLOSE_LOOP,
+ * based on AVG_SLOTS_PER_INSTRUCTION. */
+#define N_BACKWARD_SLACK           50
+#define FITNESS_BACKWARD_EDGE      (FITNESS_INITIAL - EXIT_QUALITY_CLOSE_LOOP \
+                                      - N_BACKWARD_SLACK * AVG_SLOTS_PER_INSTRUCTION)
+
+/* Backward edge penalty for JUMP_BACKWARD_NO_INTERRUPT (coroutines/yield-from).
+ * Smaller than FITNESS_BACKWARD_EDGE since these loops are very short. */
+#define FITNESS_BACKWARD_EDGE_COROUTINE  (FITNESS_BACKWARD_EDGE / 4)
+
+/* Penalty for a perfectly balanced (50/50) branch.
+ * 7 such branches (ignoring per-slot cost) exhaust fitness to EXIT_QUALITY_DEFAULT. */
+#define FITNESS_BRANCH_BALANCED    ((FITNESS_INITIAL - EXIT_QUALITY_DEFAULT) / 7)
+
 
 typedef struct _PyJitUopBuffer {
     _PyUOpInstruction *start;
@@ -101,7 +146,8 @@ typedef struct _PyJitTracerPreviousState {
 } _PyJitTracerPreviousState;
 
 typedef struct _PyJitTracerTranslatorState {
-    int jump_backward_seen;
+    int32_t fitness;              // Current trace fitness, starts high, decrements
+    int frame_depth;              // Current inline depth (0 = root frame)
 } _PyJitTracerTranslatorState;
 
 typedef struct _PyJitTracerState {

diff --git a/Lib/test/test_capi/test_opt.py b/Lib/test/test_capi/test_opt.py
@@ -1358,9 +1358,13 @@ def testfunc(n):
             for _ in gen(n):
                 pass
         testfunc(TIER2_THRESHOLD * 2)
+        # The generator may be inlined into testfunc's trace,
+        # so check whichever executor contains _YIELD_VALUE.
         gen_ex = get_first_executor(gen)
-        self.assertIsNotNone(gen_ex)
-        uops = get_opnames(gen_ex)
+        testfunc_ex = get_first_executor(testfunc)
+        ex = gen_ex or testfunc_ex
+        self.assertIsNotNone(ex)
+        uops = get_opnames(ex)
         self.assertNotIn("_MAKE_HEAP_SAFE", uops)
         self.assertIn("_YIELD_VALUE", uops)
 

diff --git a/Modules/_testinternalcapi/test_cases.c.h b/Modules/_testinternalcapi/test_cases.c.h
@@ -6345,7 +6345,13 @@ dummy_func(
             tracer->prev_state.instr_frame = frame;
             tracer->prev_state.instr_oparg = oparg;
             tracer->prev_state.instr_stacklevel = PyStackRef_IsNone(frame->f_executable) ? 2 : STACK_LEVEL();
-            if (_PyOpcode_Caches[_PyOpcode_Deopt[opcode]]) {
+            if (_PyOpcode_Caches[_PyOpcode_Deopt[opcode]]
+                // Branch opcodes use the cache for branch history, not
+                // specialization counters.  Don't reset it.
+                && opcode != POP_JUMP_IF_FALSE
+                && opcode != POP_JUMP_IF_TRUE
+                && opcode != POP_JUMP_IF_NONE
+                && opcode != POP_JUMP_IF_NOT_NONE) {
                 (&next_instr[1])->counter = trigger_backoff_counter();
             }
 

diff --git a/Python/generated_cases.c.h b/Python/generated_cases.c.h
@@ -549,8 +549,6 @@ dynamic_exit_uop[MAX_UOP_ID + 1] = {
 };
 
 
-#define CONFIDENCE_RANGE 1000
-#define CONFIDENCE_CUTOFF 333
 
 #ifdef Py_DEBUG
 #define DPRINTF(level, ...) \
@@ -598,6 +596,43 @@ add_to_trace(
     ((uint32_t)((INSTR) - ((_Py_CODEUNIT *)(CODE)->co_code_adaptive)))
 
 
+/* Branch penalty: 0 if fully biased, FITNESS_BRANCH_BALANCED if 50/50,
+ * 2*FITNESS_BRANCH_BALANCED if fully against the traced direction. */
+static inline int
+compute_branch_penalty(uint16_t history, bool branch_taken)
+{
+    int taken_count = _Py_popcount32((uint32_t)history);
+    int on_trace_count = branch_taken ? taken_count : 16 - taken_count;
+    int off_trace = 16 - on_trace_count;
+    return off_trace * FITNESS_BRANCH_BALANCED / 8;
+}
+
+/* Compute exit quality for the current trace position.
+ * Higher values mean better places to stop the trace. */
+static inline int32_t
+compute_exit_quality(_Py_CODEUNIT *target_instr, int opcode,
+                     const _PyJitTracerState *tracer)
+{
+    if (target_instr == tracer->initial_state.start_instr ||
+        target_instr == tracer->initial_state.close_loop_instr) {
+        return EXIT_QUALITY_CLOSE_LOOP;
+    }
+    if (target_instr->op.code == ENTER_EXECUTOR) {
+        return EXIT_QUALITY_ENTER_EXECUTOR;
+    }
+    if (_PyOpcode_Caches[_PyOpcode_Deopt[opcode]] > 0) {
+        return EXIT_QUALITY_SPECIALIZABLE;
+    }
+    return EXIT_QUALITY_DEFAULT;
+}
+
+/* Frame penalty: (MAX_ABSTRACT_FRAME_DEPTH-1) pushes exhaust fitness. */
+static inline int32_t
+compute_frame_penalty(uint16_t fitness_initial)
+{
+    return (int32_t)fitness_initial / (MAX_ABSTRACT_FRAME_DEPTH - 1) + 1;
+}
+
 static int
 is_terminator(const _PyUOpInstruction *uop)
 {
@@ -734,13 +769,11 @@ _PyJit_translate_single_bytecode_to_trace(
         DPRINTF(2, "Unsupported: oparg too large\n");
         unsupported:
         {
-            // Rewind to previous instruction and replace with _EXIT_TRACE.
             _PyUOpInstruction *curr = uop_buffer_last(trace);
             while (curr->opcode != _SET_IP && uop_buffer_length(trace) > 2) {
                 trace->next--;
                 curr = uop_buffer_last(trace);
             }
-            assert(curr->opcode == _SET_IP || uop_buffer_length(trace) == 2);
             if (curr->opcode == _SET_IP) {
                 int32_t old_target = (int32_t)uop_get_target(curr);
                 curr->opcode = _DEOPT;
@@ -763,6 +796,26 @@ _PyJit_translate_single_bytecode_to_trace(
         return 1;
     }
 
+    // Stop the trace if fitness has dropped below the exit quality threshold.
+    _PyJitTracerTranslatorState *ts = &tracer->translator_state;
+    int32_t eq = compute_exit_quality(target_instr, opcode, tracer);
+    DPRINTF(3, "Fitness check: %s(%d) fitness=%d, exit_quality=%d, depth=%d\n",
+            _PyOpcode_OpName[opcode], oparg, ts->fitness, eq, ts->frame_depth);
+
+    if (ts->fitness < eq) {
+        // Heuristic exit: leave operand1=0 so the side exit increments chain_depth.
+        ADD_TO_TRACE(_EXIT_TRACE, 0, 0, target);
+        OPT_STAT_INC(fitness_terminated_traces);
+        DPRINTF(2, "Fitness terminated: %s(%d) fitness=%d < exit_quality=%d\n",
+                _PyOpcode_OpName[opcode], oparg, ts->fitness, eq);
+        goto done;
+    }
+
+    // Snapshot the buffer before reserving tail slots. The later charge
+    // includes both emitted uops and capacity reserved for exits/deopts/errors.
+    _PyUOpInstruction *next_before = trace->next;
+    _PyUOpInstruction *end_before = trace->end;
+
     // One for possible _DEOPT, one because _CHECK_VALIDITY itself might _DEOPT
     trace->end -= 2;
 
@@ -789,12 +842,7 @@ _PyJit_translate_single_bytecode_to_trace(
     trace->end -= needs_guard_ip;
 
     int space_needed = expansion->nuops + needs_guard_ip + 2 + (!OPCODE_HAS_NO_SAVE_IP(opcode));
-    if (uop_buffer_remaining_space(trace) < space_needed) {
-        DPRINTF(2, "No room for expansions and guards (need %d, got %d)\n",
-                space_needed, uop_buffer_remaining_space(trace));
-        OPT_STAT_INC(trace_too_long);
-        goto done;
-    }
+    assert(uop_buffer_remaining_space(trace) > space_needed);
 
     ADD_TO_TRACE(_CHECK_VALIDITY, 0, 0, target);
 
@@ -816,36 +864,29 @@ _PyJit_translate_single_bytecode_to_trace(
             assert(jump_happened ? (next_instr == computed_jump_instr) : (next_instr == computed_next_instr));
             uint32_t uopcode = BRANCH_TO_GUARD[opcode - POP_JUMP_IF_FALSE][jump_happened];
             ADD_TO_TRACE(uopcode, 0, 0, INSTR_IP(jump_happened ? computed_next_instr : computed_jump_instr, old_code));
+            int bp = compute_branch_penalty(target_instr[1].cache, jump_happened);
+            tracer->translator_state.fitness -= bp;
+            DPRINTF(3, "  branch penalty: -%d (history=0x%04x, taken=%d) -> fitness=%d\n",
+                    bp, target_instr[1].cache, jump_happened,
+                    tracer->translator_state.fitness);
+
             break;
         }
         case JUMP_BACKWARD_JIT:
             // This is possible as the JIT might have re-activated after it was disabled
         case JUMP_BACKWARD_NO_JIT:
         case JUMP_BACKWARD:
             ADD_TO_TRACE(_CHECK_PERIODIC, 0, 0, target);
-            _Py_FALLTHROUGH;
+            tracer->translator_state.fitness -= FITNESS_BACKWARD_EDGE;
+            DPRINTF(3, "  backward edge penalty: -%d -> fitness=%d\n",
+                    FITNESS_BACKWARD_EDGE, tracer->translator_state.fitness);
+            break;
         case JUMP_BACKWARD_NO_INTERRUPT:
-        {
-            if ((next_instr != tracer->initial_state.close_loop_instr) &&
-                (next_instr != tracer->initial_state.start_instr) &&
-                uop_buffer_length(&tracer->code_buffer) > CODE_SIZE_NO_PROGRESS &&
-                // For side exits, we don't want to terminate them early.
-                tracer->initial_state.exit == NULL &&
-                // These are coroutines, and we want to unroll those usually.
-                opcode != JUMP_BACKWARD_NO_INTERRUPT) {
-                // We encountered a JUMP_BACKWARD but not to the top of our own loop.
-                // We don't want to continue tracing as we might get stuck in the
-                // inner loop. Instead, end the trace where the executor of the
-                // inner loop might start and let the traces rejoin.
-                OPT_STAT_INC(inner_loop);
-                ADD_TO_TRACE(_EXIT_TRACE, 0, 0, target);
-                uop_buffer_last(trace)->operand1 = true; // is_control_flow
-                DPRINTF(2, "JUMP_BACKWARD not to top ends trace %p %p %p\n", next_instr,
-                    tracer->initial_state.close_loop_instr, tracer->initial_state.start_instr);
-                goto done;
-            }
+            tracer->translator_state.fitness -= FITNESS_BACKWARD_EDGE_COROUTINE;
+            DPRINTF(3, "  coroutine backward edge penalty: -%d -> fitness=%d\n",
+                    FITNESS_BACKWARD_EDGE_COROUTINE,
+                    tracer->translator_state.fitness);
             break;
-        }
 
         case RESUME:
         case RESUME_CHECK:
@@ -945,6 +986,36 @@ _PyJit_translate_single_bytecode_to_trace(
                     assert(next->op.code == STORE_FAST);
                     operand = next->op.arg;
                 }
+                else if (uop == _PUSH_FRAME) {
+                    _PyJitTracerTranslatorState *ts_depth = &tracer->translator_state;
+                    ts_depth->frame_depth++;
+                    assert(ts_depth->frame_depth < MAX_ABSTRACT_FRAME_DEPTH);
+                    int32_t frame_penalty = compute_frame_penalty(tstate->interp->opt_config.fitness_initial);
+                    ts_depth->fitness -= frame_penalty;
+                    DPRINTF(3, "  _PUSH_FRAME: depth=%d, penalty=-%d -> fitness=%d\n",
+                            ts_depth->frame_depth, frame_penalty,
+                            ts_depth->fitness);
+                }
+                else if (uop == _RETURN_VALUE || uop == _RETURN_GENERATOR || uop == _YIELD_VALUE) {
+                    _PyJitTracerTranslatorState *ts_depth = &tracer->translator_state;
+                    int32_t frame_penalty = compute_frame_penalty(tstate->interp->opt_config.fitness_initial);
+                    if (ts_depth->frame_depth <= 0) {
+                        // Returning from a frame we didn't enter — penalize.
+                        ts_depth->fitness -= frame_penalty;
+                        DPRINTF(3, "  %s: underflow penalty=-%d -> fitness=%d\n",
+                                _PyOpcode_uop_name[uop], frame_penalty,
+                                ts_depth->fitness);
+                    }
+                    else {
+                        // Symmetric with push: net-zero frame impact.
+                        ts_depth->fitness += frame_penalty;
+                        ts_depth->frame_depth--;
+                        DPRINTF(3, "  %s: return reward=+%d, depth=%d -> fitness=%d\n",
+                                _PyOpcode_uop_name[uop], frame_penalty,
+                                ts_depth->frame_depth,
+                                ts_depth->fitness);
+                    }
+                }
                 else if (_PyUop_Flags[uop] & HAS_RECORDS_VALUE_FLAG) {
                     PyObject *recorded_value = tracer->prev_state.recorded_value;
                     tracer->prev_state.recorded_value = NULL;
@@ -986,7 +1057,18 @@ _PyJit_translate_single_bytecode_to_trace(
         ADD_TO_TRACE(_JUMP_TO_TOP, 0, 0, 0);
         goto done;
     }
-    DPRINTF(2, "Trace continuing\n");
+    // Charge fitness by trace-buffer capacity consumed for this bytecode,
+    // including both emitted uops and tail reservations.
+    {
+        int32_t slots_fwd = (int32_t)(trace->next - next_before);
+        int32_t slots_rev = (int32_t)(end_before - trace->end);
+        int32_t slots_used = slots_fwd + slots_rev;
+        tracer->translator_state.fitness -= slots_used;
+        DPRINTF(3, "  per-insn cost: -%d (fwd=%d, rev=%d) -> fitness=%d\n",
+                slots_used, slots_fwd, slots_rev,
+                tracer->translator_state.fitness);
+    }
+    DPRINTF(2, "Trace continuing (fitness=%d)\n", tracer->translator_state.fitness);
     return 1;
 done:
     DPRINTF(2, "Trace done\n");
@@ -1069,6 +1151,16 @@ _PyJit_TryInitializeTracing(
     assert(curr_instr->op.code == JUMP_BACKWARD_JIT || curr_instr->op.code == RESUME_CHECK_JIT || (exit != NULL));
     tracer->initial_state.jump_backward_instr = curr_instr;
 
+    // Reduce side-trace fitness as chain depth grows, but clamp the reduction
+    // after depth 4 so deeply chained exits still have at least half budget.
+    const _PyOptimizationConfig *cfg = &tstate->interp->opt_config;
+    _PyJitTracerTranslatorState *ts = &tracer->translator_state;
+    int effective_depth = Py_MIN(chain_depth, 4);
+    ts->fitness = (int32_t)((8 - effective_depth) * cfg->fitness_initial / 8);
+    ts->frame_depth = 0;
+    DPRINTF(3, "Fitness init: chain_depth=%d, fitness=%d\n",
+            chain_depth, ts->fitness);
+
     tracer->is_tracing = true;
     return 1;
 }
@@ -2101,7 +2193,11 @@ _PyDumpExecutors(FILE *out)
     fprintf(out, "    node [colorscheme=greys9]\n");
     PyInterpreterState *interp = PyInterpreterState_Get();
     for (size_t i = 0; i < interp->executor_count; i++) {
-        executor_to_gv(interp->executor_ptrs[i], out);
+        _PyExecutorObject *exec = interp->executor_ptrs[i];
+        if (exec->vm_data.code == NULL) {
+            continue;
+        }
+        executor_to_gv(exec, out);
     }
     fprintf(out, "}\n\n");
     return 0;

@@ -635,6 +635,11 @@ init_interpreter(PyInterpreterState *interp,
                 "PYTHON_JIT_SIDE_EXIT_INITIAL_BACKOFF",
                 SIDE_EXIT_INITIAL_BACKOFF, 0, MAX_BACKOFF);
 
+    // Trace fitness configuration
+    init_policy(&interp->opt_config.fitness_initial,
+                "PYTHON_JIT_FITNESS_INITIAL",
+                FITNESS_INITIAL, EXIT_QUALITY_CLOSE_LOOP, UOP_MAX_TRACE_LENGTH - 1);
+
     interp->opt_config.specialization_enabled = !is_env_enabled("PYTHON_SPECIALIZATION_OFF");
     interp->opt_config.uops_optimize_enabled = !is_env_disabled("PYTHON_UOPS_OPTIMIZE");
     if (interp != &runtime->_main_interpreter) {