From 023bbd4b85081835f4d442ad00e6202e1365673a Mon Sep 17 00:00:00 2001 From: "Jeong, YunWon" Date: Sun, 1 Mar 2026 15:44:14 +0900 Subject: [PATCH 1/6] Implement LOAD_ATTR inline caching with adaptive specialization Add type version counter (tp_version_tag) to PyType with subclass invalidation cascade. Add cache read/write methods (u16/u32/u64) to CodeUnits. Implement adaptive specialization in load_attr that replaces the opcode with specialized variants on first execution: - LoadAttrMethodNoDict: cached method lookup for slotted types - LoadAttrMethodWithValues: cached method with dict shadow check - LoadAttrInstanceValue: direct dict lookup skipping descriptors Specialized opcodes guard on type_version_tag and deoptimize back to generic LOAD_ATTR with backoff counter on cache miss. --- crates/codegen/src/ir.rs | 12 +- crates/compiler-core/src/bytecode.rs | 62 +++++++ crates/vm/src/builtins/type.rs | 42 ++++- crates/vm/src/frame.rs | 238 ++++++++++++++++++++++++++- crates/vm/src/object/core.rs | 3 + 5 files changed, 346 insertions(+), 11 deletions(-) diff --git a/crates/codegen/src/ir.rs b/crates/codegen/src/ir.rs index 4363ffaa768..b21af84a51c 100644 --- a/crates/codegen/src/ir.rs +++ b/crates/codegen/src/ir.rs @@ -457,11 +457,13 @@ impl CodeInfo { .map(|byte| CodeUnit::new(Instruction::ExtendedArg, byte)) .chain([CodeUnit { op, arg: lo_arg }]), ); - // Emit CACHE code units after the instruction - instructions.extend(core::iter::repeat_n( - CodeUnit::new(Instruction::Cache, 0.into()), - cache_count, - )); + // Emit CACHE code units after the instruction (all zeroed) + if cache_count > 0 { + instructions.extend(core::iter::repeat_n( + CodeUnit::new(Instruction::Cache, 0.into()), + cache_count, + )); + } current_offset = offset_after; } next_block = block.next; diff --git a/crates/compiler-core/src/bytecode.rs b/crates/compiler-core/src/bytecode.rs index cece1fb77fa..a2c764148ac 100644 --- a/crates/compiler-core/src/bytecode.rs +++ b/crates/compiler-core/src/bytecode.rs @@ -343,6 +343,11 @@ pub struct CodeUnit { const _: () = assert!(mem::size_of::() == 2); +/// Adaptive specialization: number of executions before attempting specialization. +pub const ADAPTIVE_WARMUP_VALUE: u16 = 50; +/// Adaptive specialization: backoff counter after de-optimization. +pub const ADAPTIVE_BACKOFF_VALUE: u16 = 250; + impl CodeUnit { pub const fn new(op: Instruction, arg: OpArgByte) -> Self { Self { op, arg } @@ -441,6 +446,63 @@ impl CodeUnits { core::ptr::write(op_ptr, new_op.into()); } } + + /// Write a u16 value into a CACHE code unit at `index`. + /// Each CodeUnit is 2 bytes (#[repr(C)]: op u8 + arg u8), so one u16 fits exactly. + /// + /// # Safety + /// - `index` must be in bounds and point to a CACHE entry. + /// - The caller must ensure no concurrent reads/writes to the same slot. + pub unsafe fn write_cache_u16(&self, index: usize, value: u16) { + unsafe { + let units = &mut *self.0.get(); + let ptr = units.as_mut_ptr().add(index) as *mut u8; + core::ptr::write_unaligned(ptr as *mut u16, value); + } + } + + /// Read a u16 value from a CACHE code unit at `index`. + pub fn read_cache_u16(&self, index: usize) -> u16 { + let units = unsafe { &*self.0.get() }; + let ptr = units.as_ptr().wrapping_add(index) as *const u8; + unsafe { core::ptr::read_unaligned(ptr as *const u16) } + } + + /// Write a u32 value across two consecutive CACHE code units starting at `index`. + /// + /// # Safety + /// Same requirements as `write_cache_u16`. + pub unsafe fn write_cache_u32(&self, index: usize, value: u32) { + unsafe { + self.write_cache_u16(index, value as u16); + self.write_cache_u16(index + 1, (value >> 16) as u16); + } + } + + /// Read a u32 value from two consecutive CACHE code units starting at `index`. + pub fn read_cache_u32(&self, index: usize) -> u32 { + let lo = self.read_cache_u16(index) as u32; + let hi = self.read_cache_u16(index + 1) as u32; + lo | (hi << 16) + } + + /// Write a u64 value across four consecutive CACHE code units starting at `index`. + /// + /// # Safety + /// Same requirements as `write_cache_u16`. + pub unsafe fn write_cache_u64(&self, index: usize, value: u64) { + unsafe { + self.write_cache_u32(index, value as u32); + self.write_cache_u32(index + 2, (value >> 32) as u32); + } + } + + /// Read a u64 value from four consecutive CACHE code units starting at `index`. + pub fn read_cache_u64(&self, index: usize) -> u64 { + let lo = self.read_cache_u32(index) as u64; + let hi = self.read_cache_u32(index + 2) as u64; + lo | (hi << 32) + } } /// A Constant (which usually encapsulates data within it) diff --git a/crates/vm/src/builtins/type.rs b/crates/vm/src/builtins/type.rs index 86865e9e083..da2510f4f9a 100644 --- a/crates/vm/src/builtins/type.rs +++ b/crates/vm/src/builtins/type.rs @@ -28,7 +28,14 @@ use crate::{ Representable, SLOT_DEFS, SetAttr, TypeDataRef, TypeDataRefMut, TypeDataSlot, }, }; -use core::{any::Any, borrow::Borrow, ops::Deref, pin::Pin, ptr::NonNull}; +use core::{ + any::Any, + borrow::Borrow, + ops::Deref, + pin::Pin, + ptr::NonNull, + sync::atomic::{AtomicU32, Ordering}, +}; use indexmap::{IndexMap, map::Entry}; use itertools::Itertools; use num_traits::ToPrimitive; @@ -44,8 +51,12 @@ pub struct PyType { pub attributes: PyRwLock, pub slots: PyTypeSlots, pub heaptype_ext: Option>>, + /// Type version tag for inline caching. 0 means unassigned/invalidated. + pub tp_version_tag: AtomicU32, } +static NEXT_TYPE_VERSION: AtomicU32 = AtomicU32::new(1); + unsafe impl crate::object::Traverse for PyType { fn traverse(&self, tracer_fn: &mut crate::object::TraverseFn<'_>) { self.base.traverse(tracer_fn); @@ -188,6 +199,27 @@ fn is_subtype_with_mro(a_mro: &[PyTypeRef], a: &Py, b: &Py) -> b } impl PyType { + /// Assign a fresh version tag. Returns 0 on overflow (all caches invalidated). + pub fn assign_version_tag(&self) -> u32 { + let v = NEXT_TYPE_VERSION.fetch_add(1, Ordering::Relaxed); + if v == 0 { + return 0; + } + self.tp_version_tag.store(v, Ordering::Release); + v + } + + /// Invalidate this type's version tag and cascade to all subclasses. + pub fn modified(&self) { + self.tp_version_tag.store(0, Ordering::Release); + let subclasses = self.subclasses.read(); + for weak_ref in subclasses.iter() { + if let Some(sub) = weak_ref.upgrade() { + sub.downcast_ref::().unwrap().modified(); + } + } + } + pub fn new_simple_heap( name: &str, base: &Py, @@ -365,6 +397,7 @@ impl PyType { attributes: PyRwLock::new(attrs), slots, heaptype_ext: Some(Pin::new(Box::new(heaptype_ext))), + tp_version_tag: AtomicU32::new(0), }, metaclass, None, @@ -418,6 +451,7 @@ impl PyType { attributes: PyRwLock::new(attrs), slots, heaptype_ext: None, + tp_version_tag: AtomicU32::new(0), }, metaclass, None, @@ -799,6 +833,9 @@ impl PyType { } update_mro_recursively(zelf, vm)?; + // Invalidate inline caches + zelf.modified(); + // TODO: do any old slots need to be cleaned up first? zelf.init_slots(&vm.ctx); @@ -1903,6 +1940,9 @@ impl SetAttr for PyType { ))); } } + // Invalidate inline caches that depend on this type's attributes + zelf.modified(); + if attr_name.as_wtf8().starts_with("__") && attr_name.as_wtf8().ends_with("__") { if assign { zelf.update_slot::(attr_name, &vm.ctx); diff --git a/crates/vm/src/frame.rs b/crates/vm/src/frame.rs index 663885c579d..c872b1a7d12 100644 --- a/crates/vm/src/frame.rs +++ b/crates/vm/src/frame.rs @@ -4,9 +4,9 @@ use crate::{ AsObject, Py, PyExact, PyObject, PyObjectRef, PyPayload, PyRef, PyResult, PyStackRef, TryFromObject, VirtualMachine, builtins::{ - PyBaseException, PyBaseExceptionRef, PyCode, PyCoroutine, PyDict, PyDictRef, PyGenerator, - PyInterpolation, PyList, PySet, PySlice, PyStr, PyStrInterned, PyTemplate, PyTraceback, - PyType, PyUtf8Str, + PyBaseException, PyBaseExceptionRef, PyBaseObject, PyCode, PyCoroutine, PyDict, PyDictRef, + PyGenerator, PyInterpolation, PyList, PySet, PySlice, PyStr, PyStrInterned, PyTemplate, + PyTraceback, PyType, PyUtf8Str, asyncgenerator::PyAsyncGenWrappedValue, float::PyFloat, frame::stack_analysis, @@ -15,7 +15,9 @@ use crate::{ range::PyRangeIterator, tuple::{PyTuple, PyTupleRef}, }, - bytecode::{self, Instruction, LoadAttr, LoadSuperAttr, SpecialMethod}, + bytecode::{ + self, ADAPTIVE_BACKOFF_VALUE, Arg, Instruction, LoadAttr, LoadSuperAttr, SpecialMethod, + }, convert::{IntoObject, ToPyResult}, coroutine::Coro, exceptions::ExceptionCtor, @@ -34,7 +36,7 @@ use core::cell::UnsafeCell; use core::iter::zip; use core::sync::atomic; use core::sync::atomic::AtomicPtr; -use core::sync::atomic::Ordering::Relaxed; +use core::sync::atomic::Ordering::{Acquire, Relaxed}; use indexmap::IndexMap; use itertools::Itertools; use malachite_bigint::BigInt; @@ -2644,6 +2646,106 @@ impl ExecutingFrame<'_> { self.push_value(vm.ctx.new_bool(!value).into()); Ok(None) } + // Specialized LOAD_ATTR opcodes + Instruction::LoadAttrMethodNoDict => { + let oparg = LoadAttr::new(u32::from(arg)); + let instr_idx = self.lasti() as usize - 1; + let cache_base = instr_idx + 1; + + let owner = self.top_value(); + let type_version = self.code.instructions.read_cache_u32(cache_base + 1); + + if owner.class().tp_version_tag.load(Acquire) == type_version { + // Cache hit: load the cached method descriptor + let descr_ptr = self.code.instructions.read_cache_u64(cache_base + 5); + let func = unsafe { &*(descr_ptr as *const PyObject) }.to_owned(); + let owner = self.pop_value(); + self.push_value(func); + self.push_value(owner); + Ok(None) + } else { + // De-optimize + unsafe { + self.code + .instructions + .replace_op(instr_idx, Instruction::LoadAttr { idx: Arg::marker() }); + self.code + .instructions + .write_cache_u16(cache_base, ADAPTIVE_BACKOFF_VALUE); + } + self.load_attr_slow(vm, oparg) + } + } + Instruction::LoadAttrMethodWithValues => { + let oparg = LoadAttr::new(u32::from(arg)); + let instr_idx = self.lasti() as usize - 1; + let cache_base = instr_idx + 1; + let attr_name = self.code.names[oparg.name_idx() as usize]; + + let owner = self.top_value(); + let type_version = self.code.instructions.read_cache_u32(cache_base + 1); + + if owner.class().tp_version_tag.load(Acquire) == type_version { + // Check instance dict doesn't shadow the method + let shadowed = if let Some(dict) = owner.dict() { + dict.get_item_opt(attr_name, vm).ok().flatten().is_some() + } else { + false + }; + + if !shadowed { + // Cache hit: load the cached method descriptor + let descr_ptr = self.code.instructions.read_cache_u64(cache_base + 5); + let func = unsafe { &*(descr_ptr as *const PyObject) }.to_owned(); + let owner = self.pop_value(); + self.push_value(func); + self.push_value(owner); + return Ok(None); + } + } + // De-optimize + unsafe { + self.code + .instructions + .replace_op(instr_idx, Instruction::LoadAttr { idx: Arg::marker() }); + self.code + .instructions + .write_cache_u16(cache_base, ADAPTIVE_BACKOFF_VALUE); + } + self.load_attr_slow(vm, oparg) + } + Instruction::LoadAttrInstanceValue => { + let oparg = LoadAttr::new(u32::from(arg)); + let instr_idx = self.lasti() as usize - 1; + let cache_base = instr_idx + 1; + let attr_name = self.code.names[oparg.name_idx() as usize]; + + let owner = self.top_value(); + let type_version = self.code.instructions.read_cache_u32(cache_base + 1); + + if owner.class().tp_version_tag.load(Acquire) == type_version { + // Type version matches — no data descriptor for this attr. + // Try direct dict lookup, skipping full descriptor protocol. + if let Some(dict) = owner.dict() + && let Some(value) = dict.get_item_opt(attr_name, vm)? + { + self.pop_value(); + self.push_value(value); + return Ok(None); + } + // Not in instance dict — fall through to class lookup via slow path + } + // De-optimize + unsafe { + self.code + .instructions + .replace_op(instr_idx, Instruction::LoadAttr { idx: Arg::marker() }); + self.code + .instructions + .write_cache_u16(cache_base, ADAPTIVE_BACKOFF_VALUE); + } + self.load_attr_slow(vm, oparg) + } // All INSTRUMENTED_* opcodes delegate to a cold function to keep // the hot instruction loop free of monitoring overhead. _ => self.execute_instrumented(instruction, arg, vm), @@ -4111,6 +4213,132 @@ impl ExecutingFrame<'_> { } fn load_attr(&mut self, vm: &VirtualMachine, oparg: LoadAttr) -> FrameResult { + let instr_idx = self.lasti() as usize - 1; + let cache_base = instr_idx + 1; + + // Decrement adaptive counter + let counter = self.code.instructions.read_cache_u16(cache_base); + if counter > 0 { + unsafe { + self.code + .instructions + .write_cache_u16(cache_base, counter - 1); + } + } else { + // Counter reached 0: attempt specialization for future calls + self.specialize_load_attr(vm, oparg, instr_idx, cache_base); + } + + // Execute slow path for this call + self.load_attr_slow(vm, oparg) + } + + fn specialize_load_attr( + &mut self, + _vm: &VirtualMachine, + oparg: LoadAttr, + instr_idx: usize, + cache_base: usize, + ) { + let obj = self.top_value(); + let cls = obj.class(); + + // Only specialize if getattro is the default (PyBaseObject::getattro) + let is_default_getattro = cls + .slots + .getattro + .load() + .is_some_and(|f| f as usize == PyBaseObject::getattro as *const () as usize); + if !is_default_getattro { + unsafe { + self.code + .instructions + .write_cache_u16(cache_base, ADAPTIVE_BACKOFF_VALUE); + } + return; + } + + // Get or assign type version + let mut type_version = cls.tp_version_tag.load(Acquire); + if type_version == 0 { + type_version = cls.assign_version_tag(); + } + if type_version == 0 { + // Version counter overflow + return; + } + + let attr_name = self.code.names[oparg.name_idx() as usize]; + + // Look up attr in class via MRO + let cls_attr = cls.get_attr(attr_name); + let has_dict = obj.dict().is_some(); + + if oparg.is_method() { + // Method specialization + if let Some(ref descr) = cls_attr + && descr + .class() + .slots + .flags + .has_feature(PyTypeFlags::METHOD_DESCRIPTOR) + { + let descr_ptr = &**descr as *const PyObject as u64; + unsafe { + self.code + .instructions + .write_cache_u32(cache_base + 1, type_version); + self.code + .instructions + .write_cache_u64(cache_base + 5, descr_ptr); + } + + let new_op = if !has_dict { + Instruction::LoadAttrMethodNoDict + } else { + Instruction::LoadAttrMethodWithValues + }; + unsafe { + self.code.instructions.replace_op(instr_idx, new_op); + } + return; + } + // Can't specialize this method call + unsafe { + self.code + .instructions + .write_cache_u16(cache_base, ADAPTIVE_BACKOFF_VALUE); + } + } else { + // Regular attribute access + let has_data_descr = cls_attr.as_ref().is_some_and(|descr| { + let descr_cls = descr.class(); + descr_cls.slots.descr_get.load().is_some() + && descr_cls.slots.descr_set.load().is_some() + }); + + if !has_data_descr && has_dict { + // Instance attribute access — skip class descriptor check + unsafe { + self.code + .instructions + .write_cache_u32(cache_base + 1, type_version); + self.code + .instructions + .replace_op(instr_idx, Instruction::LoadAttrInstanceValue); + } + } else { + // Data descriptor or no dict — can't easily specialize + unsafe { + self.code + .instructions + .write_cache_u16(cache_base, ADAPTIVE_BACKOFF_VALUE); + } + } + } + } + + fn load_attr_slow(&mut self, vm: &VirtualMachine, oparg: LoadAttr) -> FrameResult { let attr_name = self.code.names[oparg.name_idx() as usize]; let parent = self.pop_value(); diff --git a/crates/vm/src/object/core.rs b/crates/vm/src/object/core.rs index b48045f2163..41ddfa26b2e 100644 --- a/crates/vm/src/object/core.rs +++ b/crates/vm/src/object/core.rs @@ -1927,6 +1927,7 @@ pub(crate) fn init_type_hierarchy() -> (PyTypeRef, PyTypeRef, PyTypeRef) { attributes: PyRwLock::new(Default::default()), slots: PyType::make_slots(), heaptype_ext: None, + tp_version_tag: core::sync::atomic::AtomicU32::new(0), }; let object_payload = PyType { base: None, @@ -1936,6 +1937,7 @@ pub(crate) fn init_type_hierarchy() -> (PyTypeRef, PyTypeRef, PyTypeRef) { attributes: PyRwLock::new(Default::default()), slots: object::PyBaseObject::make_slots(), heaptype_ext: None, + tp_version_tag: core::sync::atomic::AtomicU32::new(0), }; let type_type_ptr = Box::into_raw(Box::new(partially_init!( PyInner:: { @@ -1997,6 +1999,7 @@ pub(crate) fn init_type_hierarchy() -> (PyTypeRef, PyTypeRef, PyTypeRef) { attributes: PyRwLock::default(), slots: PyWeak::make_slots(), heaptype_ext: None, + tp_version_tag: core::sync::atomic::AtomicU32::new(0), }; let weakref_type = PyRef::new_ref(weakref_type, type_type.clone(), None); // Static type: untrack from GC (was tracked by new_ref because PyType has HAS_TRAVERSE) From ff073c865dad8370afbe05685d999e282d81d7dd Mon Sep 17 00:00:00 2001 From: "Jeong, YunWon" Date: Sun, 1 Mar 2026 19:16:54 +0900 Subject: [PATCH 2/6] Add BINARY_OP and CALL adaptive specialization BINARY_OP: Specialize int add/subtract/multiply and float add/subtract/multiply with type guards and deoptimization. CALL: Add func_version to PyFunction, specialize simple function calls (CallPyExactArgs, CallBoundMethodExactArgs) with invoke_exact_args fast path that skips FuncArgs allocation and fill_locals_from_args. --- .cspell.json | 2 + crates/codegen/src/ir.rs | 6 +- crates/compiler-core/src/bytecode.rs | 52 +++- crates/vm/src/builtins/function.rs | 78 +++++- crates/vm/src/frame.rs | 371 +++++++++++++++++++++++++-- 5 files changed, 482 insertions(+), 27 deletions(-) diff --git a/.cspell.json b/.cspell.json index bbc13e6fded..0d41568618a 100644 --- a/.cspell.json +++ b/.cspell.json @@ -60,6 +60,7 @@ "dedentations", "dedents", "deduped", + "deoptimize", "downcastable", "downcasted", "dumpable", @@ -73,6 +74,7 @@ "interps", "jitted", "jitting", + "kwonly", "lossily", "makeunicodedata", "microbenchmark", diff --git a/crates/codegen/src/ir.rs b/crates/codegen/src/ir.rs index b21af84a51c..a129b84956f 100644 --- a/crates/codegen/src/ir.rs +++ b/crates/codegen/src/ir.rs @@ -499,7 +499,11 @@ impl CodeInfo { qualname: qualname.unwrap_or(obj_name), max_stackdepth, - instructions: CodeUnits::from(instructions), + instructions: { + let units = CodeUnits::from(instructions); + units.init_adaptive_counters(); + units + }, locations: locations.into_boxed_slice(), constants: constants.into_iter().collect(), names: name_cache.into_iter().collect(), diff --git a/crates/compiler-core/src/bytecode.rs b/crates/compiler-core/src/bytecode.rs index a2c764148ac..c34617c686d 100644 --- a/crates/compiler-core/src/bytecode.rs +++ b/crates/compiler-core/src/bytecode.rs @@ -344,9 +344,9 @@ pub struct CodeUnit { const _: () = assert!(mem::size_of::() == 2); /// Adaptive specialization: number of executions before attempting specialization. -pub const ADAPTIVE_WARMUP_VALUE: u16 = 50; +pub const ADAPTIVE_WARMUP_VALUE: u8 = 50; /// Adaptive specialization: backoff counter after de-optimization. -pub const ADAPTIVE_BACKOFF_VALUE: u16 = 250; +pub const ADAPTIVE_BACKOFF_VALUE: u8 = 250; impl CodeUnit { pub const fn new(op: Instruction, arg: OpArgByte) -> Self { @@ -396,7 +396,12 @@ impl TryFrom<&[u8]> for CodeUnits { return Err(Self::Error::InvalidBytecode); } - value.chunks_exact(2).map(CodeUnit::try_from).collect() + let units: Self = value + .chunks_exact(2) + .map(CodeUnit::try_from) + .collect::>()?; + units.init_adaptive_counters(); + Ok(units) } } @@ -503,6 +508,47 @@ impl CodeUnits { let hi = self.read_cache_u32(index + 2) as u64; lo | (hi << 32) } + + /// Read the adaptive counter from the first CACHE entry's `arg` byte. + /// This preserves `op = Instruction::Cache`, unlike `read_cache_u16`. + pub fn read_adaptive_counter(&self, index: usize) -> u8 { + let units = unsafe { &*self.0.get() }; + u8::from(units[index].arg) + } + + /// Write the adaptive counter to the first CACHE entry's `arg` byte. + /// This preserves `op = Instruction::Cache`, unlike `write_cache_u16`. + /// + /// # Safety + /// - `index` must be in bounds and point to a CACHE entry. + pub unsafe fn write_adaptive_counter(&self, index: usize, value: u8) { + let units = unsafe { &mut *self.0.get() }; + units[index].arg = OpArgByte::from(value); + } + + /// Initialize adaptive warmup counters for all instructions that have caches. + /// The counter is stored in the `arg` byte of the first CACHE entry, + /// preserving `op = Instruction::Cache`. + pub fn init_adaptive_counters(&self) { + let units = unsafe { &*self.0.get() }; + let len = units.len(); + let mut i = 0; + while i < len { + let op = units[i].op; + let caches = op.cache_entries(); + if caches > 0 { + let cache_base = i + 1; + if cache_base < len { + unsafe { + self.write_adaptive_counter(cache_base, ADAPTIVE_WARMUP_VALUE); + } + } + i += 1 + caches; + } else { + i += 1; + } + } + } } /// A Constant (which usually encapsulates data within it) diff --git a/crates/vm/src/builtins/function.rs b/crates/vm/src/builtins/function.rs index 58f818cc7a7..84ef7933027 100644 --- a/crates/vm/src/builtins/function.rs +++ b/crates/vm/src/builtins/function.rs @@ -22,6 +22,7 @@ use crate::{ Callable, Comparable, Constructor, GetAttr, GetDescriptor, PyComparisonOp, Representable, }, }; +use core::sync::atomic::{AtomicU32, Ordering::Relaxed}; use itertools::Itertools; #[cfg(feature = "jit")] use rustpython_jit::CompiledCode; @@ -72,10 +73,13 @@ pub struct PyFunction { annotate: PyMutex>, module: PyMutex, doc: PyMutex, + func_version: AtomicU32, #[cfg(feature = "jit")] jitted_code: OnceCell, } +static FUNC_VERSION_COUNTER: AtomicU32 = AtomicU32::new(1); + unsafe impl Traverse for PyFunction { fn traverse(&self, tracer_fn: &mut TraverseFn<'_>) { self.globals.traverse(tracer_fn); @@ -200,6 +204,7 @@ impl PyFunction { annotate: PyMutex::new(None), module: PyMutex::new(module), doc: PyMutex::new(doc), + func_version: AtomicU32::new(FUNC_VERSION_COUNTER.fetch_add(1, Relaxed)), #[cfg(feature = "jit")] jitted_code: OnceCell::new(), }; @@ -593,6 +598,66 @@ impl Py { pub fn invoke(&self, func_args: FuncArgs, vm: &VirtualMachine) -> PyResult { self.invoke_with_locals(func_args, None, vm) } + + /// Returns the function version, or 0 if invalidated. + #[inline] + pub fn func_version(&self) -> u32 { + self.func_version.load(Relaxed) + } + + /// Check if this function is eligible for exact-args call specialization. + /// Returns true if: no VARARGS, no VARKEYWORDS, no kwonly args, not generator/coroutine, + /// and effective_nargs matches co_argcount. + pub(crate) fn can_specialize_call(&self, effective_nargs: u32) -> bool { + let code = self.code.lock(); + let flags = code.flags; + !flags.intersects( + bytecode::CodeFlags::VARARGS + | bytecode::CodeFlags::VARKEYWORDS + | bytecode::CodeFlags::GENERATOR + | bytecode::CodeFlags::COROUTINE, + ) && code.kwonlyarg_count == 0 + && code.arg_count == effective_nargs + } + + /// Fast path for calling a simple function with exact positional args. + /// Skips FuncArgs allocation, prepend_arg, and fill_locals_from_args. + /// Only valid when: no VARARGS, no VARKEYWORDS, no kwonlyargs, not generator/coroutine, + /// and nargs == co_argcount. + pub fn invoke_exact_args(&self, args: &[PyObjectRef], vm: &VirtualMachine) -> PyResult { + let code = self.code.lock().clone(); + + let locals = ArgMapping::from_dict_exact(vm.ctx.new_dict()); + + let frame = Frame::new( + code.clone(), + Scope::new(Some(locals), self.globals.clone()), + self.builtins.clone(), + self.closure.as_ref().map_or(&[], |c| c.as_slice()), + Some(self.to_owned().into()), + vm, + ) + .into_ref(&vm.ctx); + + // Copy args directly into fastlocals + { + let fastlocals = unsafe { frame.fastlocals.borrow_mut() }; + for (i, arg) in args.iter().enumerate() { + fastlocals[i] = Some(arg.clone()); + } + } + + // Handle cell2arg + if let Some(cell2arg) = code.cell2arg.as_deref() { + let fastlocals = unsafe { frame.fastlocals.borrow_mut() }; + for (cell_idx, arg_idx) in cell2arg.iter().enumerate().filter(|(_, i)| **i != -1) { + let x = fastlocals[*arg_idx as usize].take(); + frame.set_cell_contents(cell_idx, x); + } + } + + vm.run_frame(frame) + } } impl PyPayload for PyFunction { @@ -615,12 +680,7 @@ impl PyFunction { #[pygetset(setter)] fn set___code__(&self, code: PyRef) { *self.code.lock() = code; - // TODO: jit support - // #[cfg(feature = "jit")] - // { - // // If available, clear cached compiled code. - // let _ = self.jitted_code.take(); - // } + self.func_version.store(0, Relaxed); } #[pygetset] @@ -629,7 +689,8 @@ impl PyFunction { } #[pygetset(setter)] fn set___defaults__(&self, defaults: Option) { - self.defaults_and_kwdefaults.lock().0 = defaults + self.defaults_and_kwdefaults.lock().0 = defaults; + self.func_version.store(0, Relaxed); } #[pygetset] @@ -638,7 +699,8 @@ impl PyFunction { } #[pygetset(setter)] fn set___kwdefaults__(&self, kwdefaults: Option) { - self.defaults_and_kwdefaults.lock().1 = kwdefaults + self.defaults_and_kwdefaults.lock().1 = kwdefaults; + self.func_version.store(0, Relaxed); } // {"__closure__", T_OBJECT, OFF(func_closure), READONLY}, diff --git a/crates/vm/src/frame.rs b/crates/vm/src/frame.rs index c872b1a7d12..c96065fd9e4 100644 --- a/crates/vm/src/frame.rs +++ b/crates/vm/src/frame.rs @@ -5,8 +5,8 @@ use crate::{ TryFromObject, VirtualMachine, builtins::{ PyBaseException, PyBaseExceptionRef, PyBaseObject, PyCode, PyCoroutine, PyDict, PyDictRef, - PyGenerator, PyInterpolation, PyList, PySet, PySlice, PyStr, PyStrInterned, PyTemplate, - PyTraceback, PyType, PyUtf8Str, + PyFloat, PyGenerator, PyInt, PyInterpolation, PyList, PySet, PySlice, PyStr, PyStrInterned, + PyTemplate, PyTraceback, PyType, PyUtf8Str, asyncgenerator::PyAsyncGenWrappedValue, float::PyFloat, frame::stack_analysis, @@ -1126,7 +1126,24 @@ impl ExecutingFrame<'_> { } match instruction { - Instruction::BinaryOp { op } => self.execute_bin_op(vm, op.get(arg)), + Instruction::BinaryOp { op } => { + let op_val = op.get(arg); + let instr_idx = self.lasti() as usize - 1; + let cache_base = instr_idx + 1; + + let counter = self.code.instructions.read_adaptive_counter(cache_base); + if counter > 0 { + unsafe { + self.code + .instructions + .write_adaptive_counter(cache_base, counter - 1); + } + } else { + self.specialize_binary_op(vm, op_val, instr_idx, cache_base); + } + + self.execute_bin_op(vm, op_val) + } // TODO: In CPython, this does in-place unicode concatenation when // refcount is 1. Falls back to regular iadd for now. Instruction::BinaryOpInplaceAddUnicode => { @@ -1241,7 +1258,20 @@ impl ExecutingFrame<'_> { } Instruction::Call { nargs } => { // Stack: [callable, self_or_null, arg1, ..., argN] - let args = self.collect_positional_args(nargs.get(arg)); + let nargs_val = nargs.get(arg); + let instr_idx = self.lasti() as usize - 1; + let cache_base = instr_idx + 1; + let counter = self.code.instructions.read_adaptive_counter(cache_base); + if counter > 0 { + unsafe { + self.code + .instructions + .write_adaptive_counter(cache_base, counter - 1); + } + } else { + self.specialize_call(vm, nargs_val, instr_idx, cache_base); + } + let args = self.collect_positional_args(nargs_val); self.execute_call(args, vm) } Instruction::CallKw { nargs } => { @@ -2655,7 +2685,7 @@ impl ExecutingFrame<'_> { let owner = self.top_value(); let type_version = self.code.instructions.read_cache_u32(cache_base + 1); - if owner.class().tp_version_tag.load(Acquire) == type_version { + if type_version != 0 && owner.class().tp_version_tag.load(Acquire) == type_version { // Cache hit: load the cached method descriptor let descr_ptr = self.code.instructions.read_cache_u64(cache_base + 5); let func = unsafe { &*(descr_ptr as *const PyObject) }.to_owned(); @@ -2671,7 +2701,7 @@ impl ExecutingFrame<'_> { .replace_op(instr_idx, Instruction::LoadAttr { idx: Arg::marker() }); self.code .instructions - .write_cache_u16(cache_base, ADAPTIVE_BACKOFF_VALUE); + .write_adaptive_counter(cache_base, ADAPTIVE_BACKOFF_VALUE); } self.load_attr_slow(vm, oparg) } @@ -2685,7 +2715,7 @@ impl ExecutingFrame<'_> { let owner = self.top_value(); let type_version = self.code.instructions.read_cache_u32(cache_base + 1); - if owner.class().tp_version_tag.load(Acquire) == type_version { + if type_version != 0 && owner.class().tp_version_tag.load(Acquire) == type_version { // Check instance dict doesn't shadow the method let shadowed = if let Some(dict) = owner.dict() { dict.get_item_opt(attr_name, vm).ok().flatten().is_some() @@ -2710,7 +2740,7 @@ impl ExecutingFrame<'_> { .replace_op(instr_idx, Instruction::LoadAttr { idx: Arg::marker() }); self.code .instructions - .write_cache_u16(cache_base, ADAPTIVE_BACKOFF_VALUE); + .write_adaptive_counter(cache_base, ADAPTIVE_BACKOFF_VALUE); } self.load_attr_slow(vm, oparg) } @@ -2723,7 +2753,7 @@ impl ExecutingFrame<'_> { let owner = self.top_value(); let type_version = self.code.instructions.read_cache_u32(cache_base + 1); - if owner.class().tp_version_tag.load(Acquire) == type_version { + if type_version != 0 && owner.class().tp_version_tag.load(Acquire) == type_version { // Type version matches — no data descriptor for this attr. // Try direct dict lookup, skipping full descriptor protocol. if let Some(dict) = owner.dict() @@ -2742,10 +2772,186 @@ impl ExecutingFrame<'_> { .replace_op(instr_idx, Instruction::LoadAttr { idx: Arg::marker() }); self.code .instructions - .write_cache_u16(cache_base, ADAPTIVE_BACKOFF_VALUE); + .write_adaptive_counter(cache_base, ADAPTIVE_BACKOFF_VALUE); } self.load_attr_slow(vm, oparg) } + // Specialized BINARY_OP opcodes + Instruction::BinaryOpAddInt => { + let b = self.top_value(); + let a = self.nth_value(1); + if let (Some(a_int), Some(b_int)) = ( + a.downcast_ref_if_exact::(vm), + b.downcast_ref_if_exact::(vm), + ) { + let result = a_int.as_bigint() + b_int.as_bigint(); + self.pop_value(); + self.pop_value(); + self.push_value(vm.ctx.new_bigint(&result).into()); + Ok(None) + } else { + self.deoptimize_binary_op(bytecode::BinaryOperator::Add); + self.execute_bin_op(vm, bytecode::BinaryOperator::Add) + } + } + Instruction::BinaryOpSubtractInt => { + let b = self.top_value(); + let a = self.nth_value(1); + if let (Some(a_int), Some(b_int)) = ( + a.downcast_ref_if_exact::(vm), + b.downcast_ref_if_exact::(vm), + ) { + let result = a_int.as_bigint() - b_int.as_bigint(); + self.pop_value(); + self.pop_value(); + self.push_value(vm.ctx.new_bigint(&result).into()); + Ok(None) + } else { + self.deoptimize_binary_op(bytecode::BinaryOperator::Subtract); + self.execute_bin_op(vm, bytecode::BinaryOperator::Subtract) + } + } + Instruction::BinaryOpMultiplyInt => { + let b = self.top_value(); + let a = self.nth_value(1); + if let (Some(a_int), Some(b_int)) = ( + a.downcast_ref_if_exact::(vm), + b.downcast_ref_if_exact::(vm), + ) { + let result = a_int.as_bigint() * b_int.as_bigint(); + self.pop_value(); + self.pop_value(); + self.push_value(vm.ctx.new_bigint(&result).into()); + Ok(None) + } else { + self.deoptimize_binary_op(bytecode::BinaryOperator::Multiply); + self.execute_bin_op(vm, bytecode::BinaryOperator::Multiply) + } + } + Instruction::BinaryOpAddFloat => { + let b = self.top_value(); + let a = self.nth_value(1); + if let (Some(a_f), Some(b_f)) = ( + a.downcast_ref_if_exact::(vm), + b.downcast_ref_if_exact::(vm), + ) { + let result = a_f.to_f64() + b_f.to_f64(); + self.pop_value(); + self.pop_value(); + self.push_value(vm.ctx.new_float(result).into()); + Ok(None) + } else { + self.deoptimize_binary_op(bytecode::BinaryOperator::Add); + self.execute_bin_op(vm, bytecode::BinaryOperator::Add) + } + } + Instruction::BinaryOpSubtractFloat => { + let b = self.top_value(); + let a = self.nth_value(1); + if let (Some(a_f), Some(b_f)) = ( + a.downcast_ref_if_exact::(vm), + b.downcast_ref_if_exact::(vm), + ) { + let result = a_f.to_f64() - b_f.to_f64(); + self.pop_value(); + self.pop_value(); + self.push_value(vm.ctx.new_float(result).into()); + Ok(None) + } else { + self.deoptimize_binary_op(bytecode::BinaryOperator::Subtract); + self.execute_bin_op(vm, bytecode::BinaryOperator::Subtract) + } + } + Instruction::BinaryOpMultiplyFloat => { + let b = self.top_value(); + let a = self.nth_value(1); + if let (Some(a_f), Some(b_f)) = ( + a.downcast_ref_if_exact::(vm), + b.downcast_ref_if_exact::(vm), + ) { + let result = a_f.to_f64() * b_f.to_f64(); + self.pop_value(); + self.pop_value(); + self.push_value(vm.ctx.new_float(result).into()); + Ok(None) + } else { + self.deoptimize_binary_op(bytecode::BinaryOperator::Multiply); + self.execute_bin_op(vm, bytecode::BinaryOperator::Multiply) + } + } + Instruction::CallPyExactArgs => { + let instr_idx = self.lasti() as usize - 1; + let cache_base = instr_idx + 1; + let cached_version = self.code.instructions.read_cache_u32(cache_base + 1); + let nargs: u32 = arg.into(); + // Stack: [callable, self_or_null, arg1, ..., argN] + let callable = self.nth_value(nargs + 1); + if let Some(func) = callable.downcast_ref::() + && func.func_version() == cached_version + && cached_version != 0 + { + let args: Vec = self.pop_multiple(nargs as usize).collect(); + let _null = self.pop_value_opt(); // self_or_null (NULL) + let callable = self.pop_value(); + let func = callable.downcast_ref::().unwrap(); + let result = func.invoke_exact_args(&args, vm)?; + self.push_value(result); + Ok(None) + } else { + // Deoptimize + unsafe { + self.code.instructions.replace_op( + instr_idx, + Instruction::Call { + nargs: Arg::marker(), + }, + ); + self.code + .instructions + .write_adaptive_counter(cache_base, ADAPTIVE_BACKOFF_VALUE); + } + let args = self.collect_positional_args(nargs); + self.execute_call(args, vm) + } + } + Instruction::CallBoundMethodExactArgs => { + let instr_idx = self.lasti() as usize - 1; + let cache_base = instr_idx + 1; + let cached_version = self.code.instructions.read_cache_u32(cache_base + 1); + let nargs: u32 = arg.into(); + // Stack: [callable, self_val, arg1, ..., argN] + let callable = self.nth_value(nargs + 1); + if let Some(func) = callable.downcast_ref::() + && func.func_version() == cached_version + && cached_version != 0 + { + let pos_args: Vec = self.pop_multiple(nargs as usize).collect(); + let self_val = self.pop_value(); + let callable = self.pop_value(); + let func = callable.downcast_ref::().unwrap(); + let mut all_args = Vec::with_capacity(pos_args.len() + 1); + all_args.push(self_val); + all_args.extend(pos_args); + let result = func.invoke_exact_args(&all_args, vm)?; + self.push_value(result); + Ok(None) + } else { + // Deoptimize + unsafe { + self.code.instructions.replace_op( + instr_idx, + Instruction::Call { + nargs: Arg::marker(), + }, + ); + self.code + .instructions + .write_adaptive_counter(cache_base, ADAPTIVE_BACKOFF_VALUE); + } + let args = self.collect_positional_args(nargs); + self.execute_call(args, vm) + } + } // All INSTRUMENTED_* opcodes delegate to a cold function to keep // the hot instruction loop free of monitoring overhead. _ => self.execute_instrumented(instruction, arg, vm), @@ -4217,12 +4423,12 @@ impl ExecutingFrame<'_> { let cache_base = instr_idx + 1; // Decrement adaptive counter - let counter = self.code.instructions.read_cache_u16(cache_base); + let counter = self.code.instructions.read_adaptive_counter(cache_base); if counter > 0 { unsafe { self.code .instructions - .write_cache_u16(cache_base, counter - 1); + .write_adaptive_counter(cache_base, counter - 1); } } else { // Counter reached 0: attempt specialization for future calls @@ -4253,7 +4459,7 @@ impl ExecutingFrame<'_> { unsafe { self.code .instructions - .write_cache_u16(cache_base, ADAPTIVE_BACKOFF_VALUE); + .write_adaptive_counter(cache_base, ADAPTIVE_BACKOFF_VALUE); } return; } @@ -4307,7 +4513,7 @@ impl ExecutingFrame<'_> { unsafe { self.code .instructions - .write_cache_u16(cache_base, ADAPTIVE_BACKOFF_VALUE); + .write_adaptive_counter(cache_base, ADAPTIVE_BACKOFF_VALUE); } } else { // Regular attribute access @@ -4332,7 +4538,7 @@ impl ExecutingFrame<'_> { unsafe { self.code .instructions - .write_cache_u16(cache_base, ADAPTIVE_BACKOFF_VALUE); + .write_adaptive_counter(cache_base, ADAPTIVE_BACKOFF_VALUE); } } } @@ -4363,6 +4569,141 @@ impl ExecutingFrame<'_> { Ok(None) } + fn specialize_binary_op( + &mut self, + vm: &VirtualMachine, + op: bytecode::BinaryOperator, + instr_idx: usize, + cache_base: usize, + ) { + let b = self.top_value(); + let a = self.nth_value(1); + + let new_op = match op { + bytecode::BinaryOperator::Add => { + if a.downcast_ref_if_exact::(vm).is_some() + && b.downcast_ref_if_exact::(vm).is_some() + { + Some(Instruction::BinaryOpAddInt) + } else if a.downcast_ref_if_exact::(vm).is_some() + && b.downcast_ref_if_exact::(vm).is_some() + { + Some(Instruction::BinaryOpAddFloat) + } else { + None + } + } + bytecode::BinaryOperator::Subtract => { + if a.downcast_ref_if_exact::(vm).is_some() + && b.downcast_ref_if_exact::(vm).is_some() + { + Some(Instruction::BinaryOpSubtractInt) + } else if a.downcast_ref_if_exact::(vm).is_some() + && b.downcast_ref_if_exact::(vm).is_some() + { + Some(Instruction::BinaryOpSubtractFloat) + } else { + None + } + } + bytecode::BinaryOperator::Multiply => { + if a.downcast_ref_if_exact::(vm).is_some() + && b.downcast_ref_if_exact::(vm).is_some() + { + Some(Instruction::BinaryOpMultiplyInt) + } else if a.downcast_ref_if_exact::(vm).is_some() + && b.downcast_ref_if_exact::(vm).is_some() + { + Some(Instruction::BinaryOpMultiplyFloat) + } else { + None + } + } + _ => None, + }; + + if let Some(new_op) = new_op { + unsafe { + self.code.instructions.replace_op(instr_idx, new_op); + } + } else { + unsafe { + self.code + .instructions + .write_adaptive_counter(cache_base, ADAPTIVE_BACKOFF_VALUE); + } + } + } + + fn deoptimize_binary_op(&mut self, _op: bytecode::BinaryOperator) { + let instr_idx = self.lasti() as usize - 1; + let cache_base = instr_idx + 1; + unsafe { + self.code + .instructions + .replace_op(instr_idx, Instruction::BinaryOp { op: Arg::marker() }); + self.code + .instructions + .write_adaptive_counter(cache_base, ADAPTIVE_BACKOFF_VALUE); + } + } + + fn specialize_call( + &mut self, + _vm: &VirtualMachine, + nargs: u32, + instr_idx: usize, + cache_base: usize, + ) { + // Stack: [callable, self_or_null, arg1, ..., argN] + // callable is at position nargs + 1 from top + // self_or_null is at position nargs from top + let stack = &self.state.stack; + let stack_len = stack.len(); + let self_or_null_is_some = stack[stack_len - nargs as usize - 1].is_some(); + let callable = self.nth_value(nargs + 1); + + if let Some(func) = callable.downcast_ref::() { + let version = func.func_version(); + if version == 0 { + unsafe { + self.code + .instructions + .write_adaptive_counter(cache_base, ADAPTIVE_BACKOFF_VALUE); + } + return; + } + + let effective_nargs = if self_or_null_is_some { + nargs + 1 + } else { + nargs + }; + + if func.can_specialize_call(effective_nargs) { + let new_op = if self_or_null_is_some { + Instruction::CallBoundMethodExactArgs + } else { + Instruction::CallPyExactArgs + }; + unsafe { + self.code.instructions.replace_op(instr_idx, new_op); + // Store func_version in cache (after counter) + self.code + .instructions + .write_cache_u32(cache_base + 1, version); + } + return; + } + } + + unsafe { + self.code + .instructions + .write_adaptive_counter(cache_base, ADAPTIVE_BACKOFF_VALUE); + } + } + fn load_super_attr(&mut self, vm: &VirtualMachine, oparg: LoadSuperAttr) -> FrameResult { let attr_name = self.code.names[oparg.name_idx() as usize]; From 0fa6fa7dfd283e8af2946d36d5bde477c4f20e4f Mon Sep 17 00:00:00 2001 From: "Jeong, YunWon" Date: Sun, 1 Mar 2026 21:50:27 +0900 Subject: [PATCH 3/6] Lazy quickening for adaptive specialization counters Move counter initialization from compile-time to RESUME execution, matching CPython's _PyCode_Quicken pattern. Store counter in CACHE entry's arg byte to preserve op=Instruction::Cache for dis/JIT. Add PyCode.quickened flag for one-time initialization. --- crates/codegen/src/ir.rs | 6 +----- crates/compiler-core/src/bytecode.rs | 9 ++++----- crates/vm/src/builtins/code.rs | 3 +++ crates/vm/src/frame.rs | 11 ++++++++--- 4 files changed, 16 insertions(+), 13 deletions(-) diff --git a/crates/codegen/src/ir.rs b/crates/codegen/src/ir.rs index a129b84956f..b21af84a51c 100644 --- a/crates/codegen/src/ir.rs +++ b/crates/codegen/src/ir.rs @@ -499,11 +499,7 @@ impl CodeInfo { qualname: qualname.unwrap_or(obj_name), max_stackdepth, - instructions: { - let units = CodeUnits::from(instructions); - units.init_adaptive_counters(); - units - }, + instructions: CodeUnits::from(instructions), locations: locations.into_boxed_slice(), constants: constants.into_iter().collect(), names: name_cache.into_iter().collect(), diff --git a/crates/compiler-core/src/bytecode.rs b/crates/compiler-core/src/bytecode.rs index c34617c686d..3b5d684435c 100644 --- a/crates/compiler-core/src/bytecode.rs +++ b/crates/compiler-core/src/bytecode.rs @@ -400,7 +400,6 @@ impl TryFrom<&[u8]> for CodeUnits { .chunks_exact(2) .map(CodeUnit::try_from) .collect::>()?; - units.init_adaptive_counters(); Ok(units) } } @@ -526,10 +525,10 @@ impl CodeUnits { units[index].arg = OpArgByte::from(value); } - /// Initialize adaptive warmup counters for all instructions that have caches. - /// The counter is stored in the `arg` byte of the first CACHE entry, - /// preserving `op = Instruction::Cache`. - pub fn init_adaptive_counters(&self) { + /// Initialize adaptive warmup counters for all cacheable instructions. + /// Called lazily at RESUME (first execution of a code object). + /// Uses the `arg` byte of the first CACHE entry, preserving `op = Instruction::Cache`. + pub fn quicken(&self) { let units = unsafe { &*self.0.get() }; let len = units.len(); let mut i = 0; diff --git a/crates/vm/src/builtins/code.rs b/crates/vm/src/builtins/code.rs index 1708477004e..f04f6607417 100644 --- a/crates/vm/src/builtins/code.rs +++ b/crates/vm/src/builtins/code.rs @@ -346,6 +346,8 @@ pub struct PyCode { pub instrumentation_version: AtomicU64, /// Side-table for INSTRUMENTED_LINE / INSTRUMENTED_INSTRUCTION. pub monitoring_data: PyMutex>, + /// Whether adaptive counters have been initialized (lazy quickening). + pub quickened: core::sync::atomic::AtomicBool, } impl Deref for PyCode { @@ -363,6 +365,7 @@ impl PyCode { source_path: AtomicPtr::new(sp), instrumentation_version: AtomicU64::new(0), monitoring_data: PyMutex::new(None), + quickened: core::sync::atomic::AtomicBool::new(false), } } diff --git a/crates/vm/src/frame.rs b/crates/vm/src/frame.rs index c96065fd9e4..c84723be139 100644 --- a/crates/vm/src/frame.rs +++ b/crates/vm/src/frame.rs @@ -2314,6 +2314,14 @@ impl ExecutingFrame<'_> { } Instruction::RaiseVarargs { kind } => self.execute_raise(vm, kind.get(arg)), Instruction::Resume { .. } => { + // Lazy quickening: initialize adaptive counters on first execution + if !self + .code + .quickened + .swap(true, atomic::Ordering::Relaxed) + { + self.code.instructions.quicken(); + } // Check if bytecode needs re-instrumentation let global_ver = vm .state @@ -4422,7 +4430,6 @@ impl ExecutingFrame<'_> { let instr_idx = self.lasti() as usize - 1; let cache_base = instr_idx + 1; - // Decrement adaptive counter let counter = self.code.instructions.read_adaptive_counter(cache_base); if counter > 0 { unsafe { @@ -4431,11 +4438,9 @@ impl ExecutingFrame<'_> { .write_adaptive_counter(cache_base, counter - 1); } } else { - // Counter reached 0: attempt specialization for future calls self.specialize_load_attr(vm, oparg, instr_idx, cache_base); } - // Execute slow path for this call self.load_attr_slow(vm, oparg) } From 5f231618eb9cf42e471adfe1d9c4436b252486ce Mon Sep 17 00:00:00 2001 From: "Jeong, YunWon" Date: Sun, 1 Mar 2026 23:16:22 +0900 Subject: [PATCH 4/6] Add Instruction::deoptimize() and CodeUnits::original_bytes() - deoptimize() maps specialized opcodes back to their base adaptive variant - original_bytes() produces deoptimized bytecode with zeroed CACHE entries - co_code now returns deoptimized bytes, _co_code_adaptive returns current bytes - Marshal serialization uses original_bytes() instead of raw transmute --- crates/compiler-core/src/bytecode.rs | 24 ++++ .../compiler-core/src/bytecode/instruction.rs | 113 ++++++++++++++++++ crates/compiler-core/src/marshal.rs | 5 +- crates/vm/src/builtins/code.rs | 13 +- crates/vm/src/frame.rs | 6 +- 5 files changed, 146 insertions(+), 15 deletions(-) diff --git a/crates/compiler-core/src/bytecode.rs b/crates/compiler-core/src/bytecode.rs index 3b5d684435c..d7bcaa2c6ef 100644 --- a/crates/compiler-core/src/bytecode.rs +++ b/crates/compiler-core/src/bytecode.rs @@ -525,6 +525,30 @@ impl CodeUnits { units[index].arg = OpArgByte::from(value); } + /// Produce a clean copy of the bytecode suitable for serialization + /// (marshal) and `co_code`. Specialized opcodes are mapped back to their + /// base variants via `deoptimize()` and all CACHE entries are zeroed. + pub fn original_bytes(&self) -> Vec { + let units = unsafe { &*self.0.get() }; + let mut out = Vec::with_capacity(units.len() * 2); + let len = units.len(); + let mut i = 0; + while i < len { + let op = units[i].op.deoptimize(); + let caches = op.cache_entries(); + out.push(u8::from(op)); + out.push(u8::from(units[i].arg)); + // Zero-fill all CACHE entries (counter + cached data) + for _ in 0..caches { + i += 1; + out.push(0); // op = Cache = 0 + out.push(0); // arg = 0 + } + i += 1; + } + out + } + /// Initialize adaptive warmup counters for all cacheable instructions. /// Called lazily at RESUME (first execution of a code object). /// Uses the `arg` byte of the first CACHE entry, preserving `op = Instruction::Cache`. diff --git a/crates/compiler-core/src/bytecode/instruction.rs b/crates/compiler-core/src/bytecode/instruction.rs index c1c5e8cd847..c93feaaa9ef 100644 --- a/crates/compiler-core/src/bytecode/instruction.rs +++ b/crates/compiler-core/src/bytecode/instruction.rs @@ -512,6 +512,119 @@ impl Instruction { }) } + /// Map a specialized opcode back to its adaptive (base) variant. + /// `_PyOpcode_Deopt` + pub fn deoptimize(self) -> Self { + match self { + // LOAD_ATTR specializations + Self::LoadAttrClass + | Self::LoadAttrClassWithMetaclassCheck + | Self::LoadAttrGetattributeOverridden + | Self::LoadAttrInstanceValue + | Self::LoadAttrMethodLazyDict + | Self::LoadAttrMethodNoDict + | Self::LoadAttrMethodWithValues + | Self::LoadAttrModule + | Self::LoadAttrNondescriptorNoDict + | Self::LoadAttrNondescriptorWithValues + | Self::LoadAttrProperty + | Self::LoadAttrSlot + | Self::LoadAttrWithHint => Self::LoadAttr { idx: Arg::marker() }, + // BINARY_OP specializations + Self::BinaryOpAddFloat + | Self::BinaryOpAddInt + | Self::BinaryOpAddUnicode + | Self::BinaryOpExtend + | Self::BinaryOpInplaceAddUnicode + | Self::BinaryOpMultiplyFloat + | Self::BinaryOpMultiplyInt + | Self::BinaryOpSubscrDict + | Self::BinaryOpSubscrGetitem + | Self::BinaryOpSubscrListInt + | Self::BinaryOpSubscrListSlice + | Self::BinaryOpSubscrStrInt + | Self::BinaryOpSubscrTupleInt + | Self::BinaryOpSubtractFloat + | Self::BinaryOpSubtractInt => Self::BinaryOp { op: Arg::marker() }, + // CALL specializations + Self::CallAllocAndEnterInit + | Self::CallBoundMethodExactArgs + | Self::CallBoundMethodGeneral + | Self::CallBuiltinClass + | Self::CallBuiltinFast + | Self::CallBuiltinFastWithKeywords + | Self::CallBuiltinO + | Self::CallIsinstance + | Self::CallLen + | Self::CallListAppend + | Self::CallMethodDescriptorFast + | Self::CallMethodDescriptorFastWithKeywords + | Self::CallMethodDescriptorNoargs + | Self::CallMethodDescriptorO + | Self::CallNonPyGeneral + | Self::CallPyExactArgs + | Self::CallPyGeneral + | Self::CallStr1 + | Self::CallTuple1 + | Self::CallType1 => Self::Call { + nargs: Arg::marker(), + }, + // CALL_KW specializations + Self::CallKwBoundMethod | Self::CallKwNonPy | Self::CallKwPy => Self::CallKw { + nargs: Arg::marker(), + }, + // TO_BOOL specializations + Self::ToBoolAlwaysTrue + | Self::ToBoolBool + | Self::ToBoolInt + | Self::ToBoolList + | Self::ToBoolNone + | Self::ToBoolStr => Self::ToBool, + // COMPARE_OP specializations + Self::CompareOpFloat | Self::CompareOpInt | Self::CompareOpStr => { + Self::CompareOp { op: Arg::marker() } + } + // CONTAINS_OP specializations + Self::ContainsOpDict | Self::ContainsOpSet => Self::ContainsOp(Arg::marker()), + // FOR_ITER specializations + Self::ForIterGen | Self::ForIterList | Self::ForIterRange | Self::ForIterTuple => { + Self::ForIter { + target: Arg::marker(), + } + } + // LOAD_GLOBAL specializations + Self::LoadGlobalBuiltin | Self::LoadGlobalModule => Self::LoadGlobal(Arg::marker()), + // STORE_ATTR specializations + Self::StoreAttrInstanceValue | Self::StoreAttrSlot | Self::StoreAttrWithHint => { + Self::StoreAttr { idx: Arg::marker() } + } + // LOAD_SUPER_ATTR specializations + Self::LoadSuperAttrAttr | Self::LoadSuperAttrMethod => { + Self::LoadSuperAttr { arg: Arg::marker() } + } + // STORE_SUBSCR specializations + Self::StoreSubscrDict | Self::StoreSubscrListInt => Self::StoreSubscr, + // UNPACK_SEQUENCE specializations + Self::UnpackSequenceList | Self::UnpackSequenceTuple | Self::UnpackSequenceTwoTuple => { + Self::UnpackSequence { + size: Arg::marker(), + } + } + // SEND specializations + Self::SendGen => Self::Send { + target: Arg::marker(), + }, + // LOAD_CONST specializations + Self::LoadConstImmortal | Self::LoadConstMortal => { + Self::LoadConst { idx: Arg::marker() } + } + // RESUME specializations + Self::ResumeCheck => Self::Resume { arg: Arg::marker() }, + // Everything else maps to itself + _ => self, + } + } + /// Number of CACHE code units that follow this instruction. /// _PyOpcode_Caches pub fn cache_entries(self) -> usize { diff --git a/crates/compiler-core/src/marshal.rs b/crates/compiler-core/src/marshal.rs index 11df127920a..310bad9d868 100644 --- a/crates/compiler-core/src/marshal.rs +++ b/crates/compiler-core/src/marshal.rs @@ -662,9 +662,8 @@ pub fn serialize_value( pub fn serialize_code(buf: &mut W, code: &CodeObject) { write_len(buf, code.instructions.len()); - // SAFETY: it's ok to transmute CodeUnit to [u8; 2] - let (_, instructions_bytes, _) = unsafe { code.instructions.align_to() }; - buf.write_slice(instructions_bytes); + let original = code.instructions.original_bytes(); + buf.write_slice(&original); write_len(buf, code.locations.len()); for (start, end) in &*code.locations { diff --git a/crates/vm/src/builtins/code.rs b/crates/vm/src/builtins/code.rs index f04f6607417..126d0216546 100644 --- a/crates/vm/src/builtins/code.rs +++ b/crates/vm/src/builtins/code.rs @@ -684,7 +684,12 @@ impl PyCode { #[pygetset] pub fn co_code(&self, vm: &VirtualMachine) -> crate::builtins::PyBytesRef { - // SAFETY: CodeUnit is #[repr(C)] with size 2, so we can safely transmute to bytes + vm.ctx.new_bytes(self.code.instructions.original_bytes()) + } + + #[pygetset] + pub fn _co_code_adaptive(&self, vm: &VirtualMachine) -> crate::builtins::PyBytesRef { + // Return current (possibly quickened/specialized) bytecode let bytes = unsafe { core::slice::from_raw_parts( self.code.instructions.as_ptr() as *const u8, @@ -694,12 +699,6 @@ impl PyCode { vm.ctx.new_bytes(bytes.to_vec()) } - #[pygetset] - pub fn _co_code_adaptive(&self, vm: &VirtualMachine) -> crate::builtins::PyBytesRef { - // RustPython doesn't have adaptive/specialized bytecode, so return regular co_code - self.co_code(vm) - } - #[pygetset] pub fn co_freevars(&self, vm: &VirtualMachine) -> PyTupleRef { let names = self diff --git a/crates/vm/src/frame.rs b/crates/vm/src/frame.rs index c84723be139..656be91ad4a 100644 --- a/crates/vm/src/frame.rs +++ b/crates/vm/src/frame.rs @@ -2315,11 +2315,7 @@ impl ExecutingFrame<'_> { Instruction::RaiseVarargs { kind } => self.execute_raise(vm, kind.get(arg)), Instruction::Resume { .. } => { // Lazy quickening: initialize adaptive counters on first execution - if !self - .code - .quickened - .swap(true, atomic::Ordering::Relaxed) - { + if !self.code.quickened.swap(true, atomic::Ordering::Relaxed) { self.code.instructions.quicken(); } // Check if bytecode needs re-instrumentation From a831e4f6938e6809eed3dd60c153d46d8bedd507 Mon Sep 17 00:00:00 2001 From: "Jeong, YunWon" Date: Mon, 2 Mar 2026 01:21:31 +0900 Subject: [PATCH 5/6] Fix monitoring and specialization interaction - cache_entries() returns correct count for instrumented opcodes - deoptimize() maps instrumented opcodes back to base - quicken() skips adaptive counter for instrumented opcodes - instrument_code Phase 3 deoptimizes specialized opcodes and clears CACHE entries to prevent stale pointer dereferences --- .cspell.dict/cpython.txt | 1 + crates/compiler-core/src/bytecode.rs | 12 +++++--- .../compiler-core/src/bytecode/instruction.rs | 14 +++++++--- crates/vm/src/stdlib/sys/monitoring.rs | 28 +++++++++++++++---- 4 files changed, 41 insertions(+), 14 deletions(-) diff --git a/.cspell.dict/cpython.txt b/.cspell.dict/cpython.txt index 819d6875b58..7681760ea65 100644 --- a/.cspell.dict/cpython.txt +++ b/.cspell.dict/cpython.txt @@ -44,6 +44,7 @@ copyslot cpucount defaultdict denom +deopt dictbytype DICTFLAG dictoffset diff --git a/crates/compiler-core/src/bytecode.rs b/crates/compiler-core/src/bytecode.rs index d7bcaa2c6ef..3f50120df17 100644 --- a/crates/compiler-core/src/bytecode.rs +++ b/crates/compiler-core/src/bytecode.rs @@ -560,10 +560,14 @@ impl CodeUnits { let op = units[i].op; let caches = op.cache_entries(); if caches > 0 { - let cache_base = i + 1; - if cache_base < len { - unsafe { - self.write_adaptive_counter(cache_base, ADAPTIVE_WARMUP_VALUE); + // Don't write adaptive counter for instrumented opcodes; + // specialization is skipped while monitoring is active. + if !op.is_instrumented() { + let cache_base = i + 1; + if cache_base < len { + unsafe { + self.write_adaptive_counter(cache_base, ADAPTIVE_WARMUP_VALUE); + } } } i += 1 + caches; diff --git a/crates/compiler-core/src/bytecode/instruction.rs b/crates/compiler-core/src/bytecode/instruction.rs index c93feaaa9ef..a7d030b6094 100644 --- a/crates/compiler-core/src/bytecode/instruction.rs +++ b/crates/compiler-core/src/bytecode/instruction.rs @@ -620,8 +620,11 @@ impl Instruction { } // RESUME specializations Self::ResumeCheck => Self::Resume { arg: Arg::marker() }, - // Everything else maps to itself - _ => self, + // Instrumented opcodes map back to their base + _ => match self.to_base() { + Some(base) => base, + None => self, + }, } } @@ -739,8 +742,11 @@ impl Instruction { | Self::UnpackSequenceTuple | Self::UnpackSequenceTwoTuple => 1, - // Everything else: 0 cache entries - _ => 0, + // Instrumented opcodes have the same cache entries as their base + _ => match self.to_base() { + Some(base) => base.cache_entries(), + None => 0, + }, } } } diff --git a/crates/vm/src/stdlib/sys/monitoring.rs b/crates/vm/src/stdlib/sys/monitoring.rs index e223e249e54..a57a23fd9b3 100644 --- a/crates/vm/src/stdlib/sys/monitoring.rs +++ b/crates/vm/src/stdlib/sys/monitoring.rs @@ -270,13 +270,29 @@ pub fn instrument_code(code: &PyCode, events: u32) { } } - // Phase 3: Remove regular INSTRUMENTED_* → restore base opcodes - for i in 0..len { - let op = code.code.instructions[i].op; - if let Some(base) = op.to_base() { - unsafe { - code.code.instructions.replace_op(i, base); + // Phase 3: Remove regular INSTRUMENTED_* and specialized opcodes → restore base opcodes. + // Also clear all CACHE entries so specialization starts fresh. + { + let mut i = 0; + while i < len { + let op = code.code.instructions[i].op; + let de_opt = op.deoptimize(); + if u8::from(de_opt) != u8::from(op) { + unsafe { + code.code.instructions.replace_op(i, de_opt); + } + } + let caches = de_opt.cache_entries(); + // Zero all CACHE entries (the op+arg bytes may have been overwritten + // by specialization with arbitrary data like pointers). + for c in 1..=caches { + if i + c < len { + unsafe { + code.code.instructions.write_cache_u16(i + c, 0); + } + } } + i += 1 + caches; } } From 79ca9c11249699fc364bd586b94aeb7531351dcc Mon Sep 17 00:00:00 2001 From: "Jeong, YunWon" Date: Mon, 2 Mar 2026 11:50:30 +0900 Subject: [PATCH 6/6] Address review: bounds checks, UB fix, version overflow, error handling - Add bounds checks to read_cache_u16/u32/u64 - Fix quicken() aliasing UB by using &mut directly - Add JumpBackwardJit/JumpBackwardNoJit to deoptimize() - Guard can_specialize_call with NEWLOCALS flag check - Use compare_exchange_weak for version tag to prevent wraparound - Propagate dict lookup errors in LoadAttrMethodWithValues - Apply adaptive backoff on version tag assignment failure - Remove duplicate imports in frame.rs --- crates/compiler-core/src/bytecode.rs | 16 ++++++++--- .../compiler-core/src/bytecode/instruction.rs | 4 +++ crates/vm/src/builtins/function.rs | 14 +++++----- crates/vm/src/builtins/type.rs | 17 ++++++++---- crates/vm/src/frame.rs | 27 ++++++++++++++++--- crates/vm/src/stdlib/sys/monitoring.rs | 8 +++--- 6 files changed, 63 insertions(+), 23 deletions(-) diff --git a/crates/compiler-core/src/bytecode.rs b/crates/compiler-core/src/bytecode.rs index 3f50120df17..cec04b9edd9 100644 --- a/crates/compiler-core/src/bytecode.rs +++ b/crates/compiler-core/src/bytecode.rs @@ -466,8 +466,12 @@ impl CodeUnits { } /// Read a u16 value from a CACHE code unit at `index`. + /// + /// # Panics + /// Panics if `index` is out of bounds. pub fn read_cache_u16(&self, index: usize) -> u16 { let units = unsafe { &*self.0.get() }; + assert!(index < units.len(), "read_cache_u16: index out of bounds"); let ptr = units.as_ptr().wrapping_add(index) as *const u8; unsafe { core::ptr::read_unaligned(ptr as *const u16) } } @@ -484,6 +488,9 @@ impl CodeUnits { } /// Read a u32 value from two consecutive CACHE code units starting at `index`. + /// + /// # Panics + /// Panics if `index + 1` is out of bounds. pub fn read_cache_u32(&self, index: usize) -> u32 { let lo = self.read_cache_u16(index) as u32; let hi = self.read_cache_u16(index + 1) as u32; @@ -502,6 +509,9 @@ impl CodeUnits { } /// Read a u64 value from four consecutive CACHE code units starting at `index`. + /// + /// # Panics + /// Panics if `index + 3` is out of bounds. pub fn read_cache_u64(&self, index: usize) -> u64 { let lo = self.read_cache_u32(index) as u64; let hi = self.read_cache_u32(index + 2) as u64; @@ -553,7 +563,7 @@ impl CodeUnits { /// Called lazily at RESUME (first execution of a code object). /// Uses the `arg` byte of the first CACHE entry, preserving `op = Instruction::Cache`. pub fn quicken(&self) { - let units = unsafe { &*self.0.get() }; + let units = unsafe { &mut *self.0.get() }; let len = units.len(); let mut i = 0; while i < len { @@ -565,9 +575,7 @@ impl CodeUnits { if !op.is_instrumented() { let cache_base = i + 1; if cache_base < len { - unsafe { - self.write_adaptive_counter(cache_base, ADAPTIVE_WARMUP_VALUE); - } + units[cache_base].arg = OpArgByte::from(ADAPTIVE_WARMUP_VALUE); } } i += 1 + caches; diff --git a/crates/compiler-core/src/bytecode/instruction.rs b/crates/compiler-core/src/bytecode/instruction.rs index a7d030b6094..e7b13ff21d2 100644 --- a/crates/compiler-core/src/bytecode/instruction.rs +++ b/crates/compiler-core/src/bytecode/instruction.rs @@ -620,6 +620,10 @@ impl Instruction { } // RESUME specializations Self::ResumeCheck => Self::Resume { arg: Arg::marker() }, + // JUMP_BACKWARD specializations + Self::JumpBackwardJit | Self::JumpBackwardNoJit => Self::JumpBackward { + target: Arg::marker(), + }, // Instrumented opcodes map back to their base _ => match self.to_base() { Some(base) => base, diff --git a/crates/vm/src/builtins/function.rs b/crates/vm/src/builtins/function.rs index 84ef7933027..489482d1933 100644 --- a/crates/vm/src/builtins/function.rs +++ b/crates/vm/src/builtins/function.rs @@ -611,12 +611,14 @@ impl Py { pub(crate) fn can_specialize_call(&self, effective_nargs: u32) -> bool { let code = self.code.lock(); let flags = code.flags; - !flags.intersects( - bytecode::CodeFlags::VARARGS - | bytecode::CodeFlags::VARKEYWORDS - | bytecode::CodeFlags::GENERATOR - | bytecode::CodeFlags::COROUTINE, - ) && code.kwonlyarg_count == 0 + flags.contains(bytecode::CodeFlags::NEWLOCALS) + && !flags.intersects( + bytecode::CodeFlags::VARARGS + | bytecode::CodeFlags::VARKEYWORDS + | bytecode::CodeFlags::GENERATOR + | bytecode::CodeFlags::COROUTINE, + ) + && code.kwonlyarg_count == 0 && code.arg_count == effective_nargs } diff --git a/crates/vm/src/builtins/type.rs b/crates/vm/src/builtins/type.rs index da2510f4f9a..b3a3c206c68 100644 --- a/crates/vm/src/builtins/type.rs +++ b/crates/vm/src/builtins/type.rs @@ -201,12 +201,19 @@ fn is_subtype_with_mro(a_mro: &[PyTypeRef], a: &Py, b: &Py) -> b impl PyType { /// Assign a fresh version tag. Returns 0 on overflow (all caches invalidated). pub fn assign_version_tag(&self) -> u32 { - let v = NEXT_TYPE_VERSION.fetch_add(1, Ordering::Relaxed); - if v == 0 { - return 0; + loop { + let current = NEXT_TYPE_VERSION.load(Ordering::Relaxed); + let Some(next) = current.checked_add(1) else { + return 0; // Overflow: version space exhausted + }; + if NEXT_TYPE_VERSION + .compare_exchange_weak(current, next, Ordering::Relaxed, Ordering::Relaxed) + .is_ok() + { + self.tp_version_tag.store(current, Ordering::Release); + return current; + } } - self.tp_version_tag.store(v, Ordering::Release); - v } /// Invalidate this type's version tag and cascade to all subclasses. diff --git a/crates/vm/src/frame.rs b/crates/vm/src/frame.rs index 656be91ad4a..08ce117fd48 100644 --- a/crates/vm/src/frame.rs +++ b/crates/vm/src/frame.rs @@ -8,10 +8,8 @@ use crate::{ PyFloat, PyGenerator, PyInt, PyInterpolation, PyList, PySet, PySlice, PyStr, PyStrInterned, PyTemplate, PyTraceback, PyType, PyUtf8Str, asyncgenerator::PyAsyncGenWrappedValue, - float::PyFloat, frame::stack_analysis, function::{PyCell, PyCellRef, PyFunction}, - int::PyInt, range::PyRangeIterator, tuple::{PyTuple, PyTupleRef}, }, @@ -2722,7 +2720,23 @@ impl ExecutingFrame<'_> { if type_version != 0 && owner.class().tp_version_tag.load(Acquire) == type_version { // Check instance dict doesn't shadow the method let shadowed = if let Some(dict) = owner.dict() { - dict.get_item_opt(attr_name, vm).ok().flatten().is_some() + match dict.get_item_opt(attr_name, vm) { + Ok(Some(_)) => true, + Ok(None) => false, + Err(_) => { + // Dict lookup error → deoptimize to safe path + unsafe { + self.code.instructions.replace_op( + instr_idx, + Instruction::LoadAttr { idx: Arg::marker() }, + ); + self.code + .instructions + .write_adaptive_counter(cache_base, ADAPTIVE_BACKOFF_VALUE); + } + return self.load_attr_slow(vm, oparg); + } + } } else { false }; @@ -4471,7 +4485,12 @@ impl ExecutingFrame<'_> { type_version = cls.assign_version_tag(); } if type_version == 0 { - // Version counter overflow + // Version counter overflow — backoff to avoid re-attempting every execution + unsafe { + self.code + .instructions + .write_adaptive_counter(cache_base, ADAPTIVE_BACKOFF_VALUE); + } return; } diff --git a/crates/vm/src/stdlib/sys/monitoring.rs b/crates/vm/src/stdlib/sys/monitoring.rs index a57a23fd9b3..858ea83b8a7 100644 --- a/crates/vm/src/stdlib/sys/monitoring.rs +++ b/crates/vm/src/stdlib/sys/monitoring.rs @@ -276,13 +276,13 @@ pub fn instrument_code(code: &PyCode, events: u32) { let mut i = 0; while i < len { let op = code.code.instructions[i].op; - let de_opt = op.deoptimize(); - if u8::from(de_opt) != u8::from(op) { + let base_op = op.deoptimize(); + if u8::from(base_op) != u8::from(op) { unsafe { - code.code.instructions.replace_op(i, de_opt); + code.code.instructions.replace_op(i, base_op); } } - let caches = de_opt.cache_entries(); + let caches = base_op.cache_entries(); // Zero all CACHE entries (the op+arg bytes may have been overwritten // by specialization with arbitrary data like pointers). for c in 1..=caches {