From 023bbd4b85081835f4d442ad00e6202e1365673a Mon Sep 17 00:00:00 2001
From: "Jeong, YunWon" <jeong@youknowone.org>
Date: Sun, 1 Mar 2026 15:44:14 +0900
Subject: [PATCH 1/6] Implement LOAD_ATTR inline caching with adaptive
 specialization

Add type version counter (tp_version_tag) to PyType with subclass
invalidation cascade. Add cache read/write methods (u16/u32/u64)
to CodeUnits. Implement adaptive specialization in load_attr that
replaces the opcode with specialized variants on first execution:

- LoadAttrMethodNoDict: cached method lookup for slotted types
- LoadAttrMethodWithValues: cached method with dict shadow check
- LoadAttrInstanceValue: direct dict lookup skipping descriptors

Specialized opcodes guard on type_version_tag and deoptimize back
to generic LOAD_ATTR with backoff counter on cache miss.
---
 crates/codegen/src/ir.rs             |  12 +-
 crates/compiler-core/src/bytecode.rs |  62 +++++++
 crates/vm/src/builtins/type.rs       |  42 ++++-
 crates/vm/src/frame.rs               | 238 ++++++++++++++++++++++++++-
 crates/vm/src/object/core.rs         |   3 +
 5 files changed, 346 insertions(+), 11 deletions(-)
diff --git a/crates/codegen/src/ir.rs b/crates/codegen/src/ir.rs
index 4363ffaa768..b21af84a51c 100644
--- a/crates/codegen/src/ir.rs
+++ b/crates/codegen/src/ir.rs
@@ -457,11 +457,13 @@ impl CodeInfo {
                             .map(|byte| CodeUnit::new(Instruction::ExtendedArg, byte))
                             .chain([CodeUnit { op, arg: lo_arg }]),
                     );
-                    // Emit CACHE code units after the instruction
-                    instructions.extend(core::iter::repeat_n(
-                        CodeUnit::new(Instruction::Cache, 0.into()),
-                        cache_count,
-                    ));
+                    // Emit CACHE code units after the instruction (all zeroed)
+                    if cache_count > 0 {
+                        instructions.extend(core::iter::repeat_n(
+                            CodeUnit::new(Instruction::Cache, 0.into()),
+                            cache_count,
+                        ));
+                    }
                     current_offset = offset_after;
                 }
                 next_block = block.next;
diff --git a/crates/compiler-core/src/bytecode.rs b/crates/compiler-core/src/bytecode.rs
index cece1fb77fa..a2c764148ac 100644
--- a/crates/compiler-core/src/bytecode.rs
+++ b/crates/compiler-core/src/bytecode.rs
@@ -343,6 +343,11 @@ pub struct CodeUnit {
 
 const _: () = assert!(mem::size_of::<CodeUnit>() == 2);
 
+/// Adaptive specialization: number of executions before attempting specialization.
+pub const ADAPTIVE_WARMUP_VALUE: u16 = 50;
+/// Adaptive specialization: backoff counter after de-optimization.
+pub const ADAPTIVE_BACKOFF_VALUE: u16 = 250;
+
 impl CodeUnit {
     pub const fn new(op: Instruction, arg: OpArgByte) -> Self {
         Self { op, arg }
@@ -441,6 +446,63 @@ impl CodeUnits {
             core::ptr::write(op_ptr, new_op.into());
         }
     }
+
+    /// Write a u16 value into a CACHE code unit at `index`.
+    /// Each CodeUnit is 2 bytes (#[repr(C)]: op u8 + arg u8), so one u16 fits exactly.
+    ///
+    /// # Safety
+    /// - `index` must be in bounds and point to a CACHE entry.
+    /// - The caller must ensure no concurrent reads/writes to the same slot.
+    pub unsafe fn write_cache_u16(&self, index: usize, value: u16) {
+        unsafe {
+            let units = &mut *self.0.get();
+            let ptr = units.as_mut_ptr().add(index) as *mut u8;
+            core::ptr::write_unaligned(ptr as *mut u16, value);
+        }
+    }
+
+    /// Read a u16 value from a CACHE code unit at `index`.
+    pub fn read_cache_u16(&self, index: usize) -> u16 {
+        let units = unsafe { &*self.0.get() };
+        let ptr = units.as_ptr().wrapping_add(index) as *const u8;
+        unsafe { core::ptr::read_unaligned(ptr as *const u16) }
+    }
+
+    /// Write a u32 value across two consecutive CACHE code units starting at `index`.
+    ///
+    /// # Safety
+    /// Same requirements as `write_cache_u16`.
+    pub unsafe fn write_cache_u32(&self, index: usize, value: u32) {
+        unsafe {
+            self.write_cache_u16(index, value as u16);
+            self.write_cache_u16(index + 1, (value >> 16) as u16);
+        }
+    }
+
+    /// Read a u32 value from two consecutive CACHE code units starting at `index`.
+    pub fn read_cache_u32(&self, index: usize) -> u32 {
+        let lo = self.read_cache_u16(index) as u32;
+        let hi = self.read_cache_u16(index + 1) as u32;
+        lo | (hi << 16)
+    }
+
+    /// Write a u64 value across four consecutive CACHE code units starting at `index`.
+    ///
+    /// # Safety
+    /// Same requirements as `write_cache_u16`.
+    pub unsafe fn write_cache_u64(&self, index: usize, value: u64) {
+        unsafe {
+            self.write_cache_u32(index, value as u32);
+            self.write_cache_u32(index + 2, (value >> 32) as u32);
+        }
+    }
+
+    /// Read a u64 value from four consecutive CACHE code units starting at `index`.
+    pub fn read_cache_u64(&self, index: usize) -> u64 {
+        let lo = self.read_cache_u32(index) as u64;
+        let hi = self.read_cache_u32(index + 2) as u64;
+        lo | (hi << 32)
+    }
 }
 
 /// A Constant (which usually encapsulates data within it)
diff --git a/crates/vm/src/builtins/type.rs b/crates/vm/src/builtins/type.rs
index 86865e9e083..da2510f4f9a 100644
--- a/crates/vm/src/builtins/type.rs
+++ b/crates/vm/src/builtins/type.rs
@@ -28,7 +28,14 @@ use crate::{
         Representable, SLOT_DEFS, SetAttr, TypeDataRef, TypeDataRefMut, TypeDataSlot,
     },
 };
-use core::{any::Any, borrow::Borrow, ops::Deref, pin::Pin, ptr::NonNull};
+use core::{
+    any::Any,
+    borrow::Borrow,
+    ops::Deref,
+    pin::Pin,
+    ptr::NonNull,
+    sync::atomic::{AtomicU32, Ordering},
+};
 use indexmap::{IndexMap, map::Entry};
 use itertools::Itertools;
 use num_traits::ToPrimitive;
@@ -44,8 +51,12 @@ pub struct PyType {
     pub attributes: PyRwLock<PyAttributes>,
     pub slots: PyTypeSlots,
     pub heaptype_ext: Option<Pin<Box<HeapTypeExt>>>,
+    /// Type version tag for inline caching. 0 means unassigned/invalidated.
+    pub tp_version_tag: AtomicU32,
 }
 
+static NEXT_TYPE_VERSION: AtomicU32 = AtomicU32::new(1);
+
 unsafe impl crate::object::Traverse for PyType {
     fn traverse(&self, tracer_fn: &mut crate::object::TraverseFn<'_>) {
         self.base.traverse(tracer_fn);
@@ -188,6 +199,27 @@ fn is_subtype_with_mro(a_mro: &[PyTypeRef], a: &Py<PyType>, b: &Py<PyType>) -> b
 }
 
 impl PyType {
+    /// Assign a fresh version tag. Returns 0 on overflow (all caches invalidated).
+    pub fn assign_version_tag(&self) -> u32 {
+        let v = NEXT_TYPE_VERSION.fetch_add(1, Ordering::Relaxed);
+        if v == 0 {
+            return 0;
+        }
+        self.tp_version_tag.store(v, Ordering::Release);
+        v
+    }
+
+    /// Invalidate this type's version tag and cascade to all subclasses.
+    pub fn modified(&self) {
+        self.tp_version_tag.store(0, Ordering::Release);
+        let subclasses = self.subclasses.read();
+        for weak_ref in subclasses.iter() {
+            if let Some(sub) = weak_ref.upgrade() {
+                sub.downcast_ref::<PyType>().unwrap().modified();
+            }
+        }
+    }
+
     pub fn new_simple_heap(
         name: &str,
         base: &Py<PyType>,
@@ -365,6 +397,7 @@ impl PyType {
                 attributes: PyRwLock::new(attrs),
                 slots,
                 heaptype_ext: Some(Pin::new(Box::new(heaptype_ext))),
+                tp_version_tag: AtomicU32::new(0),
             },
             metaclass,
             None,
@@ -418,6 +451,7 @@ impl PyType {
                 attributes: PyRwLock::new(attrs),
                 slots,
                 heaptype_ext: None,
+                tp_version_tag: AtomicU32::new(0),
             },
             metaclass,
             None,
@@ -799,6 +833,9 @@ impl PyType {
         }
         update_mro_recursively(zelf, vm)?;
 
+        // Invalidate inline caches
+        zelf.modified();
+
         // TODO: do any old slots need to be cleaned up first?
         zelf.init_slots(&vm.ctx);
 
@@ -1903,6 +1940,9 @@ impl SetAttr for PyType {
                 )));
             }
         }
+        // Invalidate inline caches that depend on this type's attributes
+        zelf.modified();
+
         if attr_name.as_wtf8().starts_with("__") && attr_name.as_wtf8().ends_with("__") {
             if assign {
                 zelf.update_slot::<true>(attr_name, &vm.ctx);
diff --git a/crates/vm/src/frame.rs b/crates/vm/src/frame.rs
index 663885c579d..c872b1a7d12 100644
--- a/crates/vm/src/frame.rs
+++ b/crates/vm/src/frame.rs
@@ -4,9 +4,9 @@ use crate::{
     AsObject, Py, PyExact, PyObject, PyObjectRef, PyPayload, PyRef, PyResult, PyStackRef,
     TryFromObject, VirtualMachine,
     builtins::{
-        PyBaseException, PyBaseExceptionRef, PyCode, PyCoroutine, PyDict, PyDictRef, PyGenerator,
-        PyInterpolation, PyList, PySet, PySlice, PyStr, PyStrInterned, PyTemplate, PyTraceback,
-        PyType, PyUtf8Str,
+        PyBaseException, PyBaseExceptionRef, PyBaseObject, PyCode, PyCoroutine, PyDict, PyDictRef,
+        PyGenerator, PyInterpolation, PyList, PySet, PySlice, PyStr, PyStrInterned, PyTemplate,
+        PyTraceback, PyType, PyUtf8Str,
         asyncgenerator::PyAsyncGenWrappedValue,
         float::PyFloat,
         frame::stack_analysis,
@@ -15,7 +15,9 @@ use crate::{
         range::PyRangeIterator,
         tuple::{PyTuple, PyTupleRef},
     },
-    bytecode::{self, Instruction, LoadAttr, LoadSuperAttr, SpecialMethod},
+    bytecode::{
+        self, ADAPTIVE_BACKOFF_VALUE, Arg, Instruction, LoadAttr, LoadSuperAttr, SpecialMethod,
+    },
     convert::{IntoObject, ToPyResult},
     coroutine::Coro,
     exceptions::ExceptionCtor,
@@ -34,7 +36,7 @@ use core::cell::UnsafeCell;
 use core::iter::zip;
 use core::sync::atomic;
 use core::sync::atomic::AtomicPtr;
-use core::sync::atomic::Ordering::Relaxed;
+use core::sync::atomic::Ordering::{Acquire, Relaxed};
 use indexmap::IndexMap;
 use itertools::Itertools;
 use malachite_bigint::BigInt;
@@ -2644,6 +2646,106 @@ impl ExecutingFrame<'_> {
                 self.push_value(vm.ctx.new_bool(!value).into());
                 Ok(None)
             }
+            // Specialized LOAD_ATTR opcodes
+            Instruction::LoadAttrMethodNoDict => {
+                let oparg = LoadAttr::new(u32::from(arg));
+                let instr_idx = self.lasti() as usize - 1;
+                let cache_base = instr_idx + 1;
+
+                let owner = self.top_value();
+                let type_version = self.code.instructions.read_cache_u32(cache_base + 1);
+
+                if owner.class().tp_version_tag.load(Acquire) == type_version {
+                    // Cache hit: load the cached method descriptor
+                    let descr_ptr = self.code.instructions.read_cache_u64(cache_base + 5);
+                    let func = unsafe { &*(descr_ptr as *const PyObject) }.to_owned();
+                    let owner = self.pop_value();
+                    self.push_value(func);
+                    self.push_value(owner);
+                    Ok(None)
+                } else {
+                    // De-optimize
+                    unsafe {
+                        self.code
+                            .instructions
+                            .replace_op(instr_idx, Instruction::LoadAttr { idx: Arg::marker() });
+                        self.code
+                            .instructions
+                            .write_cache_u16(cache_base, ADAPTIVE_BACKOFF_VALUE);
+                    }
+                    self.load_attr_slow(vm, oparg)
+                }
+            }
+            Instruction::LoadAttrMethodWithValues => {
+                let oparg = LoadAttr::new(u32::from(arg));
+                let instr_idx = self.lasti() as usize - 1;
+                let cache_base = instr_idx + 1;
+                let attr_name = self.code.names[oparg.name_idx() as usize];
+
+                let owner = self.top_value();
+                let type_version = self.code.instructions.read_cache_u32(cache_base + 1);
+
+                if owner.class().tp_version_tag.load(Acquire) == type_version {
+                    // Check instance dict doesn't shadow the method
+                    let shadowed = if let Some(dict) = owner.dict() {
+                        dict.get_item_opt(attr_name, vm).ok().flatten().is_some()
+                    } else {
+                        false
+                    };
+
+                    if !shadowed {
+                        // Cache hit: load the cached method descriptor
+                        let descr_ptr = self.code.instructions.read_cache_u64(cache_base + 5);
+                        let func = unsafe { &*(descr_ptr as *const PyObject) }.to_owned();
+                        let owner = self.pop_value();
+                        self.push_value(func);
+                        self.push_value(owner);
+                        return Ok(None);
+                    }
+                }
+                // De-optimize
+                unsafe {
+                    self.code
+                        .instructions
+                        .replace_op(instr_idx, Instruction::LoadAttr { idx: Arg::marker() });
+                    self.code
+                        .instructions
+                        .write_cache_u16(cache_base, ADAPTIVE_BACKOFF_VALUE);
+                }
+                self.load_attr_slow(vm, oparg)
+            }
+            Instruction::LoadAttrInstanceValue => {
+                let oparg = LoadAttr::new(u32::from(arg));
+                let instr_idx = self.lasti() as usize - 1;
+                let cache_base = instr_idx + 1;
+                let attr_name = self.code.names[oparg.name_idx() as usize];
+
+                let owner = self.top_value();
+                let type_version = self.code.instructions.read_cache_u32(cache_base + 1);
+
+                if owner.class().tp_version_tag.load(Acquire) == type_version {
+                    // Type version matches — no data descriptor for this attr.
+                    // Try direct dict lookup, skipping full descriptor protocol.
+                    if let Some(dict) = owner.dict()
+                        && let Some(value) = dict.get_item_opt(attr_name, vm)?
+                    {
+                        self.pop_value();
+                        self.push_value(value);
+                        return Ok(None);
+                    }
+                    // Not in instance dict — fall through to class lookup via slow path
+                }
+                // De-optimize
+                unsafe {
+                    self.code
+                        .instructions
+                        .replace_op(instr_idx, Instruction::LoadAttr { idx: Arg::marker() });
+                    self.code
+                        .instructions
+                        .write_cache_u16(cache_base, ADAPTIVE_BACKOFF_VALUE);
+                }
+                self.load_attr_slow(vm, oparg)
+            }
             // All INSTRUMENTED_* opcodes delegate to a cold function to keep
             // the hot instruction loop free of monitoring overhead.
             _ => self.execute_instrumented(instruction, arg, vm),
@@ -4111,6 +4213,132 @@ impl ExecutingFrame<'_> {
     }
 
     fn load_attr(&mut self, vm: &VirtualMachine, oparg: LoadAttr) -> FrameResult {
+        let instr_idx = self.lasti() as usize - 1;
+        let cache_base = instr_idx + 1;
+
+        // Decrement adaptive counter
+        let counter = self.code.instructions.read_cache_u16(cache_base);
+        if counter > 0 {
+            unsafe {
+                self.code
+                    .instructions
+                    .write_cache_u16(cache_base, counter - 1);
+            }
+        } else {
+            // Counter reached 0: attempt specialization for future calls
+            self.specialize_load_attr(vm, oparg, instr_idx, cache_base);
+        }
+
+        // Execute slow path for this call
+        self.load_attr_slow(vm, oparg)
+    }
+
+    fn specialize_load_attr(
+        &mut self,
+        _vm: &VirtualMachine,
+        oparg: LoadAttr,
+        instr_idx: usize,
+        cache_base: usize,
+    ) {
+        let obj = self.top_value();
+        let cls = obj.class();
+
+        // Only specialize if getattro is the default (PyBaseObject::getattro)
+        let is_default_getattro = cls
+            .slots
+            .getattro
+            .load()
+            .is_some_and(|f| f as usize == PyBaseObject::getattro as *const () as usize);
+        if !is_default_getattro {
+            unsafe {
+                self.code
+                    .instructions
+                    .write_cache_u16(cache_base, ADAPTIVE_BACKOFF_VALUE);
+            }
+            return;
+        }
+
+        // Get or assign type version
+        let mut type_version = cls.tp_version_tag.load(Acquire);
+        if type_version == 0 {
+            type_version = cls.assign_version_tag();
+        }
+        if type_version == 0 {
+            // Version counter overflow
+            return;
+        }
+
+        let attr_name = self.code.names[oparg.name_idx() as usize];
+
+        // Look up attr in class via MRO
+        let cls_attr = cls.get_attr(attr_name);
+        let has_dict = obj.dict().is_some();
+
+        if oparg.is_method() {
+            // Method specialization
+            if let Some(ref descr) = cls_attr
+                && descr
+                    .class()
+                    .slots
+                    .flags
+                    .has_feature(PyTypeFlags::METHOD_DESCRIPTOR)
+            {
+                let descr_ptr = &**descr as *const PyObject as u64;
+                unsafe {
+                    self.code
+                        .instructions
+                        .write_cache_u32(cache_base + 1, type_version);
+                    self.code
+                        .instructions
+                        .write_cache_u64(cache_base + 5, descr_ptr);
+                }
+
+                let new_op = if !has_dict {
+                    Instruction::LoadAttrMethodNoDict
+                } else {
+                    Instruction::LoadAttrMethodWithValues
+                };
+                unsafe {
+                    self.code.instructions.replace_op(instr_idx, new_op);
+                }
+                return;
+            }
+            // Can't specialize this method call
+            unsafe {
+                self.code
+                    .instructions
+                    .write_cache_u16(cache_base, ADAPTIVE_BACKOFF_VALUE);
+            }
+        } else {
+            // Regular attribute access
+            let has_data_descr = cls_attr.as_ref().is_some_and(|descr| {
+                let descr_cls = descr.class();
+                descr_cls.slots.descr_get.load().is_some()
+                    && descr_cls.slots.descr_set.load().is_some()
+            });
+
+            if !has_data_descr && has_dict {
+                // Instance attribute access — skip class descriptor check
+                unsafe {
+                    self.code
+                        .instructions
+                        .write_cache_u32(cache_base + 1, type_version);
+                    self.code
+                        .instructions
+                        .replace_op(instr_idx, Instruction::LoadAttrInstanceValue);
+                }
+            } else {
+                // Data descriptor or no dict — can't easily specialize
+                unsafe {
+                    self.code
+                        .instructions
+                        .write_cache_u16(cache_base, ADAPTIVE_BACKOFF_VALUE);
+                }
+            }
+        }
+    }
+
+    fn load_attr_slow(&mut self, vm: &VirtualMachine, oparg: LoadAttr) -> FrameResult {
         let attr_name = self.code.names[oparg.name_idx() as usize];
         let parent = self.pop_value();
 
diff --git a/crates/vm/src/object/core.rs b/crates/vm/src/object/core.rs
index b48045f2163..41ddfa26b2e 100644
--- a/crates/vm/src/object/core.rs
+++ b/crates/vm/src/object/core.rs
@@ -1927,6 +1927,7 @@ pub(crate) fn init_type_hierarchy() -> (PyTypeRef, PyTypeRef, PyTypeRef) {
             attributes: PyRwLock::new(Default::default()),
             slots: PyType::make_slots(),
             heaptype_ext: None,
+            tp_version_tag: core::sync::atomic::AtomicU32::new(0),
         };
         let object_payload = PyType {
             base: None,
@@ -1936,6 +1937,7 @@ pub(crate) fn init_type_hierarchy() -> (PyTypeRef, PyTypeRef, PyTypeRef) {
             attributes: PyRwLock::new(Default::default()),
             slots: object::PyBaseObject::make_slots(),
             heaptype_ext: None,
+            tp_version_tag: core::sync::atomic::AtomicU32::new(0),
         };
         let type_type_ptr = Box::into_raw(Box::new(partially_init!(
             PyInner::<PyType> {
@@ -1997,6 +1999,7 @@ pub(crate) fn init_type_hierarchy() -> (PyTypeRef, PyTypeRef, PyTypeRef) {
         attributes: PyRwLock::default(),
         slots: PyWeak::make_slots(),
         heaptype_ext: None,
+        tp_version_tag: core::sync::atomic::AtomicU32::new(0),
     };
     let weakref_type = PyRef::new_ref(weakref_type, type_type.clone(), None);
     // Static type: untrack from GC (was tracked by new_ref because PyType has HAS_TRAVERSE)

From ff073c865dad8370afbe05685d999e282d81d7dd Mon Sep 17 00:00:00 2001
From: "Jeong, YunWon" <jeong@youknowone.org>
Date: Sun, 1 Mar 2026 19:16:54 +0900
Subject: [PATCH 2/6] Add BINARY_OP and CALL adaptive specialization

BINARY_OP: Specialize int add/subtract/multiply and float
add/subtract/multiply with type guards and deoptimization.

CALL: Add func_version to PyFunction, specialize simple
function calls (CallPyExactArgs, CallBoundMethodExactArgs)
with invoke_exact_args fast path that skips FuncArgs
allocation and fill_locals_from_args.
---
 .cspell.json                         |   2 +
 crates/codegen/src/ir.rs             |   6 +-
 crates/compiler-core/src/bytecode.rs |  52 +++-
 crates/vm/src/builtins/function.rs   |  78 +++++-
 crates/vm/src/frame.rs               | 371 +++++++++++++++++++++++++--
 5 files changed, 482 insertions(+), 27 deletions(-)

diff --git a/.cspell.json b/.cspell.json
index bbc13e6fded..0d41568618a 100644
--- a/.cspell.json
+++ b/.cspell.json
@@ -60,6 +60,7 @@
     "dedentations",
     "dedents",
     "deduped",
+    "deoptimize",
     "downcastable",
     "downcasted",
     "dumpable",
@@ -73,6 +74,7 @@
     "interps",
     "jitted",
     "jitting",
+    "kwonly",
     "lossily",
     "makeunicodedata",
     "microbenchmark",
diff --git a/crates/codegen/src/ir.rs b/crates/codegen/src/ir.rs
index b21af84a51c..a129b84956f 100644
--- a/crates/codegen/src/ir.rs
+++ b/crates/codegen/src/ir.rs
@@ -499,7 +499,11 @@ impl CodeInfo {
             qualname: qualname.unwrap_or(obj_name),
 
             max_stackdepth,
-            instructions: CodeUnits::from(instructions),
+            instructions: {
+                let units = CodeUnits::from(instructions);
+                units.init_adaptive_counters();
+                units
+            },
             locations: locations.into_boxed_slice(),
             constants: constants.into_iter().collect(),
             names: name_cache.into_iter().collect(),
diff --git a/crates/compiler-core/src/bytecode.rs b/crates/compiler-core/src/bytecode.rs
index a2c764148ac..c34617c686d 100644
--- a/crates/compiler-core/src/bytecode.rs
+++ b/crates/compiler-core/src/bytecode.rs
@@ -344,9 +344,9 @@ pub struct CodeUnit {
 const _: () = assert!(mem::size_of::<CodeUnit>() == 2);
 
 /// Adaptive specialization: number of executions before attempting specialization.
-pub const ADAPTIVE_WARMUP_VALUE: u16 = 50;
+pub const ADAPTIVE_WARMUP_VALUE: u8 = 50;
 /// Adaptive specialization: backoff counter after de-optimization.
-pub const ADAPTIVE_BACKOFF_VALUE: u16 = 250;
+pub const ADAPTIVE_BACKOFF_VALUE: u8 = 250;
 
 impl CodeUnit {
     pub const fn new(op: Instruction, arg: OpArgByte) -> Self {
@@ -396,7 +396,12 @@ impl TryFrom<&[u8]> for CodeUnits {
             return Err(Self::Error::InvalidBytecode);
         }
 
-        value.chunks_exact(2).map(CodeUnit::try_from).collect()
+        let units: Self = value
+            .chunks_exact(2)
+            .map(CodeUnit::try_from)
+            .collect::<Result<_, _>>()?;
+        units.init_adaptive_counters();
+        Ok(units)
     }
 }
 
@@ -503,6 +508,47 @@ impl CodeUnits {
         let hi = self.read_cache_u32(index + 2) as u64;
         lo | (hi << 32)
     }
+
+    /// Read the adaptive counter from the first CACHE entry's `arg` byte.
+    /// This preserves `op = Instruction::Cache`, unlike `read_cache_u16`.
+    pub fn read_adaptive_counter(&self, index: usize) -> u8 {
+        let units = unsafe { &*self.0.get() };
+        u8::from(units[index].arg)
+    }
+
+    /// Write the adaptive counter to the first CACHE entry's `arg` byte.
+    /// This preserves `op = Instruction::Cache`, unlike `write_cache_u16`.
+    ///
+    /// # Safety
+    /// - `index` must be in bounds and point to a CACHE entry.
+    pub unsafe fn write_adaptive_counter(&self, index: usize, value: u8) {
+        let units = unsafe { &mut *self.0.get() };
+        units[index].arg = OpArgByte::from(value);
+    }
+
+    /// Initialize adaptive warmup counters for all instructions that have caches.
+    /// The counter is stored in the `arg` byte of the first CACHE entry,
+    /// preserving `op = Instruction::Cache`.
+    pub fn init_adaptive_counters(&self) {
+        let units = unsafe { &*self.0.get() };
+        let len = units.len();
+        let mut i = 0;
+        while i < len {
+            let op = units[i].op;
+            let caches = op.cache_entries();
+            if caches > 0 {
+                let cache_base = i + 1;
+                if cache_base < len {
+                    unsafe {
+                        self.write_adaptive_counter(cache_base, ADAPTIVE_WARMUP_VALUE);
+                    }
+                }
+                i += 1 + caches;
+            } else {
+                i += 1;
+            }
+        }
+    }
 }
 
 /// A Constant (which usually encapsulates data within it)
diff --git a/crates/vm/src/builtins/function.rs b/crates/vm/src/builtins/function.rs
index 58f818cc7a7..84ef7933027 100644
--- a/crates/vm/src/builtins/function.rs
+++ b/crates/vm/src/builtins/function.rs
@@ -22,6 +22,7 @@ use crate::{
         Callable, Comparable, Constructor, GetAttr, GetDescriptor, PyComparisonOp, Representable,
     },
 };
+use core::sync::atomic::{AtomicU32, Ordering::Relaxed};
 use itertools::Itertools;
 #[cfg(feature = "jit")]
 use rustpython_jit::CompiledCode;
@@ -72,10 +73,13 @@ pub struct PyFunction {
     annotate: PyMutex<Option<PyObjectRef>>,
     module: PyMutex<PyObjectRef>,
     doc: PyMutex<PyObjectRef>,
+    func_version: AtomicU32,
     #[cfg(feature = "jit")]
     jitted_code: OnceCell<CompiledCode>,
 }
 
+static FUNC_VERSION_COUNTER: AtomicU32 = AtomicU32::new(1);
+
 unsafe impl Traverse for PyFunction {
     fn traverse(&self, tracer_fn: &mut TraverseFn<'_>) {
         self.globals.traverse(tracer_fn);
@@ -200,6 +204,7 @@ impl PyFunction {
             annotate: PyMutex::new(None),
             module: PyMutex::new(module),
             doc: PyMutex::new(doc),
+            func_version: AtomicU32::new(FUNC_VERSION_COUNTER.fetch_add(1, Relaxed)),
             #[cfg(feature = "jit")]
             jitted_code: OnceCell::new(),
         };
@@ -593,6 +598,66 @@ impl Py<PyFunction> {
     pub fn invoke(&self, func_args: FuncArgs, vm: &VirtualMachine) -> PyResult {
         self.invoke_with_locals(func_args, None, vm)
     }
+
+    /// Returns the function version, or 0 if invalidated.
+    #[inline]
+    pub fn func_version(&self) -> u32 {
+        self.func_version.load(Relaxed)
+    }
+
+    /// Check if this function is eligible for exact-args call specialization.
+    /// Returns true if: no VARARGS, no VARKEYWORDS, no kwonly args, not generator/coroutine,
+    /// and effective_nargs matches co_argcount.
+    pub(crate) fn can_specialize_call(&self, effective_nargs: u32) -> bool {
+        let code = self.code.lock();
+        let flags = code.flags;
+        !flags.intersects(
+            bytecode::CodeFlags::VARARGS
+                | bytecode::CodeFlags::VARKEYWORDS
+                | bytecode::CodeFlags::GENERATOR
+                | bytecode::CodeFlags::COROUTINE,
+        ) && code.kwonlyarg_count == 0
+            && code.arg_count == effective_nargs
+    }
+
+    /// Fast path for calling a simple function with exact positional args.
+    /// Skips FuncArgs allocation, prepend_arg, and fill_locals_from_args.
+    /// Only valid when: no VARARGS, no VARKEYWORDS, no kwonlyargs, not generator/coroutine,
+    /// and nargs == co_argcount.
+    pub fn invoke_exact_args(&self, args: &[PyObjectRef], vm: &VirtualMachine) -> PyResult {
+        let code = self.code.lock().clone();
+
+        let locals = ArgMapping::from_dict_exact(vm.ctx.new_dict());
+
+        let frame = Frame::new(
+            code.clone(),
+            Scope::new(Some(locals), self.globals.clone()),
+            self.builtins.clone(),
+            self.closure.as_ref().map_or(&[], |c| c.as_slice()),
+            Some(self.to_owned().into()),
+            vm,
+        )
+        .into_ref(&vm.ctx);
+
+        // Copy args directly into fastlocals
+        {
+            let fastlocals = unsafe { frame.fastlocals.borrow_mut() };
+            for (i, arg) in args.iter().enumerate() {
+                fastlocals[i] = Some(arg.clone());
+            }
+        }
+
+        // Handle cell2arg
+        if let Some(cell2arg) = code.cell2arg.as_deref() {
+            let fastlocals = unsafe { frame.fastlocals.borrow_mut() };
+            for (cell_idx, arg_idx) in cell2arg.iter().enumerate().filter(|(_, i)| **i != -1) {
+                let x = fastlocals[*arg_idx as usize].take();
+                frame.set_cell_contents(cell_idx, x);
+            }
+        }
+
+        vm.run_frame(frame)
+    }
 }
 
 impl PyPayload for PyFunction {
@@ -615,12 +680,7 @@ impl PyFunction {
     #[pygetset(setter)]
     fn set___code__(&self, code: PyRef<PyCode>) {
         *self.code.lock() = code;
-        // TODO: jit support
-        // #[cfg(feature = "jit")]
-        // {
-        //     // If available, clear cached compiled code.
-        //     let _ = self.jitted_code.take();
-        // }
+        self.func_version.store(0, Relaxed);
     }
 
     #[pygetset]
@@ -629,7 +689,8 @@ impl PyFunction {
     }
     #[pygetset(setter)]
     fn set___defaults__(&self, defaults: Option<PyTupleRef>) {
-        self.defaults_and_kwdefaults.lock().0 = defaults
+        self.defaults_and_kwdefaults.lock().0 = defaults;
+        self.func_version.store(0, Relaxed);
     }
 
     #[pygetset]
@@ -638,7 +699,8 @@ impl PyFunction {
     }
     #[pygetset(setter)]
     fn set___kwdefaults__(&self, kwdefaults: Option<PyDictRef>) {
-        self.defaults_and_kwdefaults.lock().1 = kwdefaults
+        self.defaults_and_kwdefaults.lock().1 = kwdefaults;
+        self.func_version.store(0, Relaxed);
     }
 
     // {"__closure__",   T_OBJECT,     OFF(func_closure), READONLY},
diff --git a/crates/vm/src/frame.rs b/crates/vm/src/frame.rs
index c872b1a7d12..c96065fd9e4 100644
--- a/crates/vm/src/frame.rs
+++ b/crates/vm/src/frame.rs
@@ -5,8 +5,8 @@ use crate::{
     TryFromObject, VirtualMachine,
     builtins::{
         PyBaseException, PyBaseExceptionRef, PyBaseObject, PyCode, PyCoroutine, PyDict, PyDictRef,
-        PyGenerator, PyInterpolation, PyList, PySet, PySlice, PyStr, PyStrInterned, PyTemplate,
-        PyTraceback, PyType, PyUtf8Str,
+        PyFloat, PyGenerator, PyInt, PyInterpolation, PyList, PySet, PySlice, PyStr, PyStrInterned,
+        PyTemplate, PyTraceback, PyType, PyUtf8Str,
         asyncgenerator::PyAsyncGenWrappedValue,
         float::PyFloat,
         frame::stack_analysis,
@@ -1126,7 +1126,24 @@ impl ExecutingFrame<'_> {
         }
 
         match instruction {
-            Instruction::BinaryOp { op } => self.execute_bin_op(vm, op.get(arg)),
+            Instruction::BinaryOp { op } => {
+                let op_val = op.get(arg);
+                let instr_idx = self.lasti() as usize - 1;
+                let cache_base = instr_idx + 1;
+
+                let counter = self.code.instructions.read_adaptive_counter(cache_base);
+                if counter > 0 {
+                    unsafe {
+                        self.code
+                            .instructions
+                            .write_adaptive_counter(cache_base, counter - 1);
+                    }
+                } else {
+                    self.specialize_binary_op(vm, op_val, instr_idx, cache_base);
+                }
+
+                self.execute_bin_op(vm, op_val)
+            }
             // TODO: In CPython, this does in-place unicode concatenation when
             // refcount is 1. Falls back to regular iadd for now.
             Instruction::BinaryOpInplaceAddUnicode => {
@@ -1241,7 +1258,20 @@ impl ExecutingFrame<'_> {
             }
             Instruction::Call { nargs } => {
                 // Stack: [callable, self_or_null, arg1, ..., argN]
-                let args = self.collect_positional_args(nargs.get(arg));
+                let nargs_val = nargs.get(arg);
+                let instr_idx = self.lasti() as usize - 1;
+                let cache_base = instr_idx + 1;
+                let counter = self.code.instructions.read_adaptive_counter(cache_base);
+                if counter > 0 {
+                    unsafe {
+                        self.code
+                            .instructions
+                            .write_adaptive_counter(cache_base, counter - 1);
+                    }
+                } else {
+                    self.specialize_call(vm, nargs_val, instr_idx, cache_base);
+                }
+                let args = self.collect_positional_args(nargs_val);
                 self.execute_call(args, vm)
             }
             Instruction::CallKw { nargs } => {
@@ -2655,7 +2685,7 @@ impl ExecutingFrame<'_> {
                 let owner = self.top_value();
                 let type_version = self.code.instructions.read_cache_u32(cache_base + 1);
 
-                if owner.class().tp_version_tag.load(Acquire) == type_version {
+                if type_version != 0 && owner.class().tp_version_tag.load(Acquire) == type_version {
                     // Cache hit: load the cached method descriptor
                     let descr_ptr = self.code.instructions.read_cache_u64(cache_base + 5);
                     let func = unsafe { &*(descr_ptr as *const PyObject) }.to_owned();
@@ -2671,7 +2701,7 @@ impl ExecutingFrame<'_> {
                             .replace_op(instr_idx, Instruction::LoadAttr { idx: Arg::marker() });
                         self.code
                             .instructions
-                            .write_cache_u16(cache_base, ADAPTIVE_BACKOFF_VALUE);
+                            .write_adaptive_counter(cache_base, ADAPTIVE_BACKOFF_VALUE);
                     }
                     self.load_attr_slow(vm, oparg)
                 }
@@ -2685,7 +2715,7 @@ impl ExecutingFrame<'_> {
                 let owner = self.top_value();
                 let type_version = self.code.instructions.read_cache_u32(cache_base + 1);
 
-                if owner.class().tp_version_tag.load(Acquire) == type_version {
+                if type_version != 0 && owner.class().tp_version_tag.load(Acquire) == type_version {
                     // Check instance dict doesn't shadow the method
                     let shadowed = if let Some(dict) = owner.dict() {
                         dict.get_item_opt(attr_name, vm).ok().flatten().is_some()
@@ -2710,7 +2740,7 @@ impl ExecutingFrame<'_> {
                         .replace_op(instr_idx, Instruction::LoadAttr { idx: Arg::marker() });
                     self.code
                         .instructions
-                        .write_cache_u16(cache_base, ADAPTIVE_BACKOFF_VALUE);
+                        .write_adaptive_counter(cache_base, ADAPTIVE_BACKOFF_VALUE);
                 }
                 self.load_attr_slow(vm, oparg)
             }
@@ -2723,7 +2753,7 @@ impl ExecutingFrame<'_> {
                 let owner = self.top_value();
                 let type_version = self.code.instructions.read_cache_u32(cache_base + 1);
 
-                if owner.class().tp_version_tag.load(Acquire) == type_version {
+                if type_version != 0 && owner.class().tp_version_tag.load(Acquire) == type_version {
                     // Type version matches — no data descriptor for this attr.
                     // Try direct dict lookup, skipping full descriptor protocol.
                     if let Some(dict) = owner.dict()
@@ -2742,10 +2772,186 @@ impl ExecutingFrame<'_> {
                         .replace_op(instr_idx, Instruction::LoadAttr { idx: Arg::marker() });
                     self.code
                         .instructions
-                        .write_cache_u16(cache_base, ADAPTIVE_BACKOFF_VALUE);
+                        .write_adaptive_counter(cache_base, ADAPTIVE_BACKOFF_VALUE);
                 }
                 self.load_attr_slow(vm, oparg)
             }
+            // Specialized BINARY_OP opcodes
+            Instruction::BinaryOpAddInt => {
+                let b = self.top_value();
+                let a = self.nth_value(1);
+                if let (Some(a_int), Some(b_int)) = (
+                    a.downcast_ref_if_exact::<PyInt>(vm),
+                    b.downcast_ref_if_exact::<PyInt>(vm),
+                ) {
+                    let result = a_int.as_bigint() + b_int.as_bigint();
+                    self.pop_value();
+                    self.pop_value();
+                    self.push_value(vm.ctx.new_bigint(&result).into());
+                    Ok(None)
+                } else {
+                    self.deoptimize_binary_op(bytecode::BinaryOperator::Add);
+                    self.execute_bin_op(vm, bytecode::BinaryOperator::Add)
+                }
+            }
+            Instruction::BinaryOpSubtractInt => {
+                let b = self.top_value();
+                let a = self.nth_value(1);
+                if let (Some(a_int), Some(b_int)) = (
+                    a.downcast_ref_if_exact::<PyInt>(vm),
+                    b.downcast_ref_if_exact::<PyInt>(vm),
+                ) {
+                    let result = a_int.as_bigint() - b_int.as_bigint();
+                    self.pop_value();
+                    self.pop_value();
+                    self.push_value(vm.ctx.new_bigint(&result).into());
+                    Ok(None)
+                } else {
+                    self.deoptimize_binary_op(bytecode::BinaryOperator::Subtract);
+                    self.execute_bin_op(vm, bytecode::BinaryOperator::Subtract)
+                }
+            }
+            Instruction::BinaryOpMultiplyInt => {
+                let b = self.top_value();
+                let a = self.nth_value(1);
+                if let (Some(a_int), Some(b_int)) = (
+                    a.downcast_ref_if_exact::<PyInt>(vm),
+                    b.downcast_ref_if_exact::<PyInt>(vm),
+                ) {
+                    let result = a_int.as_bigint() * b_int.as_bigint();
+                    self.pop_value();
+                    self.pop_value();
+                    self.push_value(vm.ctx.new_bigint(&result).into());
+                    Ok(None)
+                } else {
+                    self.deoptimize_binary_op(bytecode::BinaryOperator::Multiply);
+                    self.execute_bin_op(vm, bytecode::BinaryOperator::Multiply)
+                }
+            }
+            Instruction::BinaryOpAddFloat => {
+                let b = self.top_value();
+                let a = self.nth_value(1);
+                if let (Some(a_f), Some(b_f)) = (
+                    a.downcast_ref_if_exact::<PyFloat>(vm),
+                    b.downcast_ref_if_exact::<PyFloat>(vm),
+                ) {
+                    let result = a_f.to_f64() + b_f.to_f64();
+                    self.pop_value();
+                    self.pop_value();
+                    self.push_value(vm.ctx.new_float(result).into());
+                    Ok(None)
+                } else {
+                    self.deoptimize_binary_op(bytecode::BinaryOperator::Add);
+                    self.execute_bin_op(vm, bytecode::BinaryOperator::Add)
+                }
+            }
+            Instruction::BinaryOpSubtractFloat => {
+                let b = self.top_value();
+                let a = self.nth_value(1);
+                if let (Some(a_f), Some(b_f)) = (
+                    a.downcast_ref_if_exact::<PyFloat>(vm),
+                    b.downcast_ref_if_exact::<PyFloat>(vm),
+                ) {
+                    let result = a_f.to_f64() - b_f.to_f64();
+                    self.pop_value();
+                    self.pop_value();
+                    self.push_value(vm.ctx.new_float(result).into());
+                    Ok(None)
+                } else {
+                    self.deoptimize_binary_op(bytecode::BinaryOperator::Subtract);
+                    self.execute_bin_op(vm, bytecode::BinaryOperator::Subtract)
+                }
+            }
+            Instruction::BinaryOpMultiplyFloat => {
+                let b = self.top_value();
+                let a = self.nth_value(1);
+                if let (Some(a_f), Some(b_f)) = (
+                    a.downcast_ref_if_exact::<PyFloat>(vm),
+                    b.downcast_ref_if_exact::<PyFloat>(vm),
+                ) {
+                    let result = a_f.to_f64() * b_f.to_f64();
+                    self.pop_value();
+                    self.pop_value();
+                    self.push_value(vm.ctx.new_float(result).into());
+                    Ok(None)
+                } else {
+                    self.deoptimize_binary_op(bytecode::BinaryOperator::Multiply);
+                    self.execute_bin_op(vm, bytecode::BinaryOperator::Multiply)
+                }
+            }
+            Instruction::CallPyExactArgs => {
+                let instr_idx = self.lasti() as usize - 1;
+                let cache_base = instr_idx + 1;
+                let cached_version = self.code.instructions.read_cache_u32(cache_base + 1);
+                let nargs: u32 = arg.into();
+                // Stack: [callable, self_or_null, arg1, ..., argN]
+                let callable = self.nth_value(nargs + 1);
+                if let Some(func) = callable.downcast_ref::<PyFunction>()
+                    && func.func_version() == cached_version
+                    && cached_version != 0
+                {
+                    let args: Vec<PyObjectRef> = self.pop_multiple(nargs as usize).collect();
+                    let _null = self.pop_value_opt(); // self_or_null (NULL)
+                    let callable = self.pop_value();
+                    let func = callable.downcast_ref::<PyFunction>().unwrap();
+                    let result = func.invoke_exact_args(&args, vm)?;
+                    self.push_value(result);
+                    Ok(None)
+                } else {
+                    // Deoptimize
+                    unsafe {
+                        self.code.instructions.replace_op(
+                            instr_idx,
+                            Instruction::Call {
+                                nargs: Arg::marker(),
+                            },
+                        );
+                        self.code
+                            .instructions
+                            .write_adaptive_counter(cache_base, ADAPTIVE_BACKOFF_VALUE);
+                    }
+                    let args = self.collect_positional_args(nargs);
+                    self.execute_call(args, vm)
+                }
+            }
+            Instruction::CallBoundMethodExactArgs => {
+                let instr_idx = self.lasti() as usize - 1;
+                let cache_base = instr_idx + 1;
+                let cached_version = self.code.instructions.read_cache_u32(cache_base + 1);
+                let nargs: u32 = arg.into();
+                // Stack: [callable, self_val, arg1, ..., argN]
+                let callable = self.nth_value(nargs + 1);
+                if let Some(func) = callable.downcast_ref::<PyFunction>()
+                    && func.func_version() == cached_version
+                    && cached_version != 0
+                {
+                    let pos_args: Vec<PyObjectRef> = self.pop_multiple(nargs as usize).collect();
+                    let self_val = self.pop_value();
+                    let callable = self.pop_value();
+                    let func = callable.downcast_ref::<PyFunction>().unwrap();
+                    let mut all_args = Vec::with_capacity(pos_args.len() + 1);
+                    all_args.push(self_val);
+                    all_args.extend(pos_args);
+                    let result = func.invoke_exact_args(&all_args, vm)?;
+                    self.push_value(result);
+                    Ok(None)
+                } else {
+                    // Deoptimize
+                    unsafe {
+                        self.code.instructions.replace_op(
+                            instr_idx,
+                            Instruction::Call {
+                                nargs: Arg::marker(),
+                            },
+                        );
+                        self.code
+                            .instructions
+                            .write_adaptive_counter(cache_base, ADAPTIVE_BACKOFF_VALUE);
+                    }
+                    let args = self.collect_positional_args(nargs);
+                    self.execute_call(args, vm)
+                }
+            }
             // All INSTRUMENTED_* opcodes delegate to a cold function to keep
             // the hot instruction loop free of monitoring overhead.
             _ => self.execute_instrumented(instruction, arg, vm),
@@ -4217,12 +4423,12 @@ impl ExecutingFrame<'_> {
         let cache_base = instr_idx + 1;
 
         // Decrement adaptive counter
-        let counter = self.code.instructions.read_cache_u16(cache_base);
+        let counter = self.code.instructions.read_adaptive_counter(cache_base);
         if counter > 0 {
             unsafe {
                 self.code
                     .instructions
-                    .write_cache_u16(cache_base, counter - 1);
+                    .write_adaptive_counter(cache_base, counter - 1);
             }
         } else {
             // Counter reached 0: attempt specialization for future calls
@@ -4253,7 +4459,7 @@ impl ExecutingFrame<'_> {
             unsafe {
                 self.code
                     .instructions
-                    .write_cache_u16(cache_base, ADAPTIVE_BACKOFF_VALUE);
+                    .write_adaptive_counter(cache_base, ADAPTIVE_BACKOFF_VALUE);
             }
             return;
         }
@@ -4307,7 +4513,7 @@ impl ExecutingFrame<'_> {
             unsafe {
                 self.code
                     .instructions
-                    .write_cache_u16(cache_base, ADAPTIVE_BACKOFF_VALUE);
+                    .write_adaptive_counter(cache_base, ADAPTIVE_BACKOFF_VALUE);
             }
         } else {
             // Regular attribute access
@@ -4332,7 +4538,7 @@ impl ExecutingFrame<'_> {
                 unsafe {
                     self.code
                         .instructions
-                        .write_cache_u16(cache_base, ADAPTIVE_BACKOFF_VALUE);
+                        .write_adaptive_counter(cache_base, ADAPTIVE_BACKOFF_VALUE);
                 }
             }
         }
@@ -4363,6 +4569,141 @@ impl ExecutingFrame<'_> {
         Ok(None)
     }
 
+    fn specialize_binary_op(
+        &mut self,
+        vm: &VirtualMachine,
+        op: bytecode::BinaryOperator,
+        instr_idx: usize,
+        cache_base: usize,
+    ) {
+        let b = self.top_value();
+        let a = self.nth_value(1);
+
+        let new_op = match op {
+            bytecode::BinaryOperator::Add => {
+                if a.downcast_ref_if_exact::<PyInt>(vm).is_some()
+                    && b.downcast_ref_if_exact::<PyInt>(vm).is_some()
+                {
+                    Some(Instruction::BinaryOpAddInt)
+                } else if a.downcast_ref_if_exact::<PyFloat>(vm).is_some()
+                    && b.downcast_ref_if_exact::<PyFloat>(vm).is_some()
+                {
+                    Some(Instruction::BinaryOpAddFloat)
+                } else {
+                    None
+                }
+            }
+            bytecode::BinaryOperator::Subtract => {
+                if a.downcast_ref_if_exact::<PyInt>(vm).is_some()
+                    && b.downcast_ref_if_exact::<PyInt>(vm).is_some()
+                {
+                    Some(Instruction::BinaryOpSubtractInt)
+                } else if a.downcast_ref_if_exact::<PyFloat>(vm).is_some()
+                    && b.downcast_ref_if_exact::<PyFloat>(vm).is_some()
+                {
+                    Some(Instruction::BinaryOpSubtractFloat)
+                } else {
+                    None
+                }
+            }
+            bytecode::BinaryOperator::Multiply => {
+                if a.downcast_ref_if_exact::<PyInt>(vm).is_some()
+                    && b.downcast_ref_if_exact::<PyInt>(vm).is_some()
+                {
+                    Some(Instruction::BinaryOpMultiplyInt)
+                } else if a.downcast_ref_if_exact::<PyFloat>(vm).is_some()
+                    && b.downcast_ref_if_exact::<PyFloat>(vm).is_some()
+                {
+                    Some(Instruction::BinaryOpMultiplyFloat)
+                } else {
+                    None
+                }
+            }
+            _ => None,
+        };
+
+        if let Some(new_op) = new_op {
+            unsafe {
+                self.code.instructions.replace_op(instr_idx, new_op);
+            }
+        } else {
+            unsafe {
+                self.code
+                    .instructions
+                    .write_adaptive_counter(cache_base, ADAPTIVE_BACKOFF_VALUE);
+            }
+        }
+    }
+
+    fn deoptimize_binary_op(&mut self, _op: bytecode::BinaryOperator) {
+        let instr_idx = self.lasti() as usize - 1;
+        let cache_base = instr_idx + 1;
+        unsafe {
+            self.code
+                .instructions
+                .replace_op(instr_idx, Instruction::BinaryOp { op: Arg::marker() });
+            self.code
+                .instructions
+                .write_adaptive_counter(cache_base, ADAPTIVE_BACKOFF_VALUE);
+        }
+    }
+
+    fn specialize_call(
+        &mut self,
+        _vm: &VirtualMachine,
+        nargs: u32,
+        instr_idx: usize,
+        cache_base: usize,
+    ) {
+        // Stack: [callable, self_or_null, arg1, ..., argN]
+        // callable is at position nargs + 1 from top
+        // self_or_null is at position nargs from top
+        let stack = &self.state.stack;
+        let stack_len = stack.len();
+        let self_or_null_is_some = stack[stack_len - nargs as usize - 1].is_some();
+        let callable = self.nth_value(nargs + 1);
+
+        if let Some(func) = callable.downcast_ref::<PyFunction>() {
+            let version = func.func_version();
+            if version == 0 {
+                unsafe {
+                    self.code
+                        .instructions
+                        .write_adaptive_counter(cache_base, ADAPTIVE_BACKOFF_VALUE);
+                }
+                return;
+            }
+
+            let effective_nargs = if self_or_null_is_some {
+                nargs + 1
+            } else {
+                nargs
+            };
+
+            if func.can_specialize_call(effective_nargs) {
+                let new_op = if self_or_null_is_some {
+                    Instruction::CallBoundMethodExactArgs
+                } else {
+                    Instruction::CallPyExactArgs
+                };
+                unsafe {
+                    self.code.instructions.replace_op(instr_idx, new_op);
+                    // Store func_version in cache (after counter)
+                    self.code
+                        .instructions
+                        .write_cache_u32(cache_base + 1, version);
+                }
+                return;
+            }
+        }
+
+        unsafe {
+            self.code
+                .instructions
+                .write_adaptive_counter(cache_base, ADAPTIVE_BACKOFF_VALUE);
+        }
+    }
+
     fn load_super_attr(&mut self, vm: &VirtualMachine, oparg: LoadSuperAttr) -> FrameResult {
         let attr_name = self.code.names[oparg.name_idx() as usize];
 

From 0fa6fa7dfd283e8af2946d36d5bde477c4f20e4f Mon Sep 17 00:00:00 2001
From: "Jeong, YunWon" <jeong@youknowone.org>
Date: Sun, 1 Mar 2026 21:50:27 +0900
Subject: [PATCH 3/6] Lazy quickening for adaptive specialization counters

Move counter initialization from compile-time to RESUME execution,
matching CPython's _PyCode_Quicken pattern. Store counter in CACHE
entry's arg byte to preserve op=Instruction::Cache for dis/JIT.
Add PyCode.quickened flag for one-time initialization.
---
 crates/codegen/src/ir.rs             |  6 +-----
 crates/compiler-core/src/bytecode.rs |  9 ++++-----
 crates/vm/src/builtins/code.rs       |  3 +++
 crates/vm/src/frame.rs               | 11 ++++++++---
 4 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/crates/codegen/src/ir.rs b/crates/codegen/src/ir.rs
index a129b84956f..b21af84a51c 100644
--- a/crates/codegen/src/ir.rs
+++ b/crates/codegen/src/ir.rs
@@ -499,11 +499,7 @@ impl CodeInfo {
             qualname: qualname.unwrap_or(obj_name),
 
             max_stackdepth,
-            instructions: {
-                let units = CodeUnits::from(instructions);
-                units.init_adaptive_counters();
-                units
-            },
+            instructions: CodeUnits::from(instructions),
             locations: locations.into_boxed_slice(),
             constants: constants.into_iter().collect(),
             names: name_cache.into_iter().collect(),
diff --git a/crates/compiler-core/src/bytecode.rs b/crates/compiler-core/src/bytecode.rs
index c34617c686d..3b5d684435c 100644
--- a/crates/compiler-core/src/bytecode.rs
+++ b/crates/compiler-core/src/bytecode.rs
@@ -400,7 +400,6 @@ impl TryFrom<&[u8]> for CodeUnits {
             .chunks_exact(2)
             .map(CodeUnit::try_from)
             .collect::<Result<_, _>>()?;
-        units.init_adaptive_counters();
         Ok(units)
     }
 }
@@ -526,10 +525,10 @@ impl CodeUnits {
         units[index].arg = OpArgByte::from(value);
     }
 
-    /// Initialize adaptive warmup counters for all instructions that have caches.
-    /// The counter is stored in the `arg` byte of the first CACHE entry,
-    /// preserving `op = Instruction::Cache`.
-    pub fn init_adaptive_counters(&self) {
+    /// Initialize adaptive warmup counters for all cacheable instructions.
+    /// Called lazily at RESUME (first execution of a code object).
+    /// Uses the `arg` byte of the first CACHE entry, preserving `op = Instruction::Cache`.
+    pub fn quicken(&self) {
         let units = unsafe { &*self.0.get() };
         let len = units.len();
         let mut i = 0;
diff --git a/crates/vm/src/builtins/code.rs b/crates/vm/src/builtins/code.rs
index 1708477004e..f04f6607417 100644
--- a/crates/vm/src/builtins/code.rs
+++ b/crates/vm/src/builtins/code.rs
@@ -346,6 +346,8 @@ pub struct PyCode {
     pub instrumentation_version: AtomicU64,
     /// Side-table for INSTRUMENTED_LINE / INSTRUMENTED_INSTRUCTION.
     pub monitoring_data: PyMutex<Option<CoMonitoringData>>,
+    /// Whether adaptive counters have been initialized (lazy quickening).
+    pub quickened: core::sync::atomic::AtomicBool,
 }
 
 impl Deref for PyCode {
@@ -363,6 +365,7 @@ impl PyCode {
             source_path: AtomicPtr::new(sp),
             instrumentation_version: AtomicU64::new(0),
             monitoring_data: PyMutex::new(None),
+            quickened: core::sync::atomic::AtomicBool::new(false),
         }
     }
 
diff --git a/crates/vm/src/frame.rs b/crates/vm/src/frame.rs
index c96065fd9e4..c84723be139 100644
--- a/crates/vm/src/frame.rs
+++ b/crates/vm/src/frame.rs
@@ -2314,6 +2314,14 @@ impl ExecutingFrame<'_> {
             }
             Instruction::RaiseVarargs { kind } => self.execute_raise(vm, kind.get(arg)),
             Instruction::Resume { .. } => {
+                // Lazy quickening: initialize adaptive counters on first execution
+                if !self
+                    .code
+                    .quickened
+                    .swap(true, atomic::Ordering::Relaxed)
+                {
+                    self.code.instructions.quicken();
+                }
                 // Check if bytecode needs re-instrumentation
                 let global_ver = vm
                     .state
@@ -4422,7 +4430,6 @@ impl ExecutingFrame<'_> {
         let instr_idx = self.lasti() as usize - 1;
         let cache_base = instr_idx + 1;
 
-        // Decrement adaptive counter
         let counter = self.code.instructions.read_adaptive_counter(cache_base);
         if counter > 0 {
             unsafe {
@@ -4431,11 +4438,9 @@ impl ExecutingFrame<'_> {
                     .write_adaptive_counter(cache_base, counter - 1);
             }
         } else {
-            // Counter reached 0: attempt specialization for future calls
             self.specialize_load_attr(vm, oparg, instr_idx, cache_base);
         }
 
-        // Execute slow path for this call
         self.load_attr_slow(vm, oparg)
     }
 

From 5f231618eb9cf42e471adfe1d9c4436b252486ce Mon Sep 17 00:00:00 2001
From: "Jeong, YunWon" <jeong@youknowone.org>
Date: Sun, 1 Mar 2026 23:16:22 +0900
Subject: [PATCH 4/6] Add Instruction::deoptimize() and
 CodeUnits::original_bytes()

- deoptimize() maps specialized opcodes back to their base adaptive variant
- original_bytes() produces deoptimized bytecode with zeroed CACHE entries
- co_code now returns deoptimized bytes, _co_code_adaptive returns current bytes
- Marshal serialization uses original_bytes() instead of raw transmute
---
 crates/compiler-core/src/bytecode.rs          |  24 ++++
 .../compiler-core/src/bytecode/instruction.rs | 113 ++++++++++++++++++
 crates/compiler-core/src/marshal.rs           |   5 +-
 crates/vm/src/builtins/code.rs                |  13 +-
 crates/vm/src/frame.rs                        |   6 +-
 5 files changed, 146 insertions(+), 15 deletions(-)

diff --git a/crates/compiler-core/src/bytecode.rs b/crates/compiler-core/src/bytecode.rs
index 3b5d684435c..d7bcaa2c6ef 100644
--- a/crates/compiler-core/src/bytecode.rs
+++ b/crates/compiler-core/src/bytecode.rs
@@ -525,6 +525,30 @@ impl CodeUnits {
         units[index].arg = OpArgByte::from(value);
     }
 
+    /// Produce a clean copy of the bytecode suitable for serialization
+    /// (marshal) and `co_code`. Specialized opcodes are mapped back to their
+    /// base variants via `deoptimize()` and all CACHE entries are zeroed.
+    pub fn original_bytes(&self) -> Vec<u8> {
+        let units = unsafe { &*self.0.get() };
+        let mut out = Vec::with_capacity(units.len() * 2);
+        let len = units.len();
+        let mut i = 0;
+        while i < len {
+            let op = units[i].op.deoptimize();
+            let caches = op.cache_entries();
+            out.push(u8::from(op));
+            out.push(u8::from(units[i].arg));
+            // Zero-fill all CACHE entries (counter + cached data)
+            for _ in 0..caches {
+                i += 1;
+                out.push(0); // op = Cache = 0
+                out.push(0); // arg = 0
+            }
+            i += 1;
+        }
+        out
+    }
+
     /// Initialize adaptive warmup counters for all cacheable instructions.
     /// Called lazily at RESUME (first execution of a code object).
     /// Uses the `arg` byte of the first CACHE entry, preserving `op = Instruction::Cache`.
diff --git a/crates/compiler-core/src/bytecode/instruction.rs b/crates/compiler-core/src/bytecode/instruction.rs
index c1c5e8cd847..c93feaaa9ef 100644
--- a/crates/compiler-core/src/bytecode/instruction.rs
+++ b/crates/compiler-core/src/bytecode/instruction.rs
@@ -512,6 +512,119 @@ impl Instruction {
         })
     }
 
+    /// Map a specialized opcode back to its adaptive (base) variant.
+    /// `_PyOpcode_Deopt`
+    pub fn deoptimize(self) -> Self {
+        match self {
+            // LOAD_ATTR specializations
+            Self::LoadAttrClass
+            | Self::LoadAttrClassWithMetaclassCheck
+            | Self::LoadAttrGetattributeOverridden
+            | Self::LoadAttrInstanceValue
+            | Self::LoadAttrMethodLazyDict
+            | Self::LoadAttrMethodNoDict
+            | Self::LoadAttrMethodWithValues
+            | Self::LoadAttrModule
+            | Self::LoadAttrNondescriptorNoDict
+            | Self::LoadAttrNondescriptorWithValues
+            | Self::LoadAttrProperty
+            | Self::LoadAttrSlot
+            | Self::LoadAttrWithHint => Self::LoadAttr { idx: Arg::marker() },
+            // BINARY_OP specializations
+            Self::BinaryOpAddFloat
+            | Self::BinaryOpAddInt
+            | Self::BinaryOpAddUnicode
+            | Self::BinaryOpExtend
+            | Self::BinaryOpInplaceAddUnicode
+            | Self::BinaryOpMultiplyFloat
+            | Self::BinaryOpMultiplyInt
+            | Self::BinaryOpSubscrDict
+            | Self::BinaryOpSubscrGetitem
+            | Self::BinaryOpSubscrListInt
+            | Self::BinaryOpSubscrListSlice
+            | Self::BinaryOpSubscrStrInt
+            | Self::BinaryOpSubscrTupleInt
+            | Self::BinaryOpSubtractFloat
+            | Self::BinaryOpSubtractInt => Self::BinaryOp { op: Arg::marker() },
+            // CALL specializations
+            Self::CallAllocAndEnterInit
+            | Self::CallBoundMethodExactArgs
+            | Self::CallBoundMethodGeneral
+            | Self::CallBuiltinClass
+            | Self::CallBuiltinFast
+            | Self::CallBuiltinFastWithKeywords
+            | Self::CallBuiltinO
+            | Self::CallIsinstance
+            | Self::CallLen
+            | Self::CallListAppend
+            | Self::CallMethodDescriptorFast
+            | Self::CallMethodDescriptorFastWithKeywords
+            | Self::CallMethodDescriptorNoargs
+            | Self::CallMethodDescriptorO
+            | Self::CallNonPyGeneral
+            | Self::CallPyExactArgs
+            | Self::CallPyGeneral
+            | Self::CallStr1
+            | Self::CallTuple1
+            | Self::CallType1 => Self::Call {
+                nargs: Arg::marker(),
+            },
+            // CALL_KW specializations
+            Self::CallKwBoundMethod | Self::CallKwNonPy | Self::CallKwPy => Self::CallKw {
+                nargs: Arg::marker(),
+            },
+            // TO_BOOL specializations
+            Self::ToBoolAlwaysTrue
+            | Self::ToBoolBool
+            | Self::ToBoolInt
+            | Self::ToBoolList
+            | Self::ToBoolNone
+            | Self::ToBoolStr => Self::ToBool,
+            // COMPARE_OP specializations
+            Self::CompareOpFloat | Self::CompareOpInt | Self::CompareOpStr => {
+                Self::CompareOp { op: Arg::marker() }
+            }
+            // CONTAINS_OP specializations
+            Self::ContainsOpDict | Self::ContainsOpSet => Self::ContainsOp(Arg::marker()),
+            // FOR_ITER specializations
+            Self::ForIterGen | Self::ForIterList | Self::ForIterRange | Self::ForIterTuple => {
+                Self::ForIter {
+                    target: Arg::marker(),
+                }
+            }
+            // LOAD_GLOBAL specializations
+            Self::LoadGlobalBuiltin | Self::LoadGlobalModule => Self::LoadGlobal(Arg::marker()),
+            // STORE_ATTR specializations
+            Self::StoreAttrInstanceValue | Self::StoreAttrSlot | Self::StoreAttrWithHint => {
+                Self::StoreAttr { idx: Arg::marker() }
+            }
+            // LOAD_SUPER_ATTR specializations
+            Self::LoadSuperAttrAttr | Self::LoadSuperAttrMethod => {
+                Self::LoadSuperAttr { arg: Arg::marker() }
+            }
+            // STORE_SUBSCR specializations
+            Self::StoreSubscrDict | Self::StoreSubscrListInt => Self::StoreSubscr,
+            // UNPACK_SEQUENCE specializations
+            Self::UnpackSequenceList | Self::UnpackSequenceTuple | Self::UnpackSequenceTwoTuple => {
+                Self::UnpackSequence {
+                    size: Arg::marker(),
+                }
+            }
+            // SEND specializations
+            Self::SendGen => Self::Send {
+                target: Arg::marker(),
+            },
+            // LOAD_CONST specializations
+            Self::LoadConstImmortal | Self::LoadConstMortal => {
+                Self::LoadConst { idx: Arg::marker() }
+            }
+            // RESUME specializations
+            Self::ResumeCheck => Self::Resume { arg: Arg::marker() },
+            // Everything else maps to itself
+            _ => self,
+        }
+    }
+
     /// Number of CACHE code units that follow this instruction.
     /// _PyOpcode_Caches
     pub fn cache_entries(self) -> usize {
diff --git a/crates/compiler-core/src/marshal.rs b/crates/compiler-core/src/marshal.rs
index 11df127920a..310bad9d868 100644
--- a/crates/compiler-core/src/marshal.rs
+++ b/crates/compiler-core/src/marshal.rs
@@ -662,9 +662,8 @@ pub fn serialize_value<W: Write, D: Dumpable>(
 
 pub fn serialize_code<W: Write, C: Constant>(buf: &mut W, code: &CodeObject<C>) {
     write_len(buf, code.instructions.len());
-    // SAFETY: it's ok to transmute CodeUnit to [u8; 2]
-    let (_, instructions_bytes, _) = unsafe { code.instructions.align_to() };
-    buf.write_slice(instructions_bytes);
+    let original = code.instructions.original_bytes();
+    buf.write_slice(&original);
 
     write_len(buf, code.locations.len());
     for (start, end) in &*code.locations {
diff --git a/crates/vm/src/builtins/code.rs b/crates/vm/src/builtins/code.rs
index f04f6607417..126d0216546 100644
--- a/crates/vm/src/builtins/code.rs
+++ b/crates/vm/src/builtins/code.rs
@@ -684,7 +684,12 @@ impl PyCode {
 
     #[pygetset]
     pub fn co_code(&self, vm: &VirtualMachine) -> crate::builtins::PyBytesRef {
-        // SAFETY: CodeUnit is #[repr(C)] with size 2, so we can safely transmute to bytes
+        vm.ctx.new_bytes(self.code.instructions.original_bytes())
+    }
+
+    #[pygetset]
+    pub fn _co_code_adaptive(&self, vm: &VirtualMachine) -> crate::builtins::PyBytesRef {
+        // Return current (possibly quickened/specialized) bytecode
         let bytes = unsafe {
             core::slice::from_raw_parts(
                 self.code.instructions.as_ptr() as *const u8,
@@ -694,12 +699,6 @@ impl PyCode {
         vm.ctx.new_bytes(bytes.to_vec())
     }
 
-    #[pygetset]
-    pub fn _co_code_adaptive(&self, vm: &VirtualMachine) -> crate::builtins::PyBytesRef {
-        // RustPython doesn't have adaptive/specialized bytecode, so return regular co_code
-        self.co_code(vm)
-    }
-
     #[pygetset]
     pub fn co_freevars(&self, vm: &VirtualMachine) -> PyTupleRef {
         let names = self
diff --git a/crates/vm/src/frame.rs b/crates/vm/src/frame.rs
index c84723be139..656be91ad4a 100644
--- a/crates/vm/src/frame.rs
+++ b/crates/vm/src/frame.rs
@@ -2315,11 +2315,7 @@ impl ExecutingFrame<'_> {
             Instruction::RaiseVarargs { kind } => self.execute_raise(vm, kind.get(arg)),
             Instruction::Resume { .. } => {
                 // Lazy quickening: initialize adaptive counters on first execution
-                if !self
-                    .code
-                    .quickened
-                    .swap(true, atomic::Ordering::Relaxed)
-                {
+                if !self.code.quickened.swap(true, atomic::Ordering::Relaxed) {
                     self.code.instructions.quicken();
                 }
                 // Check if bytecode needs re-instrumentation

From a831e4f6938e6809eed3dd60c153d46d8bedd507 Mon Sep 17 00:00:00 2001
From: "Jeong, YunWon" <jeong@youknowone.org>
Date: Mon, 2 Mar 2026 01:21:31 +0900
Subject: [PATCH 5/6] Fix monitoring and specialization interaction

- cache_entries() returns correct count for instrumented opcodes
- deoptimize() maps instrumented opcodes back to base
- quicken() skips adaptive counter for instrumented opcodes
- instrument_code Phase 3 deoptimizes specialized opcodes and
  clears CACHE entries to prevent stale pointer dereferences
---
 .cspell.dict/cpython.txt                      |  1 +
 crates/compiler-core/src/bytecode.rs          | 12 +++++---
 .../compiler-core/src/bytecode/instruction.rs | 14 +++++++---
 crates/vm/src/stdlib/sys/monitoring.rs        | 28 +++++++++++++++----
 4 files changed, 41 insertions(+), 14 deletions(-)

diff --git a/.cspell.dict/cpython.txt b/.cspell.dict/cpython.txt
index 819d6875b58..7681760ea65 100644
--- a/.cspell.dict/cpython.txt
+++ b/.cspell.dict/cpython.txt
@@ -44,6 +44,7 @@ copyslot
 cpucount
 defaultdict
 denom
+deopt
 dictbytype
 DICTFLAG
 dictoffset
diff --git a/crates/compiler-core/src/bytecode.rs b/crates/compiler-core/src/bytecode.rs
index d7bcaa2c6ef..3f50120df17 100644
--- a/crates/compiler-core/src/bytecode.rs
+++ b/crates/compiler-core/src/bytecode.rs
@@ -560,10 +560,14 @@ impl CodeUnits {
             let op = units[i].op;
             let caches = op.cache_entries();
             if caches > 0 {
-                let cache_base = i + 1;
-                if cache_base < len {
-                    unsafe {
-                        self.write_adaptive_counter(cache_base, ADAPTIVE_WARMUP_VALUE);
+                // Don't write adaptive counter for instrumented opcodes;
+                // specialization is skipped while monitoring is active.
+                if !op.is_instrumented() {
+                    let cache_base = i + 1;
+                    if cache_base < len {
+                        unsafe {
+                            self.write_adaptive_counter(cache_base, ADAPTIVE_WARMUP_VALUE);
+                        }
                     }
                 }
                 i += 1 + caches;
diff --git a/crates/compiler-core/src/bytecode/instruction.rs b/crates/compiler-core/src/bytecode/instruction.rs
index c93feaaa9ef..a7d030b6094 100644
--- a/crates/compiler-core/src/bytecode/instruction.rs
+++ b/crates/compiler-core/src/bytecode/instruction.rs
@@ -620,8 +620,11 @@ impl Instruction {
             }
             // RESUME specializations
             Self::ResumeCheck => Self::Resume { arg: Arg::marker() },
-            // Everything else maps to itself
-            _ => self,
+            // Instrumented opcodes map back to their base
+            _ => match self.to_base() {
+                Some(base) => base,
+                None => self,
+            },
         }
     }
 
@@ -739,8 +742,11 @@ impl Instruction {
             | Self::UnpackSequenceTuple
             | Self::UnpackSequenceTwoTuple => 1,
 
-            // Everything else: 0 cache entries
-            _ => 0,
+            // Instrumented opcodes have the same cache entries as their base
+            _ => match self.to_base() {
+                Some(base) => base.cache_entries(),
+                None => 0,
+            },
         }
     }
 }
diff --git a/crates/vm/src/stdlib/sys/monitoring.rs b/crates/vm/src/stdlib/sys/monitoring.rs
index e223e249e54..a57a23fd9b3 100644
--- a/crates/vm/src/stdlib/sys/monitoring.rs
+++ b/crates/vm/src/stdlib/sys/monitoring.rs
@@ -270,13 +270,29 @@ pub fn instrument_code(code: &PyCode, events: u32) {
         }
     }
 
-    // Phase 3: Remove regular INSTRUMENTED_* → restore base opcodes
-    for i in 0..len {
-        let op = code.code.instructions[i].op;
-        if let Some(base) = op.to_base() {
-            unsafe {
-                code.code.instructions.replace_op(i, base);
+    // Phase 3: Remove regular INSTRUMENTED_* and specialized opcodes → restore base opcodes.
+    // Also clear all CACHE entries so specialization starts fresh.
+    {
+        let mut i = 0;
+        while i < len {
+            let op = code.code.instructions[i].op;
+            let de_opt = op.deoptimize();
+            if u8::from(de_opt) != u8::from(op) {
+                unsafe {
+                    code.code.instructions.replace_op(i, de_opt);
+                }
+            }
+            let caches = de_opt.cache_entries();
+            // Zero all CACHE entries (the op+arg bytes may have been overwritten
+            // by specialization with arbitrary data like pointers).
+            for c in 1..=caches {
+                if i + c < len {
+                    unsafe {
+                        code.code.instructions.write_cache_u16(i + c, 0);
+                    }
+                }
             }
+            i += 1 + caches;
         }
     }
 

From 79ca9c11249699fc364bd586b94aeb7531351dcc Mon Sep 17 00:00:00 2001
From: "Jeong, YunWon" <jeong@youknowone.org>
Date: Mon, 2 Mar 2026 11:50:30 +0900
Subject: [PATCH 6/6] Address review: bounds checks, UB fix, version overflow,
 error handling

- Add bounds checks to read_cache_u16/u32/u64
- Fix quicken() aliasing UB by using &mut directly
- Add JumpBackwardJit/JumpBackwardNoJit to deoptimize()
- Guard can_specialize_call with NEWLOCALS flag check
- Use compare_exchange_weak for version tag to prevent wraparound
- Propagate dict lookup errors in LoadAttrMethodWithValues
- Apply adaptive backoff on version tag assignment failure
- Remove duplicate imports in frame.rs
---
 crates/compiler-core/src/bytecode.rs          | 16 ++++++++---
 .../compiler-core/src/bytecode/instruction.rs |  4 +++
 crates/vm/src/builtins/function.rs            | 14 +++++-----
 crates/vm/src/builtins/type.rs                | 17 ++++++++----
 crates/vm/src/frame.rs                        | 27 ++++++++++++++++---
 crates/vm/src/stdlib/sys/monitoring.rs        |  8 +++---
 6 files changed, 63 insertions(+), 23 deletions(-)

diff --git a/crates/compiler-core/src/bytecode.rs b/crates/compiler-core/src/bytecode.rs
index 3f50120df17..cec04b9edd9 100644
--- a/crates/compiler-core/src/bytecode.rs
+++ b/crates/compiler-core/src/bytecode.rs
@@ -466,8 +466,12 @@ impl CodeUnits {
     }
 
     /// Read a u16 value from a CACHE code unit at `index`.
+    ///
+    /// # Panics
+    /// Panics if `index` is out of bounds.
     pub fn read_cache_u16(&self, index: usize) -> u16 {
         let units = unsafe { &*self.0.get() };
+        assert!(index < units.len(), "read_cache_u16: index out of bounds");
         let ptr = units.as_ptr().wrapping_add(index) as *const u8;
         unsafe { core::ptr::read_unaligned(ptr as *const u16) }
     }
@@ -484,6 +488,9 @@ impl CodeUnits {
     }
 
     /// Read a u32 value from two consecutive CACHE code units starting at `index`.
+    ///
+    /// # Panics
+    /// Panics if `index + 1` is out of bounds.
     pub fn read_cache_u32(&self, index: usize) -> u32 {
         let lo = self.read_cache_u16(index) as u32;
         let hi = self.read_cache_u16(index + 1) as u32;
@@ -502,6 +509,9 @@ impl CodeUnits {
     }
 
     /// Read a u64 value from four consecutive CACHE code units starting at `index`.
+    ///
+    /// # Panics
+    /// Panics if `index + 3` is out of bounds.
     pub fn read_cache_u64(&self, index: usize) -> u64 {
         let lo = self.read_cache_u32(index) as u64;
         let hi = self.read_cache_u32(index + 2) as u64;
@@ -553,7 +563,7 @@ impl CodeUnits {
     /// Called lazily at RESUME (first execution of a code object).
     /// Uses the `arg` byte of the first CACHE entry, preserving `op = Instruction::Cache`.
     pub fn quicken(&self) {
-        let units = unsafe { &*self.0.get() };
+        let units = unsafe { &mut *self.0.get() };
         let len = units.len();
         let mut i = 0;
         while i < len {
@@ -565,9 +575,7 @@ impl CodeUnits {
                 if !op.is_instrumented() {
                     let cache_base = i + 1;
                     if cache_base < len {
-                        unsafe {
-                            self.write_adaptive_counter(cache_base, ADAPTIVE_WARMUP_VALUE);
-                        }
+                        units[cache_base].arg = OpArgByte::from(ADAPTIVE_WARMUP_VALUE);
                     }
                 }
                 i += 1 + caches;
diff --git a/crates/compiler-core/src/bytecode/instruction.rs b/crates/compiler-core/src/bytecode/instruction.rs
index a7d030b6094..e7b13ff21d2 100644
--- a/crates/compiler-core/src/bytecode/instruction.rs
+++ b/crates/compiler-core/src/bytecode/instruction.rs
@@ -620,6 +620,10 @@ impl Instruction {
             }
             // RESUME specializations
             Self::ResumeCheck => Self::Resume { arg: Arg::marker() },
+            // JUMP_BACKWARD specializations
+            Self::JumpBackwardJit | Self::JumpBackwardNoJit => Self::JumpBackward {
+                target: Arg::marker(),
+            },
             // Instrumented opcodes map back to their base
             _ => match self.to_base() {
                 Some(base) => base,
diff --git a/crates/vm/src/builtins/function.rs b/crates/vm/src/builtins/function.rs
index 84ef7933027..489482d1933 100644
--- a/crates/vm/src/builtins/function.rs
+++ b/crates/vm/src/builtins/function.rs
@@ -611,12 +611,14 @@ impl Py<PyFunction> {
     pub(crate) fn can_specialize_call(&self, effective_nargs: u32) -> bool {
         let code = self.code.lock();
         let flags = code.flags;
-        !flags.intersects(
-            bytecode::CodeFlags::VARARGS
-                | bytecode::CodeFlags::VARKEYWORDS
-                | bytecode::CodeFlags::GENERATOR
-                | bytecode::CodeFlags::COROUTINE,
-        ) && code.kwonlyarg_count == 0
+        flags.contains(bytecode::CodeFlags::NEWLOCALS)
+            && !flags.intersects(
+                bytecode::CodeFlags::VARARGS
+                    | bytecode::CodeFlags::VARKEYWORDS
+                    | bytecode::CodeFlags::GENERATOR
+                    | bytecode::CodeFlags::COROUTINE,
+            )
+            && code.kwonlyarg_count == 0
             && code.arg_count == effective_nargs
     }
 
diff --git a/crates/vm/src/builtins/type.rs b/crates/vm/src/builtins/type.rs
index da2510f4f9a..b3a3c206c68 100644
--- a/crates/vm/src/builtins/type.rs
+++ b/crates/vm/src/builtins/type.rs
@@ -201,12 +201,19 @@ fn is_subtype_with_mro(a_mro: &[PyTypeRef], a: &Py<PyType>, b: &Py<PyType>) -> b
 impl PyType {
     /// Assign a fresh version tag. Returns 0 on overflow (all caches invalidated).
     pub fn assign_version_tag(&self) -> u32 {
-        let v = NEXT_TYPE_VERSION.fetch_add(1, Ordering::Relaxed);
-        if v == 0 {
-            return 0;
+        loop {
+            let current = NEXT_TYPE_VERSION.load(Ordering::Relaxed);
+            let Some(next) = current.checked_add(1) else {
+                return 0; // Overflow: version space exhausted
+            };
+            if NEXT_TYPE_VERSION
+                .compare_exchange_weak(current, next, Ordering::Relaxed, Ordering::Relaxed)
+                .is_ok()
+            {
+                self.tp_version_tag.store(current, Ordering::Release);
+                return current;
+            }
         }
-        self.tp_version_tag.store(v, Ordering::Release);
-        v
     }
 
     /// Invalidate this type's version tag and cascade to all subclasses.
diff --git a/crates/vm/src/frame.rs b/crates/vm/src/frame.rs
index 656be91ad4a..08ce117fd48 100644
--- a/crates/vm/src/frame.rs
+++ b/crates/vm/src/frame.rs
@@ -8,10 +8,8 @@ use crate::{
         PyFloat, PyGenerator, PyInt, PyInterpolation, PyList, PySet, PySlice, PyStr, PyStrInterned,
         PyTemplate, PyTraceback, PyType, PyUtf8Str,
         asyncgenerator::PyAsyncGenWrappedValue,
-        float::PyFloat,
         frame::stack_analysis,
         function::{PyCell, PyCellRef, PyFunction},
-        int::PyInt,
         range::PyRangeIterator,
         tuple::{PyTuple, PyTupleRef},
     },
@@ -2722,7 +2720,23 @@ impl ExecutingFrame<'_> {
                 if type_version != 0 && owner.class().tp_version_tag.load(Acquire) == type_version {
                     // Check instance dict doesn't shadow the method
                     let shadowed = if let Some(dict) = owner.dict() {
-                        dict.get_item_opt(attr_name, vm).ok().flatten().is_some()
+                        match dict.get_item_opt(attr_name, vm) {
+                            Ok(Some(_)) => true,
+                            Ok(None) => false,
+                            Err(_) => {
+                                // Dict lookup error → deoptimize to safe path
+                                unsafe {
+                                    self.code.instructions.replace_op(
+                                        instr_idx,
+                                        Instruction::LoadAttr { idx: Arg::marker() },
+                                    );
+                                    self.code
+                                        .instructions
+                                        .write_adaptive_counter(cache_base, ADAPTIVE_BACKOFF_VALUE);
+                                }
+                                return self.load_attr_slow(vm, oparg);
+                            }
+                        }
                     } else {
                         false
                     };
@@ -4471,7 +4485,12 @@ impl ExecutingFrame<'_> {
             type_version = cls.assign_version_tag();
         }
         if type_version == 0 {
-            // Version counter overflow
+            // Version counter overflow — backoff to avoid re-attempting every execution
+            unsafe {
+                self.code
+                    .instructions
+                    .write_adaptive_counter(cache_base, ADAPTIVE_BACKOFF_VALUE);
+            }
             return;
         }
 
diff --git a/crates/vm/src/stdlib/sys/monitoring.rs b/crates/vm/src/stdlib/sys/monitoring.rs
index a57a23fd9b3..858ea83b8a7 100644
--- a/crates/vm/src/stdlib/sys/monitoring.rs
+++ b/crates/vm/src/stdlib/sys/monitoring.rs
@@ -276,13 +276,13 @@ pub fn instrument_code(code: &PyCode, events: u32) {
         let mut i = 0;
         while i < len {
             let op = code.code.instructions[i].op;
-            let de_opt = op.deoptimize();
-            if u8::from(de_opt) != u8::from(op) {
+            let base_op = op.deoptimize();
+            if u8::from(base_op) != u8::from(op) {
                 unsafe {
-                    code.code.instructions.replace_op(i, de_opt);
+                    code.code.instructions.replace_op(i, base_op);
                 }
             }
-            let caches = de_opt.cache_entries();
+            let caches = base_op.cache_entries();
             // Zero all CACHE entries (the op+arg bytes may have been overwritten
             // by specialization with arbitrary data like pointers).
             for c in 1..=caches {