diff --git a/.claude/knowledge/agnostic-surface-cpu-matrix.md b/.claude/knowledge/agnostic-surface-cpu-matrix.md
index 93e97b07..7ab9bca2 100644
--- a/.claude/knowledge/agnostic-surface-cpu-matrix.md
+++ b/.claude/knowledge/agnostic-surface-cpu-matrix.md
@@ -23,7 +23,7 @@ Same set as `td-simd-cpu-dispatch-matrix.md` § "Master matrix — x86_64" and
 | Z5   | `znver5` / `Zen4Avx512` (same dispatch) | AMD 2024           | same as Z4 + minor uarch       |
 | ARL  | `arrowlake` / `ArrowLake`               | Intel 2024         | AVX2+FMA + AVX-VNNI+VNNI-INT8  |
 | HSW  | `x86-64-v3` / `HaswellAvx2`             | Intel 2013→2021    | AVX2+FMA (no VNNI/AVX-512)     |
-| A76  | `cortex-a76` / `A76DotProd`             | ARMv8.2 (Pi 5, M1) | NEON+dotprod+bf16+fp16         |
+| A76  | `cortex-a76` / `A76DotProd`             | ARMv8.2 (Pi 5)     | NEON+dotprod+fp16 (no bf16 / i8mm — those are V8.6+, see § M) |
 | A72  | `cortex-a72` / `A72Fast`                | ARMv8.0 (Pi 4)     | NEON only (no dotprod)         |
 | A53  | `cortex-a53` / `A53Baseline`            | ARMv8.0 (Pi 3/Z2W) | NEON, lower IPC                |
 | SCA  | scalar fallback                         | wasm32/riscv/i686  | no SIMD                        |
@@ -530,6 +530,76 @@ verifies that no per-CPU regression has crept in vs the historical baseline:
    `crate::simd::*`, this table must grow a row. Reviewers should reject
    PRs that add a public symbol without a corresponding matrix entry.
 
+## M. AArch64 ground-truth core enumeration (GCC source)
+
+The matrix above uses three aarch64 columns (A53 / A72 / A76) that
+each cover a *dispatch tier* — multiple physical cores share the same
+SIMD primitive set. The authoritative per-core feature membership is
+in GCC's `gcc/config/aarch64/aarch64-cores.def`, scraped 2026-05-21:
+
+| Core | GCC arch | Explicit feature flags |
+|---|---|---|
+| **A53/A72/A76 tier** (baseline NEON, optional dotprod+fp16, NO bf16) | | |
+| `cortex-a53` | V8-A | `(CRC)` |
+| `cortex-a72` | V8-A | `(CRC)` |
+| `cortex-a76` | V8.2-A | `F16, RCPC, DOTPROD` |
+| `cortex-a78` | V8.2-A | `F16, RCPC, DOTPROD, SSBS, PROFILE` |
+| `cortex-x1`  | V8.2-A | `F16, RCPC, DOTPROD, SSBS, PROFILE` |
+| `neoverse-n1`| V8.2-A | `F16, RCPC, DOTPROD, PROFILE` |
+| `apple-m1`   | V8.5-A | `()` — V8.5 baseline includes F16+dotprod, NO bf16/i8mm |
+| **V8.6-A tier** (BF16 + I8MM via baseline) | | |
+| `apple-m2`   | V8.6-A | `()` — V8.6 baseline → bf16, i8mm, sve, sve2 |
+| `apple-m3`   | V8.6-A | same |
+| `oryon-1`    | V8.6-A | `CRYPTO, SM4, SHA3, F16` (Snapdragon X Elite/Plus) |
+| `ampere1`    | V8.6-A | `F16, RNG, AES, SHA3` |
+| `ampere1a`   | V8.6-A | `F16, RNG, AES, SHA3, SM4, MEMTAG` |
+| **V8.7-A tier** (baseline + LS64 + MOPS) | | |
+| `apple-m4`   | V8.7-A | `()` |
+| `ampere1b`   | V8.7-A | `F16, RNG, AES, SHA3, SM4, MEMTAG, CSSC` |
+| **V9.0-A tier** (SVE2 baseline + explicit bf16/i8mm) | | |
+| `cortex-a510`| V9-A | `SVE2_BITPERM, MEMTAG, I8MM, BF16` |
+| `cortex-a710`| V9-A | `SVE2_BITPERM, MEMTAG, I8MM, BF16` |
+| `cortex-a715`| V9-A | `SVE2_BITPERM, MEMTAG, I8MM, BF16` |
+| `cortex-x2`  | V9-A | `SVE2_BITPERM, MEMTAG, I8MM, BF16` |
+| `cortex-x3`  | V9-A | `SVE2_BITPERM, MEMTAG, I8MM, BF16` |
+| `neoverse-n2`| V9-A | `I8MM, BF16, SVE2_BITPERM, RNG, MEMTAG, PROFILE` |
+| `neoverse-v2`| V9-A | `I8MM, BF16, SVE2_BITPERM, RNG, MEMTAG, PROFILE` (Graviton 4) |
+| `grace`      | V9-A | `I8MM, BF16, SVE2_BITPERM, SVE2_AES, SVE2_SHA3, SVE2_SM4, PROFILE` |
+| **V8.4-A SVE tier** (Graviton 3's odd one) | | |
+| `neoverse-v1`| V8.4-A | `SVE, I8MM, BF16, PROFILE, SSBS, RNG` |
+| **V9.2-A tier** (V9 + V8.7 features) | | |
+| `cortex-a520`| V9.2-A | `SVE2_BITPERM, MEMTAG` |
+| `cortex-a720`| V9.2-A | `SVE2_BITPERM, MEMTAG, PROFILE` |
+| `cortex-a725`| V9.2-A | `SVE2_BITPERM, MEMTAG, PROFILE` |
+| `cortex-x4`  | V9.2-A | `SVE2_BITPERM, MEMTAG, PROFILE` |
+| `cortex-x925`| V9.2-A | `SVE2_BITPERM, MEMTAG, PROFILE` |
+| `neoverse-n3`| V9.2-A | `SVE2_BITPERM, RNG, MEMTAG, PROFILE` |
+| `neoverse-v3`| V9.2-A | `SVE2_BITPERM, RNG, LS64, MEMTAG, PROFILE` |
+
+**Dispatch tier mapping (which matrix column each core lands in):**
+
+| Tier (matrix col.) | Cores |
+|---|---|
+| A53 | `cortex-a53`, older V8.0-A |
+| A72 | `cortex-a72`, V8.0-A + CRC |
+| A76 (V8.2 with dotprod+fp16, NO bf16/i8mm) | `cortex-a76`, `cortex-a78`, `cortex-x1`, `neoverse-n1`, `apple-m1` |
+| **(new tier — V8.6+/V9 with bf16+i8mm)** | `apple-m2`+, `oryon-1` (Snapdragon X), `cortex-a510`+, `neoverse-n2`/`v2`/`grace`, `ampere1`+ |
+| **(new tier — V8.4-A + SVE + bf16+i8mm)** | `neoverse-v1` (Graviton 3 — only V8.4-A core with explicit SVE+bf16+i8mm) |
+
+The matrix's three aarch64 columns cover the bottom of the dispatch
+ladder. The bf16/i8mm tier (which would carry NEON BFMMLA / BFDOT /
+USDOT / FMLA.8h) needs its own column in a future revision — when the
+NEON BF16 asm-byte arm lands (Phase 3b in § J), every V8.6+ core
+listed above gets covered by the same dispatch arm.
+
+**Source provenance:** scraped from
+`https://raw.githubusercontent.com/gcc-mirror/gcc/master/gcc/config/aarch64/aarch64-cores.def`
+(GCC trunk, 2026-05-21). The `AARCH64_CORE(...)` macro emits the
+canonical name → arch → feature-string mapping; GCC's
+`(define_insn ...)` patterns in `aarch64-simd.md` give the bit
+encodings for the asm-byte rule (`.inst 0xXXXXXXXX`) that Phase 3b
+will use for BFMMLA / BFDOT / FMLA.8h / USDOT.
+
 ## L. Provenance
 
 - CPU feature presence: sourced from `td-simd-cpu-dispatch-matrix.md`.
diff --git a/src/simd_runtime/add_mul.rs b/src/simd_runtime/add_mul.rs
index 799f65a5..ff7d83fd 100644
--- a/src/simd_runtime/add_mul.rs
+++ b/src/simd_runtime/add_mul.rs
@@ -228,6 +228,51 @@ unsafe fn add_mul_f64_scalar(acc: &mut [f64], a: &[f64], b: &[f64]) {
     }
 }
 
+// ────────────────────────────────────────────────────────────────────────
+// CpuOps DTO entry points — pub(super) wrappers for cpu_ops.rs to
+// reference the tier-specific kernels by name in static const decls.
+// Each one has the safety invariant guaranteed by the cpu_ops()
+// LazyLock that installed the parent &'static CpuOps.
+// ────────────────────────────────────────────────────────────────────────
+
+#[cfg(target_arch = "x86_64")]
+pub(super) unsafe fn add_mul_f32_avx512_safe(acc: &mut [f32], a: &[f32], b: &[f32]) {
+    add_mul_f32_avx512(acc, a, b)
+}
+
+#[cfg(target_arch = "x86_64")]
+pub(super) unsafe fn add_mul_f64_avx512_safe(acc: &mut [f64], a: &[f64], b: &[f64]) {
+    add_mul_f64_avx512(acc, a, b)
+}
+
+#[cfg(target_arch = "x86_64")]
+pub(super) unsafe fn add_mul_f32_avx2_fma_safe(acc: &mut [f32], a: &[f32], b: &[f32]) {
+    add_mul_f32_avx2_fma(acc, a, b)
+}
+
+#[cfg(target_arch = "x86_64")]
+pub(super) unsafe fn add_mul_f64_avx2_fma_safe(acc: &mut [f64], a: &[f64], b: &[f64]) {
+    add_mul_f64_avx2_fma(acc, a, b)
+}
+
+#[cfg(target_arch = "aarch64")]
+pub(super) unsafe fn add_mul_f32_neon_safe(acc: &mut [f32], a: &[f32], b: &[f32]) {
+    add_mul_f32_neon(acc, a, b)
+}
+
+#[cfg(target_arch = "aarch64")]
+pub(super) unsafe fn add_mul_f64_neon_safe(acc: &mut [f64], a: &[f64], b: &[f64]) {
+    add_mul_f64_neon(acc, a, b)
+}
+
+pub(super) unsafe fn add_mul_f32_scalar_safe(acc: &mut [f32], a: &[f32], b: &[f32]) {
+    add_mul_f32_scalar(acc, a, b)
+}
+
+pub(super) unsafe fn add_mul_f64_scalar_safe(acc: &mut [f64], a: &[f64], b: &[f64]) {
+    add_mul_f64_scalar(acc, a, b)
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
diff --git a/src/simd_runtime/cpu_ops.rs b/src/simd_runtime/cpu_ops.rs
new file mode 100644
index 00000000..c1fe43d7
--- /dev/null
+++ b/src/simd_runtime/cpu_ops.rs
@@ -0,0 +1,379 @@
+//! Per-CPU operations DTO — the *third* dispatch pattern.
+//!
+//! **Pattern 1** (`crate::simd::*`): compile-time `#[cfg(target_feature)]`
+//! cascade. Direct monomorphized call. No runtime branch.
+//!
+//! **Pattern 2** (`crate::simd_runtime::vnni_dot::vnni_dot_u8_i8`,
+//! `crate::simd_runtime::add_mul::*`): per-op `LazyLock<fn ptr>`
+//! trampoline. One atomic-load + CPUID per op the first time it's
+//! called.
+//!
+//! **Pattern 3 (this module)**: per-CPU [`CpuOps`] DTO selected once
+//! at first access. Every op is a field/method on the struct. Consumers
+//! that touch N different SIMD ops pay ONE LazyLock load total, not
+//! N. The OpenBLAS / MKL dispatch model — also what GCC's own libgcc
+//! uses for its multi-versioned function tables.
+//!
+//! # Why three patterns?
+//!
+//! - **Pattern 1** wins on bench / fixed-target builds — direct call
+//!   sites, no indirection, full inlining.
+//! - **Pattern 2** wins for sparse-op consumers — pay LazyLock cost
+//!   only for ops you actually call.
+//! - **Pattern 3** wins for dense-op consumers (linear-algebra
+//!   pipelines that touch every BLAS-1 + BLAS-2 + GEMM kernel) — the
+//!   single LazyLock load amortizes across all calls; cache-resident
+//!   `&'static CpuOps` keeps every op's fn-ptr in L1.
+//!
+//! All three coexist. Consumers pick by import path:
+//!
+//! ```ignore
+//! // Pattern 1: same call works on every target_feature config
+//! crate::simd_ops::add_mul_f32(acc, a, b);
+//!
+//! // Pattern 2: one runtime CPUID per op, first call
+//! crate::simd_runtime::add_mul_f32(acc, a, b);
+//!
+//! // Pattern 3: one runtime CPUID total, then per-op fn-ptr deref
+//! let ops = crate::simd_runtime::cpu_ops();
+//! unsafe { (ops.add_mul_f32)(acc, a, b); }
+//! ```
+//!
+//! # The naughty bit — data-driven lookup
+//!
+//! The [`CpuOps`] selection is driven by data scraped from GCC's
+//! `aarch64-cores.def` and `i386.h` (per the matrix doc § M). The
+//! per-tier static `CpuOps` instances are baked in below; the
+//! [`cpu_ops_for_tier`] lookup table maps a tier name → `&'static
+//! CpuOps` for debugging and explicit-tier-pinning ("I want to force
+//! the AVX2 arm even though AMX is available, to measure overhead").
+//!
+//! Future work: code-gen the table from a build.rs that fetches GCC's
+//! latest core list. For now the table is hand-rolled from the scrape
+//! recorded in `.claude/knowledge/agnostic-surface-cpu-matrix.md § M`.
+
+use std::sync::LazyLock;
+
+/// Per-CPU operations DTO — fn-pointer table for every op currently
+/// in `crate::simd_runtime::*`.
+///
+/// `Copy + 'static`. Held by reference (`&'static CpuOps`) by every
+/// consumer — the lifetime is fine because each instance is a static
+/// const baked at compile time. Cache-resident after first read.
+#[derive(Copy, Clone)]
+pub struct CpuOps {
+    /// Human-readable tier name (`"amx_int8"`, `"avx512vnni"`,
+    /// `"avxvnni"`, `"avx2_fma"`, `"neon_bf16"`, `"neon_dotprod"`,
+    /// `"neon"`, `"scalar"`).
+    pub tier: &'static str,
+
+    /// Architecture string (`"x86_64"`, `"aarch64"`, `"scalar"`).
+    pub arch: &'static str,
+
+    /// `u8 × i8 → i32` dot product.
+    pub vnni_dot_u8_i8: unsafe fn(&[u8], &[i8]) -> i32,
+
+    /// `acc[i] += a[i] * b[i]` (f32, single-rounded FMA).
+    pub add_mul_f32: unsafe fn(&mut [f32], &[f32], &[f32]),
+
+    /// `acc[i] += a[i] * b[i]` (f64, single-rounded FMA).
+    pub add_mul_f64: unsafe fn(&mut [f64], &[f64], &[f64]),
+}
+
+// ────────────────────────────────────────────────────────────────────────
+// Per-tier static instances
+// ────────────────────────────────────────────────────────────────────────
+//
+// Each `CPU_OPS_*` is a `&'static CpuOps` baked at compile time. The
+// fn ptrs reference the existing trampolines in
+// `crate::simd_runtime::{vnni_dot, add_mul}`. We don't duplicate kernel
+// code here; this module is pure dispatch glue.
+
+#[cfg(target_arch = "x86_64")]
+static CPU_OPS_AMX_INT8: CpuOps = CpuOps {
+    tier: "amx_int8",
+    arch: "x86_64",
+    // VNNI dot prefers AVX-512 VNNI on AMX hosts (matmul uses TDPBUSD;
+    // for slice-level dot the AVX-512 path is still the right primitive
+    // since TDPBUSD operates on tile registers, not single rows).
+    vnni_dot_u8_i8: super::vnni_dot::vnni_dot_u8_i8_avx512_with_tail_safe,
+    add_mul_f32: super::add_mul::add_mul_f32_avx512_safe,
+    add_mul_f64: super::add_mul::add_mul_f64_avx512_safe,
+};
+
+#[cfg(target_arch = "x86_64")]
+static CPU_OPS_AVX512_VNNI: CpuOps = CpuOps {
+    tier: "avx512vnni",
+    arch: "x86_64",
+    vnni_dot_u8_i8: super::vnni_dot::vnni_dot_u8_i8_avx512_with_tail_safe,
+    add_mul_f32: super::add_mul::add_mul_f32_avx512_safe,
+    add_mul_f64: super::add_mul::add_mul_f64_avx512_safe,
+};
+
+#[cfg(target_arch = "x86_64")]
+static CPU_OPS_AVX512F: CpuOps = CpuOps {
+    tier: "avx512f",
+    arch: "x86_64",
+    // No VNNI on this tier — falls back to scalar wrapper.
+    vnni_dot_u8_i8: super::vnni_dot::vnni_dot_u8_i8_scalar_wrapper,
+    add_mul_f32: super::add_mul::add_mul_f32_avx512_safe,
+    add_mul_f64: super::add_mul::add_mul_f64_avx512_safe,
+};
+
+#[cfg(target_arch = "x86_64")]
+static CPU_OPS_AVXVNNI: CpuOps = CpuOps {
+    tier: "avxvnni",
+    arch: "x86_64",
+    vnni_dot_u8_i8: super::vnni_dot::vnni2_dot_u8_i8_safe,
+    add_mul_f32: super::add_mul::add_mul_f32_avx2_fma_safe,
+    add_mul_f64: super::add_mul::add_mul_f64_avx2_fma_safe,
+};
+
+#[cfg(target_arch = "x86_64")]
+static CPU_OPS_AVX2_FMA: CpuOps = CpuOps {
+    tier: "avx2_fma",
+    arch: "x86_64",
+    vnni_dot_u8_i8: super::vnni_dot::vnni_dot_u8_i8_scalar_wrapper,
+    add_mul_f32: super::add_mul::add_mul_f32_avx2_fma_safe,
+    add_mul_f64: super::add_mul::add_mul_f64_avx2_fma_safe,
+};
+
+#[cfg(target_arch = "aarch64")]
+static CPU_OPS_NEON: CpuOps = CpuOps {
+    tier: "neon",
+    arch: "aarch64",
+    vnni_dot_u8_i8: super::vnni_dot::vnni_dot_u8_i8_scalar_wrapper,
+    add_mul_f32: super::add_mul::add_mul_f32_neon_safe,
+    add_mul_f64: super::add_mul::add_mul_f64_neon_safe,
+};
+
+/// Universal scalar fallback. Always available on every target.
+static CPU_OPS_SCALAR: CpuOps = CpuOps {
+    tier: "scalar",
+    arch: "scalar",
+    vnni_dot_u8_i8: super::vnni_dot::vnni_dot_u8_i8_scalar_wrapper,
+    add_mul_f32: super::add_mul::add_mul_f32_scalar_safe,
+    add_mul_f64: super::add_mul::add_mul_f64_scalar_safe,
+};
+
+// ────────────────────────────────────────────────────────────────────────
+// Selection
+// ────────────────────────────────────────────────────────────────────────
+
+/// Lazily-selected per-CPU ops table for the current host.
+///
+/// First call: ~1µs (LazyLock initialization + CPUID via
+/// [`simd_caps`]). Subsequent calls: one atomic-acquire load that's
+/// cache-resident; ~1-2 ns. Every op-method on the returned
+/// `&'static CpuOps` is then ONE indirect call (no further LazyLock).
+///
+/// Compared to the per-op LazyLocks in
+/// [`crate::simd_runtime::vnni_dot::vnni_dot_u8_i8`] etc., this
+/// pattern wins when a consumer touches many different SIMD ops in
+/// one critical section — they amortize a single LazyLock load
+/// across all ops instead of paying once per op.
+///
+/// [`simd_caps`]: crate::hpc::simd_caps::simd_caps
+pub fn cpu_ops() -> &'static CpuOps {
+    static SELECTED: LazyLock<&'static CpuOps> = LazyLock::new(|| {
+        let _caps = crate::hpc::simd_caps::simd_caps();
+
+        #[cfg(target_arch = "x86_64")]
+        {
+            if _caps.amx_int8 {
+                return &CPU_OPS_AMX_INT8;
+            }
+            if _caps.avx512f && _caps.avx512vnni {
+                return &CPU_OPS_AVX512_VNNI;
+            }
+            if _caps.avx512f {
+                return &CPU_OPS_AVX512F;
+            }
+            if _caps.avx2 && _caps.avxvnniint8 {
+                return &CPU_OPS_AVXVNNI;
+            }
+            if _caps.avx2 && _caps.fma {
+                return &CPU_OPS_AVX2_FMA;
+            }
+        }
+
+        #[cfg(target_arch = "aarch64")]
+        {
+            if _caps.neon {
+                return &CPU_OPS_NEON;
+            }
+        }
+
+        &CPU_OPS_SCALAR
+    });
+    *SELECTED
+}
+
+/// Lookup `&'static CpuOps` by tier name string. Used for explicit-
+/// tier-pinning (forcing AVX2 when AMX is available, e.g. for
+/// benchmarking) and for debug reporting of which tier the
+/// auto-detection landed on.
+///
+/// Returns `None` for unknown tier names. The known tier strings are
+/// the values you can read from `cpu_ops().tier`:
+///
+///   x86_64: `"amx_int8"`, `"avx512vnni"`, `"avx512f"`, `"avxvnni"`,
+///           `"avx2_fma"`
+///   aarch64: `"neon"`
+///   universal: `"scalar"`
+///
+/// Future expansion: NEON BF16 / dotprod tiers will land here once
+/// the asm-byte arms ship (Phase 3b in the matrix doc).
+pub fn cpu_ops_for_tier(name: &str) -> Option<&'static CpuOps> {
+    match name {
+        #[cfg(target_arch = "x86_64")]
+        "amx_int8" => Some(&CPU_OPS_AMX_INT8),
+        #[cfg(target_arch = "x86_64")]
+        "avx512vnni" => Some(&CPU_OPS_AVX512_VNNI),
+        #[cfg(target_arch = "x86_64")]
+        "avx512f" => Some(&CPU_OPS_AVX512F),
+        #[cfg(target_arch = "x86_64")]
+        "avxvnni" => Some(&CPU_OPS_AVXVNNI),
+        #[cfg(target_arch = "x86_64")]
+        "avx2_fma" => Some(&CPU_OPS_AVX2_FMA),
+        #[cfg(target_arch = "aarch64")]
+        "neon" => Some(&CPU_OPS_NEON),
+        "scalar" => Some(&CPU_OPS_SCALAR),
+        _ => None,
+    }
+}
+
+/// Lookup by GCC CPU codename (e.g. `"sapphirerapids"`,
+/// `"neoverse-v2"`, `"apple-m2"`). Maps the canonical GCC name to the
+/// dispatch tier the CPU lands in, sourced from the scrape recorded
+/// in the matrix doc § M.
+///
+/// Used for "what would this CPU pick?" introspection without
+/// touching CPUID on the running host — e.g. cross-compilation
+/// reports, deployment-planning tools, integration tests that want
+/// to assert tier selection for a named target without running on
+/// that silicon.
+///
+/// Returns `None` for unknown CPU names. Only modern (V8.2-A+ on
+/// aarch64, AVX-512+ or AVX-VNNI+ on x86_64) names are mapped — older
+/// silicon falls through to `cpu_ops_for_tier("scalar")` by
+/// convention if you really need it.
+pub fn cpu_ops_for_cpu(name: &str) -> Option<&'static CpuOps> {
+    cpu_ops_for_tier(cpu_to_tier(name)?)
+}
+
+/// Maps a GCC CPU codename to the [`CpuOps`] tier it lands in. Data
+/// from the scrape recorded in `.claude/knowledge/agnostic-surface-cpu-matrix.md`
+/// § M (aarch64) plus the GCC i386 cpu definitions for x86_64.
+fn cpu_to_tier(cpu: &str) -> Option<&'static str> {
+    Some(match cpu {
+        // x86_64 — AMX-INT8 hosts
+        "sapphirerapids" | "graniterapids" | "graniterapids-d" | "emeraldrapids" => "amx_int8",
+
+        // x86_64 — AVX-512 + VNNI (no AMX)
+        "cascadelake" | "cooperlake" | "icelake-client" | "icelake-server" | "tigerlake" | "rocketlake" | "znver4"
+        | "znver5" => "avx512vnni",
+
+        // x86_64 — AVX-512F only (no VNNI)
+        "skylake-avx512" => "avx512f",
+
+        // x86_64 — AVX-VNNI (no AVX-512)
+        "alderlake" | "raptorlake" | "meteorlake" | "arrowlake" | "arrowlake-s" | "lunarlake" | "pantherlake"
+        | "sierraforest" | "grandridge" => "avxvnni",
+
+        // x86_64 — plain AVX2 + FMA (Haswell baseline, Zen 1-3)
+        "haswell" | "broadwell" | "skylake" | "kaby-lake" | "comet-lake" | "znver1" | "znver2" | "znver3" => "avx2_fma",
+
+        // aarch64 — V8.6+/V9 (would map to neon_bf16 once that tier
+        // lands; today both V8.6+ and the V8.2 baseline land at the
+        // generic NEON tier).
+        "apple-m1" | "apple-m2" | "apple-m3" | "apple-m4" | "oryon-1" | "ampere1" | "ampere1a" | "ampere1b"
+        | "cortex-a76" | "cortex-a78" | "cortex-a510" | "cortex-a520" | "cortex-a710" | "cortex-a715"
+        | "cortex-a720" | "cortex-a725" | "cortex-x1" | "cortex-x2" | "cortex-x3" | "cortex-x4" | "cortex-x925"
+        | "neoverse-n1" | "neoverse-n2" | "neoverse-n3" | "neoverse-v1" | "neoverse-v2" | "neoverse-v3" | "grace"
+        | "cortex-a72" | "cortex-a53" => "neon",
+
+        // Unknown CPU
+        _ => return None,
+    })
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn cpu_ops_resolves_on_this_host() {
+        let ops = cpu_ops();
+        // Tier name is non-empty.
+        assert!(!ops.tier.is_empty(), "tier name is empty");
+        // Arch is one of the known classes.
+        assert!(matches!(ops.arch, "x86_64" | "aarch64" | "scalar"));
+        eprintln!("cpu_ops() resolved tier={} arch={}", ops.tier, ops.arch);
+    }
+
+    #[test]
+    fn cpu_ops_stable_across_calls() {
+        let a = cpu_ops() as *const _ as usize;
+        let b = cpu_ops() as *const _ as usize;
+        assert_eq!(a, b, "cpu_ops() returned different pointers across calls");
+    }
+
+    #[test]
+    fn cpu_ops_for_tier_known_names() {
+        assert!(cpu_ops_for_tier("scalar").is_some());
+        #[cfg(target_arch = "x86_64")]
+        {
+            assert!(cpu_ops_for_tier("avx2_fma").is_some());
+            assert!(cpu_ops_for_tier("amx_int8").is_some());
+            assert!(cpu_ops_for_tier("avx512vnni").is_some());
+        }
+        #[cfg(target_arch = "aarch64")]
+        {
+            assert!(cpu_ops_for_tier("neon").is_some());
+        }
+        assert!(cpu_ops_for_tier("nonsense_tier").is_none());
+    }
+
+    #[test]
+    fn cpu_ops_for_cpu_data_driven_lookup() {
+        // Spot-check the GCC-scraped mapping (matrix doc § M).
+        assert_eq!(cpu_to_tier("sapphirerapids"), Some("amx_int8"));
+        assert_eq!(cpu_to_tier("graniterapids"), Some("amx_int8"));
+        assert_eq!(cpu_to_tier("cascadelake"), Some("avx512vnni"));
+        assert_eq!(cpu_to_tier("znver4"), Some("avx512vnni"));
+        assert_eq!(cpu_to_tier("znver5"), Some("avx512vnni"));
+        assert_eq!(cpu_to_tier("alderlake"), Some("avxvnni"));
+        assert_eq!(cpu_to_tier("arrowlake"), Some("avxvnni"));
+        assert_eq!(cpu_to_tier("haswell"), Some("avx2_fma"));
+        assert_eq!(cpu_to_tier("znver3"), Some("avx2_fma"));
+
+        assert_eq!(cpu_to_tier("apple-m2"), Some("neon"));
+        assert_eq!(cpu_to_tier("neoverse-v2"), Some("neon"));
+        assert_eq!(cpu_to_tier("oryon-1"), Some("neon"));
+        assert_eq!(cpu_to_tier("grace"), Some("neon"));
+
+        assert_eq!(cpu_to_tier("totally-fake-cpu"), None);
+    }
+
+    #[test]
+    fn cpu_ops_call_through_dto() {
+        // The whole point: call ops through the DTO and verify it
+        // produces correct results. Exercises the indirect-call
+        // through the fn ptr without going through the per-op LazyLock.
+        let ops = cpu_ops();
+        let a: Vec<u8> = (0..100).map(|i| (i % 256) as u8).collect();
+        let b: Vec<i8> = (0..100).map(|i| ((i * 3) % 256) as u8 as i8).collect();
+        let got = unsafe { (ops.vnni_dot_u8_i8)(&a, &b) };
+        let expected: i32 = (0..100).map(|i| a[i] as i32 * b[i] as i32).sum();
+        assert_eq!(got, expected, "vnni_dot via CpuOps DTO mismatch");
+
+        let mut acc = vec![1.0f32; 32];
+        let xa = vec![2.0f32; 32];
+        let xb = vec![3.0f32; 32];
+        unsafe { (ops.add_mul_f32)(&mut acc, &xa, &xb) };
+        for &v in &acc {
+            assert!((v - 7.0).abs() < 1e-6, "add_mul_f32 via DTO: got {v}");
+        }
+    }
+}
diff --git a/src/simd_runtime/mod.rs b/src/simd_runtime/mod.rs
index 428020d3..666c1277 100644
--- a/src/simd_runtime/mod.rs
+++ b/src/simd_runtime/mod.rs
@@ -84,6 +84,7 @@ compile_error!(
 
 pub mod add_mul;
 pub mod casts;
+pub mod cpu_ops;
 pub mod matmul;
 pub mod vnni_dot;
 
@@ -91,5 +92,6 @@ pub mod vnni_dot;
 // consumers can `use crate::simd_runtime::*` and get every op flat.
 pub use add_mul::{add_mul_f32, add_mul_f64};
 pub use casts::{bf16_to_f32_batch, cast_f16_to_f32_batch, cast_f32_to_f16_batch, f32_to_bf16_batch_rne};
+pub use cpu_ops::{cpu_ops, cpu_ops_for_cpu, cpu_ops_for_tier, CpuOps};
 pub use matmul::{gemm_u8_i8, matmul_bf16_to_f32, matmul_f32, matmul_i8_to_i32};
 pub use vnni_dot::vnni_dot_u8_i8;
diff --git a/src/simd_runtime/vnni_dot.rs b/src/simd_runtime/vnni_dot.rs
index 4abafede..4e9ba980 100644
--- a/src/simd_runtime/vnni_dot.rs
+++ b/src/simd_runtime/vnni_dot.rs
@@ -125,6 +125,28 @@ unsafe fn vnni_dot_u8_i8_scalar_safe_wrapper(a: &[u8], b: &[i8]) -> i32 {
     crate::simd_amx::vnni_dot_u8_i8_scalar(a, b)
 }
 
+// ────────────────────────────────────────────────────────────────────────
+// CpuOps DTO entry points — same fn ptrs as above but exposed
+// pub(super) so cpu_ops.rs can reference them by name in static decls.
+// ────────────────────────────────────────────────────────────────────────
+
+#[cfg(target_arch = "x86_64")]
+pub(super) unsafe fn vnni_dot_u8_i8_avx512_with_tail_safe(a: &[u8], b: &[i8]) -> i32 {
+    // SAFETY: the static CpuOps that references this is only installed
+    // when avx512f + avx512vnni were verified by cpu_ops()'s LazyLock.
+    vnni_dot_u8_i8_avx512_with_tail(a, b)
+}
+
+#[cfg(target_arch = "x86_64")]
+pub(super) unsafe fn vnni2_dot_u8_i8_safe(a: &[u8], b: &[i8]) -> i32 {
+    // SAFETY: dispatch closure verified avx2 + avxvnniint8.
+    vnni2_dot_u8_i8_safe_wrapper(a, b)
+}
+
+pub(super) unsafe fn vnni_dot_u8_i8_scalar_wrapper(a: &[u8], b: &[i8]) -> i32 {
+    vnni_dot_u8_i8_scalar_safe_wrapper(a, b)
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;