diff --git a/.claude/knowledge/agnostic-surface-cpu-matrix.md b/.claude/knowledge/agnostic-surface-cpu-matrix.md index 93e97b07..7ab9bca2 100644 --- a/.claude/knowledge/agnostic-surface-cpu-matrix.md +++ b/.claude/knowledge/agnostic-surface-cpu-matrix.md @@ -23,7 +23,7 @@ Same set as `td-simd-cpu-dispatch-matrix.md` § "Master matrix — x86_64" and | Z5 | `znver5` / `Zen4Avx512` (same dispatch) | AMD 2024 | same as Z4 + minor uarch | | ARL | `arrowlake` / `ArrowLake` | Intel 2024 | AVX2+FMA + AVX-VNNI+VNNI-INT8 | | HSW | `x86-64-v3` / `HaswellAvx2` | Intel 2013→2021 | AVX2+FMA (no VNNI/AVX-512) | -| A76 | `cortex-a76` / `A76DotProd` | ARMv8.2 (Pi 5, M1) | NEON+dotprod+bf16+fp16 | +| A76 | `cortex-a76` / `A76DotProd` | ARMv8.2 (Pi 5) | NEON+dotprod+fp16 (no bf16 / i8mm — those are V8.6+, see § M) | | A72 | `cortex-a72` / `A72Fast` | ARMv8.0 (Pi 4) | NEON only (no dotprod) | | A53 | `cortex-a53` / `A53Baseline` | ARMv8.0 (Pi 3/Z2W) | NEON, lower IPC | | SCA | scalar fallback | wasm32/riscv/i686 | no SIMD | @@ -530,6 +530,76 @@ verifies that no per-CPU regression has crept in vs the historical baseline: `crate::simd::*`, this table must grow a row. Reviewers should reject PRs that add a public symbol without a corresponding matrix entry. +## M. AArch64 ground-truth core enumeration (GCC source) + +The matrix above uses three aarch64 columns (A53 / A72 / A76) that +each cover a *dispatch tier* — multiple physical cores share the same +SIMD primitive set. The authoritative per-core feature membership is +in GCC's `gcc/config/aarch64/aarch64-cores.def`, scraped 2026-05-21: + +| Core | GCC arch | Explicit feature flags | +|---|---|---| +| **A53/A72/A76 tier** (baseline NEON, optional dotprod+fp16, NO bf16) | | | +| `cortex-a53` | V8-A | `(CRC)` | +| `cortex-a72` | V8-A | `(CRC)` | +| `cortex-a76` | V8.2-A | `F16, RCPC, DOTPROD` | +| `cortex-a78` | V8.2-A | `F16, RCPC, DOTPROD, SSBS, PROFILE` | +| `cortex-x1` | V8.2-A | `F16, RCPC, DOTPROD, SSBS, PROFILE` | +| `neoverse-n1`| V8.2-A | `F16, RCPC, DOTPROD, PROFILE` | +| `apple-m1` | V8.5-A | `()` — V8.5 baseline includes F16+dotprod, NO bf16/i8mm | +| **V8.6-A tier** (BF16 + I8MM via baseline) | | | +| `apple-m2` | V8.6-A | `()` — V8.6 baseline → bf16, i8mm, sve, sve2 | +| `apple-m3` | V8.6-A | same | +| `oryon-1` | V8.6-A | `CRYPTO, SM4, SHA3, F16` (Snapdragon X Elite/Plus) | +| `ampere1` | V8.6-A | `F16, RNG, AES, SHA3` | +| `ampere1a` | V8.6-A | `F16, RNG, AES, SHA3, SM4, MEMTAG` | +| **V8.7-A tier** (baseline + LS64 + MOPS) | | | +| `apple-m4` | V8.7-A | `()` | +| `ampere1b` | V8.7-A | `F16, RNG, AES, SHA3, SM4, MEMTAG, CSSC` | +| **V9.0-A tier** (SVE2 baseline + explicit bf16/i8mm) | | | +| `cortex-a510`| V9-A | `SVE2_BITPERM, MEMTAG, I8MM, BF16` | +| `cortex-a710`| V9-A | `SVE2_BITPERM, MEMTAG, I8MM, BF16` | +| `cortex-a715`| V9-A | `SVE2_BITPERM, MEMTAG, I8MM, BF16` | +| `cortex-x2` | V9-A | `SVE2_BITPERM, MEMTAG, I8MM, BF16` | +| `cortex-x3` | V9-A | `SVE2_BITPERM, MEMTAG, I8MM, BF16` | +| `neoverse-n2`| V9-A | `I8MM, BF16, SVE2_BITPERM, RNG, MEMTAG, PROFILE` | +| `neoverse-v2`| V9-A | `I8MM, BF16, SVE2_BITPERM, RNG, MEMTAG, PROFILE` (Graviton 4) | +| `grace` | V9-A | `I8MM, BF16, SVE2_BITPERM, SVE2_AES, SVE2_SHA3, SVE2_SM4, PROFILE` | +| **V8.4-A SVE tier** (Graviton 3's odd one) | | | +| `neoverse-v1`| V8.4-A | `SVE, I8MM, BF16, PROFILE, SSBS, RNG` | +| **V9.2-A tier** (V9 + V8.7 features) | | | +| `cortex-a520`| V9.2-A | `SVE2_BITPERM, MEMTAG` | +| `cortex-a720`| V9.2-A | `SVE2_BITPERM, MEMTAG, PROFILE` | +| `cortex-a725`| V9.2-A | `SVE2_BITPERM, MEMTAG, PROFILE` | +| `cortex-x4` | V9.2-A | `SVE2_BITPERM, MEMTAG, PROFILE` | +| `cortex-x925`| V9.2-A | `SVE2_BITPERM, MEMTAG, PROFILE` | +| `neoverse-n3`| V9.2-A | `SVE2_BITPERM, RNG, MEMTAG, PROFILE` | +| `neoverse-v3`| V9.2-A | `SVE2_BITPERM, RNG, LS64, MEMTAG, PROFILE` | + +**Dispatch tier mapping (which matrix column each core lands in):** + +| Tier (matrix col.) | Cores | +|---|---| +| A53 | `cortex-a53`, older V8.0-A | +| A72 | `cortex-a72`, V8.0-A + CRC | +| A76 (V8.2 with dotprod+fp16, NO bf16/i8mm) | `cortex-a76`, `cortex-a78`, `cortex-x1`, `neoverse-n1`, `apple-m1` | +| **(new tier — V8.6+/V9 with bf16+i8mm)** | `apple-m2`+, `oryon-1` (Snapdragon X), `cortex-a510`+, `neoverse-n2`/`v2`/`grace`, `ampere1`+ | +| **(new tier — V8.4-A + SVE + bf16+i8mm)** | `neoverse-v1` (Graviton 3 — only V8.4-A core with explicit SVE+bf16+i8mm) | + +The matrix's three aarch64 columns cover the bottom of the dispatch +ladder. The bf16/i8mm tier (which would carry NEON BFMMLA / BFDOT / +USDOT / FMLA.8h) needs its own column in a future revision — when the +NEON BF16 asm-byte arm lands (Phase 3b in § J), every V8.6+ core +listed above gets covered by the same dispatch arm. + +**Source provenance:** scraped from +`https://raw.githubusercontent.com/gcc-mirror/gcc/master/gcc/config/aarch64/aarch64-cores.def` +(GCC trunk, 2026-05-21). The `AARCH64_CORE(...)` macro emits the +canonical name → arch → feature-string mapping; GCC's +`(define_insn ...)` patterns in `aarch64-simd.md` give the bit +encodings for the asm-byte rule (`.inst 0xXXXXXXXX`) that Phase 3b +will use for BFMMLA / BFDOT / FMLA.8h / USDOT. + ## L. Provenance - CPU feature presence: sourced from `td-simd-cpu-dispatch-matrix.md`. diff --git a/src/simd_runtime/add_mul.rs b/src/simd_runtime/add_mul.rs index 799f65a5..ff7d83fd 100644 --- a/src/simd_runtime/add_mul.rs +++ b/src/simd_runtime/add_mul.rs @@ -228,6 +228,51 @@ unsafe fn add_mul_f64_scalar(acc: &mut [f64], a: &[f64], b: &[f64]) { } } +// ──────────────────────────────────────────────────────────────────────── +// CpuOps DTO entry points — pub(super) wrappers for cpu_ops.rs to +// reference the tier-specific kernels by name in static const decls. +// Each one has the safety invariant guaranteed by the cpu_ops() +// LazyLock that installed the parent &'static CpuOps. +// ──────────────────────────────────────────────────────────────────────── + +#[cfg(target_arch = "x86_64")] +pub(super) unsafe fn add_mul_f32_avx512_safe(acc: &mut [f32], a: &[f32], b: &[f32]) { + add_mul_f32_avx512(acc, a, b) +} + +#[cfg(target_arch = "x86_64")] +pub(super) unsafe fn add_mul_f64_avx512_safe(acc: &mut [f64], a: &[f64], b: &[f64]) { + add_mul_f64_avx512(acc, a, b) +} + +#[cfg(target_arch = "x86_64")] +pub(super) unsafe fn add_mul_f32_avx2_fma_safe(acc: &mut [f32], a: &[f32], b: &[f32]) { + add_mul_f32_avx2_fma(acc, a, b) +} + +#[cfg(target_arch = "x86_64")] +pub(super) unsafe fn add_mul_f64_avx2_fma_safe(acc: &mut [f64], a: &[f64], b: &[f64]) { + add_mul_f64_avx2_fma(acc, a, b) +} + +#[cfg(target_arch = "aarch64")] +pub(super) unsafe fn add_mul_f32_neon_safe(acc: &mut [f32], a: &[f32], b: &[f32]) { + add_mul_f32_neon(acc, a, b) +} + +#[cfg(target_arch = "aarch64")] +pub(super) unsafe fn add_mul_f64_neon_safe(acc: &mut [f64], a: &[f64], b: &[f64]) { + add_mul_f64_neon(acc, a, b) +} + +pub(super) unsafe fn add_mul_f32_scalar_safe(acc: &mut [f32], a: &[f32], b: &[f32]) { + add_mul_f32_scalar(acc, a, b) +} + +pub(super) unsafe fn add_mul_f64_scalar_safe(acc: &mut [f64], a: &[f64], b: &[f64]) { + add_mul_f64_scalar(acc, a, b) +} + #[cfg(test)] mod tests { use super::*; diff --git a/src/simd_runtime/cpu_ops.rs b/src/simd_runtime/cpu_ops.rs new file mode 100644 index 00000000..c1fe43d7 --- /dev/null +++ b/src/simd_runtime/cpu_ops.rs @@ -0,0 +1,379 @@ +//! Per-CPU operations DTO — the *third* dispatch pattern. +//! +//! **Pattern 1** (`crate::simd::*`): compile-time `#[cfg(target_feature)]` +//! cascade. Direct monomorphized call. No runtime branch. +//! +//! **Pattern 2** (`crate::simd_runtime::vnni_dot::vnni_dot_u8_i8`, +//! `crate::simd_runtime::add_mul::*`): per-op `LazyLock` +//! trampoline. One atomic-load + CPUID per op the first time it's +//! called. +//! +//! **Pattern 3 (this module)**: per-CPU [`CpuOps`] DTO selected once +//! at first access. Every op is a field/method on the struct. Consumers +//! that touch N different SIMD ops pay ONE LazyLock load total, not +//! N. The OpenBLAS / MKL dispatch model — also what GCC's own libgcc +//! uses for its multi-versioned function tables. +//! +//! # Why three patterns? +//! +//! - **Pattern 1** wins on bench / fixed-target builds — direct call +//! sites, no indirection, full inlining. +//! - **Pattern 2** wins for sparse-op consumers — pay LazyLock cost +//! only for ops you actually call. +//! - **Pattern 3** wins for dense-op consumers (linear-algebra +//! pipelines that touch every BLAS-1 + BLAS-2 + GEMM kernel) — the +//! single LazyLock load amortizes across all calls; cache-resident +//! `&'static CpuOps` keeps every op's fn-ptr in L1. +//! +//! All three coexist. Consumers pick by import path: +//! +//! ```ignore +//! // Pattern 1: same call works on every target_feature config +//! crate::simd_ops::add_mul_f32(acc, a, b); +//! +//! // Pattern 2: one runtime CPUID per op, first call +//! crate::simd_runtime::add_mul_f32(acc, a, b); +//! +//! // Pattern 3: one runtime CPUID total, then per-op fn-ptr deref +//! let ops = crate::simd_runtime::cpu_ops(); +//! unsafe { (ops.add_mul_f32)(acc, a, b); } +//! ``` +//! +//! # The naughty bit — data-driven lookup +//! +//! The [`CpuOps`] selection is driven by data scraped from GCC's +//! `aarch64-cores.def` and `i386.h` (per the matrix doc § M). The +//! per-tier static `CpuOps` instances are baked in below; the +//! [`cpu_ops_for_tier`] lookup table maps a tier name → `&'static +//! CpuOps` for debugging and explicit-tier-pinning ("I want to force +//! the AVX2 arm even though AMX is available, to measure overhead"). +//! +//! Future work: code-gen the table from a build.rs that fetches GCC's +//! latest core list. For now the table is hand-rolled from the scrape +//! recorded in `.claude/knowledge/agnostic-surface-cpu-matrix.md § M`. + +use std::sync::LazyLock; + +/// Per-CPU operations DTO — fn-pointer table for every op currently +/// in `crate::simd_runtime::*`. +/// +/// `Copy + 'static`. Held by reference (`&'static CpuOps`) by every +/// consumer — the lifetime is fine because each instance is a static +/// const baked at compile time. Cache-resident after first read. +#[derive(Copy, Clone)] +pub struct CpuOps { + /// Human-readable tier name (`"amx_int8"`, `"avx512vnni"`, + /// `"avxvnni"`, `"avx2_fma"`, `"neon_bf16"`, `"neon_dotprod"`, + /// `"neon"`, `"scalar"`). + pub tier: &'static str, + + /// Architecture string (`"x86_64"`, `"aarch64"`, `"scalar"`). + pub arch: &'static str, + + /// `u8 × i8 → i32` dot product. + pub vnni_dot_u8_i8: unsafe fn(&[u8], &[i8]) -> i32, + + /// `acc[i] += a[i] * b[i]` (f32, single-rounded FMA). + pub add_mul_f32: unsafe fn(&mut [f32], &[f32], &[f32]), + + /// `acc[i] += a[i] * b[i]` (f64, single-rounded FMA). + pub add_mul_f64: unsafe fn(&mut [f64], &[f64], &[f64]), +} + +// ──────────────────────────────────────────────────────────────────────── +// Per-tier static instances +// ──────────────────────────────────────────────────────────────────────── +// +// Each `CPU_OPS_*` is a `&'static CpuOps` baked at compile time. The +// fn ptrs reference the existing trampolines in +// `crate::simd_runtime::{vnni_dot, add_mul}`. We don't duplicate kernel +// code here; this module is pure dispatch glue. + +#[cfg(target_arch = "x86_64")] +static CPU_OPS_AMX_INT8: CpuOps = CpuOps { + tier: "amx_int8", + arch: "x86_64", + // VNNI dot prefers AVX-512 VNNI on AMX hosts (matmul uses TDPBUSD; + // for slice-level dot the AVX-512 path is still the right primitive + // since TDPBUSD operates on tile registers, not single rows). + vnni_dot_u8_i8: super::vnni_dot::vnni_dot_u8_i8_avx512_with_tail_safe, + add_mul_f32: super::add_mul::add_mul_f32_avx512_safe, + add_mul_f64: super::add_mul::add_mul_f64_avx512_safe, +}; + +#[cfg(target_arch = "x86_64")] +static CPU_OPS_AVX512_VNNI: CpuOps = CpuOps { + tier: "avx512vnni", + arch: "x86_64", + vnni_dot_u8_i8: super::vnni_dot::vnni_dot_u8_i8_avx512_with_tail_safe, + add_mul_f32: super::add_mul::add_mul_f32_avx512_safe, + add_mul_f64: super::add_mul::add_mul_f64_avx512_safe, +}; + +#[cfg(target_arch = "x86_64")] +static CPU_OPS_AVX512F: CpuOps = CpuOps { + tier: "avx512f", + arch: "x86_64", + // No VNNI on this tier — falls back to scalar wrapper. + vnni_dot_u8_i8: super::vnni_dot::vnni_dot_u8_i8_scalar_wrapper, + add_mul_f32: super::add_mul::add_mul_f32_avx512_safe, + add_mul_f64: super::add_mul::add_mul_f64_avx512_safe, +}; + +#[cfg(target_arch = "x86_64")] +static CPU_OPS_AVXVNNI: CpuOps = CpuOps { + tier: "avxvnni", + arch: "x86_64", + vnni_dot_u8_i8: super::vnni_dot::vnni2_dot_u8_i8_safe, + add_mul_f32: super::add_mul::add_mul_f32_avx2_fma_safe, + add_mul_f64: super::add_mul::add_mul_f64_avx2_fma_safe, +}; + +#[cfg(target_arch = "x86_64")] +static CPU_OPS_AVX2_FMA: CpuOps = CpuOps { + tier: "avx2_fma", + arch: "x86_64", + vnni_dot_u8_i8: super::vnni_dot::vnni_dot_u8_i8_scalar_wrapper, + add_mul_f32: super::add_mul::add_mul_f32_avx2_fma_safe, + add_mul_f64: super::add_mul::add_mul_f64_avx2_fma_safe, +}; + +#[cfg(target_arch = "aarch64")] +static CPU_OPS_NEON: CpuOps = CpuOps { + tier: "neon", + arch: "aarch64", + vnni_dot_u8_i8: super::vnni_dot::vnni_dot_u8_i8_scalar_wrapper, + add_mul_f32: super::add_mul::add_mul_f32_neon_safe, + add_mul_f64: super::add_mul::add_mul_f64_neon_safe, +}; + +/// Universal scalar fallback. Always available on every target. +static CPU_OPS_SCALAR: CpuOps = CpuOps { + tier: "scalar", + arch: "scalar", + vnni_dot_u8_i8: super::vnni_dot::vnni_dot_u8_i8_scalar_wrapper, + add_mul_f32: super::add_mul::add_mul_f32_scalar_safe, + add_mul_f64: super::add_mul::add_mul_f64_scalar_safe, +}; + +// ──────────────────────────────────────────────────────────────────────── +// Selection +// ──────────────────────────────────────────────────────────────────────── + +/// Lazily-selected per-CPU ops table for the current host. +/// +/// First call: ~1µs (LazyLock initialization + CPUID via +/// [`simd_caps`]). Subsequent calls: one atomic-acquire load that's +/// cache-resident; ~1-2 ns. Every op-method on the returned +/// `&'static CpuOps` is then ONE indirect call (no further LazyLock). +/// +/// Compared to the per-op LazyLocks in +/// [`crate::simd_runtime::vnni_dot::vnni_dot_u8_i8`] etc., this +/// pattern wins when a consumer touches many different SIMD ops in +/// one critical section — they amortize a single LazyLock load +/// across all ops instead of paying once per op. +/// +/// [`simd_caps`]: crate::hpc::simd_caps::simd_caps +pub fn cpu_ops() -> &'static CpuOps { + static SELECTED: LazyLock<&'static CpuOps> = LazyLock::new(|| { + let _caps = crate::hpc::simd_caps::simd_caps(); + + #[cfg(target_arch = "x86_64")] + { + if _caps.amx_int8 { + return &CPU_OPS_AMX_INT8; + } + if _caps.avx512f && _caps.avx512vnni { + return &CPU_OPS_AVX512_VNNI; + } + if _caps.avx512f { + return &CPU_OPS_AVX512F; + } + if _caps.avx2 && _caps.avxvnniint8 { + return &CPU_OPS_AVXVNNI; + } + if _caps.avx2 && _caps.fma { + return &CPU_OPS_AVX2_FMA; + } + } + + #[cfg(target_arch = "aarch64")] + { + if _caps.neon { + return &CPU_OPS_NEON; + } + } + + &CPU_OPS_SCALAR + }); + *SELECTED +} + +/// Lookup `&'static CpuOps` by tier name string. Used for explicit- +/// tier-pinning (forcing AVX2 when AMX is available, e.g. for +/// benchmarking) and for debug reporting of which tier the +/// auto-detection landed on. +/// +/// Returns `None` for unknown tier names. The known tier strings are +/// the values you can read from `cpu_ops().tier`: +/// +/// x86_64: `"amx_int8"`, `"avx512vnni"`, `"avx512f"`, `"avxvnni"`, +/// `"avx2_fma"` +/// aarch64: `"neon"` +/// universal: `"scalar"` +/// +/// Future expansion: NEON BF16 / dotprod tiers will land here once +/// the asm-byte arms ship (Phase 3b in the matrix doc). +pub fn cpu_ops_for_tier(name: &str) -> Option<&'static CpuOps> { + match name { + #[cfg(target_arch = "x86_64")] + "amx_int8" => Some(&CPU_OPS_AMX_INT8), + #[cfg(target_arch = "x86_64")] + "avx512vnni" => Some(&CPU_OPS_AVX512_VNNI), + #[cfg(target_arch = "x86_64")] + "avx512f" => Some(&CPU_OPS_AVX512F), + #[cfg(target_arch = "x86_64")] + "avxvnni" => Some(&CPU_OPS_AVXVNNI), + #[cfg(target_arch = "x86_64")] + "avx2_fma" => Some(&CPU_OPS_AVX2_FMA), + #[cfg(target_arch = "aarch64")] + "neon" => Some(&CPU_OPS_NEON), + "scalar" => Some(&CPU_OPS_SCALAR), + _ => None, + } +} + +/// Lookup by GCC CPU codename (e.g. `"sapphirerapids"`, +/// `"neoverse-v2"`, `"apple-m2"`). Maps the canonical GCC name to the +/// dispatch tier the CPU lands in, sourced from the scrape recorded +/// in the matrix doc § M. +/// +/// Used for "what would this CPU pick?" introspection without +/// touching CPUID on the running host — e.g. cross-compilation +/// reports, deployment-planning tools, integration tests that want +/// to assert tier selection for a named target without running on +/// that silicon. +/// +/// Returns `None` for unknown CPU names. Only modern (V8.2-A+ on +/// aarch64, AVX-512+ or AVX-VNNI+ on x86_64) names are mapped — older +/// silicon falls through to `cpu_ops_for_tier("scalar")` by +/// convention if you really need it. +pub fn cpu_ops_for_cpu(name: &str) -> Option<&'static CpuOps> { + cpu_ops_for_tier(cpu_to_tier(name)?) +} + +/// Maps a GCC CPU codename to the [`CpuOps`] tier it lands in. Data +/// from the scrape recorded in `.claude/knowledge/agnostic-surface-cpu-matrix.md` +/// § M (aarch64) plus the GCC i386 cpu definitions for x86_64. +fn cpu_to_tier(cpu: &str) -> Option<&'static str> { + Some(match cpu { + // x86_64 — AMX-INT8 hosts + "sapphirerapids" | "graniterapids" | "graniterapids-d" | "emeraldrapids" => "amx_int8", + + // x86_64 — AVX-512 + VNNI (no AMX) + "cascadelake" | "cooperlake" | "icelake-client" | "icelake-server" | "tigerlake" | "rocketlake" | "znver4" + | "znver5" => "avx512vnni", + + // x86_64 — AVX-512F only (no VNNI) + "skylake-avx512" => "avx512f", + + // x86_64 — AVX-VNNI (no AVX-512) + "alderlake" | "raptorlake" | "meteorlake" | "arrowlake" | "arrowlake-s" | "lunarlake" | "pantherlake" + | "sierraforest" | "grandridge" => "avxvnni", + + // x86_64 — plain AVX2 + FMA (Haswell baseline, Zen 1-3) + "haswell" | "broadwell" | "skylake" | "kaby-lake" | "comet-lake" | "znver1" | "znver2" | "znver3" => "avx2_fma", + + // aarch64 — V8.6+/V9 (would map to neon_bf16 once that tier + // lands; today both V8.6+ and the V8.2 baseline land at the + // generic NEON tier). + "apple-m1" | "apple-m2" | "apple-m3" | "apple-m4" | "oryon-1" | "ampere1" | "ampere1a" | "ampere1b" + | "cortex-a76" | "cortex-a78" | "cortex-a510" | "cortex-a520" | "cortex-a710" | "cortex-a715" + | "cortex-a720" | "cortex-a725" | "cortex-x1" | "cortex-x2" | "cortex-x3" | "cortex-x4" | "cortex-x925" + | "neoverse-n1" | "neoverse-n2" | "neoverse-n3" | "neoverse-v1" | "neoverse-v2" | "neoverse-v3" | "grace" + | "cortex-a72" | "cortex-a53" => "neon", + + // Unknown CPU + _ => return None, + }) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn cpu_ops_resolves_on_this_host() { + let ops = cpu_ops(); + // Tier name is non-empty. + assert!(!ops.tier.is_empty(), "tier name is empty"); + // Arch is one of the known classes. + assert!(matches!(ops.arch, "x86_64" | "aarch64" | "scalar")); + eprintln!("cpu_ops() resolved tier={} arch={}", ops.tier, ops.arch); + } + + #[test] + fn cpu_ops_stable_across_calls() { + let a = cpu_ops() as *const _ as usize; + let b = cpu_ops() as *const _ as usize; + assert_eq!(a, b, "cpu_ops() returned different pointers across calls"); + } + + #[test] + fn cpu_ops_for_tier_known_names() { + assert!(cpu_ops_for_tier("scalar").is_some()); + #[cfg(target_arch = "x86_64")] + { + assert!(cpu_ops_for_tier("avx2_fma").is_some()); + assert!(cpu_ops_for_tier("amx_int8").is_some()); + assert!(cpu_ops_for_tier("avx512vnni").is_some()); + } + #[cfg(target_arch = "aarch64")] + { + assert!(cpu_ops_for_tier("neon").is_some()); + } + assert!(cpu_ops_for_tier("nonsense_tier").is_none()); + } + + #[test] + fn cpu_ops_for_cpu_data_driven_lookup() { + // Spot-check the GCC-scraped mapping (matrix doc § M). + assert_eq!(cpu_to_tier("sapphirerapids"), Some("amx_int8")); + assert_eq!(cpu_to_tier("graniterapids"), Some("amx_int8")); + assert_eq!(cpu_to_tier("cascadelake"), Some("avx512vnni")); + assert_eq!(cpu_to_tier("znver4"), Some("avx512vnni")); + assert_eq!(cpu_to_tier("znver5"), Some("avx512vnni")); + assert_eq!(cpu_to_tier("alderlake"), Some("avxvnni")); + assert_eq!(cpu_to_tier("arrowlake"), Some("avxvnni")); + assert_eq!(cpu_to_tier("haswell"), Some("avx2_fma")); + assert_eq!(cpu_to_tier("znver3"), Some("avx2_fma")); + + assert_eq!(cpu_to_tier("apple-m2"), Some("neon")); + assert_eq!(cpu_to_tier("neoverse-v2"), Some("neon")); + assert_eq!(cpu_to_tier("oryon-1"), Some("neon")); + assert_eq!(cpu_to_tier("grace"), Some("neon")); + + assert_eq!(cpu_to_tier("totally-fake-cpu"), None); + } + + #[test] + fn cpu_ops_call_through_dto() { + // The whole point: call ops through the DTO and verify it + // produces correct results. Exercises the indirect-call + // through the fn ptr without going through the per-op LazyLock. + let ops = cpu_ops(); + let a: Vec = (0..100).map(|i| (i % 256) as u8).collect(); + let b: Vec = (0..100).map(|i| ((i * 3) % 256) as u8 as i8).collect(); + let got = unsafe { (ops.vnni_dot_u8_i8)(&a, &b) }; + let expected: i32 = (0..100).map(|i| a[i] as i32 * b[i] as i32).sum(); + assert_eq!(got, expected, "vnni_dot via CpuOps DTO mismatch"); + + let mut acc = vec![1.0f32; 32]; + let xa = vec![2.0f32; 32]; + let xb = vec![3.0f32; 32]; + unsafe { (ops.add_mul_f32)(&mut acc, &xa, &xb) }; + for &v in &acc { + assert!((v - 7.0).abs() < 1e-6, "add_mul_f32 via DTO: got {v}"); + } + } +} diff --git a/src/simd_runtime/mod.rs b/src/simd_runtime/mod.rs index 428020d3..666c1277 100644 --- a/src/simd_runtime/mod.rs +++ b/src/simd_runtime/mod.rs @@ -84,6 +84,7 @@ compile_error!( pub mod add_mul; pub mod casts; +pub mod cpu_ops; pub mod matmul; pub mod vnni_dot; @@ -91,5 +92,6 @@ pub mod vnni_dot; // consumers can `use crate::simd_runtime::*` and get every op flat. pub use add_mul::{add_mul_f32, add_mul_f64}; pub use casts::{bf16_to_f32_batch, cast_f16_to_f32_batch, cast_f32_to_f16_batch, f32_to_bf16_batch_rne}; +pub use cpu_ops::{cpu_ops, cpu_ops_for_cpu, cpu_ops_for_tier, CpuOps}; pub use matmul::{gemm_u8_i8, matmul_bf16_to_f32, matmul_f32, matmul_i8_to_i32}; pub use vnni_dot::vnni_dot_u8_i8; diff --git a/src/simd_runtime/vnni_dot.rs b/src/simd_runtime/vnni_dot.rs index 4abafede..4e9ba980 100644 --- a/src/simd_runtime/vnni_dot.rs +++ b/src/simd_runtime/vnni_dot.rs @@ -125,6 +125,28 @@ unsafe fn vnni_dot_u8_i8_scalar_safe_wrapper(a: &[u8], b: &[i8]) -> i32 { crate::simd_amx::vnni_dot_u8_i8_scalar(a, b) } +// ──────────────────────────────────────────────────────────────────────── +// CpuOps DTO entry points — same fn ptrs as above but exposed +// pub(super) so cpu_ops.rs can reference them by name in static decls. +// ──────────────────────────────────────────────────────────────────────── + +#[cfg(target_arch = "x86_64")] +pub(super) unsafe fn vnni_dot_u8_i8_avx512_with_tail_safe(a: &[u8], b: &[i8]) -> i32 { + // SAFETY: the static CpuOps that references this is only installed + // when avx512f + avx512vnni were verified by cpu_ops()'s LazyLock. + vnni_dot_u8_i8_avx512_with_tail(a, b) +} + +#[cfg(target_arch = "x86_64")] +pub(super) unsafe fn vnni2_dot_u8_i8_safe(a: &[u8], b: &[i8]) -> i32 { + // SAFETY: dispatch closure verified avx2 + avxvnniint8. + vnni2_dot_u8_i8_safe_wrapper(a, b) +} + +pub(super) unsafe fn vnni_dot_u8_i8_scalar_wrapper(a: &[u8], b: &[i8]) -> i32 { + vnni_dot_u8_i8_scalar_safe_wrapper(a, b) +} + #[cfg(test)] mod tests { use super::*;