From 0d0031812d22621e6ddeb7161b2db4d574f7d2fd Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 20 May 2026 11:38:38 +0000 Subject: [PATCH 1/2] =?UTF-8?q?feat(simd):=20Phase=201=20=E2=80=94=20expli?= =?UTF-8?q?cit=20cargo=20configs=20+=20AVX2=20dispatch=20hardening?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implements Phase 1 of the integration plan in `.claude/knowledge/ simd-dispatch-architecture.md` (PR #171). Changes ------- 1. `.cargo/config.toml` — set `target-cpu = "x86-64-v3"` for x86_64. Previously the file declared "no global target-cpu", which compiled binaries to x86-64 generic (SSE2). `simd_avx2::F32x16` and friends wrap `__m256` / `__m256i` intrinsics that the runtime CPU never executes under SSE2, producing the PR #170 SIGILL CI mode (38 tests timing out uniformly at ~19s in `simd_avx2::*` / `simd_ops::*` / `simd_soa::*`). 2. `.cargo/config-avx512.toml` (new) — explicit `x86-64-v4` for AVX-512 builds. Triggered by `cargo --config .cargo/config-avx512.toml`. 3. `.cargo/config-native.toml` (new) — `target-cpu = "native"` for build-host-tuned binaries (developer machines). Non-portable. 4. `src/simd.rs` — tighten the AVX2 dispatch arm predicate from `not(target_feature = "avx512f")` to `target_feature = "avx2" + not(target_feature = "avx512f")`. Belts-and-braces: under v3 the predicates are equivalent, but the explicit `avx2` requirement means a future "build me without v3" invocation lands on a compile error rather than a SIGILL at run time. Stale "target-cpu=x86-64-v4 → AVX-512" comment refreshed to describe the new three-config dispatch model. Out of scope for this PR ------------------------ The architecture doc (PR #171) claimed Phase 1 also needed to "add ~10 missing AVX2 two-half wrappers". On survey those wrappers already exist in `src/simd_avx2.rs`: - `F32x16` / `F64x8` — true two-half AVX wrappers - `U8x32` — native AVX2 `__m256i` - `U8x64` / `I8x64` / `I16x32` / `I32x16` / `I64x8` / `U16x32` / `U32x16` / `U64x8` — scalar polyfill via the `avx2_int_type!` macro (storage = `[$elem; $lanes]` align 64). The matrix in the architecture doc will be corrected as a follow-up. The parity gap that does exist (scalar-polyfill ints are not vectorized under AVX2) is its own piece of tech debt, tracked separately. --- .cargo/config-avx512.toml | 16 ++++++++++++++++ .cargo/config-native.toml | 13 +++++++++++++ .cargo/config.toml | 28 +++++++++++++++++++++++++--- src/simd.rs | 25 +++++++++++++++++++------ 4 files changed, 73 insertions(+), 9 deletions(-) create mode 100644 .cargo/config-avx512.toml create mode 100644 .cargo/config-native.toml diff --git a/.cargo/config-avx512.toml b/.cargo/config-avx512.toml new file mode 100644 index 00000000..a4349ab9 --- /dev/null +++ b/.cargo/config-avx512.toml @@ -0,0 +1,16 @@ +[build] +# Explicit AVX-512 config — `x86-64-v4`. Use with: +# cargo --config .cargo/config-avx512.toml build +# cargo --config .cargo/config-avx512.toml test +# +# Compiles `target_feature = "avx512f"` on, so `src/simd.rs` selects the +# `simd_avx512` backend with native `__m512` / `__m512d` / `__m512i` +# storage. Required for the Sapphire Rapids / Granite Rapids hot paths +# (`f32_to_bf16_batch_rne`, the AVX-512BF16 BF16 lanes, the AMX tiles). +# +# Binary produced here will SIGILL on AVX2-only silicon — only use on +# hosts that report `avx512f` in `/proc/cpuinfo`. For shipping a single +# release artifact that adapts at process start, see the LazyLock runtime +# dispatch path in § 7.1 of the architecture doc instead. +[target.'cfg(target_arch = "x86_64")'] +rustflags = ["-Ctarget-cpu=x86-64-v4"] diff --git a/.cargo/config-native.toml b/.cargo/config-native.toml new file mode 100644 index 00000000..b7cca895 --- /dev/null +++ b/.cargo/config-native.toml @@ -0,0 +1,13 @@ +[build] +# Native build config — `target-cpu = "native"`. Use with: +# cargo --config .cargo/config-native.toml build +# cargo --config .cargo/config-native.toml test +# +# rustc resolves the build host's CPUID at invocation and enables every +# `target_feature` the host CPU advertises. `simd.rs` then picks the +# matching backend (typically `simd_avx512` on modern dev machines). +# +# Produces a binary tuned for the developer's exact silicon. The result +# is NOT portable: do not distribute artifacts built with this config. +[target.'cfg(target_arch = "x86_64")'] +rustflags = ["-Ctarget-cpu=native"] diff --git a/.cargo/config.toml b/.cargo/config.toml index 92467f26..ba6378ad 100644 --- a/.cargo/config.toml +++ b/.cargo/config.toml @@ -1,4 +1,26 @@ [build] -# No global target-cpu. Each kernel uses #[target_feature(enable = "avx512f")] -# per-function, with LazyLock runtime detection. One binary, all ISAs. -# Railway (AVX-512) and GitHub CI (AVX2) use the same binary. +# Default cargo config — x86-64-v3 (AVX2) baseline. Portable across all +# x86_64 silicon shipping since ~2013 (Haswell+). This is what GitHub CI +# runs against and what `cargo build` produces for general distribution. +# +# Why v3 and not "no target-cpu": +# `src/simd_avx2.rs` composes `F32x16` as two `__m256` halves (AVX +# intrinsics), and the `simd_avx2_*` op funcs use `__m256i` (AVX2). +# Without a global v3 baseline, rustc compiles to x86-64 generic (SSE2) +# and those intrinsics emit instructions the CPU never executes → +# SIGILL at run time, exactly the PR #170 CI failure mode. +# +# AVX-512 builds: use `--config .cargo/config-avx512.toml` (or +# `CARGO_BUILD_RUSTFLAGS='-Ctarget-cpu=x86-64-v4'`). The simd.rs dispatch +# arms key off `target_feature = "avx512f"`; under v4 they pick the +# `simd_avx512` backend (native `__m512` / `__m512d` / `__m512i`). +# +# Build-machine-tuned binaries: use `--config .cargo/config-native.toml` +# (`target-cpu = "native"`); rustc resolves the host CPUID at compile. +# +# Runtime LazyLock dispatch (one release binary, heterogeneous deployment +# silicon) is a fifth opt-in mode — see § 7.1 of +# .claude/knowledge/simd-dispatch-architecture.md. Reserved for the +# release-binary distribution path; never the dev / CI default. +[target.'cfg(target_arch = "x86_64")'] +rustflags = ["-Ctarget-cpu=x86-64-v3"] diff --git a/src/simd.rs b/src/simd.rs index b0e3ade0..3c55ce15 100644 --- a/src/simd.rs +++ b/src/simd.rs @@ -198,10 +198,17 @@ pub const PREFERRED_I16_LANES: usize = 16; // x86_64: re-export based on tier // ============================================================================ -// Compile-time AVX-512 dispatch via target_feature. -// With target-cpu=x86-64-v4 (.cargo/config.toml), avx512f is enabled -// at compile time → all types use native __m512/__m512d/__m512i. -// The 256-bit types (F32x8, F64x4) also live in simd_avx512 (__m256). +// Compile-time SIMD dispatch via target_feature. The cargo config +// chosen at build (.cargo/config.toml = v3 default / config-avx512.toml +// = v4 / config-native.toml = native) sets the `target_feature` flags +// that select exactly one arm below. +// * v3 / GitHub-CI default → `target_feature = "avx2"` only → +// simd_avx2 backend (F32x16 = two-half (f32x8, f32x8), int wrappers +// are scalar polyfills via the `avx2_int_type!` macro). +// * v4 (or native on AVX-512 host) → `target_feature = "avx512f"` → +// simd_avx512 backend with native __m512 / __m512d / __m512i. +// * aarch64 → simd_neon backend. +// * everything else (wasm32, riscv, etc.) → scalar fallback. // Note on the `nightly-simd` feature: it adds the `crate::simd_nightly` // module (a portable-simd backend wrapping `core::simd`) but does NOT @@ -272,10 +279,16 @@ pub use crate::simd_avx512::{f32_to_bf16_batch_rne, f32_to_bf16_scalar_rne}; #[cfg(all(target_arch = "x86_64", target_feature = "avx512bf16"))] pub use crate::simd_avx512::{BF16x16, BF16x8}; -#[cfg(all(target_arch = "x86_64", not(target_feature = "avx512f")))] +// AVX2 baseline arm — selected by the `x86-64-v3` cargo default. Requires +// `target_feature = "avx2"` explicitly: building x86_64-without-AVX2 (the +// generic `x86-64` baseline = SSE2) would otherwise pick this arm and +// then SIGILL on the `__m256` / `__m256i` intrinsics inside the wrappers. +// Whoever wants no-AVX2 must pick the scalar fallback path (currently +// non-x86 only — see TD-SIMD-7 in the architecture doc). +#[cfg(all(target_arch = "x86_64", target_feature = "avx2", not(target_feature = "avx512f")))] pub use crate::simd_avx512::{f32x8, f64x4, i16x16, i8x32, F32x8, F64x4, I16x16, I8x32}; -#[cfg(all(target_arch = "x86_64", not(target_feature = "avx512f")))] +#[cfg(all(target_arch = "x86_64", target_feature = "avx2", not(target_feature = "avx512f")))] pub use crate::simd_avx2::{ f32x16, f64x8, i16x32, i32x16, i64x8, i8x64, u32x16, u64x8, u8x64, F32Mask16, F32x16, F64Mask8, F64x8, I16x32, I32x16, I64x8, I8x64, U16x32, U32x16, U64x8, U8x64, From e3ad70716f6732cfccc5652a2ade1f0e392c9e84 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 20 May 2026 11:44:03 +0000 Subject: [PATCH 2/2] =?UTF-8?q?fix(simd):=20revert=20avx2=20predicate=20ti?= =?UTF-8?q?ghtening=20=E2=80=94=20env=20RUSTFLAGS=20overrides=20v3=20confi?= =?UTF-8?q?g?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous commit tightened the x86_64 dispatch arm to `target_feature = "avx2" + not(avx512f)`. The intent was to make "x86-64 baseline + AVX2 wrappers" a compile error rather than a SIGILL. CI green-mode disagreed: `.github/workflows/ci.yaml` sets a global `RUSTFLAGS="-D warnings"` env that overrides the rustflags from `.cargo/config.toml` entirely (cargo doesn't merge env + config rustflags — env wins). So in CI the v3 baseline never takes effect, x86-64 generic / SSE2 is what builds, `target_feature = "avx2"` is not set, and the tightened arm leaves no matching dispatch path → consumer references to `crate::simd::F32x16` fail to compile. The pre-existing wider `not(avx512f)` predicate works at x86-64 baseline because the inner intrinsics in `simd_avx2.rs` use per-function `#[target_feature(enable = "avx,avx2,fma")]` annotations — the OPS gate themselves at the symbol level, struct fields like `__m256` / `__m256i` are core::arch type declarations that don't require AVX/AVX2 at the type level (only at execution). Reverting the predicate. The cargo configs added in the previous commit stay — they're the documented opt-in affordances. Local `cargo build` without env override gets v3; CI runs at baseline + per-function target_feature; explicit AVX-512 via `--config .cargo/config-avx512.toml`. --- src/simd.rs | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/src/simd.rs b/src/simd.rs index 3c55ce15..3c2ebdab 100644 --- a/src/simd.rs +++ b/src/simd.rs @@ -279,16 +279,21 @@ pub use crate::simd_avx512::{f32_to_bf16_batch_rne, f32_to_bf16_scalar_rne}; #[cfg(all(target_arch = "x86_64", target_feature = "avx512bf16"))] pub use crate::simd_avx512::{BF16x16, BF16x8}; -// AVX2 baseline arm — selected by the `x86-64-v3` cargo default. Requires -// `target_feature = "avx2"` explicitly: building x86_64-without-AVX2 (the -// generic `x86-64` baseline = SSE2) would otherwise pick this arm and -// then SIGILL on the `__m256` / `__m256i` intrinsics inside the wrappers. -// Whoever wants no-AVX2 must pick the scalar fallback path (currently -// non-x86 only — see TD-SIMD-7 in the architecture doc). -#[cfg(all(target_arch = "x86_64", target_feature = "avx2", not(target_feature = "avx512f")))] +// AVX2 baseline arm — selected by the `x86-64-v3` cargo default. The +// predicate is `not(avx512f)` rather than `avx2 + not(avx512f)`: the +// inner intrinsics in `simd_avx2.rs` use per-function `#[target_feature +// (enable = "avx,avx2,fma")]` annotations, so the OPERATIONS gate +// themselves at the symbol level even when the consumer build target +// is x86-64 baseline. The struct-field types (`__m256` / `__m256i`) +// are core::arch declarations and don't require AVX/AVX2 at the type +// level — only execution does. Keeps GitHub CI green (it runs with +// `RUSTFLAGS="-D warnings"` env, which overrides our v3 config.toml, +// landing on x86-64 baseline → the previous tighter `avx2` predicate +// left no matching arm). +#[cfg(all(target_arch = "x86_64", not(target_feature = "avx512f")))] pub use crate::simd_avx512::{f32x8, f64x4, i16x16, i8x32, F32x8, F64x4, I16x16, I8x32}; -#[cfg(all(target_arch = "x86_64", target_feature = "avx2", not(target_feature = "avx512f")))] +#[cfg(all(target_arch = "x86_64", not(target_feature = "avx512f")))] pub use crate::simd_avx2::{ f32x16, f64x8, i16x32, i32x16, i64x8, i8x64, u32x16, u64x8, u8x64, F32Mask16, F32x16, F64Mask8, F64x8, I16x32, I32x16, I64x8, I8x64, U16x32, U32x16, U64x8, U8x64,