Skip to content

Commit a18366a

Browse files
authored
Merge pull request #172 from AdaWorldAPI/claude/pr-x-phase1-config-flip
feat(simd): Phase 1 — explicit cargo configs + AVX2 dispatch hardening
2 parents 8d3c1d7 + e3ad707 commit a18366a

4 files changed

Lines changed: 76 additions & 7 deletions

File tree

.cargo/config-avx512.toml

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
[build]
2+
# Explicit AVX-512 config — `x86-64-v4`. Use with:
3+
# cargo --config .cargo/config-avx512.toml build
4+
# cargo --config .cargo/config-avx512.toml test
5+
#
6+
# Compiles `target_feature = "avx512f"` on, so `src/simd.rs` selects the
7+
# `simd_avx512` backend with native `__m512` / `__m512d` / `__m512i`
8+
# storage. Required for the Sapphire Rapids / Granite Rapids hot paths
9+
# (`f32_to_bf16_batch_rne`, the AVX-512BF16 BF16 lanes, the AMX tiles).
10+
#
11+
# Binary produced here will SIGILL on AVX2-only silicon — only use on
12+
# hosts that report `avx512f` in `/proc/cpuinfo`. For shipping a single
13+
# release artifact that adapts at process start, see the LazyLock runtime
14+
# dispatch path in § 7.1 of the architecture doc instead.
15+
[target.'cfg(target_arch = "x86_64")']
16+
rustflags = ["-Ctarget-cpu=x86-64-v4"]

.cargo/config-native.toml

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
[build]
2+
# Native build config — `target-cpu = "native"`. Use with:
3+
# cargo --config .cargo/config-native.toml build
4+
# cargo --config .cargo/config-native.toml test
5+
#
6+
# rustc resolves the build host's CPUID at invocation and enables every
7+
# `target_feature` the host CPU advertises. `simd.rs` then picks the
8+
# matching backend (typically `simd_avx512` on modern dev machines).
9+
#
10+
# Produces a binary tuned for the developer's exact silicon. The result
11+
# is NOT portable: do not distribute artifacts built with this config.
12+
[target.'cfg(target_arch = "x86_64")']
13+
rustflags = ["-Ctarget-cpu=native"]

.cargo/config.toml

Lines changed: 25 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,26 @@
11
[build]
2-
# No global target-cpu. Each kernel uses #[target_feature(enable = "avx512f")]
3-
# per-function, with LazyLock runtime detection. One binary, all ISAs.
4-
# Railway (AVX-512) and GitHub CI (AVX2) use the same binary.
2+
# Default cargo config — x86-64-v3 (AVX2) baseline. Portable across all
3+
# x86_64 silicon shipping since ~2013 (Haswell+). This is what GitHub CI
4+
# runs against and what `cargo build` produces for general distribution.
5+
#
6+
# Why v3 and not "no target-cpu":
7+
# `src/simd_avx2.rs` composes `F32x16` as two `__m256` halves (AVX
8+
# intrinsics), and the `simd_avx2_*` op funcs use `__m256i` (AVX2).
9+
# Without a global v3 baseline, rustc compiles to x86-64 generic (SSE2)
10+
# and those intrinsics emit instructions the CPU never executes →
11+
# SIGILL at run time, exactly the PR #170 CI failure mode.
12+
#
13+
# AVX-512 builds: use `--config .cargo/config-avx512.toml` (or
14+
# `CARGO_BUILD_RUSTFLAGS='-Ctarget-cpu=x86-64-v4'`). The simd.rs dispatch
15+
# arms key off `target_feature = "avx512f"`; under v4 they pick the
16+
# `simd_avx512` backend (native `__m512` / `__m512d` / `__m512i`).
17+
#
18+
# Build-machine-tuned binaries: use `--config .cargo/config-native.toml`
19+
# (`target-cpu = "native"`); rustc resolves the host CPUID at compile.
20+
#
21+
# Runtime LazyLock dispatch (one release binary, heterogeneous deployment
22+
# silicon) is a fifth opt-in mode — see § 7.1 of
23+
# .claude/knowledge/simd-dispatch-architecture.md. Reserved for the
24+
# release-binary distribution path; never the dev / CI default.
25+
[target.'cfg(target_arch = "x86_64")']
26+
rustflags = ["-Ctarget-cpu=x86-64-v3"]

src/simd.rs

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -198,10 +198,17 @@ pub const PREFERRED_I16_LANES: usize = 16;
198198
// x86_64: re-export based on tier
199199
// ============================================================================
200200

201-
// Compile-time AVX-512 dispatch via target_feature.
202-
// With target-cpu=x86-64-v4 (.cargo/config.toml), avx512f is enabled
203-
// at compile time → all types use native __m512/__m512d/__m512i.
204-
// The 256-bit types (F32x8, F64x4) also live in simd_avx512 (__m256).
201+
// Compile-time SIMD dispatch via target_feature. The cargo config
202+
// chosen at build (.cargo/config.toml = v3 default / config-avx512.toml
203+
// = v4 / config-native.toml = native) sets the `target_feature` flags
204+
// that select exactly one arm below.
205+
// * v3 / GitHub-CI default → `target_feature = "avx2"` only →
206+
// simd_avx2 backend (F32x16 = two-half (f32x8, f32x8), int wrappers
207+
// are scalar polyfills via the `avx2_int_type!` macro).
208+
// * v4 (or native on AVX-512 host) → `target_feature = "avx512f"` →
209+
// simd_avx512 backend with native __m512 / __m512d / __m512i.
210+
// * aarch64 → simd_neon backend.
211+
// * everything else (wasm32, riscv, etc.) → scalar fallback.
205212

206213
// Note on the `nightly-simd` feature: it adds the `crate::simd_nightly`
207214
// module (a portable-simd backend wrapping `core::simd`) but does NOT
@@ -272,6 +279,17 @@ pub use crate::simd_avx512::{f32_to_bf16_batch_rne, f32_to_bf16_scalar_rne};
272279
#[cfg(all(target_arch = "x86_64", target_feature = "avx512bf16"))]
273280
pub use crate::simd_avx512::{BF16x16, BF16x8};
274281

282+
// AVX2 baseline arm — selected by the `x86-64-v3` cargo default. The
283+
// predicate is `not(avx512f)` rather than `avx2 + not(avx512f)`: the
284+
// inner intrinsics in `simd_avx2.rs` use per-function `#[target_feature
285+
// (enable = "avx,avx2,fma")]` annotations, so the OPERATIONS gate
286+
// themselves at the symbol level even when the consumer build target
287+
// is x86-64 baseline. The struct-field types (`__m256` / `__m256i`)
288+
// are core::arch declarations and don't require AVX/AVX2 at the type
289+
// level — only execution does. Keeps GitHub CI green (it runs with
290+
// `RUSTFLAGS="-D warnings"` env, which overrides our v3 config.toml,
291+
// landing on x86-64 baseline → the previous tighter `avx2` predicate
292+
// left no matching arm).
275293
#[cfg(all(target_arch = "x86_64", not(target_feature = "avx512f")))]
276294
pub use crate::simd_avx512::{f32x8, f64x4, i16x16, i8x32, F32x8, F64x4, I16x16, I8x32};
277295

0 commit comments

Comments
 (0)