Skip to content

Commit 7620898

Browse files
committed
feat(simd_caps): CPUID 7,1 + AMX-FP16/AVX512-FP16/VP2INTERSECT bits + AMX OS-gate in cpu_ops
Salvages the detection-only subset of closed PR #190 — three real gaps in the substrate runtime dispatch without inheriting any of PR #190's consumer-facing additions (no SimdProfile enum, no public dispatch-identity API, no cpu-* features). What lands here: 1) CPUID leaf 7,1 read for AMX-FP16 (CPUID.07H.1H:EAX bit 21). Lives on a different subleaf than the existing AMX bits; GraniteRapids is the only silicon advertising it today. Guarded by leaf 7,0 EAX >= 1 so older CPUs that don't expose subleaf 1 stay correct. 2) Three new SimdCaps fields (additive, all default false on non-x86): - avx512fp16 — CPUID.07H.0H:EDX bit 23 — `__m512h` math. Discriminates SPR-class from CascadeLake/ IceLakeSp/SkylakeX for any future FP16 kernel. - avx512vp2intersect — CPUID.07H.0H:EDX bit 8 — TigerLake mobile only; absent from Ice Lake-SP and every later server part. Exposed for completeness. - amx_fp16 — CPUID.07H.1H:EAX bit 21 — Granite Rapids. Plus convenience methods has_avx512_fp16() and has_amx_fp16() (the latter defense-in-depths the amx_tile bit). 3) AMX OS-state gate in cpu_ops() selection. The CPU-reports-AMX path now AND-gates on `simd_amx::amx_available()` which runs the full four-step check (CPUID + OSXSAVE + XCR0[17,18] + arch_prctl(XCOMP_PERM, 18) on Linux 5.19+). This closes the SIGILL hole when a hypervisor masks XCR0 or the OS hasn't honoured the prctl: previously cpu_ops() would route to CPU_OPS_AMX_INT8 and AMX instructions would SIGILL despite the CPUID bit. Now it demotes to CPU_OPS_AVX512_VNNI cleanly. What's deliberately NOT here (rejected from PR #190): - No `SimdProfile` enum — would expose dispatch identity to consumer code and invite `match profile { ... }` arms that defeat the polyfill contract. - No `cpu-*` cargo features — build-time silicon pinning that defeats polyfill at an earlier binding time. - No `simd_profile_probe` example — diagnostic-only, rebuilds the SimdProfile surface this PR doesn't bring. - No public dispatch-identity API at any layer. The new bits are internal substrate detection; consumers continue to use `crate::simd::*` polyfilled types and `crate::simd_runtime::*` per-op trampolines. The new fields slot into existing `cpu_ops()` selection by extension (e.g. a future AMX-FP16 tier would AND-gate on `caps.amx_fp16 && simd_amx::amx_available()` between the AMX-INT8 and AVX-512-VNNI arms). No selection logic uses them yet — they're laying the runway, not consuming it. Tests: - 4 new simd_caps tests: cpuid_extended_bits_smoke, has_amx_fp16_requires_amx_tile, x86_extended_bits_are_false_on_non_x86, plus extended determinism coverage. - All 6 existing cpu_ops tests still pass; the AMX OS-gate change passes through transparently on hosts where amx_available() agrees with CPUID (the typical case). - fmt + clippy clean on `--features runtime-dispatch`.
1 parent c10e1e0 commit 7620898

2 files changed

Lines changed: 133 additions & 1 deletion

File tree

src/hpc/simd_caps.rs

Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,24 @@ pub struct SimdCaps {
7171
/// (`is_x86_feature_detected!("avxvnniint8")`).
7272
/// Present on Arrow Lake, Lunar Lake, NUC 14 (Meteor Lake-H).
7373
pub avxvnniint8: bool,
74+
/// AVX-512 FP16 arithmetic (CPUID.07H.0H:EDX bit 23). Native
75+
/// `__m512h` operations (`_mm512_*_ph`). Present on Sapphire Rapids,
76+
/// Granite Rapids, Zen 4+. Bit is exposed for downstream substrate
77+
/// kernels and dispatch ladders; no consumer-facing dispatch axis
78+
/// is built on top of it.
79+
pub avx512fp16: bool,
80+
/// AVX-512 VP2INTERSECT (CPUID.07H.0H:EDX bit 8). Present only on
81+
/// Tiger Lake mobile silicon; absent from Ice Lake-SP and every
82+
/// later server part. Useful for future intersection-heavy
83+
/// primitives (set ops on bitmaps); exposed for completeness.
84+
pub avx512vp2intersect: bool,
85+
/// AMX-FP16 (CPUID.07H.1H:EAX bit 21). `TDPFP16PS` FP16 tile dot
86+
/// product, present on Granite Rapids only. Lives at CPUID leaf
87+
/// 7,1 (subleaf 1), not leaf 7,0 — separate `__cpuid_count(7, 1)`
88+
/// call required. The leaf 7,1 read is gated on leaf 7,0's EAX
89+
/// max-subleaf field being ≥ 1; on older silicon that field is 0
90+
/// and we never query leaf 7,1.
91+
pub amx_fp16: bool,
7492

7593
// ── aarch64 (ARM) ──
7694
/// NEON 128-bit SIMD (mandatory on aarch64, always true).
@@ -124,6 +142,9 @@ impl SimdCaps {
124142
amx_bf16: false,
125143
avx512bf16: false,
126144
avxvnniint8: false,
145+
avx512fp16: false,
146+
avx512vp2intersect: false,
147+
amx_fp16: false,
127148
neon: false,
128149
asimd_dotprod: false,
129150
fp16: false,
@@ -143,6 +164,18 @@ impl SimdCaps {
143164
let amx_tile = (cpuid7.edx >> 24) & 1 == 1;
144165
let amx_int8 = (cpuid7.edx >> 25) & 1 == 1;
145166
let amx_bf16 = (cpuid7.edx >> 22) & 1 == 1;
167+
let avx512fp16 = (cpuid7.edx >> 23) & 1 == 1;
168+
let avx512vp2intersect = (cpuid7.edx >> 8) & 1 == 1;
169+
170+
// Leaf 7,1 EAX bit 21 = AMX-FP16. Leaf 7,1 only exists when
171+
// leaf 7,0 EAX (max-subleaf) is at least 1; on older silicon
172+
// this returns 0 and the answer is correctly false.
173+
let amx_fp16 = if cpuid7.eax >= 1 {
174+
let cpuid7_1 = core::arch::x86_64::__cpuid_count(7, 1);
175+
(cpuid7_1.eax >> 21) & 1 == 1
176+
} else {
177+
false
178+
};
146179

147180
Self {
148181
avx2: is_x86_feature_detected!("avx2"),
@@ -160,6 +193,9 @@ impl SimdCaps {
160193
amx_bf16,
161194
avx512bf16: is_x86_feature_detected!("avx512bf16"),
162195
avxvnniint8: is_x86_feature_detected!("avxvnniint8"),
196+
avx512fp16,
197+
avx512vp2intersect,
198+
amx_fp16,
163199
// ARM fields: all false on x86
164200
neon: false,
165201
asimd_dotprod: false,
@@ -192,6 +228,9 @@ impl SimdCaps {
192228
amx_bf16: false,
193229
avx512bf16: false,
194230
avxvnniint8: false,
231+
avx512fp16: false,
232+
avx512vp2intersect: false,
233+
amx_fp16: false,
195234
// ARM fields: runtime detection
196235
neon: true, // mandatory on aarch64
197236
asimd_dotprod: std::arch::is_aarch64_feature_detected!("dotprod"),
@@ -221,6 +260,9 @@ impl SimdCaps {
221260
amx_bf16: false,
222261
avx512bf16: false,
223262
avxvnniint8: false,
263+
avx512fp16: false,
264+
avx512vp2intersect: false,
265+
amx_fp16: false,
224266
neon: false,
225267
asimd_dotprod: false,
226268
fp16: false,
@@ -275,6 +317,23 @@ impl SimdCaps {
275317
self.avxvnniint8
276318
}
277319

320+
/// True if AVX-512 FP16 (`__m512h`) is available. Distinguishes
321+
/// SapphireRapids-class silicon (and Zen 4+) from the CascadeLake /
322+
/// IceLakeSp / SkylakeX baseline that lacks native `__m512h` math.
323+
#[inline(always)]
324+
pub fn has_avx512_fp16(self) -> bool {
325+
self.avx512fp16
326+
}
327+
328+
/// True if AMX-FP16 (`TDPFP16PS`) is available. Only Granite Rapids
329+
/// advertises this bit. Requires both the CPUID 7,1 bit AND
330+
/// AMX-TILE (defense-in-depth: a CPU advertising AMX-FP16 without
331+
/// AMX-TILE is contradictory but the check stays cheap).
332+
#[inline(always)]
333+
pub fn has_amx_fp16(self) -> bool {
334+
self.amx_fp16 && self.amx_tile
335+
}
336+
278337
// ── ARM convenience methods ──
279338

280339
/// True if running on aarch64 with NEON (always true on aarch64).
@@ -511,4 +570,69 @@ mod tests {
511570
#[cfg(target_arch = "aarch64")]
512571
assert_ne!(profile, ArmProfile::NotArm);
513572
}
573+
574+
/// New CPUID 7,0 EDX bits and the CPUID 7,1 leaf read must surface
575+
/// without crashing on every host. Field values are host-dependent;
576+
/// we just exercise the readers and the convenience methods.
577+
#[test]
578+
fn cpuid_extended_bits_smoke() {
579+
let caps = simd_caps();
580+
let _ = caps.avx512fp16;
581+
let _ = caps.avx512vp2intersect;
582+
let _ = caps.amx_fp16;
583+
let _ = caps.has_avx512_fp16();
584+
let _ = caps.has_amx_fp16();
585+
}
586+
587+
/// `has_amx_fp16()` defense-in-depth: even if `amx_fp16` were
588+
/// spuriously true without `amx_tile`, the convenience method must
589+
/// require both. Matches the pattern used by `has_amx_bf16` in
590+
/// `simd_amx::amx_available()`.
591+
#[test]
592+
fn has_amx_fp16_requires_amx_tile() {
593+
let synthetic = SimdCaps {
594+
avx2: false,
595+
avx512f: false,
596+
avx512bw: false,
597+
avx512vl: false,
598+
avx512vpopcntdq: false,
599+
sse41: false,
600+
sse2: false,
601+
fma: false,
602+
avx512vnni: false,
603+
avx512vbmi: false,
604+
amx_tile: false,
605+
amx_int8: false,
606+
amx_bf16: false,
607+
avx512bf16: false,
608+
avxvnniint8: false,
609+
avx512fp16: false,
610+
avx512vp2intersect: false,
611+
amx_fp16: true,
612+
neon: false,
613+
asimd_dotprod: false,
614+
fp16: false,
615+
aes: false,
616+
sha2: false,
617+
crc32: false,
618+
};
619+
assert!(
620+
!synthetic.has_amx_fp16(),
621+
"amx_fp16 without amx_tile must report false"
622+
);
623+
}
624+
625+
/// On non-x86 builds the x86 capability bits MUST all read false —
626+
/// the platform-specific zero-defaults must not regress when new
627+
/// fields are added to `SimdCaps`.
628+
#[cfg(not(target_arch = "x86_64"))]
629+
#[test]
630+
fn x86_extended_bits_are_false_on_non_x86() {
631+
let caps = simd_caps();
632+
assert!(!caps.avx512fp16);
633+
assert!(!caps.avx512vp2intersect);
634+
assert!(!caps.amx_fp16);
635+
assert!(!caps.has_avx512_fp16());
636+
assert!(!caps.has_amx_fp16());
637+
}
514638
}

src/simd_runtime/cpu_ops.rs

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,15 @@ pub fn cpu_ops() -> &'static CpuOps {
180180

181181
#[cfg(target_arch = "x86_64")]
182182
{
183-
if _caps.amx_int8 {
183+
// AMX tier selection: CPUID-reports-AMX is necessary but
184+
// not sufficient. A hypervisor may mask XCR0 bits 17/18
185+
// (the tile XSAVE state) or the OS may not have honoured
186+
// `arch_prctl(XCOMP_PERM, 18)` on Linux 5.19+. In either
187+
// case AMX instructions SIGILL despite the CPUID bit
188+
// being set. `simd_amx::amx_available()` runs the full
189+
// four-step gate (CPUID + OSXSAVE + XCR0 + arch_prctl);
190+
// demote to the AVX-512 path when the OS-check fails.
191+
if _caps.amx_int8 && crate::simd_amx::amx_available() {
184192
return &CPU_OPS_AMX_INT8;
185193
}
186194
if _caps.avx512f && _caps.avx512vnni {

0 commit comments

Comments
 (0)