From 2ef97c03977fdd35f4c494871e13145af46dc3f2 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 20 May 2026 16:55:23 +0000 Subject: [PATCH] =?UTF-8?q?feat(simd):=20missing-lanes=20sweep=20=E2=80=94?= =?UTF-8?q?=20U16x16/U32x8/U64x4/I32x8/I64x4=20across=20all=20backends?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PR #178's matrix audit surfaced five 256-bit int lane types that were either entirely missing or stranded in `simd_nightly` only. Adds them across every backend so `crate::simd::{U16x16, U32x8, U64x4, I32x8, I64x4}` resolves uniformly on v3 / v4 / native / nightly / scalar / aarch64 paths. `src/simd_avx2.rs` + 5× `avx2_int_type!` instantiations producing scalar-storage `[$elem; $lanes]` polyfills (align 64). Same macro pattern as the existing 512-bit polyfills (U8x64, U16x32, …). Native AVX2 `__m256i` upgrades are TD-SIMD-3. + 5× lowercase aliases (`u16x16 = U16x16`, etc.) matching the std::simd convention used by every other lane type in the file. `src/simd_scalar.rs` + 5× `impl_int_type!` instantiations mirroring the AVX2 polyfills above. Consumers on non-x86/non-aarch64 (wasm32, riscv, thumb) reach the same type names through `crate::simd::*`. + Lowercase aliases. `src/simd_avx512.rs` + Re-export of the new types from `simd_avx2` so the v4 dispatch arm in `simd.rs` can surface them without forking the macro into this file. Both files are already gated on `target_arch = "x86_64"`, so the re-export is cheap. Native `__m256i` upgrades here are TD-SIMD-3 (same story as the v3 polyfills). `src/simd_nightly/u_word_types.rs` + `U16x16` wrapper backed by `core::simd::u16x16`. Same API surface as the existing 32-/16-/8-lane wrappers — splat, from_slice, from_array, to_array, copy_to_slice, reduce_{sum,min,max}, simd_min/max, cmpeq_mask, cmpgt_mask, Default. `src/simd_nightly/i_word_types.rs` + `I32x8` and `I64x4` wrappers backed by `core::simd::{i32x8, i64x4}`. Same API surface as siblings; PartialEq via array compare. `src/simd_nightly/mod.rs` + Re-exports for the three new types + lowercase aliases. `src/simd.rs` + All 5 dispatch arms (nightly, v4, v3, aarch64, scalar fallback) updated to surface the new types through `crate::simd::*`. `.claude/knowledge/simd-dispatch-architecture.md` + Parity matrix updated — the five rows previously marked ❌ across most backends now show 🟠 polyfill (v3, v4-via-v3, scalar) / 🔵 (nightly via `core::simd`). Verified: `cargo check` clean under default v3 features and under `-Ctarget-cpu=x86-64-v4` (via `CARGO_TARGET_X86_64_UNKNOWN_LINUX_GNU_RUSTFLAGS` + explicit `--target` so build scripts don't SIGILL on non-AVX-512 runners — same pattern as the tier4-avx512-check job). --- .../knowledge/simd-dispatch-architecture.md | 10 +- src/simd.rs | 32 +++- src/simd_avx2.rs | 26 +++ src/simd_avx512.rs | 8 + src/simd_nightly/i_word_types.rs | 176 +++++++++++++++++- src/simd_nightly/mod.rs | 11 +- src/simd_nightly/u_word_types.rs | 86 ++++++++- src/simd_scalar.rs | 22 +++ 8 files changed, 354 insertions(+), 17 deletions(-) diff --git a/.claude/knowledge/simd-dispatch-architecture.md b/.claude/knowledge/simd-dispatch-architecture.md index 6181932f..df381c84 100644 --- a/.claude/knowledge/simd-dispatch-architecture.md +++ b/.claude/knowledge/simd-dispatch-architecture.md @@ -150,19 +150,19 @@ tracked as TD-SIMD-3.) | `U8x64` | ✅ `__m512i` | 🟠 `[u8; 64]` polyfill | ❌ | 🔵 | ✅ | | `U8x32` | ✅ `__m256i` | ✅ `__m256i` | ❌ | 🔵 | ✅ | | `U16x32` | ✅ `__m512i` | 🟠 `[u16; 32]` polyfill | ❌ | 🔵 | ✅ | -| `U16x16` | ❌ | ❌ | ❌ | ❌ | ❌ | +| `U16x16` | 🟠 (via `simd_avx2`) | 🟠 `[u16; 16]` polyfill | ❌ | 🔵 `core::simd::u16x16` | 🟠 | | `U32x16` | ✅ `__m512i` | 🟠 `[u32; 16]` polyfill | ❌ | 🔵 | ✅ | -| `U32x8` | ❌ | ❌ | ❌ | 🔵 `core::simd::u32x8` | ❌ | +| `U32x8` | 🟠 (via `simd_avx2`) | 🟠 `[u32; 8]` polyfill | ❌ | 🔵 `core::simd::u32x8` | 🟠 | | `U64x8` | ✅ `__m512i` | 🟠 `[u64; 8]` polyfill | ❌ | 🔵 | ✅ | -| `U64x4` | ❌ | ❌ | ❌ | 🔵 `core::simd::u64x4` | ❌ | +| `U64x4` | 🟠 (via `simd_avx2`) | 🟠 `[u64; 4]` polyfill | ❌ | 🔵 `core::simd::u64x4` | 🟠 | | `I8x32` | ✅ `__m256i` | ✅ `__m256i` (in `simd_avx512`) | ❌ | 🔵 | ✅ | | `I8x64` | ✅ `__m512i` | 🟠 `[i8; 64]` polyfill | ❌ | 🔵 | ✅ | | `I16x16` | ✅ `__m256i` | ✅ `__m256i` (in `simd_avx512`) | ❌ | 🔵 | ✅ | | `I16x32` | ✅ `__m512i` | 🟠 `[i16; 32]` polyfill | ❌ | 🔵 | ✅ | | `I32x16` | ✅ `__m512i` | 🟠 `[i32; 16]` polyfill | ❌ | 🔵 | ✅ | -| `I32x8` | ❌ | ❌ | ❌ | ❌ | ❌ | +| `I32x8` | 🟠 (via `simd_avx2`) | 🟠 `[i32; 8]` polyfill | ❌ | 🔵 `core::simd::i32x8` | 🟠 | | `I64x8` | ✅ `__m512i` | 🟠 `[i64; 8]` polyfill | ❌ | 🔵 | ✅ | -| `I64x4` | ❌ | ❌ | ❌ | ❌ | ❌ | +| `I64x4` | 🟠 (via `simd_avx2`) | 🟠 `[i64; 4]` polyfill | ❌ | 🔵 `core::simd::i64x4` | 🟠 | | `BF16x8` | ✅ `__m128bh` | ❌ | ❌ | 🔵 | ✅ | | `BF16x16` | ✅ `__m256bh` | ❌ | ❌ | 🔵 | ✅ | | `F16x16` | ❌ | 🟠 `[u16; 16]` polyfill (`simd_half`) | 🟠 `[u16; 16]` polyfill (`simd_half`) | 🔵 | ✅ | diff --git a/src/simd.rs b/src/simd.rs index 673177c5..f96ea9d3 100644 --- a/src/simd.rs +++ b/src/simd.rs @@ -220,9 +220,10 @@ pub const PREFERRED_I16_LANES: usize = 16; // as soon as `nightly-simd` is on. #[cfg(feature = "nightly-simd")] pub use crate::simd_nightly::{ - f32x16, f32x8, f64x4, f64x8, i16x16, i16x32, i32x16, i64x8, i8x32, i8x64, u16x32, u32x16, u32x8, u64x4, u64x8, - u8x32, u8x64, BF16x16, BF16x8, F16x16, F32Mask16, F32Mask8, F32x16, F32x8, F64Mask4, F64Mask8, F64x4, F64x8, - I16x16, I16x32, I32x16, I64x8, I8x32, I8x64, U16x32, U32x16, U32x8, U64x4, U64x8, U8x32, U8x64, + f32x16, f32x8, f64x4, f64x8, i16x16, i16x32, i32x16, i32x8, i64x4, i64x8, i8x32, i8x64, u16x16, u16x32, u32x16, + u32x8, u64x4, u64x8, u8x32, u8x64, BF16x16, BF16x8, F16x16, F32Mask16, F32Mask8, F32x16, F32x8, F64Mask4, F64Mask8, + F64x4, F64x8, I16x16, I16x32, I32x16, I32x8, I64x4, I64x8, I8x32, I8x64, U16x16, U16x32, U32x16, U32x8, U64x4, + U64x8, U8x32, U8x64, }; #[cfg(all(target_arch = "x86_64", target_feature = "avx512f", not(feature = "nightly-simd")))] @@ -234,10 +235,15 @@ pub use crate::simd_avx512::{ i16x16, i16x32, i32x16, + i32x8, + i64x4, i64x8, i8x32, i8x64, + u16x16, u32x16, + u32x8, + u64x4, u64x8, u8x64, F32Mask16, @@ -251,11 +257,18 @@ pub use crate::simd_avx512::{ I16x16, I16x32, I32x16, + // 256-bit int polyfills surfaced 2026-05-20 (re-exported from + // `simd_avx2` via `simd_avx512`'s re-export at line ~2260). + I32x8, + I64x4, I64x8, I8x32, I8x64, + U16x16, U16x32, U32x16, + U32x8, + U64x4, U64x8, U8x64, }; @@ -302,8 +315,9 @@ pub use crate::simd_avx512::{f32x8, f64x4, i16x16, i8x32, F32x8, F64x4, I16x16, not(feature = "nightly-simd") ))] pub use crate::simd_avx2::{ - f32x16, f64x8, i16x32, i32x16, i64x8, i8x64, u32x16, u64x8, u8x64, F32Mask16, F32x16, F64Mask8, F64x8, I16x32, - I32x16, I64x8, I8x64, U16x32, U32x16, U64x8, U8x64, + f32x16, f64x8, i16x32, i32x16, i32x8, i64x4, i64x8, i8x64, u16x16, u32x16, u32x8, u64x4, u64x8, u8x64, F32Mask16, + F32x16, F64Mask8, F64x8, I16x32, I32x16, I32x8, I64x4, I64x8, I8x64, U16x16, U16x32, U32x16, U32x8, U64x4, U64x8, + U8x64, }; // U8x32 — native AVX2 byte width (one __m256i = 32 bytes). Available on @@ -335,7 +349,8 @@ pub(crate) mod scalar; pub use crate::simd_neon::aarch64_simd::{f32x16, f64x8, F32Mask16, F32x16, F64Mask8, F64x8}; #[cfg(all(target_arch = "aarch64", not(feature = "nightly-simd")))] pub use scalar::{ - f32x8, f64x4, i32x16, i64x8, u32x16, u64x8, u8x64, F32x8, F64x4, I32x16, I64x8, U16x32, U32x16, U64x8, U8x64, + f32x8, f64x4, i32x16, i32x8, i64x4, i64x8, u16x16, u32x16, u32x8, u64x4, u64x8, u8x64, F32x8, F64x4, I32x16, I32x8, + I64x4, I64x8, U16x16, U16x32, U32x16, U32x8, U64x4, U64x8, U8x64, }; // Other non-x86 targets (wasm, riscv, etc.): full scalar fallback. @@ -345,8 +360,9 @@ pub use scalar::{ not(feature = "nightly-simd") ))] pub use scalar::{ - f32x16, f32x8, f64x4, f64x8, i16x16, i16x32, i32x16, i64x8, i8x32, i8x64, u32x16, u64x8, u8x64, F32Mask16, F32x16, - F32x8, F64Mask8, F64x4, F64x8, I16x16, I16x32, I32x16, I64x8, I8x32, I8x64, U16x32, U32x16, U64x8, U8x64, + f32x16, f32x8, f64x4, f64x8, i16x16, i16x32, i32x16, i32x8, i64x4, i64x8, i8x32, i8x64, u16x16, u32x16, u32x8, + u64x4, u64x8, u8x64, F32Mask16, F32x16, F32x8, F64Mask8, F64x4, F64x8, I16x16, I16x32, I32x16, I32x8, I64x4, I64x8, + I8x32, I8x64, U16x16, U16x32, U32x16, U32x8, U64x4, U64x8, U8x64, }; // Scalar BF16 conversion — always available on all platforms diff --git a/src/simd_avx2.rs b/src/simd_avx2.rs index 2a3f9ed3..be3e369f 100644 --- a/src/simd_avx2.rs +++ b/src/simd_avx2.rs @@ -1542,6 +1542,19 @@ avx2_int_type!(U16x32, u16, 32, 0u16); avx2_int_type!(U32x16, u32, 16, 0u32); avx2_int_type!(U64x8, u64, 8, 0u64); +// 256-bit int lanes — scalar polyfills filling the gap surfaced by the +// 2026-05-20 matrix audit. None of these had wrappers anywhere except +// for `U32x8` / `U64x4` in `simd_nightly`. Adding `U16x16`, `U32x8`, +// `U64x4`, `I32x8`, `I64x4` here mirrors the existing 512-bit polyfill +// pattern (`[$elem; $lanes]` storage, align 64). Native AVX2 `__m256i` +// upgrades for these are TD-SIMD-3 (the same fold-into-real-SIMD task +// already tracked for the 512-bit polyfills above). +avx2_int_type!(U16x16, u16, 16, 0u16); +avx2_int_type!(U32x8, u32, 8, 0u32); +avx2_int_type!(U64x4, u64, 4, 0u64); +avx2_int_type!(I32x8, i32, 8, 0i32); +avx2_int_type!(I64x4, i64, 4, 0i64); + // Extra methods for U16x32 (widen/narrow, shift, multiply) — AVX2 scalar fallback. impl U16x32 { #[inline(always)] @@ -2266,6 +2279,19 @@ pub type i8x64 = I8x64; #[allow(non_camel_case_types)] pub type i16x32 = I16x32; +// Lowercase aliases for the 256-bit polyfills added in the 2026-05-20 +// missing-lanes sweep. +#[allow(non_camel_case_types)] +pub type u16x16 = U16x16; +#[allow(non_camel_case_types)] +pub type u32x8 = U32x8; +#[allow(non_camel_case_types)] +pub type u64x4 = U64x4; +#[allow(non_camel_case_types)] +pub type i32x8 = I32x8; +#[allow(non_camel_case_types)] +pub type i64x4 = I64x4; + #[cfg(test)] mod tests { use super::*; diff --git a/src/simd_avx512.rs b/src/simd_avx512.rs index 82cb8534..64dcd4d0 100644 --- a/src/simd_avx512.rs +++ b/src/simd_avx512.rs @@ -2256,6 +2256,14 @@ pub type i16x32 = I16x32; #[allow(non_camel_case_types)] pub type i16x16 = I16x16; +// 256-bit int lanes — added 2026-05-20 missing-lanes sweep. These types +// don't have native `__m256i` wrappers in this module yet; re-exported +// from `simd_avx2.rs` (where they live as scalar-storage polyfills via +// the `avx2_int_type!` macro) so the v4 dispatch arm in `simd.rs` can +// surface them through `crate::simd::*` with the same names the v3 arm +// uses. Native AVX2 `__m256i` upgrades for these are TD-SIMD-3. +pub use crate::simd_avx2::{i32x8, i64x4, u16x16, u32x8, u64x4, I32x8, I64x4, U16x16, U32x8, U64x4}; + // ============================================================================ // BF16 conversion wrappers — AVX-512 BF16 hardware instructions // ============================================================================ diff --git a/src/simd_nightly/i_word_types.rs b/src/simd_nightly/i_word_types.rs index d4e1c7ff..ad3694ba 100644 --- a/src/simd_nightly/i_word_types.rs +++ b/src/simd_nightly/i_word_types.rs @@ -3,7 +3,7 @@ use core::simd::cmp::{SimdOrd, SimdPartialEq, SimdPartialOrd}; use core::simd::num::SimdInt; -use core::simd::{i16x16, i16x32, i32x16, i64x8}; +use core::simd::{i16x16, i16x32, i32x16, i32x8, i64x4, i64x8}; // ════════════════════════════════════════════════════════════════════ // I16x16 — 16-lane signed 16-bit integer @@ -428,3 +428,177 @@ impl core::fmt::Display for I64x8 { write!(f, "I64x8({:?})", &self.to_array()[..]) } } + +// ════════════════════════════════════════════════════════════════════ +// I32x8 — 8-lane i32 (256-bit, added 2026-05-20 missing-lanes sweep) +// ════════════════════════════════════════════════════════════════════ + +/// 8-lane `i32` SIMD vector backed by `core::simd::i32x8`. +/// +/// API mirrors `simd_avx512::I32x16` at half-width. Miri-executable. +#[derive(Copy, Clone, Debug)] +#[repr(transparent)] +pub struct I32x8(pub i32x8); + +impl I32x8 { + pub const LANES: usize = 8; + + #[inline(always)] + pub fn splat(v: i32) -> Self { + Self(i32x8::splat(v)) + } + + #[inline(always)] + pub fn from_slice(s: &[i32]) -> Self { + assert!(s.len() >= 8, "I32x8::from_slice needs >=8 elements"); + Self(i32x8::from_slice(s)) + } + + #[inline(always)] + pub fn from_array(arr: [i32; 8]) -> Self { + Self(i32x8::from_array(arr)) + } + + #[inline(always)] + pub fn to_array(self) -> [i32; 8] { + self.0.to_array() + } + + #[inline(always)] + pub fn copy_to_slice(self, s: &mut [i32]) { + assert!(s.len() >= 8, "I32x8::copy_to_slice needs >=8 elements"); + self.0.copy_to_slice(s); + } + + #[inline(always)] + pub fn reduce_sum(self) -> i32 { + self.0.reduce_sum() + } + #[inline(always)] + pub fn reduce_min(self) -> i32 { + self.0.reduce_min() + } + #[inline(always)] + pub fn reduce_max(self) -> i32 { + self.0.reduce_max() + } + + #[inline(always)] + pub fn simd_min(self, other: Self) -> Self { + Self(self.0.simd_min(other.0)) + } + #[inline(always)] + pub fn simd_max(self, other: Self) -> Self { + Self(self.0.simd_max(other.0)) + } + + #[inline(always)] + pub fn cmpeq_mask(self, other: Self) -> u8 { + self.0.simd_eq(other.0).to_bitmask() as u8 + } + #[inline(always)] + pub fn cmpgt_mask(self, other: Self) -> u8 { + self.0.simd_gt(other.0).to_bitmask() as u8 + } +} + +impl Default for I32x8 { + #[inline(always)] + fn default() -> Self { + Self::splat(0) + } +} + +impl PartialEq for I32x8 { + #[inline(always)] + fn eq(&self, other: &Self) -> bool { + self.to_array() == other.to_array() + } +} + +// ════════════════════════════════════════════════════════════════════ +// I64x4 — 4-lane i64 (256-bit, added 2026-05-20 missing-lanes sweep) +// ════════════════════════════════════════════════════════════════════ + +/// 4-lane `i64` SIMD vector backed by `core::simd::i64x4`. +/// +/// API mirrors `simd_avx512::I64x8` at half-width. Miri-executable. +#[derive(Copy, Clone, Debug)] +#[repr(transparent)] +pub struct I64x4(pub i64x4); + +impl I64x4 { + pub const LANES: usize = 4; + + #[inline(always)] + pub fn splat(v: i64) -> Self { + Self(i64x4::splat(v)) + } + + #[inline(always)] + pub fn from_slice(s: &[i64]) -> Self { + assert!(s.len() >= 4, "I64x4::from_slice needs >=4 elements"); + Self(i64x4::from_slice(s)) + } + + #[inline(always)] + pub fn from_array(arr: [i64; 4]) -> Self { + Self(i64x4::from_array(arr)) + } + + #[inline(always)] + pub fn to_array(self) -> [i64; 4] { + self.0.to_array() + } + + #[inline(always)] + pub fn copy_to_slice(self, s: &mut [i64]) { + assert!(s.len() >= 4, "I64x4::copy_to_slice needs >=4 elements"); + self.0.copy_to_slice(s); + } + + #[inline(always)] + pub fn reduce_sum(self) -> i64 { + self.0.reduce_sum() + } + #[inline(always)] + pub fn reduce_min(self) -> i64 { + self.0.reduce_min() + } + #[inline(always)] + pub fn reduce_max(self) -> i64 { + self.0.reduce_max() + } + + #[inline(always)] + pub fn simd_min(self, other: Self) -> Self { + Self(self.0.simd_min(other.0)) + } + #[inline(always)] + pub fn simd_max(self, other: Self) -> Self { + Self(self.0.simd_max(other.0)) + } + + #[inline(always)] + pub fn cmpeq_mask(self, other: Self) -> u8 { + self.0.simd_eq(other.0).to_bitmask() as u8 + } + #[inline(always)] + pub fn cmpgt_mask(self, other: Self) -> u8 { + self.0.simd_gt(other.0).to_bitmask() as u8 + } +} + +impl Default for I64x4 { + #[inline(always)] + fn default() -> Self { + Self::splat(0) + } +} + +impl PartialEq for I64x4 { + #[inline(always)] + fn eq(&self, other: &Self) -> bool { + self.to_array() == other.to_array() + } +} diff --git a/src/simd_nightly/mod.rs b/src/simd_nightly/mod.rs index 99636acc..3a281acf 100644 --- a/src/simd_nightly/mod.rs +++ b/src/simd_nightly/mod.rs @@ -39,10 +39,10 @@ pub use f16_types::F16x16; pub use f32_types::{F32x16, F32x8}; pub use f64_types::{F64x4, F64x8}; pub use i8_types::{I8x32, I8x64}; -pub use i_word_types::{I16x16, I16x32, I32x16, I64x8}; +pub use i_word_types::{I16x16, I16x32, I32x16, I32x8, I64x4, I64x8}; pub use masks::{F32Mask16, F32Mask8, F64Mask4, F64Mask8}; pub use u8_types::{U8x32, U8x64}; -pub use u_word_types::{U16x32, U32x16, U32x8, U64x4, U64x8}; +pub use u_word_types::{U16x16, U16x32, U32x16, U32x8, U64x4, U64x8}; // Lowercase aliases — match the std::simd convention used by // `simd_avx2.rs`, `simd_avx512.rs`, and the scalar fallback in @@ -83,3 +83,10 @@ pub type i16x16 = I16x16; pub type i32x16 = I32x16; #[allow(non_camel_case_types)] pub type i64x8 = I64x8; +// 256-bit aliases for the missing-lanes sweep (2026-05-20). +#[allow(non_camel_case_types)] +pub type u16x16 = U16x16; +#[allow(non_camel_case_types)] +pub type i32x8 = I32x8; +#[allow(non_camel_case_types)] +pub type i64x4 = I64x4; diff --git a/src/simd_nightly/u_word_types.rs b/src/simd_nightly/u_word_types.rs index a20bc3ce..bd34c01b 100644 --- a/src/simd_nightly/u_word_types.rs +++ b/src/simd_nightly/u_word_types.rs @@ -3,7 +3,7 @@ use core::simd::cmp::{SimdOrd, SimdPartialEq, SimdPartialOrd}; use core::simd::num::SimdUint; -use core::simd::{u16x32, u32x16, u32x8, u64x4, u64x8}; +use core::simd::{u16x16, u16x32, u32x16, u32x8, u64x4, u64x8}; // ════════════════════════════════════════════════════════════════════ // U64x8 — 8-lane u64 @@ -478,6 +478,90 @@ impl Default for U16x32 { } } +// ════════════════════════════════════════════════════════════════════ +// U16x16 — 16-lane u16 (256-bit, added 2026-05-20 missing-lanes sweep) +// ════════════════════════════════════════════════════════════════════ + +/// 16-lane `u16` SIMD vector backed by `core::simd::u16x16`. +/// +/// API mirrors `simd_avx512::U16x32` at half-width. Miri-executable. +#[derive(Copy, Clone, Debug, PartialEq)] +#[repr(transparent)] +pub struct U16x16(pub u16x16); + +impl U16x16 { + pub const LANES: usize = 16; + + #[inline(always)] + pub fn splat(v: u16) -> Self { + Self(u16x16::splat(v)) + } + + #[inline(always)] + pub fn from_slice(s: &[u16]) -> Self { + assert!(s.len() >= 16, "U16x16::from_slice needs >=16 elements"); + Self(u16x16::from_slice(s)) + } + + #[inline(always)] + pub fn from_array(arr: [u16; 16]) -> Self { + Self(u16x16::from_array(arr)) + } + + #[inline(always)] + pub fn to_array(self) -> [u16; 16] { + self.0.to_array() + } + + #[inline(always)] + pub fn copy_to_slice(self, s: &mut [u16]) { + assert!(s.len() >= 16, "U16x16::copy_to_slice needs >=16 elements"); + self.0.copy_to_slice(s); + } + + #[inline(always)] + pub fn reduce_sum(self) -> u16 { + self.0.reduce_sum() + } + + #[inline(always)] + pub fn reduce_min(self) -> u16 { + self.0.reduce_min() + } + + #[inline(always)] + pub fn reduce_max(self) -> u16 { + self.0.reduce_max() + } + + #[inline(always)] + pub fn simd_min(self, other: Self) -> Self { + Self(self.0.simd_min(other.0)) + } + + #[inline(always)] + pub fn simd_max(self, other: Self) -> Self { + Self(self.0.simd_max(other.0)) + } + + #[inline(always)] + pub fn cmpeq_mask(self, other: Self) -> u16 { + self.0.simd_eq(other.0).to_bitmask() as u16 + } + + #[inline(always)] + pub fn cmpgt_mask(self, other: Self) -> u16 { + self.0.simd_gt(other.0).to_bitmask() as u16 + } +} + +impl Default for U16x16 { + #[inline(always)] + fn default() -> Self { + Self::splat(0) + } +} + // ════════════════════════════════════════════════════════════════════ // Tests // ════════════════════════════════════════════════════════════════════ diff --git a/src/simd_scalar.rs b/src/simd_scalar.rs index 8c539d49..77b0b421 100644 --- a/src/simd_scalar.rs +++ b/src/simd_scalar.rs @@ -532,6 +532,16 @@ impl_int_type!(I8x32, i8, 32, 0i8); impl_int_type!(I16x32, i16, 32, 0i16); impl_int_type!(I16x16, i16, 16, 0i16); +// 256-bit int lanes — scalar polyfills filling the gap surfaced by the +// 2026-05-20 matrix audit. Mirror the additions in `src/simd_avx2.rs` +// (via the `avx2_int_type!` macro) so consumers on every backend reach +// the same type names through `crate::simd::*`. +impl_int_type!(U16x16, u16, 16, 0u16); +impl_int_type!(U32x8, u32, 8, 0u32); +impl_int_type!(U64x4, u64, 4, 0u64); +impl_int_type!(I32x8, i32, 8, 0i32); +impl_int_type!(I64x4, i64, 4, 0i64); + // I8x64 / I8x32 / I16x32 / I16x16 — AVX-512BW-style methods (scalar shape) impl I8x64 { #[inline(always)] @@ -1284,3 +1294,15 @@ pub type i8x32 = I8x32; pub type i16x32 = I16x32; #[allow(non_camel_case_types)] pub type i16x16 = I16x16; +// Lowercase aliases for the 256-bit polyfills added in the 2026-05-20 +// missing-lanes sweep. +#[allow(non_camel_case_types)] +pub type u16x16 = U16x16; +#[allow(non_camel_case_types)] +pub type u32x8 = U32x8; +#[allow(non_camel_case_types)] +pub type u64x4 = U64x4; +#[allow(non_camel_case_types)] +pub type i32x8 = I32x8; +#[allow(non_camel_case_types)] +pub type i64x4 = I64x4;