diff --git a/scripts/miri-tests.sh b/scripts/miri-tests.sh index 1f291bad..0f3a7d9d 100755 --- a/scripts/miri-tests.sh +++ b/scripts/miri-tests.sh @@ -1,18 +1,87 @@ #!/bin/sh +# +# Miri test runner — ephemeral nightly, scoped to this script ONLY. +# +# Rules of the road (do not violate): +# * The repo's default toolchain is stable (see rust-toolchain.toml). +# `cargo build`, `cargo test`, `cargo clippy`, CI's clippy / tests jobs +# all use stable. Nothing else opts into nightly. +# * Miri requires nightly because `src/simd_nightly/` is gated on +# `#![feature(portable_simd)]` (unstable issue #86656), and Miri itself +# ships only on nightly. This script invokes nightly via `+nightly`, +# which is an ephemeral, per-invocation switch — it does NOT change +# the default toolchain. +# * The `nightly-simd` cargo feature is enabled here ONLY. It routes +# `crate::simd::*` through `core::simd::*` (the std polyfill) instead +# of the architecture-specific `_mm*_*` intrinsics, so Miri can +# actually execute the SIMD code paths. Production builds (and CI's +# clippy / tests on stable) keep using the intrinsics backend. +# * `blas` is excluded because Miri cannot FFI into `cblas_gemm`. +# +# If Miri stays clean, the matching CI job at `.github/workflows/ci.yaml` +# § miri promotes this from optional → required. set -x set -e -# We rely on layout-dependent casts, which should be covered with #[repr(transparent)] -# This should catch if we missed that -RUSTFLAGS="-Zrandomize-layout" +# Idempotent install of the miri component on nightly. No-op when already +# present (rustup short-circuits). Safe in CI fresh checkouts. +rustup component add miri --toolchain nightly >/dev/null 2>&1 || \ + rustup +nightly component add miri -# Miri reports a stacked borrow violation deep within rayon, in a crate called crossbeam-epoch -# The crate has a PR to fix this: https://github.com/crossbeam-rs/crossbeam/pull/871 -# but using Miri's tree borrow mode may resolve it for now. -# Disabled until we can figure out a different rayon issue: https://github.com/rust-lang/miri/issues/1371 -# MIRIFLAGS="-Zmiri-tree-borrows" +# Layout randomisation — catches missing `#[repr(transparent)]` and similar +# layout-dependent UB. Cheap; always on. +export RUSTFLAGS="-Zrandomize-layout" -# General tests -# Note that we exclude blas feature because Miri can't do cblas_gemm -cargo miri nextest run -v -p ndarray -p ndarray-rand --features approx,serde +# Miri reports a stacked borrow violation deep within rayon's +# crossbeam-epoch. Upstream fix: crossbeam PR #871. +# Tree-borrow mode resolves it but trips a different rayon issue +# (rust-lang/miri#1371). Left disabled until both upstream stories close. +# export MIRIFLAGS="-Zmiri-tree-borrows" + +# Architectural limit on the Miri sweep: +# +# `crate::simd::*` (the production dispatch in `src/simd.rs`) re-exports +# from `simd_avx512` / `simd_avx2` / `simd_neon`, which call `_mm*_*` / +# `vget*` intrinsics directly. Miri rejects those with "calling a +# function that requires unavailable target features: avx" because the +# Miri target doesn't enable AVX/AVX2/AVX-512/NEON target features. +# +# The `nightly-simd` feature ships a parallel module `crate::simd_nightly` +# (the 24-type `core::simd` polyfill, at full parity with the 24 types +# defined across `simd_avx2.rs` + `simd_avx512.rs` — landed in PR #146) +# which IS Miri-checkable. But the default `crate::simd::*` dispatch is +# NOT routed through it; consumer modules that import `crate::simd::F32x16` +# (most of `hpc::*` + the `simd::tests::*` suite) go through intrinsics. +# The polyfill is no longer the bottleneck — the missing piece is a +# `cfg(miri)` switch in `src/simd.rs` that re-exports from `simd_nightly` +# instead of `simd_avx*` under Miri. +cargo +nightly miri nextest run -v \ + --no-fail-fast \ + -p ndarray -p ndarray-rand \ + --features approx,serde,nightly-simd \ + -E '!( + test(/^hpc::/) - test(/^hpc::byte_scan/) + ) and !test(/^simd::tests::/) + and !test(/^hpc::framebuffer::pyramid_tests::/) + ' +# +# Filter rationale (3-clause AND): +# +# 1. `!(test(/^hpc::/) - test(/^hpc::byte_scan/))` +# Skip everything in `hpc::*` EXCEPT `hpc::byte_scan` (the scalar-fallback +# path validated against the `cfg(miri)` SimdCaps bypass). +# +# 2. `!test(/^simd::tests::/)` +# Skip the `simd::tests::*` suite. These exercise `crate::simd::F32x16` +# etc. directly — types that re-export AVX/AVX2/AVX-512 intrinsics. Miri +# rejects every one with "calling a function that requires unavailable +# target features: avx". Same architectural class as `hpc::*`. Will +# become miri-runnable when `crate::simd::*` gains a cfg(miri) dispatch +# through `simd_nightly`. +# +# 3. `!test(/^hpc::framebuffer::pyramid_tests::/)` +# The 3 pyramid tests take 19+ minutes EACH under Miri (large 2D scan +# loops over SIMD-shaped data). Not a UB signal — pure runtime cost. +# Re-enable once the test fixtures are sized down or the loops are +# cfg(miri)-shortened. diff --git a/src/hpc/mod.rs b/src/hpc/mod.rs index cec5b4d4..ae063575 100644 --- a/src/hpc/mod.rs +++ b/src/hpc/mod.rs @@ -236,6 +236,11 @@ pub mod framebuffer; /// Transcoded from Opus CELT for the HHTL cascade → waveform pipeline. pub mod audio; +/// Vertical streaming structs for the EdgeColumn SoA (D-CSV-11b, sprint-12). +/// Per cognitive-substrate-convergence-v1.md §5 L-20. +#[allow(missing_docs)] +pub mod stream; + #[cfg(all(test, feature = "hpc-extras"))] mod e2e_tests { //! End-to-end pipeline test: Fingerprint → Node → Seal → Cascade → CLAM → Causality → BNN diff --git a/src/hpc/simd_caps.rs b/src/hpc/simd_caps.rs index 2789ba88..a35823b5 100644 --- a/src/hpc/simd_caps.rs +++ b/src/hpc/simd_caps.rs @@ -100,8 +100,41 @@ pub fn simd_caps() -> SimdCaps { } impl SimdCaps { + /// Miri-only: CPUID inline asm is unsupported by Miri (it can't simulate + /// CPU feature detection). Return an all-scalar capability set so any + /// test reaching this LazyLock under Miri exercises the scalar fallback + /// paths instead of aborting on the `__cpuid_count` call. Scoped to + /// `cfg(miri)` — production builds and stable CI use the real detection + /// below. + #[cfg(miri)] + fn detect() -> Self { + Self { + avx2: false, + avx512f: false, + avx512bw: false, + avx512vl: false, + avx512vpopcntdq: false, + sse41: false, + sse2: false, + fma: false, + avx512vnni: false, + avx512vbmi: false, + amx_tile: false, + amx_int8: false, + amx_bf16: false, + avx512bf16: false, + avxvnniint8: false, + neon: false, + asimd_dotprod: false, + fp16: false, + aes: false, + sha2: false, + crc32: false, + } + } + /// Detect CPU capabilities at runtime. - #[cfg(target_arch = "x86_64")] + #[cfg(all(target_arch = "x86_64", not(miri)))] fn detect() -> Self { // `__cpuid_count` is safe on x86_64 (Rust 1.87+): CPUID is always // available on x86_64 (guaranteed by the ABI) and has no side effects @@ -140,7 +173,7 @@ impl SimdCaps { /// AArch64: detect NEON sub-features via `is_aarch64_feature_detected!`. /// NEON itself is mandatory (always true). The sub-features distinguish /// Pi Zero 2 W / Pi 3 (A53) from Pi 4 (A72) from Pi 5 (A76). - #[cfg(target_arch = "aarch64")] + #[cfg(all(target_arch = "aarch64", not(miri)))] fn detect() -> Self { Self { // x86 fields: all false on ARM diff --git a/src/hpc/stream/inference.rs b/src/hpc/stream/inference.rs new file mode 100644 index 00000000..503a5545 --- /dev/null +++ b/src/hpc/stream/inference.rs @@ -0,0 +1,219 @@ +//! InferenceStream — forward-iterator over a borrowed `&[InferenceRow]` slice. +//! Per cognitive-substrate-convergence-v1.md §5 L-20: vertical streaming +//! over the inference-mantissa lane of the EdgeColumn SoA. Used by the +//! integer-SIMD MUL evaluation hot path (D-CSV-8 sprint-12 SIMD vec). +//! +//! Pure iterator scaffold; `par_inference_stream` rayon variant is sprint-13+. + +// Local mirror of CausalEdge64 shape (bit-compatible with causal_edge::CausalEdge64). +// No cross-crate import: ndarray is the producer; causal-edge is the consumer. + +/// A single row of the EdgeColumn SoA, bit-compatible with +/// `causal_edge::CausalEdge64` v2 layout. +/// +/// Fields of interest for the inference-mantissa lane: +/// - bits 46-49: signed 4-bit inference mantissa (−8..+7) +/// - bits 53-58: W-slot corpus root handle (0..=63) +#[repr(C, align(8))] +#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug, Default)] +pub struct InferenceRow(pub u64); + +impl InferenceRow { + /// Read the 4-bit signed mantissa at bits 46-49 (matches causal-edge v2 + /// `inference_mantissa()` exactly — see `causal-edge/src/layout.rs`). + /// + /// Sign-extension: extract 4-bit unsigned value, then sign-extend to i8 + /// via arithmetic left-shift trick: `(raw << 4) >> 4`. + #[inline] + pub fn inference_mantissa(&self) -> i8 { + let raw = ((self.0 >> 46) & 0xF) as i8; + (raw << 4) >> 4 // sign-extend 4 → 8 bits + } + + /// Read the W-slot at bits 53-58 (6 bits, 0..=63). + /// + /// The W-slot is the witness corpus root handle per CausalEdge64 v2 L-6. + /// Returns 0 for zero-initialized rows. + #[inline] + pub fn w_slot(&self) -> u8 { + ((self.0 >> 53) & 0x3F) as u8 + } +} + +/// Forward-iterator over a borrowed slice of [`InferenceRow`] values. +/// +/// Provides vertical streaming access to the inference-mantissa lane of the +/// EdgeColumn SoA. Yields `(index, &InferenceRow)` tuples so callers can +/// correlate back to the originating row without maintaining external counters. +/// +/// # Example +/// ```rust +/// use ndarray::hpc::stream::inference::{InferenceRow, InferenceStream}; +/// +/// let rows = vec![InferenceRow(0), InferenceRow(1 << 46)]; +/// let mut stream = InferenceStream::new(&rows); +/// assert_eq!(stream.len(), 2); +/// let (idx, row) = stream.next().unwrap(); +/// assert_eq!(idx, 0); +/// ``` +pub struct InferenceStream<'a> { + rows: &'a [InferenceRow], + cursor: usize, +} + +impl<'a> InferenceStream<'a> { + /// Construct a new stream over the given slice. The cursor starts at 0. + pub fn new(rows: &'a [InferenceRow]) -> Self { + Self { rows, cursor: 0 } + } + + /// Total number of rows in the underlying slice (not remaining). + pub fn len(&self) -> usize { + self.rows.len() + } + + /// Returns `true` if the underlying slice is empty. + pub fn is_empty(&self) -> bool { + self.rows.is_empty() + } + + /// Number of rows not yet yielded by the iterator. + pub fn remaining(&self) -> usize { + self.rows.len().saturating_sub(self.cursor) + } + + /// Reset the cursor to the beginning so the stream can be iterated again. + pub fn reset(&mut self) { + self.cursor = 0; + } +} + +impl<'a> Iterator for InferenceStream<'a> { + type Item = (usize, &'a InferenceRow); + + fn next(&mut self) -> Option { + if self.cursor < self.rows.len() { + let i = self.cursor; + self.cursor += 1; + Some((i, &self.rows[i])) + } else { + None + } + } + + fn size_hint(&self) -> (usize, Option) { + let rem = self.remaining(); + (rem, Some(rem)) + } +} + +impl<'a> ExactSizeIterator for InferenceStream<'a> { + fn len(&self) -> usize { + self.remaining() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_inference_stream_empty() { + let rows: &[InferenceRow] = &[]; + let mut stream = InferenceStream::new(rows); + assert!(stream.is_empty()); + assert_eq!(stream.len(), 0); + assert_eq!(stream.remaining(), 0); + assert!(stream.next().is_none()); + } + + #[test] + fn test_inference_stream_yields_all() { + let rows = vec![InferenceRow(0), InferenceRow(1), InferenceRow(2)]; + let stream = InferenceStream::new(&rows); + let collected: Vec<_> = stream.collect(); + assert_eq!(collected.len(), 3); + assert_eq!(collected[0].0, 0); + assert_eq!(collected[1].0, 1); + assert_eq!(collected[2].0, 2); + assert_eq!(collected[0].1 as *const _, &rows[0] as *const _); + assert_eq!(collected[2].1 as *const _, &rows[2] as *const _); + } + + #[test] + fn test_mantissa_signed_extraction() { + // Pack bits 46-49 = 0b1111 = 15 (raw), which is -1 in 4-bit two's complement. + let raw_bits: u64 = 0b1111u64 << 46; + let row = InferenceRow(raw_bits); + assert_eq!(row.inference_mantissa(), -1); + + // Pack bits 46-49 = 0b0111 = 7 (raw), positive maximum. + let row_pos = InferenceRow(0b0111u64 << 46); + assert_eq!(row_pos.inference_mantissa(), 7); + + // Pack bits 46-49 = 0b1000 = 8 (raw), which is -8 in 4-bit two's complement. + let row_min = InferenceRow(0b1000u64 << 46); + assert_eq!(row_min.inference_mantissa(), -8); + + // Zero mantissa. + let row_zero = InferenceRow(0); + assert_eq!(row_zero.inference_mantissa(), 0); + } + + #[test] + fn test_w_slot_extraction() { + // Pack bits 53-58 = 0b111111 = 63 (maximum W-slot value). + let raw_bits: u64 = 0b111111u64 << 53; + let row = InferenceRow(raw_bits); + assert_eq!(row.w_slot(), 63); + + // W-slot = 0 (zero row). + let row_zero = InferenceRow(0); + assert_eq!(row_zero.w_slot(), 0); + + // W-slot = 1. + let row_one = InferenceRow(1u64 << 53); + assert_eq!(row_one.w_slot(), 1); + + // W-slot = 32 (bit 58 set, bit 53 clear). + let row_32 = InferenceRow(32u64 << 53); + assert_eq!(row_32.w_slot(), 32); + } + + #[test] + fn test_remaining_decrements() { + let rows = vec![InferenceRow(0); 4]; + let mut stream = InferenceStream::new(&rows); + assert_eq!(stream.remaining(), 4); + stream.next(); + assert_eq!(stream.remaining(), 3); + stream.next(); + assert_eq!(stream.remaining(), 2); + stream.next(); + assert_eq!(stream.remaining(), 1); + stream.next(); + assert_eq!(stream.remaining(), 0); + // Exhausted: remaining stays 0. + stream.next(); + assert_eq!(stream.remaining(), 0); + } + + #[test] + fn test_reset_restarts() { + let rows = vec![InferenceRow(10), InferenceRow(20)]; + let mut stream = InferenceStream::new(&rows); + + // Exhaust the stream. + assert!(stream.next().is_some()); + assert!(stream.next().is_some()); + assert!(stream.next().is_none()); + assert_eq!(stream.remaining(), 0); + + // After reset, the stream yields from the beginning again. + stream.reset(); + assert_eq!(stream.remaining(), 2); + let first = stream.next().unwrap(); + assert_eq!(first.0, 0); + assert_eq!(first.1 .0, 10); + } +} diff --git a/src/hpc/stream/mod.rs b/src/hpc/stream/mod.rs new file mode 100644 index 00000000..55f3a551 --- /dev/null +++ b/src/hpc/stream/mod.rs @@ -0,0 +1,15 @@ +//! Vertical streaming structs for the SoA columns. +//! Per cognitive-substrate-convergence-v1.md §5 L-20. +//! +//! Sprint-12 scope (W-F4/5/6): `QualiaStream` + `InferenceStream` + +//! `SplatFieldStream` forward-iterator scaffolds. Sprint-13+: +//! `par_*` rayon variants once rayon is wired into the ndarray +//! feature gate. + +pub mod inference; +pub mod qualia; +pub mod splat_field; + +pub use inference::{InferenceRow, InferenceStream}; +pub use qualia::{QualiaI4Row, QualiaStream}; +pub use splat_field::{SplatField, SplatFieldStream}; diff --git a/src/hpc/stream/qualia.rs b/src/hpc/stream/qualia.rs new file mode 100644 index 00000000..e31ce79b --- /dev/null +++ b/src/hpc/stream/qualia.rs @@ -0,0 +1,201 @@ +//! QualiaStream — forward-iterator over a borrowed `&[QualiaI4Row]` slice. +//! Per cognitive-substrate-convergence-v1.md §5 L-20: vertical streaming +//! structs at the ndarray hardware-acceleration layer for sweep over the +//! QualiaColumn SoA layout introduced by D-CSV-5b. +//! +//! Yields `(row_index, &QualiaI4Row)` tuples. Pure iterator scaffold; the +//! `par_qualia_stream` rayon-parallel variant is sprint-13+ once rayon is +//! wired into the ndarray feature gate. + +// NOTE: do NOT import lance-graph-contract here (would create circular dep +// since contract is *consumer* of ndarray). Define a minimal local mirror +// of the QualiaI4_16D shape — 8 bytes, `Copy`, hashable. Real coupling at +// the consumer boundary, not at the producer. + +/// Local mirror of `lance_graph_contract::qualia::QualiaI4_16D`. +/// Bit-compatible: `repr(C, align(8))`, 8 bytes, 16 × 4-bit signed lanes. +/// Defined here to avoid a circular dependency with the contract crate. +#[repr(C, align(8))] +#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug, Default)] +pub struct QualiaI4Row(pub u64); + +/// Forward-iterator over a borrowed `&[QualiaI4Row]` slice. +/// +/// Yields `(row_index, &QualiaI4Row)` tuples in ascending index order. +/// +/// # Example +/// +/// ``` +/// use ndarray::hpc::stream::qualia::{QualiaI4Row, QualiaStream}; +/// +/// let rows = vec![QualiaI4Row(1), QualiaI4Row(2), QualiaI4Row(3)]; +/// let mut stream = QualiaStream::new(&rows); +/// assert_eq!(stream.next(), Some((0, &QualiaI4Row(1)))); +/// assert_eq!(stream.next(), Some((1, &QualiaI4Row(2)))); +/// assert_eq!(stream.next(), Some((2, &QualiaI4Row(3)))); +/// assert_eq!(stream.next(), None); +/// ``` +pub struct QualiaStream<'a> { + rows: &'a [QualiaI4Row], + cursor: usize, +} + +impl<'a> QualiaStream<'a> { + /// Construct a new `QualiaStream` over `rows`. + /// The cursor starts at index 0. + #[inline] + pub fn new(rows: &'a [QualiaI4Row]) -> Self { + Self { rows, cursor: 0 } + } + + /// Total number of rows in the backing slice (unchanged by iteration). + #[inline] + pub fn len(&self) -> usize { + self.rows.len() + } + + /// `true` if the backing slice is empty. + #[inline] + pub fn is_empty(&self) -> bool { + self.rows.is_empty() + } + + /// Number of rows not yet yielded (decrements with each `next()` call). + #[inline] + pub fn remaining(&self) -> usize { + self.rows.len().saturating_sub(self.cursor) + } + + /// Reset the cursor to 0, allowing the stream to be re-iterated from the start. + #[inline] + pub fn reset(&mut self) { + self.cursor = 0; + } + + /// Current cursor position (0-based index of the NEXT row to be yielded). + #[inline] + pub fn cursor(&self) -> usize { + self.cursor + } +} + +impl<'a> Iterator for QualiaStream<'a> { + type Item = (usize, &'a QualiaI4Row); + + #[inline] + fn next(&mut self) -> Option { + if self.cursor < self.rows.len() { + let i = self.cursor; + self.cursor += 1; + Some((i, &self.rows[i])) + } else { + None + } + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + let rem = self.remaining(); + (rem, Some(rem)) + } +} + +impl<'a> ExactSizeIterator for QualiaStream<'a> { + /// Returns the number of rows not yet yielded. + #[inline] + fn len(&self) -> usize { + self.remaining() + } +} + +// ─── Tests ──────────────────────────────────────────────────────────────────── + +#[cfg(test)] +mod tests { + use super::{QualiaI4Row, QualiaStream}; + + /// Empty slice → stream yields nothing immediately. + #[test] + fn test_empty_stream() { + let rows: Vec = vec![]; + let mut stream = QualiaStream::new(&rows); + assert!(stream.is_empty()); + assert_eq!(stream.len(), 0); + assert_eq!(stream.remaining(), 0); + assert_eq!(stream.next(), None); + } + + /// Stream over N rows must yield exactly N items. + #[test] + fn test_stream_yields_all_rows() { + let rows = vec![QualiaI4Row(0xAA), QualiaI4Row(0xBB), QualiaI4Row(0xCC), QualiaI4Row(0xDD)]; + let stream = QualiaStream::new(&rows); + let collected: Vec<(usize, &QualiaI4Row)> = stream.collect(); + assert_eq!(collected.len(), rows.len()); + for (i, row) in collected.iter() { + assert_eq!(row.0, rows[*i].0); + } + } + + /// Each yielded index must equal the row's position in the slice. + #[test] + fn test_stream_indices_match() { + let rows: Vec = (0u64..8).map(QualiaI4Row).collect(); + let mut stream = QualiaStream::new(&rows); + let mut expected_idx = 0usize; + while let Some((idx, row)) = stream.next() { + assert_eq!(idx, expected_idx, "index mismatch at position {}", expected_idx); + assert_eq!(row.0, expected_idx as u64, "row value mismatch at index {}", expected_idx); + expected_idx += 1; + } + assert_eq!(expected_idx, rows.len()); + } + + /// `remaining()` must decrement by 1 with each `next()` call. + #[test] + fn test_remaining_decrements() { + let rows: Vec = (0u64..5).map(QualiaI4Row).collect(); + let mut stream = QualiaStream::new(&rows); + assert_eq!(stream.remaining(), 5); + let _ = stream.next(); + assert_eq!(stream.remaining(), 4); + let _ = stream.next(); + assert_eq!(stream.remaining(), 3); + // Exhaust + while stream.next().is_some() {} + assert_eq!(stream.remaining(), 0); + } + + /// After `reset()`, the stream must replay all rows from index 0. + #[test] + fn test_reset_restarts() { + let rows = vec![QualiaI4Row(10), QualiaI4Row(20), QualiaI4Row(30)]; + let mut stream = QualiaStream::new(&rows); + // Consume all + while stream.next().is_some() {} + assert_eq!(stream.remaining(), 0); + // Reset and re-collect + stream.reset(); + assert_eq!(stream.remaining(), 3); + let first = stream.next(); + assert_eq!(first, Some((0, &QualiaI4Row(10)))); + } + + /// `ExactSizeIterator::len()` must equal `remaining()` at every step. + #[test] + fn test_exact_size_iterator() { + let rows: Vec = (0u64..6).map(QualiaI4Row).collect(); + let mut stream = QualiaStream::new(&rows); + // Before iteration + assert_eq!(ExactSizeIterator::len(&stream), 6); + assert_eq!(ExactSizeIterator::len(&stream), stream.remaining()); + // After each next() + for expected_remaining in (0..6usize).rev() { + let _ = stream.next(); + assert_eq!(ExactSizeIterator::len(&stream), expected_remaining); + assert_eq!(ExactSizeIterator::len(&stream), stream.remaining()); + } + assert_eq!(stream.next(), None); + assert_eq!(ExactSizeIterator::len(&stream), 0); + } +} diff --git a/src/hpc/stream/splat_field.rs b/src/hpc/stream/splat_field.rs new file mode 100644 index 00000000..d0702e2f --- /dev/null +++ b/src/hpc/stream/splat_field.rs @@ -0,0 +1,226 @@ +//! SplatFieldStream — forward-iterator over Gaussian-splat field samples. +//! Per cognitive-substrate-convergence-v1.md §5 L-20 + .claude/knowledge/ +//! splat-shader-rayon-struct-method-vision.md: vertical streaming over +//! the splat field for the D-CSV-12 splat op fleet. +//! +//! Each row = one Gaussian splat (mean, σ², energy). Pure iterator +//! scaffold; `par_splat_stream` rayon variant is sprint-13+. + +// NOTE: SplatField is defined locally here — do NOT import lance-graph-contract +// (would create a circular dep; ndarray is a producer, contract is a consumer). + +/// One Gaussian splat row: mean position, variance (σ²), accumulated energy, +/// and a generation/cycle stamp. +/// +/// Layout: `repr(C, align(16))` — 4 × 4-byte fields = exactly 16 bytes. +/// `align(16)` matches the SSE/NEON minimum and is verified by +/// `test_splat_field_size_16b`. +#[repr(C, align(16))] +#[derive(Clone, Copy, PartialEq, Debug, Default)] +pub struct SplatField { + /// Mean position in the field space (could be index, palette ID, or BindSpace row). + pub mean: u32, + /// σ² (variance) — controls splat spread. + pub variance: f32, + /// Accumulated energy at this splat. + pub energy: f32, + /// Generation/cycle stamp for the splat. + pub generation: u32, +} + +/// Forward-iterator over a borrowed `&[SplatField]` slice. +/// +/// Yields `(row_index, &SplatField)` tuples in ascending index order. +/// +/// # Example +/// +/// ``` +/// use ndarray::hpc::stream::splat_field::{SplatField, SplatFieldStream}; +/// +/// let rows = vec![ +/// SplatField { mean: 0, variance: 1.0, energy: 0.5, generation: 1 }, +/// SplatField { mean: 1, variance: 2.0, energy: 1.5, generation: 2 }, +/// ]; +/// let mut stream = SplatFieldStream::new(&rows); +/// let (idx, splat) = stream.next().unwrap(); +/// assert_eq!(idx, 0); +/// assert_eq!(splat.mean, 0); +/// ``` +pub struct SplatFieldStream<'a> { + rows: &'a [SplatField], + cursor: usize, +} + +impl<'a> SplatFieldStream<'a> { + /// Construct a new `SplatFieldStream` over `rows`. + /// The cursor starts at index 0. + #[inline] + pub fn new(rows: &'a [SplatField]) -> Self { + Self { rows, cursor: 0 } + } + + /// Total number of rows in the backing slice (unchanged by iteration). + #[inline] + pub fn len(&self) -> usize { + self.rows.len() + } + + /// `true` if the backing slice is empty. + #[inline] + pub fn is_empty(&self) -> bool { + self.rows.is_empty() + } + + /// Number of rows not yet yielded (decrements with each `next()` call). + #[inline] + pub fn remaining(&self) -> usize { + self.rows.len().saturating_sub(self.cursor) + } + + /// Reset the cursor to 0, allowing the stream to be re-iterated from the start. + #[inline] + pub fn reset(&mut self) { + self.cursor = 0; + } + + /// Filter to only splats whose `energy` field is strictly above `threshold`. + /// + /// Consumes `self` (the `SplatFieldStream` is itself an `Iterator`) and + /// returns a lazy `impl Iterator` — no allocation. + pub fn filter_energy_above(self, threshold: f32) -> impl Iterator { + self.filter(move |(_, s)| s.energy > threshold) + } +} + +impl<'a> Iterator for SplatFieldStream<'a> { + type Item = (usize, &'a SplatField); + + #[inline] + fn next(&mut self) -> Option { + if self.cursor < self.rows.len() { + let i = self.cursor; + self.cursor += 1; + Some((i, &self.rows[i])) + } else { + None + } + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + let rem = self.remaining(); + (rem, Some(rem)) + } +} + +impl<'a> ExactSizeIterator for SplatFieldStream<'a> { + /// Returns the number of rows not yet yielded. + #[inline] + fn len(&self) -> usize { + self.remaining() + } +} + +// ─── Tests ──────────────────────────────────────────────────────────────────── + +#[cfg(test)] +mod tests { + use super::{SplatField, SplatFieldStream}; + use std::mem; + + fn make_splat(mean: u32, variance: f32, energy: f32, generation: u32) -> SplatField { + SplatField { + mean, + variance, + energy, + generation, + } + } + + /// Empty slice → stream yields nothing immediately. + #[test] + fn test_splat_stream_empty() { + let rows: Vec = vec![]; + let mut stream = SplatFieldStream::new(&rows); + assert!(stream.is_empty()); + assert_eq!(stream.len(), 0); + assert_eq!(stream.remaining(), 0); + assert_eq!(stream.next(), None); + } + + /// Stream over N rows must yield exactly N items with matching indices. + #[test] + fn test_splat_stream_yields_all() { + let rows = vec![make_splat(0, 1.0, 0.1, 1), make_splat(1, 2.0, 0.5, 2), make_splat(2, 0.5, 2.0, 3)]; + let stream = SplatFieldStream::new(&rows); + let collected: Vec<(usize, &SplatField)> = stream.collect(); + assert_eq!(collected.len(), 3); + for (idx, splat) in &collected { + assert_eq!(splat.mean, *idx as u32); + } + } + + /// `filter_energy_above` must retain only splats strictly above the threshold. + #[test] + fn test_filter_energy_above() { + let rows = vec![ + make_splat(0, 1.0, 0.1, 1), + make_splat(1, 1.0, 0.5, 2), + make_splat(2, 1.0, 1.0, 3), + make_splat(3, 1.0, 2.0, 4), + ]; + let stream = SplatFieldStream::new(&rows); + let above: Vec<(usize, &SplatField)> = stream.filter_energy_above(0.5).collect(); + // Only rows with energy > 0.5: indices 2 (1.0) and 3 (2.0). + assert_eq!(above.len(), 2); + assert_eq!(above[0].0, 2); + assert_eq!(above[1].0, 3); + } + + /// `size_of::()` must be exactly 16 bytes — verifies `align(16)` + /// and field packing (4 × 4-byte fields with no hidden padding). + #[test] + fn test_splat_field_size_16b() { + assert_eq!(mem::size_of::(), 16, "SplatField must be exactly 16 bytes (4 × 4B fields, align(16))"); + assert_eq!(mem::align_of::(), 16, "SplatField alignment must be 16"); + } + + /// `remaining()` must decrement by 1 with each `next()` call. + #[test] + fn test_remaining_decrements() { + let rows = vec![ + make_splat(0, 1.0, 1.0, 0), + make_splat(1, 1.0, 1.0, 1), + make_splat(2, 1.0, 1.0, 2), + make_splat(3, 1.0, 1.0, 3), + ]; + let mut stream = SplatFieldStream::new(&rows); + assert_eq!(stream.remaining(), 4); + let _ = stream.next(); + assert_eq!(stream.remaining(), 3); + let _ = stream.next(); + assert_eq!(stream.remaining(), 2); + // Exhaust remaining + while stream.next().is_some() {} + assert_eq!(stream.remaining(), 0); + assert_eq!(stream.next(), None); + } + + /// After `reset()`, the stream replays all rows from index 0. + #[test] + fn test_reset_restarts() { + let rows = vec![make_splat(10, 1.0, 0.3, 1), make_splat(20, 2.0, 0.6, 2), make_splat(30, 3.0, 0.9, 3)]; + let mut stream = SplatFieldStream::new(&rows); + // Consume everything + while stream.next().is_some() {} + assert_eq!(stream.remaining(), 0); + // Reset and verify replay + stream.reset(); + assert_eq!(stream.remaining(), 3); + let first = stream.next(); + assert!(first.is_some()); + let (idx, splat) = first.unwrap(); + assert_eq!(idx, 0); + assert_eq!(splat.mean, 10); + } +} diff --git a/src/simd.rs b/src/simd.rs index ed3e0dea..b3c00f11 100644 --- a/src/simd.rs +++ b/src/simd.rs @@ -205,12 +205,19 @@ pub const PREFERRED_I16_LANES: usize = 16; // Note on the `nightly-simd` feature: it adds the `crate::simd_nightly` // module (a portable-simd backend wrapping `core::simd`) but does NOT -// replace the intrinsics dispatch below. Full type-parity coverage -// would require the nightly module to define ~30 types; the current -// draft covers 5 (F32x16, F64x8, U8x64, U32x16, F32Mask16). Consumers -// who want miri-runnable SIMD code import from `simd_nightly` -// explicitly (e.g. `use ndarray::simd_nightly::F32x16`). The main -// polyfill via `crate::simd::F32x16` continues to use intrinsics. +// replace the intrinsics dispatch below. The polyfill ships full +// type-parity with production (PR #146): 24 types covering F32x8/16, +// F64x4/8, BF16x8/16, F16x16, I8x32/64, I16x16/32, I32x16, I64x8, +// U8x32/64, U16x32, U32x8/16, U64x4/8, plus the F32/F64 mask types — +// matches the 24 types defined in `simd_avx2.rs` + `simd_avx512.rs`. +// Consumers who want miri-runnable SIMD code import from `simd_nightly` +// explicitly today (e.g. `use ndarray::simd_nightly::F32x16`). +// +// The remaining work for Miri-clean coverage of `hpc::*` is wiring this +// file's `pub use crate::simd_{avx512,avx2,neon}::*` re-exports to +// route through `simd_nightly` under `cfg(miri)`. Once that lands, +// every `use crate::simd::F32x16` call site becomes miri-checkable +// without source changes. The polyfill itself is no longer the bottleneck. #[cfg(all(target_arch = "x86_64", target_feature = "avx512f"))] pub use crate::simd_avx512::{