diff --git a/scripts/miri-tests.sh b/scripts/miri-tests.sh
index 1f291bad..0f3a7d9d 100755
--- a/scripts/miri-tests.sh
+++ b/scripts/miri-tests.sh
@@ -1,18 +1,87 @@
 #!/bin/sh
+#
+# Miri test runner — ephemeral nightly, scoped to this script ONLY.
+#
+# Rules of the road (do not violate):
+#   * The repo's default toolchain is stable (see rust-toolchain.toml).
+#     `cargo build`, `cargo test`, `cargo clippy`, CI's clippy / tests jobs
+#     all use stable. Nothing else opts into nightly.
+#   * Miri requires nightly because `src/simd_nightly/` is gated on
+#     `#![feature(portable_simd)]` (unstable issue #86656), and Miri itself
+#     ships only on nightly. This script invokes nightly via `+nightly`,
+#     which is an ephemeral, per-invocation switch — it does NOT change
+#     the default toolchain.
+#   * The `nightly-simd` cargo feature is enabled here ONLY. It routes
+#     `crate::simd::*` through `core::simd::*` (the std polyfill) instead
+#     of the architecture-specific `_mm*_*` intrinsics, so Miri can
+#     actually execute the SIMD code paths. Production builds (and CI's
+#     clippy / tests on stable) keep using the intrinsics backend.
+#   * `blas` is excluded because Miri cannot FFI into `cblas_gemm`.
+#
+# If Miri stays clean, the matching CI job at `.github/workflows/ci.yaml`
+# § miri promotes this from optional → required.
 
 set -x
 set -e
 
-# We rely on layout-dependent casts, which should be covered with #[repr(transparent)]
-# This should catch if we missed that
-RUSTFLAGS="-Zrandomize-layout"
+# Idempotent install of the miri component on nightly. No-op when already
+# present (rustup short-circuits). Safe in CI fresh checkouts.
+rustup component add miri --toolchain nightly >/dev/null 2>&1 || \
+    rustup +nightly component add miri
 
-# Miri reports a stacked borrow violation deep within rayon, in a crate called crossbeam-epoch
-# The crate has a PR to fix this: https://github.com/crossbeam-rs/crossbeam/pull/871
-# but using Miri's tree borrow mode may resolve it for now.
-# Disabled until we can figure out a different rayon issue: https://github.com/rust-lang/miri/issues/1371
-# MIRIFLAGS="-Zmiri-tree-borrows"
+# Layout randomisation — catches missing `#[repr(transparent)]` and similar
+# layout-dependent UB. Cheap; always on.
+export RUSTFLAGS="-Zrandomize-layout"
 
-# General tests
-# Note that we exclude blas feature because Miri can't do cblas_gemm
-cargo miri nextest run -v -p ndarray -p ndarray-rand --features approx,serde
+# Miri reports a stacked borrow violation deep within rayon's
+# crossbeam-epoch. Upstream fix: crossbeam PR #871.
+# Tree-borrow mode resolves it but trips a different rayon issue
+# (rust-lang/miri#1371). Left disabled until both upstream stories close.
+# export MIRIFLAGS="-Zmiri-tree-borrows"
+
+# Architectural limit on the Miri sweep:
+#
+# `crate::simd::*` (the production dispatch in `src/simd.rs`) re-exports
+# from `simd_avx512` / `simd_avx2` / `simd_neon`, which call `_mm*_*` /
+# `vget*` intrinsics directly. Miri rejects those with "calling a
+# function that requires unavailable target features: avx" because the
+# Miri target doesn't enable AVX/AVX2/AVX-512/NEON target features.
+#
+# The `nightly-simd` feature ships a parallel module `crate::simd_nightly`
+# (the 24-type `core::simd` polyfill, at full parity with the 24 types
+# defined across `simd_avx2.rs` + `simd_avx512.rs` — landed in PR #146)
+# which IS Miri-checkable. But the default `crate::simd::*` dispatch is
+# NOT routed through it; consumer modules that import `crate::simd::F32x16`
+# (most of `hpc::*` + the `simd::tests::*` suite) go through intrinsics.
+# The polyfill is no longer the bottleneck — the missing piece is a
+# `cfg(miri)` switch in `src/simd.rs` that re-exports from `simd_nightly`
+# instead of `simd_avx*` under Miri.
+cargo +nightly miri nextest run -v \
+    --no-fail-fast \
+    -p ndarray -p ndarray-rand \
+    --features approx,serde,nightly-simd \
+    -E '!(
+            test(/^hpc::/) - test(/^hpc::byte_scan/)
+        ) and !test(/^simd::tests::/)
+          and !test(/^hpc::framebuffer::pyramid_tests::/)
+       '
+#
+# Filter rationale (3-clause AND):
+#
+# 1. `!(test(/^hpc::/) - test(/^hpc::byte_scan/))`
+#    Skip everything in `hpc::*` EXCEPT `hpc::byte_scan` (the scalar-fallback
+#    path validated against the `cfg(miri)` SimdCaps bypass).
+#
+# 2. `!test(/^simd::tests::/)`
+#    Skip the `simd::tests::*` suite. These exercise `crate::simd::F32x16`
+#    etc. directly — types that re-export AVX/AVX2/AVX-512 intrinsics. Miri
+#    rejects every one with "calling a function that requires unavailable
+#    target features: avx". Same architectural class as `hpc::*`. Will
+#    become miri-runnable when `crate::simd::*` gains a cfg(miri) dispatch
+#    through `simd_nightly`.
+#
+# 3. `!test(/^hpc::framebuffer::pyramid_tests::/)`
+#    The 3 pyramid tests take 19+ minutes EACH under Miri (large 2D scan
+#    loops over SIMD-shaped data). Not a UB signal — pure runtime cost.
+#    Re-enable once the test fixtures are sized down or the loops are
+#    cfg(miri)-shortened.
diff --git a/src/hpc/mod.rs b/src/hpc/mod.rs
index cec5b4d4..ae063575 100644
--- a/src/hpc/mod.rs
+++ b/src/hpc/mod.rs
@@ -236,6 +236,11 @@ pub mod framebuffer;
 /// Transcoded from Opus CELT for the HHTL cascade → waveform pipeline.
 pub mod audio;
 
+/// Vertical streaming structs for the EdgeColumn SoA (D-CSV-11b, sprint-12).
+/// Per cognitive-substrate-convergence-v1.md §5 L-20.
+#[allow(missing_docs)]
+pub mod stream;
+
 #[cfg(all(test, feature = "hpc-extras"))]
 mod e2e_tests {
     //! End-to-end pipeline test: Fingerprint → Node → Seal → Cascade → CLAM → Causality → BNN
diff --git a/src/hpc/simd_caps.rs b/src/hpc/simd_caps.rs
index 2789ba88..a35823b5 100644
--- a/src/hpc/simd_caps.rs
+++ b/src/hpc/simd_caps.rs
@@ -100,8 +100,41 @@ pub fn simd_caps() -> SimdCaps {
 }
 
 impl SimdCaps {
+    /// Miri-only: CPUID inline asm is unsupported by Miri (it can't simulate
+    /// CPU feature detection). Return an all-scalar capability set so any
+    /// test reaching this LazyLock under Miri exercises the scalar fallback
+    /// paths instead of aborting on the `__cpuid_count` call. Scoped to
+    /// `cfg(miri)` — production builds and stable CI use the real detection
+    /// below.
+    #[cfg(miri)]
+    fn detect() -> Self {
+        Self {
+            avx2: false,
+            avx512f: false,
+            avx512bw: false,
+            avx512vl: false,
+            avx512vpopcntdq: false,
+            sse41: false,
+            sse2: false,
+            fma: false,
+            avx512vnni: false,
+            avx512vbmi: false,
+            amx_tile: false,
+            amx_int8: false,
+            amx_bf16: false,
+            avx512bf16: false,
+            avxvnniint8: false,
+            neon: false,
+            asimd_dotprod: false,
+            fp16: false,
+            aes: false,
+            sha2: false,
+            crc32: false,
+        }
+    }
+
     /// Detect CPU capabilities at runtime.
-    #[cfg(target_arch = "x86_64")]
+    #[cfg(all(target_arch = "x86_64", not(miri)))]
     fn detect() -> Self {
         // `__cpuid_count` is safe on x86_64 (Rust 1.87+): CPUID is always
         // available on x86_64 (guaranteed by the ABI) and has no side effects
@@ -140,7 +173,7 @@ impl SimdCaps {
     /// AArch64: detect NEON sub-features via `is_aarch64_feature_detected!`.
     /// NEON itself is mandatory (always true). The sub-features distinguish
     /// Pi Zero 2 W / Pi 3 (A53) from Pi 4 (A72) from Pi 5 (A76).
-    #[cfg(target_arch = "aarch64")]
+    #[cfg(all(target_arch = "aarch64", not(miri)))]
     fn detect() -> Self {
         Self {
             // x86 fields: all false on ARM
diff --git a/src/hpc/stream/inference.rs b/src/hpc/stream/inference.rs
new file mode 100644
index 00000000..503a5545
--- /dev/null
+++ b/src/hpc/stream/inference.rs
@@ -0,0 +1,219 @@
+//! InferenceStream — forward-iterator over a borrowed `&[InferenceRow]` slice.
+//! Per cognitive-substrate-convergence-v1.md §5 L-20: vertical streaming
+//! over the inference-mantissa lane of the EdgeColumn SoA. Used by the
+//! integer-SIMD MUL evaluation hot path (D-CSV-8 sprint-12 SIMD vec).
+//!
+//! Pure iterator scaffold; `par_inference_stream` rayon variant is sprint-13+.
+
+// Local mirror of CausalEdge64 shape (bit-compatible with causal_edge::CausalEdge64).
+// No cross-crate import: ndarray is the producer; causal-edge is the consumer.
+
+/// A single row of the EdgeColumn SoA, bit-compatible with
+/// `causal_edge::CausalEdge64` v2 layout.
+///
+/// Fields of interest for the inference-mantissa lane:
+/// - bits 46-49: signed 4-bit inference mantissa (−8..+7)
+/// - bits 53-58: W-slot corpus root handle (0..=63)
+#[repr(C, align(8))]
+#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug, Default)]
+pub struct InferenceRow(pub u64);
+
+impl InferenceRow {
+    /// Read the 4-bit signed mantissa at bits 46-49 (matches causal-edge v2
+    /// `inference_mantissa()` exactly — see `causal-edge/src/layout.rs`).
+    ///
+    /// Sign-extension: extract 4-bit unsigned value, then sign-extend to i8
+    /// via arithmetic left-shift trick: `(raw << 4) >> 4`.
+    #[inline]
+    pub fn inference_mantissa(&self) -> i8 {
+        let raw = ((self.0 >> 46) & 0xF) as i8;
+        (raw << 4) >> 4 // sign-extend 4 → 8 bits
+    }
+
+    /// Read the W-slot at bits 53-58 (6 bits, 0..=63).
+    ///
+    /// The W-slot is the witness corpus root handle per CausalEdge64 v2 L-6.
+    /// Returns 0 for zero-initialized rows.
+    #[inline]
+    pub fn w_slot(&self) -> u8 {
+        ((self.0 >> 53) & 0x3F) as u8
+    }
+}
+
+/// Forward-iterator over a borrowed slice of [`InferenceRow`] values.
+///
+/// Provides vertical streaming access to the inference-mantissa lane of the
+/// EdgeColumn SoA. Yields `(index, &InferenceRow)` tuples so callers can
+/// correlate back to the originating row without maintaining external counters.
+///
+/// # Example
+/// ```rust
+/// use ndarray::hpc::stream::inference::{InferenceRow, InferenceStream};
+///
+/// let rows = vec![InferenceRow(0), InferenceRow(1 << 46)];
+/// let mut stream = InferenceStream::new(&rows);
+/// assert_eq!(stream.len(), 2);
+/// let (idx, row) = stream.next().unwrap();
+/// assert_eq!(idx, 0);
+/// ```
+pub struct InferenceStream<'a> {
+    rows: &'a [InferenceRow],
+    cursor: usize,
+}
+
+impl<'a> InferenceStream<'a> {
+    /// Construct a new stream over the given slice. The cursor starts at 0.
+    pub fn new(rows: &'a [InferenceRow]) -> Self {
+        Self { rows, cursor: 0 }
+    }
+
+    /// Total number of rows in the underlying slice (not remaining).
+    pub fn len(&self) -> usize {
+        self.rows.len()
+    }
+
+    /// Returns `true` if the underlying slice is empty.
+    pub fn is_empty(&self) -> bool {
+        self.rows.is_empty()
+    }
+
+    /// Number of rows not yet yielded by the iterator.
+    pub fn remaining(&self) -> usize {
+        self.rows.len().saturating_sub(self.cursor)
+    }
+
+    /// Reset the cursor to the beginning so the stream can be iterated again.
+    pub fn reset(&mut self) {
+        self.cursor = 0;
+    }
+}
+
+impl<'a> Iterator for InferenceStream<'a> {
+    type Item = (usize, &'a InferenceRow);
+
+    fn next(&mut self) -> Option<Self::Item> {
+        if self.cursor < self.rows.len() {
+            let i = self.cursor;
+            self.cursor += 1;
+            Some((i, &self.rows[i]))
+        } else {
+            None
+        }
+    }
+
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        let rem = self.remaining();
+        (rem, Some(rem))
+    }
+}
+
+impl<'a> ExactSizeIterator for InferenceStream<'a> {
+    fn len(&self) -> usize {
+        self.remaining()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_inference_stream_empty() {
+        let rows: &[InferenceRow] = &[];
+        let mut stream = InferenceStream::new(rows);
+        assert!(stream.is_empty());
+        assert_eq!(stream.len(), 0);
+        assert_eq!(stream.remaining(), 0);
+        assert!(stream.next().is_none());
+    }
+
+    #[test]
+    fn test_inference_stream_yields_all() {
+        let rows = vec![InferenceRow(0), InferenceRow(1), InferenceRow(2)];
+        let stream = InferenceStream::new(&rows);
+        let collected: Vec<_> = stream.collect();
+        assert_eq!(collected.len(), 3);
+        assert_eq!(collected[0].0, 0);
+        assert_eq!(collected[1].0, 1);
+        assert_eq!(collected[2].0, 2);
+        assert_eq!(collected[0].1 as *const _, &rows[0] as *const _);
+        assert_eq!(collected[2].1 as *const _, &rows[2] as *const _);
+    }
+
+    #[test]
+    fn test_mantissa_signed_extraction() {
+        // Pack bits 46-49 = 0b1111 = 15 (raw), which is -1 in 4-bit two's complement.
+        let raw_bits: u64 = 0b1111u64 << 46;
+        let row = InferenceRow(raw_bits);
+        assert_eq!(row.inference_mantissa(), -1);
+
+        // Pack bits 46-49 = 0b0111 = 7 (raw), positive maximum.
+        let row_pos = InferenceRow(0b0111u64 << 46);
+        assert_eq!(row_pos.inference_mantissa(), 7);
+
+        // Pack bits 46-49 = 0b1000 = 8 (raw), which is -8 in 4-bit two's complement.
+        let row_min = InferenceRow(0b1000u64 << 46);
+        assert_eq!(row_min.inference_mantissa(), -8);
+
+        // Zero mantissa.
+        let row_zero = InferenceRow(0);
+        assert_eq!(row_zero.inference_mantissa(), 0);
+    }
+
+    #[test]
+    fn test_w_slot_extraction() {
+        // Pack bits 53-58 = 0b111111 = 63 (maximum W-slot value).
+        let raw_bits: u64 = 0b111111u64 << 53;
+        let row = InferenceRow(raw_bits);
+        assert_eq!(row.w_slot(), 63);
+
+        // W-slot = 0 (zero row).
+        let row_zero = InferenceRow(0);
+        assert_eq!(row_zero.w_slot(), 0);
+
+        // W-slot = 1.
+        let row_one = InferenceRow(1u64 << 53);
+        assert_eq!(row_one.w_slot(), 1);
+
+        // W-slot = 32 (bit 58 set, bit 53 clear).
+        let row_32 = InferenceRow(32u64 << 53);
+        assert_eq!(row_32.w_slot(), 32);
+    }
+
+    #[test]
+    fn test_remaining_decrements() {
+        let rows = vec![InferenceRow(0); 4];
+        let mut stream = InferenceStream::new(&rows);
+        assert_eq!(stream.remaining(), 4);
+        stream.next();
+        assert_eq!(stream.remaining(), 3);
+        stream.next();
+        assert_eq!(stream.remaining(), 2);
+        stream.next();
+        assert_eq!(stream.remaining(), 1);
+        stream.next();
+        assert_eq!(stream.remaining(), 0);
+        // Exhausted: remaining stays 0.
+        stream.next();
+        assert_eq!(stream.remaining(), 0);
+    }
+
+    #[test]
+    fn test_reset_restarts() {
+        let rows = vec![InferenceRow(10), InferenceRow(20)];
+        let mut stream = InferenceStream::new(&rows);
+
+        // Exhaust the stream.
+        assert!(stream.next().is_some());
+        assert!(stream.next().is_some());
+        assert!(stream.next().is_none());
+        assert_eq!(stream.remaining(), 0);
+
+        // After reset, the stream yields from the beginning again.
+        stream.reset();
+        assert_eq!(stream.remaining(), 2);
+        let first = stream.next().unwrap();
+        assert_eq!(first.0, 0);
+        assert_eq!(first.1 .0, 10);
+    }
+}
diff --git a/src/hpc/stream/mod.rs b/src/hpc/stream/mod.rs
new file mode 100644
index 00000000..55f3a551
--- /dev/null
+++ b/src/hpc/stream/mod.rs
@@ -0,0 +1,15 @@
+//! Vertical streaming structs for the SoA columns.
+//! Per cognitive-substrate-convergence-v1.md §5 L-20.
+//!
+//! Sprint-12 scope (W-F4/5/6): `QualiaStream` + `InferenceStream` +
+//! `SplatFieldStream` forward-iterator scaffolds. Sprint-13+:
+//! `par_*` rayon variants once rayon is wired into the ndarray
+//! feature gate.
+
+pub mod inference;
+pub mod qualia;
+pub mod splat_field;
+
+pub use inference::{InferenceRow, InferenceStream};
+pub use qualia::{QualiaI4Row, QualiaStream};
+pub use splat_field::{SplatField, SplatFieldStream};
diff --git a/src/hpc/stream/qualia.rs b/src/hpc/stream/qualia.rs
new file mode 100644
index 00000000..e31ce79b
--- /dev/null
+++ b/src/hpc/stream/qualia.rs
@@ -0,0 +1,201 @@
+//! QualiaStream — forward-iterator over a borrowed `&[QualiaI4Row]` slice.
+//! Per cognitive-substrate-convergence-v1.md §5 L-20: vertical streaming
+//! structs at the ndarray hardware-acceleration layer for sweep over the
+//! QualiaColumn SoA layout introduced by D-CSV-5b.
+//!
+//! Yields `(row_index, &QualiaI4Row)` tuples. Pure iterator scaffold; the
+//! `par_qualia_stream` rayon-parallel variant is sprint-13+ once rayon is
+//! wired into the ndarray feature gate.
+
+// NOTE: do NOT import lance-graph-contract here (would create circular dep
+// since contract is *consumer* of ndarray). Define a minimal local mirror
+// of the QualiaI4_16D shape — 8 bytes, `Copy`, hashable. Real coupling at
+// the consumer boundary, not at the producer.
+
+/// Local mirror of `lance_graph_contract::qualia::QualiaI4_16D`.
+/// Bit-compatible: `repr(C, align(8))`, 8 bytes, 16 × 4-bit signed lanes.
+/// Defined here to avoid a circular dependency with the contract crate.
+#[repr(C, align(8))]
+#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug, Default)]
+pub struct QualiaI4Row(pub u64);
+
+/// Forward-iterator over a borrowed `&[QualiaI4Row]` slice.
+///
+/// Yields `(row_index, &QualiaI4Row)` tuples in ascending index order.
+///
+/// # Example
+///
+/// ```
+/// use ndarray::hpc::stream::qualia::{QualiaI4Row, QualiaStream};
+///
+/// let rows = vec![QualiaI4Row(1), QualiaI4Row(2), QualiaI4Row(3)];
+/// let mut stream = QualiaStream::new(&rows);
+/// assert_eq!(stream.next(), Some((0, &QualiaI4Row(1))));
+/// assert_eq!(stream.next(), Some((1, &QualiaI4Row(2))));
+/// assert_eq!(stream.next(), Some((2, &QualiaI4Row(3))));
+/// assert_eq!(stream.next(), None);
+/// ```
+pub struct QualiaStream<'a> {
+    rows: &'a [QualiaI4Row],
+    cursor: usize,
+}
+
+impl<'a> QualiaStream<'a> {
+    /// Construct a new `QualiaStream` over `rows`.
+    /// The cursor starts at index 0.
+    #[inline]
+    pub fn new(rows: &'a [QualiaI4Row]) -> Self {
+        Self { rows, cursor: 0 }
+    }
+
+    /// Total number of rows in the backing slice (unchanged by iteration).
+    #[inline]
+    pub fn len(&self) -> usize {
+        self.rows.len()
+    }
+
+    /// `true` if the backing slice is empty.
+    #[inline]
+    pub fn is_empty(&self) -> bool {
+        self.rows.is_empty()
+    }
+
+    /// Number of rows not yet yielded (decrements with each `next()` call).
+    #[inline]
+    pub fn remaining(&self) -> usize {
+        self.rows.len().saturating_sub(self.cursor)
+    }
+
+    /// Reset the cursor to 0, allowing the stream to be re-iterated from the start.
+    #[inline]
+    pub fn reset(&mut self) {
+        self.cursor = 0;
+    }
+
+    /// Current cursor position (0-based index of the NEXT row to be yielded).
+    #[inline]
+    pub fn cursor(&self) -> usize {
+        self.cursor
+    }
+}
+
+impl<'a> Iterator for QualiaStream<'a> {
+    type Item = (usize, &'a QualiaI4Row);
+
+    #[inline]
+    fn next(&mut self) -> Option<Self::Item> {
+        if self.cursor < self.rows.len() {
+            let i = self.cursor;
+            self.cursor += 1;
+            Some((i, &self.rows[i]))
+        } else {
+            None
+        }
+    }
+
+    #[inline]
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        let rem = self.remaining();
+        (rem, Some(rem))
+    }
+}
+
+impl<'a> ExactSizeIterator for QualiaStream<'a> {
+    /// Returns the number of rows not yet yielded.
+    #[inline]
+    fn len(&self) -> usize {
+        self.remaining()
+    }
+}
+
+// ─── Tests ────────────────────────────────────────────────────────────────────
+
+#[cfg(test)]
+mod tests {
+    use super::{QualiaI4Row, QualiaStream};
+
+    /// Empty slice → stream yields nothing immediately.
+    #[test]
+    fn test_empty_stream() {
+        let rows: Vec<QualiaI4Row> = vec![];
+        let mut stream = QualiaStream::new(&rows);
+        assert!(stream.is_empty());
+        assert_eq!(stream.len(), 0);
+        assert_eq!(stream.remaining(), 0);
+        assert_eq!(stream.next(), None);
+    }
+
+    /// Stream over N rows must yield exactly N items.
+    #[test]
+    fn test_stream_yields_all_rows() {
+        let rows = vec![QualiaI4Row(0xAA), QualiaI4Row(0xBB), QualiaI4Row(0xCC), QualiaI4Row(0xDD)];
+        let stream = QualiaStream::new(&rows);
+        let collected: Vec<(usize, &QualiaI4Row)> = stream.collect();
+        assert_eq!(collected.len(), rows.len());
+        for (i, row) in collected.iter() {
+            assert_eq!(row.0, rows[*i].0);
+        }
+    }
+
+    /// Each yielded index must equal the row's position in the slice.
+    #[test]
+    fn test_stream_indices_match() {
+        let rows: Vec<QualiaI4Row> = (0u64..8).map(QualiaI4Row).collect();
+        let mut stream = QualiaStream::new(&rows);
+        let mut expected_idx = 0usize;
+        while let Some((idx, row)) = stream.next() {
+            assert_eq!(idx, expected_idx, "index mismatch at position {}", expected_idx);
+            assert_eq!(row.0, expected_idx as u64, "row value mismatch at index {}", expected_idx);
+            expected_idx += 1;
+        }
+        assert_eq!(expected_idx, rows.len());
+    }
+
+    /// `remaining()` must decrement by 1 with each `next()` call.
+    #[test]
+    fn test_remaining_decrements() {
+        let rows: Vec<QualiaI4Row> = (0u64..5).map(QualiaI4Row).collect();
+        let mut stream = QualiaStream::new(&rows);
+        assert_eq!(stream.remaining(), 5);
+        let _ = stream.next();
+        assert_eq!(stream.remaining(), 4);
+        let _ = stream.next();
+        assert_eq!(stream.remaining(), 3);
+        // Exhaust
+        while stream.next().is_some() {}
+        assert_eq!(stream.remaining(), 0);
+    }
+
+    /// After `reset()`, the stream must replay all rows from index 0.
+    #[test]
+    fn test_reset_restarts() {
+        let rows = vec![QualiaI4Row(10), QualiaI4Row(20), QualiaI4Row(30)];
+        let mut stream = QualiaStream::new(&rows);
+        // Consume all
+        while stream.next().is_some() {}
+        assert_eq!(stream.remaining(), 0);
+        // Reset and re-collect
+        stream.reset();
+        assert_eq!(stream.remaining(), 3);
+        let first = stream.next();
+        assert_eq!(first, Some((0, &QualiaI4Row(10))));
+    }
+
+    /// `ExactSizeIterator::len()` must equal `remaining()` at every step.
+    #[test]
+    fn test_exact_size_iterator() {
+        let rows: Vec<QualiaI4Row> = (0u64..6).map(QualiaI4Row).collect();
+        let mut stream = QualiaStream::new(&rows);
+        // Before iteration
+        assert_eq!(ExactSizeIterator::len(&stream), 6);
+        assert_eq!(ExactSizeIterator::len(&stream), stream.remaining());
+        // After each next()
+        for expected_remaining in (0..6usize).rev() {
+            let _ = stream.next();
+            assert_eq!(ExactSizeIterator::len(&stream), expected_remaining);
+            assert_eq!(ExactSizeIterator::len(&stream), stream.remaining());
+        }
+        assert_eq!(stream.next(), None);
+        assert_eq!(ExactSizeIterator::len(&stream), 0);
+    }
+}
diff --git a/src/hpc/stream/splat_field.rs b/src/hpc/stream/splat_field.rs
new file mode 100644
index 00000000..d0702e2f
--- /dev/null
+++ b/src/hpc/stream/splat_field.rs
@@ -0,0 +1,226 @@
+//! SplatFieldStream — forward-iterator over Gaussian-splat field samples.
+//! Per cognitive-substrate-convergence-v1.md §5 L-20 + .claude/knowledge/
+//! splat-shader-rayon-struct-method-vision.md: vertical streaming over
+//! the splat field for the D-CSV-12 splat op fleet.
+//!
+//! Each row = one Gaussian splat (mean, σ², energy). Pure iterator
+//! scaffold; `par_splat_stream` rayon variant is sprint-13+.
+
+// NOTE: SplatField is defined locally here — do NOT import lance-graph-contract
+// (would create a circular dep; ndarray is a producer, contract is a consumer).
+
+/// One Gaussian splat row: mean position, variance (σ²), accumulated energy,
+/// and a generation/cycle stamp.
+///
+/// Layout: `repr(C, align(16))` — 4 × 4-byte fields = exactly 16 bytes.
+/// `align(16)` matches the SSE/NEON minimum and is verified by
+/// `test_splat_field_size_16b`.
+#[repr(C, align(16))]
+#[derive(Clone, Copy, PartialEq, Debug, Default)]
+pub struct SplatField {
+    /// Mean position in the field space (could be index, palette ID, or BindSpace row).
+    pub mean: u32,
+    /// σ² (variance) — controls splat spread.
+    pub variance: f32,
+    /// Accumulated energy at this splat.
+    pub energy: f32,
+    /// Generation/cycle stamp for the splat.
+    pub generation: u32,
+}
+
+/// Forward-iterator over a borrowed `&[SplatField]` slice.
+///
+/// Yields `(row_index, &SplatField)` tuples in ascending index order.
+///
+/// # Example
+///
+/// ```
+/// use ndarray::hpc::stream::splat_field::{SplatField, SplatFieldStream};
+///
+/// let rows = vec![
+///     SplatField { mean: 0, variance: 1.0, energy: 0.5, generation: 1 },
+///     SplatField { mean: 1, variance: 2.0, energy: 1.5, generation: 2 },
+/// ];
+/// let mut stream = SplatFieldStream::new(&rows);
+/// let (idx, splat) = stream.next().unwrap();
+/// assert_eq!(idx, 0);
+/// assert_eq!(splat.mean, 0);
+/// ```
+pub struct SplatFieldStream<'a> {
+    rows: &'a [SplatField],
+    cursor: usize,
+}
+
+impl<'a> SplatFieldStream<'a> {
+    /// Construct a new `SplatFieldStream` over `rows`.
+    /// The cursor starts at index 0.
+    #[inline]
+    pub fn new(rows: &'a [SplatField]) -> Self {
+        Self { rows, cursor: 0 }
+    }
+
+    /// Total number of rows in the backing slice (unchanged by iteration).
+    #[inline]
+    pub fn len(&self) -> usize {
+        self.rows.len()
+    }
+
+    /// `true` if the backing slice is empty.
+    #[inline]
+    pub fn is_empty(&self) -> bool {
+        self.rows.is_empty()
+    }
+
+    /// Number of rows not yet yielded (decrements with each `next()` call).
+    #[inline]
+    pub fn remaining(&self) -> usize {
+        self.rows.len().saturating_sub(self.cursor)
+    }
+
+    /// Reset the cursor to 0, allowing the stream to be re-iterated from the start.
+    #[inline]
+    pub fn reset(&mut self) {
+        self.cursor = 0;
+    }
+
+    /// Filter to only splats whose `energy` field is strictly above `threshold`.
+    ///
+    /// Consumes `self` (the `SplatFieldStream` is itself an `Iterator`) and
+    /// returns a lazy `impl Iterator` — no allocation.
+    pub fn filter_energy_above(self, threshold: f32) -> impl Iterator<Item = (usize, &'a SplatField)> {
+        self.filter(move |(_, s)| s.energy > threshold)
+    }
+}
+
+impl<'a> Iterator for SplatFieldStream<'a> {
+    type Item = (usize, &'a SplatField);
+
+    #[inline]
+    fn next(&mut self) -> Option<Self::Item> {
+        if self.cursor < self.rows.len() {
+            let i = self.cursor;
+            self.cursor += 1;
+            Some((i, &self.rows[i]))
+        } else {
+            None
+        }
+    }
+
+    #[inline]
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        let rem = self.remaining();
+        (rem, Some(rem))
+    }
+}
+
+impl<'a> ExactSizeIterator for SplatFieldStream<'a> {
+    /// Returns the number of rows not yet yielded.
+    #[inline]
+    fn len(&self) -> usize {
+        self.remaining()
+    }
+}
+
+// ─── Tests ────────────────────────────────────────────────────────────────────
+
+#[cfg(test)]
+mod tests {
+    use super::{SplatField, SplatFieldStream};
+    use std::mem;
+
+    fn make_splat(mean: u32, variance: f32, energy: f32, generation: u32) -> SplatField {
+        SplatField {
+            mean,
+            variance,
+            energy,
+            generation,
+        }
+    }
+
+    /// Empty slice → stream yields nothing immediately.
+    #[test]
+    fn test_splat_stream_empty() {
+        let rows: Vec<SplatField> = vec![];
+        let mut stream = SplatFieldStream::new(&rows);
+        assert!(stream.is_empty());
+        assert_eq!(stream.len(), 0);
+        assert_eq!(stream.remaining(), 0);
+        assert_eq!(stream.next(), None);
+    }
+
+    /// Stream over N rows must yield exactly N items with matching indices.
+    #[test]
+    fn test_splat_stream_yields_all() {
+        let rows = vec![make_splat(0, 1.0, 0.1, 1), make_splat(1, 2.0, 0.5, 2), make_splat(2, 0.5, 2.0, 3)];
+        let stream = SplatFieldStream::new(&rows);
+        let collected: Vec<(usize, &SplatField)> = stream.collect();
+        assert_eq!(collected.len(), 3);
+        for (idx, splat) in &collected {
+            assert_eq!(splat.mean, *idx as u32);
+        }
+    }
+
+    /// `filter_energy_above` must retain only splats strictly above the threshold.
+    #[test]
+    fn test_filter_energy_above() {
+        let rows = vec![
+            make_splat(0, 1.0, 0.1, 1),
+            make_splat(1, 1.0, 0.5, 2),
+            make_splat(2, 1.0, 1.0, 3),
+            make_splat(3, 1.0, 2.0, 4),
+        ];
+        let stream = SplatFieldStream::new(&rows);
+        let above: Vec<(usize, &SplatField)> = stream.filter_energy_above(0.5).collect();
+        // Only rows with energy > 0.5: indices 2 (1.0) and 3 (2.0).
+        assert_eq!(above.len(), 2);
+        assert_eq!(above[0].0, 2);
+        assert_eq!(above[1].0, 3);
+    }
+
+    /// `size_of::<SplatField>()` must be exactly 16 bytes — verifies `align(16)`
+    /// and field packing (4 × 4-byte fields with no hidden padding).
+    #[test]
+    fn test_splat_field_size_16b() {
+        assert_eq!(mem::size_of::<SplatField>(), 16, "SplatField must be exactly 16 bytes (4 × 4B fields, align(16))");
+        assert_eq!(mem::align_of::<SplatField>(), 16, "SplatField alignment must be 16");
+    }
+
+    /// `remaining()` must decrement by 1 with each `next()` call.
+    #[test]
+    fn test_remaining_decrements() {
+        let rows = vec![
+            make_splat(0, 1.0, 1.0, 0),
+            make_splat(1, 1.0, 1.0, 1),
+            make_splat(2, 1.0, 1.0, 2),
+            make_splat(3, 1.0, 1.0, 3),
+        ];
+        let mut stream = SplatFieldStream::new(&rows);
+        assert_eq!(stream.remaining(), 4);
+        let _ = stream.next();
+        assert_eq!(stream.remaining(), 3);
+        let _ = stream.next();
+        assert_eq!(stream.remaining(), 2);
+        // Exhaust remaining
+        while stream.next().is_some() {}
+        assert_eq!(stream.remaining(), 0);
+        assert_eq!(stream.next(), None);
+    }
+
+    /// After `reset()`, the stream replays all rows from index 0.
+    #[test]
+    fn test_reset_restarts() {
+        let rows = vec![make_splat(10, 1.0, 0.3, 1), make_splat(20, 2.0, 0.6, 2), make_splat(30, 3.0, 0.9, 3)];
+        let mut stream = SplatFieldStream::new(&rows);
+        // Consume everything
+        while stream.next().is_some() {}
+        assert_eq!(stream.remaining(), 0);
+        // Reset and verify replay
+        stream.reset();
+        assert_eq!(stream.remaining(), 3);
+        let first = stream.next();
+        assert!(first.is_some());
+        let (idx, splat) = first.unwrap();
+        assert_eq!(idx, 0);
+        assert_eq!(splat.mean, 10);
+    }
+}
diff --git a/src/simd.rs b/src/simd.rs
index ed3e0dea..b3c00f11 100644
--- a/src/simd.rs
+++ b/src/simd.rs
@@ -205,12 +205,19 @@ pub const PREFERRED_I16_LANES: usize = 16;
 
 // Note on the `nightly-simd` feature: it adds the `crate::simd_nightly`
 // module (a portable-simd backend wrapping `core::simd`) but does NOT
-// replace the intrinsics dispatch below. Full type-parity coverage
-// would require the nightly module to define ~30 types; the current
-// draft covers 5 (F32x16, F64x8, U8x64, U32x16, F32Mask16). Consumers
-// who want miri-runnable SIMD code import from `simd_nightly`
-// explicitly (e.g. `use ndarray::simd_nightly::F32x16`). The main
-// polyfill via `crate::simd::F32x16` continues to use intrinsics.
+// replace the intrinsics dispatch below. The polyfill ships full
+// type-parity with production (PR #146): 24 types covering F32x8/16,
+// F64x4/8, BF16x8/16, F16x16, I8x32/64, I16x16/32, I32x16, I64x8,
+// U8x32/64, U16x32, U32x8/16, U64x4/8, plus the F32/F64 mask types —
+// matches the 24 types defined in `simd_avx2.rs` + `simd_avx512.rs`.
+// Consumers who want miri-runnable SIMD code import from `simd_nightly`
+// explicitly today (e.g. `use ndarray::simd_nightly::F32x16`).
+//
+// The remaining work for Miri-clean coverage of `hpc::*` is wiring this
+// file's `pub use crate::simd_{avx512,avx2,neon}::*` re-exports to
+// route through `simd_nightly` under `cfg(miri)`. Once that lands,
+// every `use crate::simd::F32x16` call site becomes miri-checkable
+// without source changes. The polyfill itself is no longer the bottleneck.
 
 #[cfg(all(target_arch = "x86_64", target_feature = "avx512f"))]
 pub use crate::simd_avx512::{