AdaWorldAPI · AdaWorldAPI · May 16, 2026 · May 14, 2026 · May 14, 2026 · May 14, 2026
diff --git a/scripts/miri-tests.sh b/scripts/miri-tests.sh
@@ -1,18 +1,87 @@
 #!/bin/sh
+#
+# Miri test runner — ephemeral nightly, scoped to this script ONLY.
+#
+# Rules of the road (do not violate):
+#   * The repo's default toolchain is stable (see rust-toolchain.toml).
+#     `cargo build`, `cargo test`, `cargo clippy`, CI's clippy / tests jobs
+#     all use stable. Nothing else opts into nightly.
+#   * Miri requires nightly because `src/simd_nightly/` is gated on
+#     `#![feature(portable_simd)]` (unstable issue #86656), and Miri itself
+#     ships only on nightly. This script invokes nightly via `+nightly`,
+#     which is an ephemeral, per-invocation switch — it does NOT change
+#     the default toolchain.
+#   * The `nightly-simd` cargo feature is enabled here ONLY. It routes
+#     `crate::simd::*` through `core::simd::*` (the std polyfill) instead
+#     of the architecture-specific `_mm*_*` intrinsics, so Miri can
+#     actually execute the SIMD code paths. Production builds (and CI's
+#     clippy / tests on stable) keep using the intrinsics backend.
+#   * `blas` is excluded because Miri cannot FFI into `cblas_gemm`.
+#
+# If Miri stays clean, the matching CI job at `.github/workflows/ci.yaml`
+# § miri promotes this from optional → required.
 
 set -x
 set -e
 
-# We rely on layout-dependent casts, which should be covered with #[repr(transparent)]
-# This should catch if we missed that
-RUSTFLAGS="-Zrandomize-layout"
+# Idempotent install of the miri component on nightly. No-op when already
+# present (rustup short-circuits). Safe in CI fresh checkouts.
+rustup component add miri --toolchain nightly >/dev/null 2>&1 || \
+    rustup +nightly component add miri
 
-# Miri reports a stacked borrow violation deep within rayon, in a crate called crossbeam-epoch
-# The crate has a PR to fix this: https://github.com/crossbeam-rs/crossbeam/pull/871
-# but using Miri's tree borrow mode may resolve it for now.
-# Disabled until we can figure out a different rayon issue: https://github.com/rust-lang/miri/issues/1371
-# MIRIFLAGS="-Zmiri-tree-borrows"
+# Layout randomisation — catches missing `#[repr(transparent)]` and similar
+# layout-dependent UB. Cheap; always on.
+export RUSTFLAGS="-Zrandomize-layout"
 
-# General tests
-# Note that we exclude blas feature because Miri can't do cblas_gemm
-cargo miri nextest run -v -p ndarray -p ndarray-rand --features approx,serde
+# Miri reports a stacked borrow violation deep within rayon's
+# crossbeam-epoch. Upstream fix: crossbeam PR #871.
+# Tree-borrow mode resolves it but trips a different rayon issue
+# (rust-lang/miri#1371). Left disabled until both upstream stories close.
+# export MIRIFLAGS="-Zmiri-tree-borrows"
+
+# Architectural limit on the Miri sweep:
+#
+# `crate::simd::*` (the production dispatch in `src/simd.rs`) re-exports
+# from `simd_avx512` / `simd_avx2` / `simd_neon`, which call `_mm*_*` /
+# `vget*` intrinsics directly. Miri rejects those with "calling a
+# function that requires unavailable target features: avx" because the
+# Miri target doesn't enable AVX/AVX2/AVX-512/NEON target features.
+#
+# The `nightly-simd` feature ships a parallel module `crate::simd_nightly`
+# (the 24-type `core::simd` polyfill, at full parity with the 24 types
+# defined across `simd_avx2.rs` + `simd_avx512.rs` — landed in PR #146)
+# which IS Miri-checkable. But the default `crate::simd::*` dispatch is
+# NOT routed through it; consumer modules that import `crate::simd::F32x16`
+# (most of `hpc::*` + the `simd::tests::*` suite) go through intrinsics.
+# The polyfill is no longer the bottleneck — the missing piece is a
+# `cfg(miri)` switch in `src/simd.rs` that re-exports from `simd_nightly`
+# instead of `simd_avx*` under Miri.
+cargo +nightly miri nextest run -v \
+    --no-fail-fast \
+    -p ndarray -p ndarray-rand \
+    --features approx,serde,nightly-simd \
+    -E '!(
+            test(/^hpc::/) - test(/^hpc::byte_scan/)
+        ) and !test(/^simd::tests::/)
+          and !test(/^hpc::framebuffer::pyramid_tests::/)
+       '
+#
+# Filter rationale (3-clause AND):
+#
+# 1. `!(test(/^hpc::/) - test(/^hpc::byte_scan/))`
+#    Skip everything in `hpc::*` EXCEPT `hpc::byte_scan` (the scalar-fallback
+#    path validated against the `cfg(miri)` SimdCaps bypass).
+#
+# 2. `!test(/^simd::tests::/)`
+#    Skip the `simd::tests::*` suite. These exercise `crate::simd::F32x16`
+#    etc. directly — types that re-export AVX/AVX2/AVX-512 intrinsics. Miri
+#    rejects every one with "calling a function that requires unavailable
+#    target features: avx". Same architectural class as `hpc::*`. Will
+#    become miri-runnable when `crate::simd::*` gains a cfg(miri) dispatch
+#    through `simd_nightly`.
+#
+# 3. `!test(/^hpc::framebuffer::pyramid_tests::/)`
+#    The 3 pyramid tests take 19+ minutes EACH under Miri (large 2D scan
+#    loops over SIMD-shaped data). Not a UB signal — pure runtime cost.
+#    Re-enable once the test fixtures are sized down or the loops are
+#    cfg(miri)-shortened.
diff --git a/src/hpc/mod.rs b/src/hpc/mod.rs
@@ -236,6 +236,11 @@ pub mod framebuffer;
 /// Transcoded from Opus CELT for the HHTL cascade → waveform pipeline.
 pub mod audio;
 
+/// Vertical streaming structs for the EdgeColumn SoA (D-CSV-11b, sprint-12).
+/// Per cognitive-substrate-convergence-v1.md §5 L-20.
+#[allow(missing_docs)]
+pub mod stream;
+
 #[cfg(all(test, feature = "hpc-extras"))]
 mod e2e_tests {
     //! End-to-end pipeline test: Fingerprint → Node → Seal → Cascade → CLAM → Causality → BNN

diff --git a/src/hpc/simd_caps.rs b/src/hpc/simd_caps.rs
@@ -100,8 +100,41 @@ pub fn simd_caps() -> SimdCaps {
 }
 
 impl SimdCaps {
+    /// Miri-only: CPUID inline asm is unsupported by Miri (it can't simulate
+    /// CPU feature detection). Return an all-scalar capability set so any
+    /// test reaching this LazyLock under Miri exercises the scalar fallback
+    /// paths instead of aborting on the `__cpuid_count` call. Scoped to
+    /// `cfg(miri)` — production builds and stable CI use the real detection
+    /// below.
+    #[cfg(miri)]
+    fn detect() -> Self {
+        Self {
+            avx2: false,
+            avx512f: false,
+            avx512bw: false,
+            avx512vl: false,
+            avx512vpopcntdq: false,
+            sse41: false,
+            sse2: false,
+            fma: false,
+            avx512vnni: false,
+            avx512vbmi: false,
+            amx_tile: false,
+            amx_int8: false,
+            amx_bf16: false,
+            avx512bf16: false,
+            avxvnniint8: false,
+            neon: false,
+            asimd_dotprod: false,
+            fp16: false,
+            aes: false,
+            sha2: false,
+            crc32: false,
+        }
+    }
+
     /// Detect CPU capabilities at runtime.
-    #[cfg(target_arch = "x86_64")]
+    #[cfg(all(target_arch = "x86_64", not(miri)))]
     fn detect() -> Self {
         // `__cpuid_count` is safe on x86_64 (Rust 1.87+): CPUID is always
         // available on x86_64 (guaranteed by the ABI) and has no side effects
@@ -140,7 +173,7 @@ impl SimdCaps {
     /// AArch64: detect NEON sub-features via `is_aarch64_feature_detected!`.
     /// NEON itself is mandatory (always true). The sub-features distinguish
     /// Pi Zero 2 W / Pi 3 (A53) from Pi 4 (A72) from Pi 5 (A76).
-    #[cfg(target_arch = "aarch64")]
+    #[cfg(all(target_arch = "aarch64", not(miri)))]
     fn detect() -> Self {
         Self {
             // x86 fields: all false on ARM

diff --git a/src/hpc/stream/inference.rs b/src/hpc/stream/inference.rs
@@ -0,0 +1,219 @@
+//! InferenceStream — forward-iterator over a borrowed `&[InferenceRow]` slice.
+//! Per cognitive-substrate-convergence-v1.md §5 L-20: vertical streaming
+//! over the inference-mantissa lane of the EdgeColumn SoA. Used by the
+//! integer-SIMD MUL evaluation hot path (D-CSV-8 sprint-12 SIMD vec).
+//!
+//! Pure iterator scaffold; `par_inference_stream` rayon variant is sprint-13+.
+
+// Local mirror of CausalEdge64 shape (bit-compatible with causal_edge::CausalEdge64).
+// No cross-crate import: ndarray is the producer; causal-edge is the consumer.
+
+/// A single row of the EdgeColumn SoA, bit-compatible with
+/// `causal_edge::CausalEdge64` v2 layout.
+///
+/// Fields of interest for the inference-mantissa lane:
+/// - bits 46-49: signed 4-bit inference mantissa (−8..+7)
+/// - bits 53-58: W-slot corpus root handle (0..=63)
+#[repr(C, align(8))]
+#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug, Default)]
+pub struct InferenceRow(pub u64);
+
+impl InferenceRow {
+    /// Read the 4-bit signed mantissa at bits 46-49 (matches causal-edge v2
+    /// `inference_mantissa()` exactly — see `causal-edge/src/layout.rs`).
+    ///
+    /// Sign-extension: extract 4-bit unsigned value, then sign-extend to i8
+    /// via arithmetic left-shift trick: `(raw << 4) >> 4`.
+    #[inline]
+    pub fn inference_mantissa(&self) -> i8 {
+        let raw = ((self.0 >> 46) & 0xF) as i8;
+        (raw << 4) >> 4 // sign-extend 4 → 8 bits
+    }
+
+    /// Read the W-slot at bits 53-58 (6 bits, 0..=63).
+    ///
+    /// The W-slot is the witness corpus root handle per CausalEdge64 v2 L-6.
+    /// Returns 0 for zero-initialized rows.
+    #[inline]
+    pub fn w_slot(&self) -> u8 {
+        ((self.0 >> 53) & 0x3F) as u8
+    }
+}
+
+/// Forward-iterator over a borrowed slice of [`InferenceRow`] values.
+///
+/// Provides vertical streaming access to the inference-mantissa lane of the
+/// EdgeColumn SoA. Yields `(index, &InferenceRow)` tuples so callers can
+/// correlate back to the originating row without maintaining external counters.
+///
+/// # Example
+/// ```rust
+/// use ndarray::hpc::stream::inference::{InferenceRow, InferenceStream};
+///
+/// let rows = vec![InferenceRow(0), InferenceRow(1 << 46)];
+/// let mut stream = InferenceStream::new(&rows);
+/// assert_eq!(stream.len(), 2);
+/// let (idx, row) = stream.next().unwrap();
+/// assert_eq!(idx, 0);
+/// ```
+pub struct InferenceStream<'a> {
+    rows: &'a [InferenceRow],
+    cursor: usize,
+}
+
+impl<'a> InferenceStream<'a> {
+    /// Construct a new stream over the given slice. The cursor starts at 0.
+    pub fn new(rows: &'a [InferenceRow]) -> Self {
+        Self { rows, cursor: 0 }
+    }
+
+    /// Total number of rows in the underlying slice (not remaining).
+    pub fn len(&self) -> usize {
+        self.rows.len()
+    }
+
+    /// Returns `true` if the underlying slice is empty.
+    pub fn is_empty(&self) -> bool {
+        self.rows.is_empty()
+    }
+
+    /// Number of rows not yet yielded by the iterator.
+    pub fn remaining(&self) -> usize {
+        self.rows.len().saturating_sub(self.cursor)
+    }
+
+    /// Reset the cursor to the beginning so the stream can be iterated again.
+    pub fn reset(&mut self) {
+        self.cursor = 0;
+    }
+}
+
+impl<'a> Iterator for InferenceStream<'a> {
+    type Item = (usize, &'a InferenceRow);
+
+    fn next(&mut self) -> Option<Self::Item> {
+        if self.cursor < self.rows.len() {
+            let i = self.cursor;
+            self.cursor += 1;
+            Some((i, &self.rows[i]))
+        } else {
+            None
+        }
+    }
+
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        let rem = self.remaining();
+        (rem, Some(rem))
+    }
+}
+
+impl<'a> ExactSizeIterator for InferenceStream<'a> {
+    fn len(&self) -> usize {
+        self.remaining()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_inference_stream_empty() {
+        let rows: &[InferenceRow] = &[];
+        let mut stream = InferenceStream::new(rows);
+        assert!(stream.is_empty());
+        assert_eq!(stream.len(), 0);
+        assert_eq!(stream.remaining(), 0);
+        assert!(stream.next().is_none());
+    }
+
+    #[test]
+    fn test_inference_stream_yields_all() {
+        let rows = vec![InferenceRow(0), InferenceRow(1), InferenceRow(2)];
+        let stream = InferenceStream::new(&rows);
+        let collected: Vec<_> = stream.collect();
+        assert_eq!(collected.len(), 3);
+        assert_eq!(collected[0].0, 0);
+        assert_eq!(collected[1].0, 1);
+        assert_eq!(collected[2].0, 2);
+        assert_eq!(collected[0].1 as *const _, &rows[0] as *const _);
+        assert_eq!(collected[2].1 as *const _, &rows[2] as *const _);
+    }
+
+    #[test]
+    fn test_mantissa_signed_extraction() {
+        // Pack bits 46-49 = 0b1111 = 15 (raw), which is -1 in 4-bit two's complement.
+        let raw_bits: u64 = 0b1111u64 << 46;
+        let row = InferenceRow(raw_bits);
+        assert_eq!(row.inference_mantissa(), -1);
+
+        // Pack bits 46-49 = 0b0111 = 7 (raw), positive maximum.
+        let row_pos = InferenceRow(0b0111u64 << 46);
+        assert_eq!(row_pos.inference_mantissa(), 7);
+
+        // Pack bits 46-49 = 0b1000 = 8 (raw), which is -8 in 4-bit two's complement.
+        let row_min = InferenceRow(0b1000u64 << 46);
+        assert_eq!(row_min.inference_mantissa(), -8);
+
+        // Zero mantissa.
+        let row_zero = InferenceRow(0);
+        assert_eq!(row_zero.inference_mantissa(), 0);
+    }
+
+    #[test]
+    fn test_w_slot_extraction() {
+        // Pack bits 53-58 = 0b111111 = 63 (maximum W-slot value).
+        let raw_bits: u64 = 0b111111u64 << 53;
+        let row = InferenceRow(raw_bits);
+        assert_eq!(row.w_slot(), 63);
+
+        // W-slot = 0 (zero row).
+        let row_zero = InferenceRow(0);
+        assert_eq!(row_zero.w_slot(), 0);
+
+        // W-slot = 1.
+        let row_one = InferenceRow(1u64 << 53);
+        assert_eq!(row_one.w_slot(), 1);
+
+        // W-slot = 32 (bit 58 set, bit 53 clear).
+        let row_32 = InferenceRow(32u64 << 53);
+        assert_eq!(row_32.w_slot(), 32);
+    }
+
+    #[test]
+    fn test_remaining_decrements() {
+        let rows = vec![InferenceRow(0); 4];
+        let mut stream = InferenceStream::new(&rows);
+        assert_eq!(stream.remaining(), 4);
+        stream.next();
+        assert_eq!(stream.remaining(), 3);
+        stream.next();
+        assert_eq!(stream.remaining(), 2);
+        stream.next();
+        assert_eq!(stream.remaining(), 1);
+        stream.next();
+        assert_eq!(stream.remaining(), 0);
+        // Exhausted: remaining stays 0.
+        stream.next();
+        assert_eq!(stream.remaining(), 0);
+    }
+
+    #[test]
+    fn test_reset_restarts() {
+        let rows = vec![InferenceRow(10), InferenceRow(20)];
+        let mut stream = InferenceStream::new(&rows);
+
+        // Exhaust the stream.
+        assert!(stream.next().is_some());
+        assert!(stream.next().is_some());
+        assert!(stream.next().is_none());
+        assert_eq!(stream.remaining(), 0);
+
+        // After reset, the stream yields from the beginning again.
+        stream.reset();
+        assert_eq!(stream.remaining(), 2);
+        let first = stream.next().unwrap();
+        assert_eq!(first.0, 0);
+        assert_eq!(first.1 .0, 10);
+    }
+}
diff --git a/src/hpc/stream/mod.rs b/src/hpc/stream/mod.rs
@@ -0,0 +1,15 @@
+//! Vertical streaming structs for the SoA columns.
+//! Per cognitive-substrate-convergence-v1.md §5 L-20.
+//!
+//! Sprint-12 scope (W-F4/5/6): `QualiaStream` + `InferenceStream` +
+//! `SplatFieldStream` forward-iterator scaffolds. Sprint-13+:
+//! `par_*` rayon variants once rayon is wired into the ndarray
+//! feature gate.
+
+pub mod inference;
+pub mod qualia;
+pub mod splat_field;
+
+pub use inference::{InferenceRow, InferenceStream};
+pub use qualia::{QualiaI4Row, QualiaStream};
+pub use splat_field::{SplatField, SplatFieldStream};