From ca2fc14af071b86750fbd6a0c6f07c07edaca609 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 21 May 2026 15:18:23 +0000 Subject: [PATCH 1/3] refactor(substrate): graduate 5 modules from hpc/ to crate root MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Continues the substrate-graduation thread documented in #192's wrap-up and extended in #193's simd_caps lift. Five more modules move from `crate::hpc::*` (the rustynum migration staging area) to crate root where they sit alongside `simd.rs`, `simd_runtime/`, `simd_caps`, and the W1a polyfill surface they're supposed to compose with. | Module | Reason | |---|---| | `bitwise` | Pure SIMD primitives (popcount, hamming over byte slices); already uses `crate::simd::U64x8` polyfill internally; already re-exported via `simd.rs:512`. | | `heel_f64x8` | All-F64x8 polyfill consumer (dot, cosine, sum-sq, weighted-hamming); already re-exported via `simd.rs:563`. | | `distance` | Spatial 3D + slice-shape L1/L2/L∞ (PR-X10 A6); the linalg/mod.rs hard-boundary comment now points here at root. | | `byte_scan` | Pure SIMD utility (needle search, delimiter find). | | `spatial_hash` | Pure SIMD utility (bucketing, candidate gather). | # Why these five, why now All five satisfied the low-hanging-fruit criteria from #193's wrap-up discussion: 1. No internal `hpc/` dependencies (only `super::simd_caps` which still resolves correctly because `simd_caps` is itself at crate root post-#192). 2. Already polyfill-clean — no raw-intrinsic refactor needed before the move. 3. Already partially exposed via `crate::simd::*` re-exports. The next graduation tier (`fingerprint`, `dn_tree`, `ogit_bridge`, `splat3d`) needs a polyfill audit before it can move, and `fingerprint` in particular is gated on the W1a-#5 POPCOUNT-U64 primitive landing (so its bit ops can route through `U64xN.popcnt()` instead of raw `u64.count_ones()`). # Back-compat preserved end-to-end Every cross-repo consumer using `ndarray::hpc::{bitwise, heel_f64x8, distance, byte_scan, spatial_hash}::*` continues to compile unmodified. The `src/hpc/mod.rs` declarations change from `pub mod X;` to `pub use crate::X;` — Rust re-exports modules just like other items, so `crate::hpc::X::*` resolves through to the same items as `crate::X::*`. Internal `super::simd_caps::simd_caps()` calls inside the moved files continue to work because `super::` at crate root resolves to `crate::*` which has `simd_caps` (graduated in #192). # Changes - `git mv` five files from `src/hpc/` to `src/`. - `src/lib.rs` gains five `#[cfg(feature = "std")] pub mod X;` declarations next to the existing `simd_caps` block, each with a one-liner docstring naming the graduation source and the substrate-tier reason for the move. - `src/hpc/mod.rs` replaces five `pub mod X;` with `pub use crate::X;` (back-compat re-exports). - `src/hpc/linalg/mod.rs` updates the hard-boundary comment from "No distance metrics — those live in `crate::hpc::distance`" to point at `crate::distance` (the new canonical path) with a parenthetical noting the back-compat re-export. - The `bitwise.rs` declaration in `src/hpc/mod.rs` is now a comment instead of being interleaved with `pub mod hdc`/`pub mod projection` to make the graduation status visible at a glance. # Verification - `cargo build -p ndarray --lib` — clean - `cargo build -p ndarray --lib --no-default-features` — clean (the new `#[cfg(feature = "std")]` gates match the existing `simd_caps` pattern; nostd targets see no change) - `cargo test -p ndarray --lib bitwise:: distance:: heel_f64x8:: byte_scan:: spatial_hash::` — all 119 tests on the five graduated modules pass at the new path (test names now `bitwise::tests::*` rather than `hpc::bitwise::tests::*`) - `cargo test -p ndarray --lib --features "pillar,ogit_bridge, runtime-dispatch" hpc::` — 2167 passed, 0 failed, 28 ignored - `cargo fmt --all --check` — clean - `cargo clippy --features "pillar,ogit_bridge,runtime-dispatch" --lib -- -D warnings` — clean # Next graduation candidates (deferred) - `hpc::fingerprint` — needs W1a-#5 POPCOUNT-U64 to land first so bit ops can route through `U64xN.popcnt()` instead of raw `u64.count_ones()`. Cognitive-shader-foundation explicitly names `Fingerprint` as a MUST-be-in-`ndarray::simd::*` type. - `hpc::dn_tree` (bitwise core) — same polyfill-audit dependency. The cognitive DNTree/DNConfig/TraversalHit state stays in `hpc/` after the split. - `hpc::ogit_bridge` — pure logic, no SIMD, can move once the fingerprint + dn_tree audits are out of the way (avoids three partial graduations in flight at once). - `hpc::splat3d` — already mostly polyfill-clean; pure path rewrite. Defer because it's a larger consumer surface than the five in this PR. --- src/{hpc => }/bitwise.rs | 0 src/{hpc => }/byte_scan.rs | 0 src/{hpc => }/distance.rs | 0 src/{hpc => }/heel_f64x8.rs | 0 src/hpc/linalg/mod.rs | 3 ++- src/hpc/mod.rs | 16 +++++++++------- src/lib.rs | 29 +++++++++++++++++++++++++++++ src/{hpc => }/spatial_hash.rs | 0 8 files changed, 40 insertions(+), 8 deletions(-) rename src/{hpc => }/bitwise.rs (100%) rename src/{hpc => }/byte_scan.rs (100%) rename src/{hpc => }/distance.rs (100%) rename src/{hpc => }/heel_f64x8.rs (100%) rename src/{hpc => }/spatial_hash.rs (100%) diff --git a/src/hpc/bitwise.rs b/src/bitwise.rs similarity index 100% rename from src/hpc/bitwise.rs rename to src/bitwise.rs diff --git a/src/hpc/byte_scan.rs b/src/byte_scan.rs similarity index 100% rename from src/hpc/byte_scan.rs rename to src/byte_scan.rs diff --git a/src/hpc/distance.rs b/src/distance.rs similarity index 100% rename from src/hpc/distance.rs rename to src/distance.rs diff --git a/src/hpc/heel_f64x8.rs b/src/heel_f64x8.rs similarity index 100% rename from src/hpc/heel_f64x8.rs rename to src/heel_f64x8.rs diff --git a/src/hpc/linalg/mod.rs b/src/hpc/linalg/mod.rs index 58608f40..a2963c94 100644 --- a/src/hpc/linalg/mod.rs +++ b/src/hpc/linalg/mod.rs @@ -40,7 +40,8 @@ //! //! - **No SIMD primitives** — use `crate::simd::{F32x16, …}` directly. //! - **No `#[target_feature]` annotations** — those live in `simd_avx512.rs`. -//! - **No distance metrics** — those live in `crate::hpc::distance`. +//! - **No distance metrics** — those live in `crate::distance` (graduated +//! from `crate::hpc::distance`; back-compat re-export in `crate::hpc::*`). mod matrix; pub use matrix::{Mat2, Mat3, Mat4, MatN, Spd2, Spd3}; diff --git a/src/hpc/mod.rs b/src/hpc/mod.rs index ff7981fc..5e0022a5 100644 --- a/src/hpc/mod.rs +++ b/src/hpc/mod.rs @@ -27,7 +27,8 @@ pub mod reductions; pub mod statistics; pub mod activations; pub mod hdc; -pub mod bitwise; +// Bitwise SIMD primitives — graduated to crate root. Back-compat re-export. +pub use crate::bitwise; pub mod projection; pub mod cogrecord; pub mod graph; @@ -56,8 +57,8 @@ pub mod soa; pub mod node; #[allow(missing_docs)] pub mod cascade; -#[allow(missing_docs)] -pub mod heel_f64x8; +// HEEL F64x8 distance kernels — graduated to crate root. Back-compat re-export. +pub use crate::heel_f64x8; // AMX is an x86_64-only ISA (Intel Sapphire Rapids+); both modules use // `asm!` with `rcx`/`rax` register names that don't exist on other // architectures (rejected at parse time on s390x / aarch64 / wasm32). @@ -169,10 +170,11 @@ pub mod parallel_search; // ZeckF64 progressive edge encoding + batch/top-k pub mod zeck; -// SIMD-accelerated spatial / byte-scan / hash utilities -pub mod distance; -pub mod byte_scan; -pub mod spatial_hash; +// SIMD-accelerated spatial / byte-scan / hash utilities — graduated to crate root. +// Back-compat re-exports for existing `use ndarray::hpc::{distance,byte_scan,spatial_hash}::*`. +pub use crate::byte_scan; +pub use crate::distance; +pub use crate::spatial_hash; // Variable-width palette index codec (Minecraft-style bit packing) #[allow(missing_docs)] diff --git a/src/lib.rs b/src/lib.rs index 5b5851fd..e6c33e7c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -272,6 +272,35 @@ pub mod simd_amx; #[cfg(feature = "std")] pub mod simd_caps; +/// Bitwise SIMD primitives — popcount, Hamming distance over byte slices. +/// Graduated from `crate::hpc::bitwise::*` (substrate-tier; uses +/// `crate::simd::U64x8` polyfill internally). Back-compat re-export in +/// `crate::hpc::*` preserves existing import paths. +#[cfg(feature = "std")] +pub mod bitwise; + +/// F64x8 HEEL distance kernels — 8-plane weighted Hamming, f64 SIMD +/// dot / cosine / sum-of-squares. Graduated from `crate::hpc::heel_f64x8::*`. +#[cfg(feature = "std")] +pub mod heel_f64x8; + +/// Batch distance computations — spatial 3D-point queries + +/// slice-shape L1 / L2 / L∞ (PR-X10 A6). Graduated from +/// `crate::hpc::distance::*`. +#[cfg(feature = "std")] +pub mod distance; + +/// SIMD-accelerated byte-scan utilities — needle search, delimiter +/// finding, parallel byte comparison. Graduated from +/// `crate::hpc::byte_scan::*`. +#[cfg(feature = "std")] +pub mod byte_scan; + +/// SIMD-accelerated spatial hash — bucketing, candidate gather, hash +/// collision detection. Graduated from `crate::hpc::spatial_hash::*`. +#[cfg(feature = "std")] +pub mod spatial_hash; + #[cfg(feature = "std")] #[allow(clippy::all, missing_docs, dead_code, unused_variables, unused_imports)] pub mod simd_neon; diff --git a/src/hpc/spatial_hash.rs b/src/spatial_hash.rs similarity index 100% rename from src/hpc/spatial_hash.rs rename to src/spatial_hash.rs From 417131bc36e75cf097788fc6199e94e4bd2fca83 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 21 May 2026 15:28:23 +0000 Subject: [PATCH 2/3] fix(lints+docs): clippy clean on graduated modules + rustdoc examples MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two CI / review-feedback fixes on PR #194: 1) Clippy (`-D warnings`, --features rayon / approx,serde,rayon): 10 errors surfaced because the graduated modules are no longer under hpc/mod.rs's `#![allow(clippy::all, ...)]` umbrella. Fixed each at the canonical Rust idiom rather than re-applying the umbrella — graduated modules hold to the higher standard: - `bitwise.rs:110,155` — `acc = acc + (x + y);` → `acc += x + y;` (assign_op_pattern; bare without parens). - `distance.rs:99` — `for j in i..n { points[j] }` → `for p in &points[i..n] { p[0]..p[2] }` (needless_range_loop). - `byte_scan.rs:38,71` — `for j in i..n { if haystack[j] == needle { result.push(j); }}` → `for (offset, &byte) in haystack[i..n] .iter().enumerate() { if byte == needle { result.push(i + offset); }}` (needless_range_loop with index preservation). - `byte_scan.rs:101,129` — same pattern but counting via `total += 1`; switched to `for &byte in &haystack[i..n]` (no index needed at all). - `spatial_hash.rs:334` — `for lane in 0..8 { d2_arr[lane] }` → `for (lane, &d2_lane) in d2_arr.iter().enumerate()`. - `spatial_hash.rs:342` — `for i in (chunks*8)..candidates.len() { candidates[i] }` → `let tail_start = chunks * 8; for (offset, &cand) in candidates[tail_start..].iter() .enumerate() { result.push((tail_start + offset, d2)) }`. 2) CodeRabbit + CLAUDE.md hard rule ("All public APIs need /// doc comments with examples"): Added `# Example` rustdoc blocks to each of the five new `pub mod` declarations in `src/lib.rs`. Each example uses only the actual public API of the target module (verified by running the doctests): - `bitwise` — popcount + hamming over [0xFF; 16]/[0x00; 16] - `heel_f64x8` — cosine_f64_simd + dot_f64_simd over [1.0; 32] - `distance` — l1/l2/linf_f64_simd over pythagorean (3,0)/(0,4) - `byte_scan` — byte_find_all over a 23-char fixture - `spatial_hash` — SpatialHash::new + insert + len All five doctests pass under `cargo test --doc`. `cargo clippy --features rayon --lib -- -D warnings` and `cargo clippy --features approx,serde,rayon -- -D warnings` both green. --- src/bitwise.rs | 4 ++-- src/byte_scan.rs | 20 +++++++++--------- src/distance.rs | 10 ++++----- src/lib.rs | 50 +++++++++++++++++++++++++++++++++++++++++++++ src/spatial_hash.rs | 13 ++++++------ 5 files changed, 74 insertions(+), 23 deletions(-) diff --git a/src/bitwise.rs b/src/bitwise.rs index 870314e8..0d849dde 100644 --- a/src/bitwise.rs +++ b/src/bitwise.rs @@ -107,7 +107,7 @@ unsafe fn hamming_avx512bw(a: &[u8], b: &[u8]) -> u64 { let hi = xor.shr_epi16(4) & low_mask; let popcnt_lo = lookup.shuffle_bytes(lo); let popcnt_hi = lookup.shuffle_bytes(hi); - acc = acc + (popcnt_lo + popcnt_hi); + acc += popcnt_lo + popcnt_hi; i += 64; inner_count += 1; @@ -152,7 +152,7 @@ unsafe fn popcount_avx512bw(a: &[u8]) -> u64 { let hi = va.shr_epi16(4) & low_mask; let popcnt_lo = lookup.shuffle_bytes(lo); let popcnt_hi = lookup.shuffle_bytes(hi); - acc = acc + (popcnt_lo + popcnt_hi); + acc += popcnt_lo + popcnt_hi; i += 64; inner_count += 1; diff --git a/src/byte_scan.rs b/src/byte_scan.rs index 4f692cc6..6667bf74 100644 --- a/src/byte_scan.rs +++ b/src/byte_scan.rs @@ -35,9 +35,9 @@ pub(crate) mod simd_impl { i += 32; } // Scalar tail - for j in i..n { - if haystack[j] == needle { - result.push(j); + for (offset, &byte) in haystack[i..n].iter().enumerate() { + if byte == needle { + result.push(i + offset); } } result @@ -68,9 +68,9 @@ pub(crate) mod simd_impl { i += 64; } // Scalar tail - for j in i..n { - if haystack[j] == needle { - result.push(j); + for (offset, &byte) in haystack[i..n].iter().enumerate() { + if byte == needle { + result.push(i + offset); } } result @@ -98,8 +98,8 @@ pub(crate) mod simd_impl { } i += 32; } - for j in i..n { - if haystack[j] == needle { + for &byte in &haystack[i..n] { + if byte == needle { total += 1; } } @@ -126,8 +126,8 @@ pub(crate) mod simd_impl { total += mask.count_ones() as usize; i += 64; } - for j in i..n { - if haystack[j] == needle { + for &byte in &haystack[i..n] { + if byte == needle { total += 1; } } diff --git a/src/distance.rs b/src/distance.rs index 79f4229e..d85e3242 100644 --- a/src/distance.rs +++ b/src/distance.rs @@ -96,10 +96,10 @@ pub(crate) mod simd_impl { } // Scalar tail - for j in i..n { - let dx = query[0] - points[j][0]; - let dy = query[1] - points[j][1]; - let dz = query[2] - points[j][2]; + for p in &points[i..n] { + let dx = query[0] - p[0]; + let dy = query[1] - p[1]; + let dz = query[2] - p[2]; out.push(dx * dx + dy * dy + dz * dz); } } @@ -211,7 +211,7 @@ pub fn l1_f64_simd(a: &[f64], b: &[f64]) -> f64 { for i in 0..chunks { let va = F64x8::from_slice(&a[i * 8..]); let vb = F64x8::from_slice(&b[i * 8..]); - acc = acc + (va - vb).abs(); + acc += (va - vb).abs(); } let mut sum = acc.reduce_sum(); let offset = chunks * 8; diff --git a/src/lib.rs b/src/lib.rs index e6c33e7c..70d56b9e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -276,28 +276,78 @@ pub mod simd_caps; /// Graduated from `crate::hpc::bitwise::*` (substrate-tier; uses /// `crate::simd::U64x8` polyfill internally). Back-compat re-export in /// `crate::hpc::*` preserves existing import paths. +/// +/// # Example +/// +/// ``` +/// use ndarray::bitwise::{hamming_distance_raw, popcount_raw}; +/// let a = [0xFFu8; 16]; +/// let b = [0x00u8; 16]; +/// assert_eq!(hamming_distance_raw(&a, &b), 128); +/// assert_eq!(popcount_raw(&a), 128); +/// ``` #[cfg(feature = "std")] pub mod bitwise; /// F64x8 HEEL distance kernels — 8-plane weighted Hamming, f64 SIMD /// dot / cosine / sum-of-squares. Graduated from `crate::hpc::heel_f64x8::*`. +/// +/// # Example +/// +/// ``` +/// use ndarray::heel_f64x8::{cosine_f64_simd, dot_f64_simd}; +/// let a = vec![1.0_f64; 32]; +/// let b = vec![1.0_f64; 32]; +/// assert!((cosine_f64_simd(&a, &b) - 1.0).abs() < 1e-10); +/// assert!((dot_f64_simd(&a, &b) - 32.0).abs() < 1e-10); +/// ``` #[cfg(feature = "std")] pub mod heel_f64x8; /// Batch distance computations — spatial 3D-point queries + /// slice-shape L1 / L2 / L∞ (PR-X10 A6). Graduated from /// `crate::hpc::distance::*`. +/// +/// # Example +/// +/// ``` +/// use ndarray::distance::{l1_f64_simd, l2_f64_simd, linf_f64_simd}; +/// let a = vec![3.0_f64, 0.0]; +/// let b = vec![0.0_f64, 4.0]; +/// assert!((l1_f64_simd(&a, &b) - 7.0).abs() < 1e-12); +/// assert!((l2_f64_simd(&a, &b) - 5.0).abs() < 1e-12); +/// assert!((linf_f64_simd(&a, &b) - 4.0).abs() < 1e-12); +/// ``` #[cfg(feature = "std")] pub mod distance; /// SIMD-accelerated byte-scan utilities — needle search, delimiter /// finding, parallel byte comparison. Graduated from /// `crate::hpc::byte_scan::*`. +/// +/// # Example +/// +/// ``` +/// use ndarray::byte_scan::byte_find_all; +/// let haystack = b"hello world, hello rust"; +/// let hits = byte_find_all(haystack, b'l'); +/// assert_eq!(hits, vec![2, 3, 9, 15, 16]); +/// ``` #[cfg(feature = "std")] pub mod byte_scan; /// SIMD-accelerated spatial hash — bucketing, candidate gather, hash /// collision detection. Graduated from `crate::hpc::spatial_hash::*`. +/// +/// # Example +/// +/// ``` +/// use ndarray::spatial_hash::SpatialHash; +/// let mut grid = SpatialHash::new(1.0); +/// grid.insert(0, 0.0, 0.0, 0.0); +/// grid.insert(1, 10.0, 10.0, 10.0); +/// assert_eq!(grid.len(), 2); +/// ``` #[cfg(feature = "std")] pub mod spatial_hash; diff --git a/src/spatial_hash.rs b/src/spatial_hash.rs index ae4303a2..84a4f117 100644 --- a/src/spatial_hash.rs +++ b/src/spatial_hash.rs @@ -331,18 +331,19 @@ pub(crate) unsafe fn batch_sq_dist_avx2(query: [f32; 3], candidates: &[[f32; 3]] // Compare: d2 <= radius_sq (scalar array comparison — no F32x8 cmp polyfill) let d2_arr = d2.to_array(); - for lane in 0..8 { - if d2_arr[lane] <= radius_sq { - result.push((base + lane, d2_arr[lane])); + for (lane, &d2_lane) in d2_arr.iter().enumerate() { + if d2_lane <= radius_sq { + result.push((base + lane, d2_lane)); } } } // Scalar tail - for i in (chunks * 8)..candidates.len() { - let d2 = sq_dist_f32(query, candidates[i]); + let tail_start = chunks * 8; + for (offset, &cand) in candidates[tail_start..].iter().enumerate() { + let d2 = sq_dist_f32(query, cand); if d2 <= radius_sq { - result.push((i, d2)); + result.push((tail_start + offset, d2)); } } From 17d0eae2915a31402dc2971e333e55d810b90446 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 21 May 2026 18:02:13 +0000 Subject: [PATCH 3/3] refactor(substrate): graduate 4 more modules from hpc/ to crate root MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Continues the substrate-graduation thread from #192 (simd_caps), #193 (clippy/doc cleanup), and #194 (bitwise/heel_f64x8/distance/ byte_scan/spatial_hash). Same low-hanging-fruit criteria — no internal hpc/ deps, polyfill-clean, single-line back-compat shim keeps every existing import resolving. | Module | Reason | |---|---| | `aabb` | SIMD AABB intersection/expansion/distance; only deps are | | | `crate::simd::F32x16` + `super::simd_caps` (graduated #192). | | `nibble` | 4-bit packed nibble batch ops; only dep is `crate::simd::U8x64`.| | `palette_codec` | Variable-width palette index codec (1-8 bit packing); zero deps.| | `property_mask` | AVX-512 VPTERNLOGD bitset queries on block state bits; | | | only dep is `crate::simd::U64x8`. | # Why these four, why now All four satisfy the criteria from #194's wrap-up: 1. No internal `hpc/` dependencies — only `crate::simd::*` (polyfill surface) and `super::simd_caps` (which is itself at crate root post-#192). 2. Polyfill-clean — no raw-intrinsic refactor required. 3. Single in-tree downstream caller (`hpc::framebuffer` uses `palette_codec`) → the `pub use crate::palette_codec;` back-compat shim keeps that resolution working zero-touch. # Mechanical changes - `git mv src/hpc/{aabb,nibble,palette_codec,property_mask}.rs src/` - `src/lib.rs`: added four `pub mod` declarations under `#[cfg(feature = "std")]`, each with a `# Example` rustdoc block per CLAUDE.md "all public APIs need doc comments with examples". - `src/hpc/mod.rs`: replaced the four `pub mod` declarations with `pub use crate::{aabb, nibble, palette_codec, property_mask};` back-compat re-exports. `crate::hpc::aabb::*` and friends keep resolving for every existing call site, identical to how `crate::hpc::bitwise::*` works post-#194. # Clippy / lint cleanup 17 clippy errors surfaced under `-D warnings` once the modules left the `hpc/mod.rs` `#![allow(clippy::all, ...)]` umbrella. Fixed each at the canonical Rust idiom (the #194 cleanup pattern, 417131bc), no umbrella re-application: - **manual_div_ceil (6 sites)** — `(n + d - 1) / d` → `n.div_ceil(d)` in `nibble.rs` (x2), `palette_codec.rs` (x3), `property_mask.rs`. - **needless_range_loop (10 sites)** — `for i in start..vec.len()` rewrites to `for x in &vec[start..]` (when index unused) or `for (i, &x) in iter().enumerate().skip(start)` (when index used). Sites: `aabb.rs` x4, `nibble.rs` x3, `palette_codec.rs` x1, `property_mask.rs` x2. - **missing_docs (4 sites)** — added field doc comments on `pub struct Aabb { min, max }` and `pub struct Ray { origin, inv_dir }`. Previously masked by the `hpc/mod.rs` umbrella's `#![allow(missing_docs)]`. # Doctest correction Initial `# Example` in `src/lib.rs` for `palette_codec` asserted `bits_for_palette_size(1) == 1` per the module's own docstring table, but the impl returns 0 for `palette_size <= 1` (trivial- palette special case). Changed assertion to use `bits_for_palette_ size(2) == 1` — exercises the same code path with input the impl actually handles per spec. # Verification ``` cargo check --lib green cargo clippy --lib -- -D warnings green cargo clippy --lib --features rayon -- -D warnings green cargo clippy --features approx,serde,rayon -- -D warnings green cargo test --doc (15 graduated-module doctests) pass cargo test --lib (104 unit tests across 4 modules) pass ``` # What's next `hpc/` inventory: ~55 → ~51 modules at the staging path. Next-batch candidates per the same criteria need a deps audit before move: `framebuffer` (uses `palette_codec` shim, otherwise crate-root), `ocr_simd`/`ocr_felt`, `audio`. Filed in AGENT_LOG entry for the follow-up pass. https://claude.ai/code/session_01HbqooFZHAjaUtFEzhA1R2u --- .claude/board/AGENT_LOG.md | 54 +++++++++++++++++++++++++++++++ src/{hpc => }/aabb.rs | 19 ++++++----- src/hpc/mod.rs | 20 ++++++------ src/lib.rs | 58 ++++++++++++++++++++++++++++++++++ src/{hpc => }/nibble.rs | 24 +++++++------- src/{hpc => }/palette_codec.rs | 11 +++---- src/{hpc => }/property_mask.rs | 9 +++--- 7 files changed, 153 insertions(+), 42 deletions(-) rename src/{hpc => }/aabb.rs (97%) rename src/{hpc => }/nibble.rs (96%) rename src/{hpc => }/palette_codec.rs (98%) rename src/{hpc => }/property_mask.rs (99%) diff --git a/.claude/board/AGENT_LOG.md b/.claude/board/AGENT_LOG.md index b6948deb..cf9133a9 100644 --- a/.claude/board/AGENT_LOG.md +++ b/.claude/board/AGENT_LOG.md @@ -28,6 +28,60 @@ ## Entries (append below; newest first) +## 2026-05-21T16:00 — substrate-graduation batch 3 (opus 4.7) + +**Branch:** `claude/continue-ndarray-x0Oaw` +**Continues:** PR #194 batch of 5 (`bitwise`/`heel_f64x8`/`distance`/`byte_scan`/`spatial_hash`) + #193 (`simd_caps`). +**Verdict:** SHIP — `cargo check`, `cargo clippy --features approx,serde,rayon -- -D warnings`, doctest suite (15 graduated-module doctests pass), and unit tests (104 lib tests pass) all green. + +**Modules graduated (4):** + +| Module | Old path | New path | Internal hpc/ deps? | +|---|---|---|---| +| `aabb` | `src/hpc/aabb.rs` | `src/aabb.rs` | None — only `super::simd_caps` (now resolves via crate root) | +| `nibble` | `src/hpc/nibble.rs` | `src/nibble.rs` | None — only `super::simd_caps` | +| `palette_codec` | `src/hpc/palette_codec.rs` | `src/palette_codec.rs` | None — pure logic | +| `property_mask` | `src/hpc/property_mask.rs` | `src/property_mask.rs` | None — only `super::simd_caps` | + +**Why these four, why now (criteria carried over from #194 wrap-up):** +1. No internal `hpc/` dependencies. All four only reach into `crate::simd::*` (the polyfill surface) and `super::simd_caps` (itself at crate root post-#192). +2. Already polyfill-clean — no raw-intrinsic refactor required before the move. +3. Single in-tree downstream caller (`hpc::framebuffer` imports `palette_codec`) → the `pub use crate::palette_codec;` back-compat shim in `hpc/mod.rs` keeps that resolution working zero-touch. + +**Changes:** +- `git mv src/hpc/{aabb,nibble,palette_codec,property_mask}.rs src/` +- Added `pub mod {aabb, nibble, palette_codec, property_mask};` to `src/lib.rs` (with `# Example` rustdoc blocks per CLAUDE.md hard rule "all public APIs need /// doc comments with examples"). +- Replaced the four `pub mod` declarations in `src/hpc/mod.rs` with `pub use crate::{aabb, nibble, palette_codec, property_mask};` back-compat re-exports. + +**Lint follow-ups (graduated modules lose the `#![allow(clippy::all, …)]` umbrella that `hpc/mod.rs` carries):** + +17 clippy errors surfaced under `-D warnings`. All fixed at the canonical Rust idiom rather than re-applying the umbrella, per the #194 cleanup precedent (417131bc): + +- **`manual_div_ceil` (6 sites)**: `(n + d - 1) / d` → `n.div_ceil(d)` in `nibble.rs` (×2), `palette_codec.rs` (×3), `property_mask.rs` (×1). +- **`needless_range_loop` (10 sites)**: `for i in start..vec.len() { vec[i] }` → `for x in &vec[start..]` or `for (i, &x) in iter().enumerate()` depending on whether the index is used. Sites in `aabb.rs` (×4), `nibble.rs` (×3), `palette_codec.rs` (×1), `property_mask.rs` (×2). +- **`missing_docs` (4 sites)**: Added field doc comments on `pub struct Aabb { min, max }` and `pub struct Ray { origin, inv_dir }` — these were previously caught by the `hpc/mod.rs` umbrella's `#![allow(missing_docs)]`. + +**Doctest fix:** Initial `bits_for_palette_size(1) → 1` in the `lib.rs` `# Example` block was wrong — the actual impl returns 0 for `palette_size <= 1` (trivial-palette special case; the bits/indices table in `palette_codec.rs`'s module docstring overpromises). Changed example to `bits_for_palette_size(2) → 1`. + +**Verification:** + +``` +cargo check --lib → clean +cargo clippy --lib -- -D warnings → clean +cargo clippy --lib --features rayon -- -D warnings → clean +cargo clippy --features approx,serde,rayon -- -D warnings → clean +cargo test --doc (filtered: graduated modules) → 15 doctests pass +cargo test --lib aabb::tests nibble::tests palette_codec::tests property_mask::tests → 104 unit tests pass +``` + +**No back-compat break:** every existing `use ndarray::hpc::{aabb, nibble, palette_codec, property_mask}::*` continues to resolve via the `pub use crate::*` shims in `hpc/mod.rs`. Verified via `cargo check` of the full workspace — `framebuffer.rs:29` (the one in-tree downstream consumer of `palette_codec`) compiles unchanged. + +**Remaining hpc/ inventory after this batch:** ~55 → ~51 modules at crate root path `crate::hpc::*`. Next-batch candidates (still low-hanging by the same criteria) — to be audited in a separate pass before move: `framebuffer` (depends on `palette_codec` shim, otherwise pure crate-root), `ocr_simd`/`ocr_felt` (need dep audit), `audio` (depends on `crate::simd`). + +**Commit:** TBD (pending push). + +--- + ## 2026-05-13T00:00 — agent #3 polyfill-ops (sonnet) **File:** `src/simd_ops.rs` (288 lines) diff --git a/src/hpc/aabb.rs b/src/aabb.rs similarity index 97% rename from src/hpc/aabb.rs rename to src/aabb.rs index 43770fe8..180dbf1a 100644 --- a/src/hpc/aabb.rs +++ b/src/aabb.rs @@ -17,7 +17,9 @@ #[derive(Debug, Clone, Copy, PartialEq)] #[repr(C)] pub struct Aabb { + /// Minimum corner of the bounding box (x, y, z). pub min: [f32; 3], + /// Maximum corner of the bounding box (x, y, z). pub max: [f32; 3], } @@ -97,7 +99,10 @@ impl Aabb { #[derive(Debug, Clone, Copy, PartialEq)] #[repr(C)] pub struct Ray { + /// Ray origin point (x, y, z). pub origin: [f32; 3], + /// Per-axis reciprocal of the ray direction (1 / dx, 1 / dy, 1 / dz); + /// `inf` is valid (encodes a zero-component direction, slab test skips it). pub inv_dir: [f32; 3], } @@ -122,8 +127,7 @@ impl Ray { #[inline] fn sq_dist_point_aabb(point: [f32; 3], aabb: &Aabb) -> f32 { let mut dist_sq = 0.0f32; - for axis in 0..3 { - let v = point[axis]; + for (axis, &v) in point.iter().enumerate() { if v < aabb.min[axis] { let d = aabb.min[axis] - v; dist_sq += d * d; @@ -230,8 +234,8 @@ unsafe fn aabb_intersect_batch_avx512(query: &Aabb, candidates: &[Aabb]) -> Vec< } // Scalar tail - for i in (chunks * 16)..candidates.len() { - result.push(query.intersects(&candidates[i])); + for cand in &candidates[chunks * 16..] { + result.push(query.intersects(cand)); } result @@ -403,16 +407,15 @@ unsafe fn ray_aabb_slab_test_avx512(ray: &Ray, aabbs: &[Aabb]) -> (Vec, Ve let t_enter_clamped = t_enter.simd_max(zero); let t_arr = t_enter_clamped.to_array(); - for i in 0..16 { + for (i, &t) in t_arr.iter().enumerate() { let hit = (hit_mask >> i) & 1 != 0; hits.push(hit); - t_values.push(if hit { t_arr[i] } else { f32::MAX }); + t_values.push(if hit { t } else { f32::MAX }); } } // Scalar tail for remainder - for i in (chunks * 16)..aabbs.len() { - let aabb = &aabbs[i]; + for aabb in &aabbs[chunks * 16..] { let mut t_enter = f32::NEG_INFINITY; let mut t_exit = f32::INFINITY; diff --git a/src/hpc/mod.rs b/src/hpc/mod.rs index 5e0022a5..3e195b8b 100644 --- a/src/hpc/mod.rs +++ b/src/hpc/mod.rs @@ -176,17 +176,15 @@ pub use crate::byte_scan; pub use crate::distance; pub use crate::spatial_hash; -// Variable-width palette index codec (Minecraft-style bit packing) -#[allow(missing_docs)] -pub mod palette_codec; - -// SIMD-accelerated HPC modules (block properties, nibble light data, AABB collision) -#[allow(missing_docs)] -pub mod property_mask; -#[allow(missing_docs)] -pub mod nibble; -#[allow(missing_docs)] -pub mod aabb; +// Variable-width palette index codec — graduated to crate root. +// Back-compat re-export for existing `use ndarray::hpc::palette_codec::*`. +pub use crate::palette_codec; + +// SIMD-accelerated HPC modules (block properties, nibble light data, AABB +// collision) — all three graduated to crate root. Back-compat re-exports. +pub use crate::aabb; +pub use crate::nibble; +pub use crate::property_mask; // Holographic phase-space operations (ported from rustynum-holo) #[allow(missing_docs)] diff --git a/src/lib.rs b/src/lib.rs index 70d56b9e..c19d6fc7 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -351,6 +351,64 @@ pub mod byte_scan; #[cfg(feature = "std")] pub mod spatial_hash; +/// Axis-aligned bounding box batch operations — SIMD-accelerated +/// intersection, expansion, distance queries. Graduated from +/// `crate::hpc::aabb::*`. +/// +/// # Example +/// +/// ``` +/// use ndarray::aabb::Aabb; +/// let a = Aabb::new([0.0, 0.0, 0.0], [1.0, 1.0, 1.0]); +/// let b = Aabb::new([0.5, 0.5, 0.5], [1.5, 1.5, 1.5]); +/// assert!(a.intersects(&b)); +/// ``` +#[cfg(feature = "std")] +pub mod aabb; + +/// Nibble batch operations for 4-bit packed data (light levels, palettes). +/// Graduated from `crate::hpc::nibble::*`. +/// +/// # Example +/// +/// ``` +/// use ndarray::nibble::nibble_unpack; +/// let unpacked = nibble_unpack(&[0x3A], 2); +/// assert_eq!(unpacked, vec![0xA, 0x3]); +/// ``` +#[cfg(feature = "std")] +pub mod nibble; + +/// Variable-width palette index codec (Minecraft-style bit packing). +/// Packs/unpacks palette indices (0–255) into 1–8 bit widths. +/// Graduated from `crate::hpc::palette_codec::*`. +/// +/// # Example +/// +/// ``` +/// use ndarray::palette_codec::bits_for_palette_size; +/// assert_eq!(bits_for_palette_size(2), 1); +/// assert_eq!(bits_for_palette_size(16), 4); +/// assert_eq!(bits_for_palette_size(256), 8); +/// ``` +#[cfg(feature = "std")] +pub mod palette_codec; + +/// Block property mask — compiled bitset queries on block state bits. +/// AVX-512 VPTERNLOGD tests 3 conditions in 1 cycle. Graduated from +/// `crate::hpc::property_mask::*`. +/// +/// # Example +/// +/// ``` +/// use ndarray::property_mask::PropertyMask; +/// let mask = PropertyMask::new().require_bit(0).forbid_bit(3); +/// assert!(mask.test(0b0000_0001)); // bit 0 set, bit 3 clear → match +/// assert!(!mask.test(0b0000_1001)); // bit 3 set → no match +/// ``` +#[cfg(feature = "std")] +pub mod property_mask; + #[cfg(feature = "std")] #[allow(clippy::all, missing_docs, dead_code, unused_variables, unused_imports)] pub mod simd_neon; diff --git a/src/hpc/nibble.rs b/src/nibble.rs similarity index 96% rename from src/hpc/nibble.rs rename to src/nibble.rs index 05659f95..560c7c72 100644 --- a/src/hpc/nibble.rs +++ b/src/nibble.rs @@ -21,7 +21,7 @@ /// assert_eq!(nibble_unpack(packed, 2), vec![0xA, 0x3]); /// ``` pub fn nibble_unpack(packed: &[u8], count: usize) -> Vec { - assert!(packed.len() >= (count + 1) / 2, "packed buffer too small"); + assert!(packed.len() >= count.div_ceil(2), "packed buffer too small"); let mut out = Vec::with_capacity(count); @@ -105,7 +105,7 @@ pub(crate) unsafe fn nibble_unpack_avx2(packed: &[u8], count: usize, out: &mut V /// assert_eq!(packed, vec![0x3A]); /// ``` pub fn nibble_pack(values: &[u8]) -> Vec { - let out_len = (values.len() + 1) / 2; + let out_len = values.len().div_ceil(2); let mut out = vec![0u8; out_len]; for (i, &v) in values.iter().enumerate() { @@ -175,10 +175,10 @@ unsafe fn nibble_sub_clamp_avx2(packed: &mut [u8], delta: u8) { let mut data = [0u8; 32]; data.copy_from_slice(&packed[offset..offset + 32]); - for j in 0..32 { - let lo = (data[j] & 0x0F).saturating_sub(delta); - let hi = ((data[j] >> 4) & 0x0F).saturating_sub(delta); - data[j] = lo | (hi << 4); + for byte in &mut data { + let lo = (*byte & 0x0F).saturating_sub(delta); + let hi = ((*byte >> 4) & 0x0F).saturating_sub(delta); + *byte = lo | (hi << 4); } packed[offset..offset + 32].copy_from_slice(&data); @@ -263,9 +263,9 @@ pub(crate) unsafe fn nibble_above_threshold_avx2(packed: &[u8], threshold: u8) - let base_byte = c * 32; let chunk = &packed[base_byte..base_byte + 32]; - for j in 0..32 { - let lo = chunk[j] & 0x0F; - let hi = (chunk[j] >> 4) & 0x0F; + for (j, &b) in chunk.iter().enumerate() { + let lo = b & 0x0F; + let hi = (b >> 4) & 0x0F; if lo > threshold { result.push((base_byte + j) * 2); } @@ -277,9 +277,9 @@ pub(crate) unsafe fn nibble_above_threshold_avx2(packed: &[u8], threshold: u8) - // Scalar tail let tail_start = chunks * 32; - for byte_idx in tail_start..packed.len() { - let lo = packed[byte_idx] & 0x0F; - let hi = packed[byte_idx] >> 4; + for (byte_idx, &b) in packed.iter().enumerate().skip(tail_start) { + let lo = b & 0x0F; + let hi = b >> 4; if lo > threshold { result.push(byte_idx * 2); } diff --git a/src/hpc/palette_codec.rs b/src/palette_codec.rs similarity index 98% rename from src/hpc/palette_codec.rs rename to src/palette_codec.rs index 9dc4d8a5..7447a76f 100644 --- a/src/hpc/palette_codec.rs +++ b/src/palette_codec.rs @@ -56,7 +56,7 @@ pub fn pack_indices(indices: &[u8], bits_per_index: usize) -> Vec { assert!(bits_per_index > 0 && bits_per_index <= 8, "bits_per_index must be 1..=8"); let indices_per_word = 64 / bits_per_index; - let n_words = (indices.len() + indices_per_word - 1) / indices_per_word; + let n_words = indices.len().div_ceil(indices_per_word); let mut packed = vec![0u64; n_words]; let mask = (1u64 << bits_per_index) - 1; @@ -110,7 +110,7 @@ pub fn pack_indices_bytes(indices: &[u8], bits_per_index: usize) -> Vec { /// /// Inverse of [`pack_indices_bytes`]. pub fn unpack_indices_bytes(packed: &[u8], bits_per_index: usize, count: usize) -> Vec { - let n_words = (packed.len() + 7) / 8; + let n_words = packed.len().div_ceil(8); let mut words = Vec::with_capacity(n_words); for chunk in packed.chunks(8) { let mut buf = [0u8; 8]; @@ -145,7 +145,7 @@ pub fn transcode(packed: &[u64], old_bits: usize, new_bits: usize, count: usize) let old_per_word = 64 / old_bits; let new_per_word = 64 / new_bits; - let n_new_words = (count + new_per_word - 1) / new_per_word; + let n_new_words = count.div_ceil(new_per_word); let old_mask = (1u64 << old_bits) - 1; let new_mask = (1u64 << new_bits) - 1; @@ -309,8 +309,7 @@ unsafe fn unpack_generic_avx512(packed: &[u64], bits_per_index: usize, count: us let mut result = Vec::with_capacity(count); let mut emitted = 0usize; - for word_idx in 0..packed.len() { - let word = packed[word_idx]; + for &word in packed { for slot in 0..indices_per_word { if emitted >= count { return result; @@ -336,7 +335,7 @@ unsafe fn unpack_generic_avx512(packed: &[u64], bits_per_index: usize, count: us unsafe fn pack_generic_avx512(indices: &[u8], bits_per_index: usize) -> Vec { assert!(bits_per_index > 0 && bits_per_index <= 8); let indices_per_word = 64 / bits_per_index; - let n_words = (indices.len() + indices_per_word - 1) / indices_per_word; + let n_words = indices.len().div_ceil(indices_per_word); let mask = (1u64 << bits_per_index) - 1; let mut packed = vec![0u64; n_words]; diff --git a/src/hpc/property_mask.rs b/src/property_mask.rs similarity index 99% rename from src/hpc/property_mask.rs rename to src/property_mask.rs index f16f6dab..063b709e 100644 --- a/src/hpc/property_mask.rs +++ b/src/property_mask.rs @@ -90,7 +90,7 @@ impl PropertyMask { /// The returned vector has `ceil(states.len() / 64)` entries. pub fn test_section(&self, states: &[u64]) -> Vec { let n = states.len(); - let result_len = (n + 63) / 64; + let result_len = n.div_ceil(64); let mut result = vec![0u64; result_len]; #[cfg(target_arch = "x86_64")] @@ -233,8 +233,8 @@ impl PropertyMask { } // Scalar tail - for i in (chunks * 8)..states.len() { - if self.test(states[i]) { + for &state in &states[chunks * 8..] { + if self.test(state) { total += 1; } } @@ -376,8 +376,7 @@ unsafe fn count_section_multi_avx512(masks: &[PropertyMask], states: &[u64]) -> } // Scalar tail - for i in (chunks * 8)..states.len() { - let state = states[i]; + for &state in &states[chunks * 8..] { for (m_idx, mask) in masks.iter().enumerate() { if mask.test(state) { counts[m_idx] += 1;