Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
274 changes: 274 additions & 0 deletions src/hpc/distance.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,23 @@
//! SIMD-accelerated squared-distance, radius filtering, and K-nearest-neighbor
//! searches over contiguous point slices. All operations work on borrowed slices
//! with no internal copies. Scalar fallback is provided for non-x86 targets.
//!
//! # Slice-shape geometric distance (PR-X10 A6)
//!
//! For arbitrary-length f64 slices (non-3D-point shape), use:
//!
//! - [`l1_f64_simd`] — Manhattan: `Σ |a_i − b_i|`
//! - [`l2_f64_simd`] — Euclidean: `√Σ (a_i − b_i)²`
//! - [`linf_f64_simd`] — Chebyshev: `max |a_i − b_i|`
//!
//! These use the `F64x8` polyfill (no `target_feature`, no `unsafe`),
//! matching the [`crate::hpc::heel_f64x8::cosine_f64_simd`] idiom: F64x8
//! chunks with FMA / SIMD-max accumulator + scalar remainder. They are
//! the salvaged kernels from the rolled-back PR #160 cross-repo arc
//! (lance-graph `heel_f64x8::{l1, l2, linf}_f64_simd`), re-landed here
//! per the linalg-core design's A6 worker scope and the
//! `crate::hpc::linalg/mod.rs` hard boundary ("No distance metrics —
//! those live in `crate::hpc::distance`").

// ---------------------------------------------------------------------------
// Scalar helpers
Expand Down Expand Up @@ -165,6 +182,108 @@ pub fn knn_f64(query: [f64; 3], points: &[[f64; 3]], k: usize) -> (Vec<usize>, V
(indices, sq_dists)
}

// ---------------------------------------------------------------------------
// Slice-shape geometric distance — PR-X10 A6
// ---------------------------------------------------------------------------
//
// Polyfilled F64x8 chunked path with scalar remainder; no `target_feature`,
// no `unsafe` — the polyfill in `crate::simd::F64x8` owns runtime feature
// dispatch (AVX-512 native zmm / AVX2 2×ymm / scalar [f64; 8]).
//
// All three kernels read `min(a.len(), b.len())` elements. Empty inputs
// return 0.0.

use crate::simd::F64x8;

/// L1 (Manhattan) distance between two f64 slices: `Σ |a_i − b_i|`.
///
/// EXACT precision class — the per-lane `(a - b).abs()` introduces no
/// rounding beyond the standard subtract, and the reduce-sum order is
/// lane-tree within each F64x8 chunk + sequential across chunks (matches
/// the [`crate::hpc::heel_f64x8::cosine_f64_simd`] order so callers can
/// reason about determinism the same way).
///
/// Reads `min(a.len(), b.len())` elements. Returns 0.0 for empty inputs.
pub fn l1_f64_simd(a: &[f64], b: &[f64]) -> f64 {
let n = a.len().min(b.len());
let chunks = n / 8;
let mut acc = F64x8::splat(0.0);
for i in 0..chunks {
let va = F64x8::from_slice(&a[i * 8..]);
let vb = F64x8::from_slice(&b[i * 8..]);
acc = acc + (va - vb).abs();
}
let mut sum = acc.reduce_sum();
let offset = chunks * 8;
for i in 0..(n - offset) {
sum += (a[offset + i] - b[offset + i]).abs();
}
sum
}

/// L2 (Euclidean) distance between two f64 slices: `√Σ (a_i − b_i)²`.
///
/// VERIFY precision class — the final `sqrt` is one ULP; the sum is
/// lane-tree within each F64x8 + sequential across chunks (same order
/// pattern as L1). Determinism across runs holds for fixed slice
/// length and fixed chunking. For full order-independence use a
/// pairwise-reduce variant (see `blas_level1::nrm2`).
///
/// Reads `min(a.len(), b.len())` elements. Returns 0.0 for empty inputs.
pub fn l2_f64_simd(a: &[f64], b: &[f64]) -> f64 {
let n = a.len().min(b.len());
let chunks = n / 8;
let mut acc = F64x8::splat(0.0);
for i in 0..chunks {
let va = F64x8::from_slice(&a[i * 8..]);
let vb = F64x8::from_slice(&b[i * 8..]);
let d = va - vb;
acc = d.mul_add(d, acc); // acc += d*d (single FMA per chunk)
}
let mut sum_sq = acc.reduce_sum();
let offset = chunks * 8;
for i in 0..(n - offset) {
let d = a[offset + i] - b[offset + i];
sum_sq += d * d;
}
sum_sq.sqrt()
}

/// L∞ (Chebyshev) distance between two f64 slices: `max |a_i − b_i|`.
///
/// EXACT precision class — `(a - b).abs()` and `max` introduce no
/// rounding; the result is determined by the inputs alone (order-
/// independent across chunks since `max` is associative and commutative
/// under IEEE-754 for non-NaN inputs).
///
/// Reads `min(a.len(), b.len())` elements. Returns 0.0 for empty inputs.
///
/// # NaN handling
///
/// IEEE-754 `_mm512_max_pd` returns the second operand when either input
/// is NaN; callers passing NaN-tainted slices may observe non-deterministic
/// max across runs (an upstream constraint, not a kernel bug). Audit
/// upstream for NaN before relying on this kernel on production data.
pub fn linf_f64_simd(a: &[f64], b: &[f64]) -> f64 {
let n = a.len().min(b.len());
let chunks = n / 8;
let mut max_v = F64x8::splat(0.0);
for i in 0..chunks {
let va = F64x8::from_slice(&a[i * 8..]);
let vb = F64x8::from_slice(&b[i * 8..]);
max_v = max_v.simd_max((va - vb).abs());
}
let mut max_d = max_v.reduce_max();
let offset = chunks * 8;
for i in 0..(n - offset) {
let d = (a[offset + i] - b[offset + i]).abs();
if d > max_d {
max_d = d;
}
}
max_d
}

// ---------------------------------------------------------------------------
// Tests
// ---------------------------------------------------------------------------
Expand Down Expand Up @@ -315,4 +434,159 @@ mod tests {
let result = squared_distances_f32(query, &points);
assert!(approx_eq_f32(result[0], 0.0));
}

// -- PR-X10 A6 slice-shape L1 / L2 / L∞ --

fn approx_eq_f64_tol(a: f64, b: f64, tol: f64) -> bool {
(a - b).abs() < tol
}

/// Deterministic SplitMix64 — matches the pillar harness so the
/// corpus is reproducible across runs and across machines.
fn splitmix(state: &mut u64) -> u64 {
*state = state.wrapping_add(0x9E37_79B9_7F4A_7C15);
let mut z = *state;
z = (z ^ (z >> 30)).wrapping_mul(0xBF58_476D_1CE4_E5B9);
z = (z ^ (z >> 27)).wrapping_mul(0x94D0_49BB_1331_11EB);
z ^ (z >> 31)
}

fn random_vec_f64(seed: u64, n: usize) -> Vec<f64> {
let mut s = seed;
(0..n)
.map(|_| {
let bits = splitmix(&mut s) >> 11;
(bits as f64) / (1u64 << 53) as f64 * 2.0 - 1.0 // uniform in [-1, 1)
})
.collect()
}

// -- L1 boundary + parity --

#[test]
fn l1_f64_simd_self_zero() {
let a = random_vec_f64(0xC1A0, 200);
assert_eq!(l1_f64_simd(&a, &a), 0.0);
}

#[test]
fn l1_f64_simd_empty_is_zero() {
let a: Vec<f64> = vec![];
let b: Vec<f64> = vec![];
assert_eq!(l1_f64_simd(&a, &b), 0.0);
}

#[test]
fn l1_f64_simd_uniform_diff() {
let a = vec![3.0f64; 17];
let b = vec![1.0f64; 17];
// 17 * |3 - 1| = 34
assert!(approx_eq_f64_tol(l1_f64_simd(&a, &b), 34.0, 1e-12));
}

#[test]
fn l1_f64_simd_matches_scalar() {
// 200 elements covers chunked path (25 chunks of 8) + remainder of 0;
// 199 covers chunked + remainder of 7.
for &n in &[1usize, 7, 8, 15, 16, 17, 64, 199, 200, 1024] {
let a = random_vec_f64(0xA110_C1A0, n);
let b = random_vec_f64(0xB220_C1A0, n);
let simd = l1_f64_simd(&a, &b);
let scalar: f64 = a.iter().zip(&b).map(|(x, y)| (x - y).abs()).sum();
assert!(approx_eq_f64_tol(simd, scalar, 1e-11), "n={} simd={:.15} scalar={:.15}", n, simd, scalar);
}
}

// -- L2 boundary + parity --

#[test]
fn l2_f64_simd_self_zero() {
let a = random_vec_f64(0xC2A0, 200);
assert_eq!(l2_f64_simd(&a, &a), 0.0);
}

#[test]
fn l2_f64_simd_empty_is_zero() {
let a: Vec<f64> = vec![];
let b: Vec<f64> = vec![];
assert_eq!(l2_f64_simd(&a, &b), 0.0);
}

#[test]
fn l2_f64_simd_pythagoras() {
// (3, 0, …) vs (0, 4, …): √(9 + 16) = 5
let a = vec![3.0f64, 0.0];
let b = vec![0.0f64, 4.0];
assert!(approx_eq_f64_tol(l2_f64_simd(&a, &b), 5.0, 1e-12));
}

#[test]
fn l2_f64_simd_matches_scalar() {
for &n in &[1usize, 7, 8, 15, 16, 17, 64, 199, 200, 1024] {
let a = random_vec_f64(0xA110_C2A0, n);
let b = random_vec_f64(0xB220_C2A0, n);
let simd = l2_f64_simd(&a, &b);
let sum_sq: f64 = a.iter().zip(&b).map(|(x, y)| (x - y).powi(2)).sum();
let scalar = sum_sq.sqrt();
// Sqrt is 1 ULP; cross-chunk summation order differs by chunks
// of 8 vs sequential — allow generous relative tolerance.
let rel = (simd - scalar).abs() / scalar.max(1e-12);
assert!(rel < 1e-10, "n={} simd={:.15} scalar={:.15} rel={:.2e}", n, simd, scalar, rel);
}
}

// -- L∞ boundary + parity --

#[test]
fn linf_f64_simd_self_zero() {
let a = random_vec_f64(0xC1FF, 200);
assert_eq!(linf_f64_simd(&a, &a), 0.0);
}

#[test]
fn linf_f64_simd_empty_is_zero() {
let a: Vec<f64> = vec![];
let b: Vec<f64> = vec![];
assert_eq!(linf_f64_simd(&a, &b), 0.0);
}

#[test]
fn linf_f64_simd_picks_max_in_chunk() {
// Max difference must land inside a chunked path (index 5 < 8) and
// also outside (index 13 > 8) to exercise both halves.
let mut a = vec![0.0f64; 16];
let mut b = vec![0.0f64; 16];
a[5] = 0.5;
a[13] = -0.7; // |Δ| = 0.7 — should win
b[2] = 0.1;
assert!(approx_eq_f64_tol(linf_f64_simd(&a, &b), 0.7, 1e-12));
}

#[test]
fn linf_f64_simd_matches_scalar() {
for &n in &[1usize, 7, 8, 15, 16, 17, 64, 199, 200, 1024] {
let a = random_vec_f64(0xA110_C1FF, n);
let b = random_vec_f64(0xB220_C1FF, n);
let simd = linf_f64_simd(&a, &b);
let scalar: f64 = a
.iter()
.zip(&b)
.map(|(x, y)| (x - y).abs())
.fold(0.0_f64, f64::max);
assert!(approx_eq_f64_tol(simd, scalar, 1e-15), "n={} simd={:.15} scalar={:.15}", n, simd, scalar);
}
}

/// Mismatched-length slices: must use the shorter length, no panic.
#[test]
fn slice_distances_mismatched_length_uses_min() {
let a = vec![1.0f64; 17];
let b = vec![2.0f64; 10];
// L1 over min=10: 10 * |1 - 2| = 10
assert!(approx_eq_f64_tol(l1_f64_simd(&a, &b), 10.0, 1e-12));
// L2 over min=10: √(10 * 1) = √10
assert!(approx_eq_f64_tol(l2_f64_simd(&a, &b), 10f64.sqrt(), 1e-12));
// L∞ = 1
assert!(approx_eq_f64_tol(linf_f64_simd(&a, &b), 1.0, 1e-12));
}
}
52 changes: 46 additions & 6 deletions src/hpc/dn_tree.rs
Original file line number Diff line number Diff line change
Expand Up @@ -132,8 +132,15 @@ pub(crate) fn bundle_into(current: &GraphHV, hv: &GraphHV, lr: f64, boost: f64,
/// Create a u64 bitmask where each bit is independently 1 with probability ~`p`.
///
/// Uses cascaded AND of random words to achieve the target probability:
/// - p >= 0.5 → OR of inverse masks
/// - p < 0.5 → AND cascade
/// - p > 0.5 → invert the (1-p) mask
/// - p <= 0.5 → AND cascade
///
/// At exactly `p = 0.5` the AND-cascade branch executes a single
/// `rng.next_u64()` (n = ceil(-log2(0.5)) = 1) — each bit is then
/// IID Bernoulli(0.5). Note the **strict** comparison here: an earlier
/// version used `p >= 0.5`, which recursed with `1.0 - 0.5 = 0.5`
/// infinitely. The Pillar-13 drift-check (`hpc::pillar::hhtl_contraction`)
/// already uses the strict comparison and is the canonical reference.
fn make_probability_mask(p: f64, rng: &mut SplitMix64) -> u64 {
if p >= 1.0 {
return u64::MAX;
Expand All @@ -142,13 +149,14 @@ fn make_probability_mask(p: f64, rng: &mut SplitMix64) -> u64 {
return 0;
}

if p >= 0.5 {
// p >= 0.5: use OR approach — each AND of randoms gives ~0.25, NOT gives ~0.75, etc.
// Simpler: just AND enough randoms to get (1-p) kill rate, then NOT.
if p > 0.5 {
// p > 0.5: invert the (1-p) mask. Strict > 0.5 so p == 0.5
// falls through to the AND-cascade and produces a single
// Bernoulli(0.5) word in one rng draw.
return !make_probability_mask(1.0 - p, rng);
}

// p < 0.5: AND cascade. Each AND halves the probability.
// p <= 0.5: AND cascade. Each AND halves the probability.
// We need n ANDs where 0.5^n ≈ p, so n = -log2(p).
let n = (-p.log2()).ceil() as u32;
let mut mask = rng.next_u64();
Expand Down Expand Up @@ -543,6 +551,38 @@ mod tests {
SplitMix64::new(42)
}

/// Regression: at p = 0.5 exactly, the previous `p >= 0.5` branch
/// recursed with `1.0 - 0.5 = 0.5` infinitely. The strict `p > 0.5`
/// fix routes p=0.5 to the AND-cascade (n=1, one rng draw) which
/// produces a Bernoulli(0.5) mask in O(1) time.
#[test]
fn make_probability_mask_at_half_terminates() {
let mut rng = make_rng();
// If this stack-overflows, the recursion fix has regressed.
let mask = make_probability_mask(0.5, &mut rng);
// Bernoulli(0.5) over 64 bits — popcount should be near 32, but
// any value 0..=64 is valid for a single draw. The test's
// load-bearing assertion is that the call returns.
assert!(mask <= u64::MAX);
}

/// Empirical Bernoulli(0.5) check: average popcount over N=1024
/// independent masks must land near 32 (the true mean) within a
/// generous tolerance.
#[test]
fn make_probability_mask_at_half_is_bernoulli_half() {
let mut rng = make_rng();
const N: u32 = 1024;
let mut total: u64 = 0;
for _ in 0..N {
total += make_probability_mask(0.5, &mut rng).count_ones() as u64;
}
let mean = total as f64 / N as f64;
// σ per word = sqrt(64 * 0.5 * 0.5) = 4; mean's SE = 4 / √N = 0.125.
// Tolerance 2.0 ≈ 16 SEs — comfortable margin against flakes.
assert!((mean - 32.0).abs() < 2.0, "make_probability_mask(0.5) mean popcount {mean:.4} not near 32");
}

#[test]
fn test_new_tree_structure() {
let tree = DNTree::with_capacity(4096);
Expand Down
6 changes: 4 additions & 2 deletions src/hpc/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,10 @@
//! - FFT (forward, inverse, real-to-complex)
//! - VML (vectorized math library)

// SIMD capability singleton — detect once, all modules share
pub mod simd_caps;
// SIMD capability singleton — graduated to crate root (it never depended
// on anything else in `hpc/`); re-exported here for back-compat with
// existing `crate::hpc::simd_caps::*` imports across the workspace.
pub use crate::simd_caps;
// LazyLock frozen SIMD dispatch — function pointers selected once at startup
pub mod simd_dispatch;

Expand Down
Loading
Loading