diff --git a/src/hpc/pillar/hhtl_contraction.rs b/src/hpc/pillar/hhtl_contraction.rs index 102b5eab..5f946dfe 100644 --- a/src/hpc/pillar/hhtl_contraction.rs +++ b/src/hpc/pillar/hhtl_contraction.rs @@ -448,4 +448,96 @@ mod tests { assert!((r1.psd_rate - r2.psd_rate).abs() < 1e-12); assert!((r1.lognorm_concentration - r2.lognorm_concentration).abs() < 1e-12); } + + /// Drift-detection: the pillar's `bundle_step` independently re-derives + /// the bit-mixing bundle operator. The production code path at + /// `crate::hpc::dn_tree::bundle_into` (PR #189, exposed `pub(crate)`) + /// is the substrate the pillar is defending. This test runs both on + /// seed-aligned SplitMix64 RNGs and asserts the first 16 u64 words of + /// production's `GraphHV.channels[0]` agree bit-exactly with the + /// pillar's `[u64; WORDS]` output. + /// + /// # Why this is a bit-exact (not ε-tolerant) check + /// + /// Per the substrate's bit-exactness contract (W1a + the data-flow + /// rules), bundling is a *gated XOR* (Bernoulli-mixture per bit) — + /// the mask draws come from `SplitMix64` which is bit-deterministic. + /// Both pillar's `probability_mask` and production's + /// `make_probability_mask` consume the same number of `next_u64()` + /// draws at lr=0.25 (n=ceil(-log2(0.25))=2 per word), so the masks + /// for the first `WORDS` words align exactly across the two + /// functions. The remaining 240 words of channel 0 (and channels 1/2) + /// consume extra RNG draws on the production side; those don't affect + /// the first WORDS=16 words because each word is independent. + /// + /// # Why not lr=0.5 + /// + /// Production's `make_probability_mask(0.5)` has a latent + /// infinite-recursion bug: `p >= 0.5` recurses with `1.0 - 0.5 = 0.5` + /// forever. Pillar's `probability_mask` uses `p > 0.5` (strict) and + /// falls through to the AND-cascade at p=0.5. Real production usage + /// (DNConfig default lr=0.03, boost up to ~30 → effective_lr~0.9) + /// never hits 0.5 exactly, so the bug is dormant. This drift-check + /// uses lr=0.25 where both implementations agree; the lr=0.5 case + /// is recorded as a follow-up. + #[test] + fn pillar_13_matches_production_bundle_into() { + use crate::hpc::cam_index::GraphHV; + use crate::hpc::dn_tree::{bundle_into, SplitMix64 as DnSplitMix64}; + + const N_TRIALS: u32 = 16; + const TEST_LR: f64 = 0.25; + + // Both SplitMix64 implementations use identical algorithm (same + // multiplier constants 0x9E3779B97F4A7C15, 0xBF58476D1CE4E5B9, + // 0x94D049BB133111EB and same shift sequence), so identical seeds + // → identical sequences. Both functions consume the same number + // of next_u64() draws per word at p=0.25 (n=ceil(-log2(0.25))=2), + // so the mask sequences align bit-exactly across the first WORDS + // positions of each call. + // + // The RNGs MUST be re-seeded per trial because production's + // bundle_into consumes 48× more RNG draws per call (3 channels × + // 256 words × 2 draws = 1536) than pillar's bundle_step (16 words + // × 2 draws = 32). Without re-seeding, post-trial-0 RNG states + // diverge. + + for trial in 0..N_TRIALS { + // Per-trial seed for both bundling RNGs (must be the same so + // masks align). Inputs come from a separate stream so the + // bundling RNG state isn't disturbed by input generation. + let trial_seed = PILLAR_13_SEED.wrapping_add(trial as u64); + let mut rng_pillar = SplitMix64::new(trial_seed); + let mut rng_prod = DnSplitMix64::new(trial_seed); + + let mut rng_inputs = SplitMix64::new(trial_seed.wrapping_mul(0x9E37_79B9_7F4A_7C15)); + let x = random_bits(&mut rng_inputs); + let y = random_bits(&mut rng_inputs); + + // Pillar side: WORDS=16 u64 mixing + let out_pillar = bundle_step(&x, &y, TEST_LR as f32, &mut rng_pillar); + + // Production side: pack x/y into channel 0 of a GraphHV, + // zero the rest. Pillar's `bundle(x, y, lr)` is "keep x where + // mask=0, take y where mask=1"; production's `bundle_into` + // contract is the same with `current` ↔ x and `hv` ↔ y + // (per src/hpc/dn_tree.rs line 125). boost=1.0 means + // effective_lr = lr * 1.0 = TEST_LR (matching pillar). + let mut hv_x = GraphHV::zero(); + let mut hv_y = GraphHV::zero(); + hv_x.channels[0].words[..WORDS].copy_from_slice(&x); + hv_y.channels[0].words[..WORDS].copy_from_slice(&y); + let hv_out = bundle_into(&hv_x, &hv_y, TEST_LR, 1.0, &mut rng_prod); + + // Compare first WORDS=16 u64 words bit-exactly + for w in 0..WORDS { + assert_eq!( + out_pillar[w], hv_out.channels[0].words[w], + "Pillar/bundle_into drift at trial {trial} word {w}: \ + pillar=0x{:016x} prod=0x{:016x}", + out_pillar[w], hv_out.channels[0].words[w] + ); + } + } + } } diff --git a/src/hpc/pillar/ogit_lattice.rs b/src/hpc/pillar/ogit_lattice.rs index db9dd7c3..0caa7f14 100644 --- a/src/hpc/pillar/ogit_lattice.rs +++ b/src/hpc/pillar/ogit_lattice.rs @@ -450,4 +450,108 @@ mod tests { assert!((r1.psd_rate - r2.psd_rate).abs() < 1e-12); assert!((r1.lognorm_concentration - r2.lognorm_concentration).abs() < 1e-12); } + + /// Drift-detection: the pillar's `transitive_closure` independently + /// derives the partial-order closure on synthetic DAGs. The production + /// code path at `crate::hpc::ogit_bridge::schema::OntologySchema::is_ancestor` + /// (PR #189, exposed `pub`) is the substrate the pillar is defending. + /// + /// This test generates a small **single-parent** tree (production's + /// `OntologySchema.parent: Option>` is single-parent, so the + /// drift-check operates on a strict subset of pillar's DAG family), + /// builds it as Turtle source, runs both: + /// - pillar's `transitive_closure` on the equivalent boolean + /// direct-edge matrix + /// - production's `is_ancestor(a, d)` on the parsed `OntologySchema` + /// and asserts agreement on EVERY (ancestor, descendant) pair. + /// + /// # Pillar/production closure axes + /// + /// Pillar `le[i * N + j] = true` means "type `i` ≤ type `j`" (i.e., + /// `i` extends/is-subclass-of `j`). Production + /// `is_ancestor(a, d) = true` means "a is an ancestor of d" (i.e., + /// d extends/is-subclass-of a). So the equivalence is: + /// `pillar.le[i][j] == production.is_ancestor(types[j], types[i])`. + #[cfg(feature = "ogit_bridge")] + #[test] + fn pillar_14_matches_production_is_ancestor() { + use crate::hpc::ogit_bridge::schema::OntologySchema; + use crate::hpc::ogit_bridge::turtle_parser::TurtleParser; + + // Small N — Turtle parsing scales linearly but we want a fast test. + const N: usize = 8; + + // Type names: ogit:T0, ogit:T1, …, ogit:T{N-1} + let names: Vec = (0..N).map(|i| format!("ogit:T{i}")).collect(); + + // Generate a deterministic single-parent tree. Type 0 is the root; + // type k>0 picks parent uniformly from {0..k}. Seed-anchored so + // the test is reproducible. + let mut rng = SplitMix64::new(PILLAR_14_SEED); + let mut parent = vec![usize::MAX; N]; + for k in 1..N { + // Uniform sample over {0..k}; range is small so modulo-bias + // is negligible and reproducibility matters more than rigor. + parent[k] = (rng.next_u64() as usize) % k; + } + + // Build Turtle source and parse to OntologySchema. + let mut src = String::from( + "@prefix ogit: .\n\ + @prefix rdfs: .\n", + ); + src.push_str(&format!("{} a rdfs:Class .\n", names[0])); + for k in 1..N { + src.push_str(&format!("{} a rdfs:Class ; rdfs:subClassOf {} .\n", names[k], names[parent[k]])); + } + let triples = TurtleParser::parse(&src).unwrap(); + let schema = OntologySchema::from_triples(&triples).unwrap(); + + // Build the equivalent direct-edge boolean matrix in pillar's + // [N × N] flat layout. direct[k * N + parent[k]] = true. + let mut direct = vec![false; N * N]; + for k in 1..N { + direct[k * N + parent[k]] = true; + } + // Hand-compute closure using pillar's helper (not full Pillar 14 + // version which is N_TYPES-sized; inline the Floyd-Warshall here). + let mut le = vec![false; N * N]; + for i in 0..N { + le[i * N + i] = true; + for j in 0..N { + if direct[i * N + j] { + le[i * N + j] = true; + } + } + } + for kk in 0..N { + for i in 0..N { + if !le[i * N + kk] { + continue; + } + for j in 0..N { + if le[kk * N + j] { + le[i * N + j] = true; + } + } + } + } + + // Cross-check every (ancestor, descendant) pair. + let mut total = 0u32; + for i in 0..N { + for j in 0..N { + let pillar_says = le[i * N + j]; // i extends j (j is ancestor of i) + let prod_says = schema.is_ancestor(&names[j], &names[i]); + assert_eq!( + pillar_says, prod_says, + "Pillar/is_ancestor drift on pair (ancestor={}, descendant={}): \ + pillar.le[{i}][{j}]={pillar_says} production.is_ancestor={prod_says}", + names[j], names[i] + ); + total += 1; + } + } + eprintln!("Pillar 14 ↔ is_ancestor agreement: {total} pair-checks pass over N={N} single-parent tree"); + } } diff --git a/src/hpc/pillar/splat_invariants.rs b/src/hpc/pillar/splat_invariants.rs index d6d595b4..c45b1ce8 100644 --- a/src/hpc/pillar/splat_invariants.rs +++ b/src/hpc/pillar/splat_invariants.rs @@ -437,4 +437,52 @@ mod tests { assert!((r1.psd_rate - r2.psd_rate).abs() < 1e-12); assert!((r1.lognorm_concentration - r2.lognorm_concentration).abs() < 1e-12); } + + /// Drift-detection: the pillar's `covariance_from_scale_quat` + /// independently re-derives `Σ = R(q) · diag(s²) · R(q)ᵀ`. The + /// production code path at `crate::hpc::splat3d::spd3::Spd3::from_scale_quat` + /// is the substrate the pillar is *defending*. This test runs both + /// on the same SplitMix64-seeded sample of 256 `(scale, quat)` pairs + /// and asserts agreement to within `1e-5` per upper-triangle entry. + /// + /// Any divergence ≥ ε indicates one of two failure modes: + /// (a) production drifted from the canonical quaternion-rotation + /// formula (the pillar definition wins by design — fix the + /// production code), or + /// (b) the pillar itself drifted (audit `covariance_from_scale_quat` + /// against Kerbl 2023 Eq. 3 before changing). + /// + /// This is the *coupling* the per-pillar docstring promises: + /// production and pillar share no code, but they share a CI gate + /// that compares them point-for-point. + #[test] + fn pillar_12_matches_production_spd3_from_scale_quat() { + use crate::hpc::splat3d::spd3::Spd3; + + const N: u32 = 256; + let mut rng = SplitMix64::new(PILLAR_12_SEED); + let mut max_abs_err: f32 = 0.0; + + for _ in 0..N { + let s = [sample_scale_axis(&mut rng), sample_scale_axis(&mut rng), sample_scale_axis(&mut rng)]; + let q = sample_unit_quaternion(&mut rng); + + let pillar = covariance_from_scale_quat(s, q); + let prod = Spd3::from_scale_quat(s, q); + let prod_ut = [prod.a11, prod.a12, prod.a13, prod.a22, prod.a23, prod.a33]; + + for (i, (&p, &pr)) in pillar.iter().zip(prod_ut.iter()).enumerate() { + let err = (p - pr).abs(); + if err > max_abs_err { + max_abs_err = err; + } + assert!( + err < 1e-5, + "Pillar/Spd3 drift at lane {i}: pillar={p:.7} prod={pr:.7} err={err:.2e} s={s:?} quat={q:?}" + ); + } + } + + eprintln!("Pillar 12 ↔ Spd3::from_scale_quat agreement: max_abs_err={max_abs_err:.3e} over {N} pairs"); + } } diff --git a/src/simd_runtime/cpu_ops.rs b/src/simd_runtime/cpu_ops.rs index c1fe43d7..67b79713 100644 --- a/src/simd_runtime/cpu_ops.rs +++ b/src/simd_runtime/cpu_ops.rs @@ -243,29 +243,42 @@ pub fn cpu_ops_for_tier(name: &str) -> Option<&'static CpuOps> { } } -/// Lookup by GCC CPU codename (e.g. `"sapphirerapids"`, -/// `"neoverse-v2"`, `"apple-m2"`). Maps the canonical GCC name to the -/// dispatch tier the CPU lands in, sourced from the scrape recorded -/// in the matrix doc § M. +/// Lookup a [`CpuOps`] by GCC CPU codename (e.g. `"sapphirerapids"`, +/// `"neoverse-v2"`, `"apple-m2"`) on the **current build host**. /// -/// Used for "what would this CPU pick?" introspection without -/// touching CPUID on the running host — e.g. cross-compilation -/// reports, deployment-planning tools, integration tests that want -/// to assert tier selection for a named target without running on -/// that silicon. +/// Returns `Some(&'static CpuOps)` only when the named CPU's tier is +/// reachable from the current `target_arch` (e.g. an x86_64 CPU name +/// on an x86_64 build, an aarch64 CPU name on an aarch64 build). +/// Cross-arch lookups — e.g. `cpu_ops_for_cpu("apple-m2")` on an +/// x86_64 build — return `None` because the underlying NEON kernel +/// fn pointers are compiled out and there is no honest `CpuOps` to +/// return. +/// +/// For pure introspection ("what tier would this CPU pick?", with no +/// intent to call kernels), use [`cpu_tier_for_cpu`] instead — it is +/// `cfg`-free and works on any build host. /// /// Returns `None` for unknown CPU names. Only modern (V8.2-A+ on /// aarch64, AVX-512+ or AVX-VNNI+ on x86_64) names are mapped — older /// silicon falls through to `cpu_ops_for_tier("scalar")` by /// convention if you really need it. pub fn cpu_ops_for_cpu(name: &str) -> Option<&'static CpuOps> { - cpu_ops_for_tier(cpu_to_tier(name)?) + cpu_ops_for_tier(cpu_tier_for_cpu(name)?) } -/// Maps a GCC CPU codename to the [`CpuOps`] tier it lands in. Data -/// from the scrape recorded in `.claude/knowledge/agnostic-surface-cpu-matrix.md` -/// § M (aarch64) plus the GCC i386 cpu definitions for x86_64. -fn cpu_to_tier(cpu: &str) -> Option<&'static str> { +/// Lookup the dispatch tier name (e.g. `"amx_int8"`, `"avx512vnni"`, +/// `"neon"`) for a GCC CPU codename. Data from the scrape recorded +/// in `.claude/knowledge/agnostic-surface-cpu-matrix.md` § M +/// (aarch64) plus the GCC i386 cpu definitions for x86_64. +/// +/// `cfg`-free — works on any build host regardless of `target_arch`. +/// This is the right entry point for cross-target introspection: +/// deployment-planning tools, cross-compilation reports, integration +/// tests that assert "apple-m2 lands at the neon tier" without +/// actually building for that silicon. +/// +/// Returns `None` for unknown CPU names. +pub fn cpu_tier_for_cpu(cpu: &str) -> Option<&'static str> { Some(match cpu { // x86_64 — AMX-INT8 hosts "sapphirerapids" | "graniterapids" | "graniterapids-d" | "emeraldrapids" => "amx_int8", @@ -336,24 +349,41 @@ mod tests { } #[test] - fn cpu_ops_for_cpu_data_driven_lookup() { - // Spot-check the GCC-scraped mapping (matrix doc § M). - assert_eq!(cpu_to_tier("sapphirerapids"), Some("amx_int8")); - assert_eq!(cpu_to_tier("graniterapids"), Some("amx_int8")); - assert_eq!(cpu_to_tier("cascadelake"), Some("avx512vnni")); - assert_eq!(cpu_to_tier("znver4"), Some("avx512vnni")); - assert_eq!(cpu_to_tier("znver5"), Some("avx512vnni")); - assert_eq!(cpu_to_tier("alderlake"), Some("avxvnni")); - assert_eq!(cpu_to_tier("arrowlake"), Some("avxvnni")); - assert_eq!(cpu_to_tier("haswell"), Some("avx2_fma")); - assert_eq!(cpu_to_tier("znver3"), Some("avx2_fma")); - - assert_eq!(cpu_to_tier("apple-m2"), Some("neon")); - assert_eq!(cpu_to_tier("neoverse-v2"), Some("neon")); - assert_eq!(cpu_to_tier("oryon-1"), Some("neon")); - assert_eq!(cpu_to_tier("grace"), Some("neon")); - - assert_eq!(cpu_to_tier("totally-fake-cpu"), None); + fn cpu_tier_for_cpu_data_driven_lookup() { + // Spot-check the GCC-scraped mapping (matrix doc § M). This + // function is cfg-free — every assertion must hold on every + // build host, regardless of target_arch. + assert_eq!(cpu_tier_for_cpu("sapphirerapids"), Some("amx_int8")); + assert_eq!(cpu_tier_for_cpu("graniterapids"), Some("amx_int8")); + assert_eq!(cpu_tier_for_cpu("cascadelake"), Some("avx512vnni")); + assert_eq!(cpu_tier_for_cpu("znver4"), Some("avx512vnni")); + assert_eq!(cpu_tier_for_cpu("znver5"), Some("avx512vnni")); + assert_eq!(cpu_tier_for_cpu("alderlake"), Some("avxvnni")); + assert_eq!(cpu_tier_for_cpu("arrowlake"), Some("avxvnni")); + assert_eq!(cpu_tier_for_cpu("haswell"), Some("avx2_fma")); + assert_eq!(cpu_tier_for_cpu("znver3"), Some("avx2_fma")); + + assert_eq!(cpu_tier_for_cpu("apple-m2"), Some("neon")); + assert_eq!(cpu_tier_for_cpu("neoverse-v2"), Some("neon")); + assert_eq!(cpu_tier_for_cpu("oryon-1"), Some("neon")); + assert_eq!(cpu_tier_for_cpu("grace"), Some("neon")); + + assert_eq!(cpu_tier_for_cpu("totally-fake-cpu"), None); + } + + /// Regression for the cross-arch-introspection bug Codex flagged + /// on PR #187: `cpu_tier_for_cpu` MUST return the same Some-string + /// regardless of the build host. Previously, ARM CPU names like + /// `"apple-m2"` would fall to `None` on an x86_64 build because the + /// lookup piped through the cfg-gated `cpu_ops_for_tier`. + #[test] + fn cpu_tier_for_cpu_is_cross_arch() { + // These four must resolve on EVERY build host (x86_64, aarch64, + // wasm, etc.) — no cfg gating on this surface. + assert_eq!(cpu_tier_for_cpu("apple-m2"), Some("neon")); + assert_eq!(cpu_tier_for_cpu("sapphirerapids"), Some("amx_int8")); + assert_eq!(cpu_tier_for_cpu("neoverse-v2"), Some("neon")); + assert_eq!(cpu_tier_for_cpu("alderlake"), Some("avxvnni")); } #[test] diff --git a/src/simd_runtime/mod.rs b/src/simd_runtime/mod.rs index 666c1277..2f4c5e5b 100644 --- a/src/simd_runtime/mod.rs +++ b/src/simd_runtime/mod.rs @@ -92,6 +92,6 @@ pub mod vnni_dot; // consumers can `use crate::simd_runtime::*` and get every op flat. pub use add_mul::{add_mul_f32, add_mul_f64}; pub use casts::{bf16_to_f32_batch, cast_f16_to_f32_batch, cast_f32_to_f16_batch, f32_to_bf16_batch_rne}; -pub use cpu_ops::{cpu_ops, cpu_ops_for_cpu, cpu_ops_for_tier, CpuOps}; +pub use cpu_ops::{cpu_ops, cpu_ops_for_cpu, cpu_ops_for_tier, cpu_tier_for_cpu, CpuOps}; pub use matmul::{gemm_u8_i8, matmul_bf16_to_f32, matmul_f32, matmul_i8_to_i32}; pub use vnni_dot::vnni_dot_u8_i8;