Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
92 changes: 92 additions & 0 deletions src/hpc/pillar/hhtl_contraction.rs
Original file line number Diff line number Diff line change
Expand Up @@ -448,4 +448,96 @@ mod tests {
assert!((r1.psd_rate - r2.psd_rate).abs() < 1e-12);
assert!((r1.lognorm_concentration - r2.lognorm_concentration).abs() < 1e-12);
}

/// Drift-detection: the pillar's `bundle_step` independently re-derives
/// the bit-mixing bundle operator. The production code path at
/// `crate::hpc::dn_tree::bundle_into` (PR #189, exposed `pub(crate)`)
/// is the substrate the pillar is defending. This test runs both on
/// seed-aligned SplitMix64 RNGs and asserts the first 16 u64 words of
/// production's `GraphHV.channels[0]` agree bit-exactly with the
/// pillar's `[u64; WORDS]` output.
///
/// # Why this is a bit-exact (not ε-tolerant) check
///
/// Per the substrate's bit-exactness contract (W1a + the data-flow
/// rules), bundling is a *gated XOR* (Bernoulli-mixture per bit) —
/// the mask draws come from `SplitMix64` which is bit-deterministic.
/// Both pillar's `probability_mask` and production's
/// `make_probability_mask` consume the same number of `next_u64()`
/// draws at lr=0.25 (n=ceil(-log2(0.25))=2 per word), so the masks
/// for the first `WORDS` words align exactly across the two
/// functions. The remaining 240 words of channel 0 (and channels 1/2)
/// consume extra RNG draws on the production side; those don't affect
/// the first WORDS=16 words because each word is independent.
///
/// # Why not lr=0.5
///
/// Production's `make_probability_mask(0.5)` has a latent
/// infinite-recursion bug: `p >= 0.5` recurses with `1.0 - 0.5 = 0.5`
/// forever. Pillar's `probability_mask` uses `p > 0.5` (strict) and
/// falls through to the AND-cascade at p=0.5. Real production usage
/// (DNConfig default lr=0.03, boost up to ~30 → effective_lr~0.9)
/// never hits 0.5 exactly, so the bug is dormant. This drift-check
/// uses lr=0.25 where both implementations agree; the lr=0.5 case
/// is recorded as a follow-up.
#[test]
fn pillar_13_matches_production_bundle_into() {
use crate::hpc::cam_index::GraphHV;
use crate::hpc::dn_tree::{bundle_into, SplitMix64 as DnSplitMix64};

const N_TRIALS: u32 = 16;
const TEST_LR: f64 = 0.25;

// Both SplitMix64 implementations use identical algorithm (same
// multiplier constants 0x9E3779B97F4A7C15, 0xBF58476D1CE4E5B9,
// 0x94D049BB133111EB and same shift sequence), so identical seeds
// → identical sequences. Both functions consume the same number
// of next_u64() draws per word at p=0.25 (n=ceil(-log2(0.25))=2),
// so the mask sequences align bit-exactly across the first WORDS
// positions of each call.
//
// The RNGs MUST be re-seeded per trial because production's
// bundle_into consumes 48× more RNG draws per call (3 channels ×
// 256 words × 2 draws = 1536) than pillar's bundle_step (16 words
// × 2 draws = 32). Without re-seeding, post-trial-0 RNG states
// diverge.

for trial in 0..N_TRIALS {
// Per-trial seed for both bundling RNGs (must be the same so
// masks align). Inputs come from a separate stream so the
// bundling RNG state isn't disturbed by input generation.
let trial_seed = PILLAR_13_SEED.wrapping_add(trial as u64);
let mut rng_pillar = SplitMix64::new(trial_seed);
let mut rng_prod = DnSplitMix64::new(trial_seed);

let mut rng_inputs = SplitMix64::new(trial_seed.wrapping_mul(0x9E37_79B9_7F4A_7C15));
let x = random_bits(&mut rng_inputs);
let y = random_bits(&mut rng_inputs);

// Pillar side: WORDS=16 u64 mixing
let out_pillar = bundle_step(&x, &y, TEST_LR as f32, &mut rng_pillar);

// Production side: pack x/y into channel 0 of a GraphHV,
// zero the rest. Pillar's `bundle(x, y, lr)` is "keep x where
// mask=0, take y where mask=1"; production's `bundle_into`
// contract is the same with `current` ↔ x and `hv` ↔ y
// (per src/hpc/dn_tree.rs line 125). boost=1.0 means
// effective_lr = lr * 1.0 = TEST_LR (matching pillar).
let mut hv_x = GraphHV::zero();
let mut hv_y = GraphHV::zero();
hv_x.channels[0].words[..WORDS].copy_from_slice(&x);
hv_y.channels[0].words[..WORDS].copy_from_slice(&y);
let hv_out = bundle_into(&hv_x, &hv_y, TEST_LR, 1.0, &mut rng_prod);

// Compare first WORDS=16 u64 words bit-exactly
for w in 0..WORDS {
assert_eq!(
out_pillar[w], hv_out.channels[0].words[w],
"Pillar/bundle_into drift at trial {trial} word {w}: \
pillar=0x{:016x} prod=0x{:016x}",
out_pillar[w], hv_out.channels[0].words[w]
);
}
}
}
}
104 changes: 104 additions & 0 deletions src/hpc/pillar/ogit_lattice.rs
Original file line number Diff line number Diff line change
Expand Up @@ -450,4 +450,108 @@ mod tests {
assert!((r1.psd_rate - r2.psd_rate).abs() < 1e-12);
assert!((r1.lognorm_concentration - r2.lognorm_concentration).abs() < 1e-12);
}

/// Drift-detection: the pillar's `transitive_closure` independently
/// derives the partial-order closure on synthetic DAGs. The production
/// code path at `crate::hpc::ogit_bridge::schema::OntologySchema::is_ancestor`
/// (PR #189, exposed `pub`) is the substrate the pillar is defending.
///
/// This test generates a small **single-parent** tree (production's
/// `OntologySchema.parent: Option<Box<str>>` is single-parent, so the
/// drift-check operates on a strict subset of pillar's DAG family),
/// builds it as Turtle source, runs both:
/// - pillar's `transitive_closure` on the equivalent boolean
/// direct-edge matrix
/// - production's `is_ancestor(a, d)` on the parsed `OntologySchema`
/// and asserts agreement on EVERY (ancestor, descendant) pair.
///
/// # Pillar/production closure axes
///
/// Pillar `le[i * N + j] = true` means "type `i` ≤ type `j`" (i.e.,
/// `i` extends/is-subclass-of `j`). Production
/// `is_ancestor(a, d) = true` means "a is an ancestor of d" (i.e.,
/// d extends/is-subclass-of a). So the equivalence is:
/// `pillar.le[i][j] == production.is_ancestor(types[j], types[i])`.
#[cfg(feature = "ogit_bridge")]
#[test]
fn pillar_14_matches_production_is_ancestor() {
use crate::hpc::ogit_bridge::schema::OntologySchema;
use crate::hpc::ogit_bridge::turtle_parser::TurtleParser;

// Small N — Turtle parsing scales linearly but we want a fast test.
const N: usize = 8;

// Type names: ogit:T0, ogit:T1, …, ogit:T{N-1}
let names: Vec<String> = (0..N).map(|i| format!("ogit:T{i}")).collect();

// Generate a deterministic single-parent tree. Type 0 is the root;
// type k>0 picks parent uniformly from {0..k}. Seed-anchored so
// the test is reproducible.
let mut rng = SplitMix64::new(PILLAR_14_SEED);
let mut parent = vec![usize::MAX; N];
for k in 1..N {
// Uniform sample over {0..k}; range is small so modulo-bias
// is negligible and reproducibility matters more than rigor.
parent[k] = (rng.next_u64() as usize) % k;
}

// Build Turtle source and parse to OntologySchema.
let mut src = String::from(
"@prefix ogit: <http://www.purl.org/ogit/> .\n\
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .\n",
);
src.push_str(&format!("{} a rdfs:Class .\n", names[0]));
for k in 1..N {
src.push_str(&format!("{} a rdfs:Class ; rdfs:subClassOf {} .\n", names[k], names[parent[k]]));
}
let triples = TurtleParser::parse(&src).unwrap();
let schema = OntologySchema::from_triples(&triples).unwrap();

// Build the equivalent direct-edge boolean matrix in pillar's
// [N × N] flat layout. direct[k * N + parent[k]] = true.
let mut direct = vec![false; N * N];
for k in 1..N {
direct[k * N + parent[k]] = true;
}
// Hand-compute closure using pillar's helper (not full Pillar 14
// version which is N_TYPES-sized; inline the Floyd-Warshall here).
let mut le = vec![false; N * N];
for i in 0..N {
le[i * N + i] = true;
for j in 0..N {
if direct[i * N + j] {
le[i * N + j] = true;
}
}
}
for kk in 0..N {
for i in 0..N {
if !le[i * N + kk] {
continue;
}
for j in 0..N {
if le[kk * N + j] {
le[i * N + j] = true;
}
}
}
}

// Cross-check every (ancestor, descendant) pair.
let mut total = 0u32;
for i in 0..N {
for j in 0..N {
let pillar_says = le[i * N + j]; // i extends j (j is ancestor of i)
let prod_says = schema.is_ancestor(&names[j], &names[i]);
assert_eq!(
pillar_says, prod_says,
"Pillar/is_ancestor drift on pair (ancestor={}, descendant={}): \
pillar.le[{i}][{j}]={pillar_says} production.is_ancestor={prod_says}",
names[j], names[i]
);
total += 1;
}
}
eprintln!("Pillar 14 ↔ is_ancestor agreement: {total} pair-checks pass over N={N} single-parent tree");
}
}
48 changes: 48 additions & 0 deletions src/hpc/pillar/splat_invariants.rs
Original file line number Diff line number Diff line change
Expand Up @@ -437,4 +437,52 @@ mod tests {
assert!((r1.psd_rate - r2.psd_rate).abs() < 1e-12);
assert!((r1.lognorm_concentration - r2.lognorm_concentration).abs() < 1e-12);
}

/// Drift-detection: the pillar's `covariance_from_scale_quat`
/// independently re-derives `Σ = R(q) · diag(s²) · R(q)ᵀ`. The
/// production code path at `crate::hpc::splat3d::spd3::Spd3::from_scale_quat`
/// is the substrate the pillar is *defending*. This test runs both
/// on the same SplitMix64-seeded sample of 256 `(scale, quat)` pairs
/// and asserts agreement to within `1e-5` per upper-triangle entry.
///
/// Any divergence ≥ ε indicates one of two failure modes:
/// (a) production drifted from the canonical quaternion-rotation
/// formula (the pillar definition wins by design — fix the
/// production code), or
/// (b) the pillar itself drifted (audit `covariance_from_scale_quat`
/// against Kerbl 2023 Eq. 3 before changing).
///
/// This is the *coupling* the per-pillar docstring promises:
/// production and pillar share no code, but they share a CI gate
/// that compares them point-for-point.
#[test]
fn pillar_12_matches_production_spd3_from_scale_quat() {
use crate::hpc::splat3d::spd3::Spd3;

const N: u32 = 256;
let mut rng = SplitMix64::new(PILLAR_12_SEED);
let mut max_abs_err: f32 = 0.0;

for _ in 0..N {
let s = [sample_scale_axis(&mut rng), sample_scale_axis(&mut rng), sample_scale_axis(&mut rng)];
let q = sample_unit_quaternion(&mut rng);

let pillar = covariance_from_scale_quat(s, q);
let prod = Spd3::from_scale_quat(s, q);
let prod_ut = [prod.a11, prod.a12, prod.a13, prod.a22, prod.a23, prod.a33];

for (i, (&p, &pr)) in pillar.iter().zip(prod_ut.iter()).enumerate() {
let err = (p - pr).abs();
if err > max_abs_err {
max_abs_err = err;
}
assert!(
err < 1e-5,
"Pillar/Spd3 drift at lane {i}: pillar={p:.7} prod={pr:.7} err={err:.2e} s={s:?} quat={q:?}"
);
}
}

eprintln!("Pillar 12 ↔ Spd3::from_scale_quat agreement: max_abs_err={max_abs_err:.3e} over {N} pairs");
}
}
94 changes: 62 additions & 32 deletions src/simd_runtime/cpu_ops.rs
Original file line number Diff line number Diff line change
Expand Up @@ -243,29 +243,42 @@ pub fn cpu_ops_for_tier(name: &str) -> Option<&'static CpuOps> {
}
}

/// Lookup by GCC CPU codename (e.g. `"sapphirerapids"`,
/// `"neoverse-v2"`, `"apple-m2"`). Maps the canonical GCC name to the
/// dispatch tier the CPU lands in, sourced from the scrape recorded
/// in the matrix doc § M.
/// Lookup a [`CpuOps`] by GCC CPU codename (e.g. `"sapphirerapids"`,
/// `"neoverse-v2"`, `"apple-m2"`) on the **current build host**.
///
/// Used for "what would this CPU pick?" introspection without
/// touching CPUID on the running host — e.g. cross-compilation
/// reports, deployment-planning tools, integration tests that want
/// to assert tier selection for a named target without running on
/// that silicon.
/// Returns `Some(&'static CpuOps)` only when the named CPU's tier is
/// reachable from the current `target_arch` (e.g. an x86_64 CPU name
/// on an x86_64 build, an aarch64 CPU name on an aarch64 build).
/// Cross-arch lookups — e.g. `cpu_ops_for_cpu("apple-m2")` on an
/// x86_64 build — return `None` because the underlying NEON kernel
/// fn pointers are compiled out and there is no honest `CpuOps` to
/// return.
///
/// For pure introspection ("what tier would this CPU pick?", with no
/// intent to call kernels), use [`cpu_tier_for_cpu`] instead — it is
/// `cfg`-free and works on any build host.
///
/// Returns `None` for unknown CPU names. Only modern (V8.2-A+ on
/// aarch64, AVX-512+ or AVX-VNNI+ on x86_64) names are mapped — older
/// silicon falls through to `cpu_ops_for_tier("scalar")` by
/// convention if you really need it.
pub fn cpu_ops_for_cpu(name: &str) -> Option<&'static CpuOps> {
cpu_ops_for_tier(cpu_to_tier(name)?)
cpu_ops_for_tier(cpu_tier_for_cpu(name)?)
}

/// Maps a GCC CPU codename to the [`CpuOps`] tier it lands in. Data
/// from the scrape recorded in `.claude/knowledge/agnostic-surface-cpu-matrix.md`
/// § M (aarch64) plus the GCC i386 cpu definitions for x86_64.
fn cpu_to_tier(cpu: &str) -> Option<&'static str> {
/// Lookup the dispatch tier name (e.g. `"amx_int8"`, `"avx512vnni"`,
/// `"neon"`) for a GCC CPU codename. Data from the scrape recorded
/// in `.claude/knowledge/agnostic-surface-cpu-matrix.md` § M
/// (aarch64) plus the GCC i386 cpu definitions for x86_64.
///
/// `cfg`-free — works on any build host regardless of `target_arch`.
/// This is the right entry point for cross-target introspection:
/// deployment-planning tools, cross-compilation reports, integration
/// tests that assert "apple-m2 lands at the neon tier" without
/// actually building for that silicon.
///
/// Returns `None` for unknown CPU names.
pub fn cpu_tier_for_cpu(cpu: &str) -> Option<&'static str> {
Some(match cpu {
// x86_64 — AMX-INT8 hosts
"sapphirerapids" | "graniterapids" | "graniterapids-d" | "emeraldrapids" => "amx_int8",
Expand Down Expand Up @@ -336,24 +349,41 @@ mod tests {
}

#[test]
fn cpu_ops_for_cpu_data_driven_lookup() {
// Spot-check the GCC-scraped mapping (matrix doc § M).
assert_eq!(cpu_to_tier("sapphirerapids"), Some("amx_int8"));
assert_eq!(cpu_to_tier("graniterapids"), Some("amx_int8"));
assert_eq!(cpu_to_tier("cascadelake"), Some("avx512vnni"));
assert_eq!(cpu_to_tier("znver4"), Some("avx512vnni"));
assert_eq!(cpu_to_tier("znver5"), Some("avx512vnni"));
assert_eq!(cpu_to_tier("alderlake"), Some("avxvnni"));
assert_eq!(cpu_to_tier("arrowlake"), Some("avxvnni"));
assert_eq!(cpu_to_tier("haswell"), Some("avx2_fma"));
assert_eq!(cpu_to_tier("znver3"), Some("avx2_fma"));

assert_eq!(cpu_to_tier("apple-m2"), Some("neon"));
assert_eq!(cpu_to_tier("neoverse-v2"), Some("neon"));
assert_eq!(cpu_to_tier("oryon-1"), Some("neon"));
assert_eq!(cpu_to_tier("grace"), Some("neon"));

assert_eq!(cpu_to_tier("totally-fake-cpu"), None);
fn cpu_tier_for_cpu_data_driven_lookup() {
// Spot-check the GCC-scraped mapping (matrix doc § M). This
// function is cfg-free — every assertion must hold on every
// build host, regardless of target_arch.
assert_eq!(cpu_tier_for_cpu("sapphirerapids"), Some("amx_int8"));
assert_eq!(cpu_tier_for_cpu("graniterapids"), Some("amx_int8"));
assert_eq!(cpu_tier_for_cpu("cascadelake"), Some("avx512vnni"));
assert_eq!(cpu_tier_for_cpu("znver4"), Some("avx512vnni"));
assert_eq!(cpu_tier_for_cpu("znver5"), Some("avx512vnni"));
assert_eq!(cpu_tier_for_cpu("alderlake"), Some("avxvnni"));
assert_eq!(cpu_tier_for_cpu("arrowlake"), Some("avxvnni"));
assert_eq!(cpu_tier_for_cpu("haswell"), Some("avx2_fma"));
assert_eq!(cpu_tier_for_cpu("znver3"), Some("avx2_fma"));

assert_eq!(cpu_tier_for_cpu("apple-m2"), Some("neon"));
assert_eq!(cpu_tier_for_cpu("neoverse-v2"), Some("neon"));
assert_eq!(cpu_tier_for_cpu("oryon-1"), Some("neon"));
assert_eq!(cpu_tier_for_cpu("grace"), Some("neon"));

assert_eq!(cpu_tier_for_cpu("totally-fake-cpu"), None);
}

/// Regression for the cross-arch-introspection bug Codex flagged
/// on PR #187: `cpu_tier_for_cpu` MUST return the same Some-string
/// regardless of the build host. Previously, ARM CPU names like
/// `"apple-m2"` would fall to `None` on an x86_64 build because the
/// lookup piped through the cfg-gated `cpu_ops_for_tier`.
#[test]
fn cpu_tier_for_cpu_is_cross_arch() {
// These four must resolve on EVERY build host (x86_64, aarch64,
// wasm, etc.) — no cfg gating on this surface.
assert_eq!(cpu_tier_for_cpu("apple-m2"), Some("neon"));
assert_eq!(cpu_tier_for_cpu("sapphirerapids"), Some("amx_int8"));
assert_eq!(cpu_tier_for_cpu("neoverse-v2"), Some("neon"));
assert_eq!(cpu_tier_for_cpu("alderlake"), Some("avxvnni"));
}

#[test]
Expand Down
2 changes: 1 addition & 1 deletion src/simd_runtime/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,6 @@ pub mod vnni_dot;
// consumers can `use crate::simd_runtime::*` and get every op flat.
pub use add_mul::{add_mul_f32, add_mul_f64};
pub use casts::{bf16_to_f32_batch, cast_f16_to_f32_batch, cast_f32_to_f16_batch, f32_to_bf16_batch_rne};
pub use cpu_ops::{cpu_ops, cpu_ops_for_cpu, cpu_ops_for_tier, CpuOps};
pub use cpu_ops::{cpu_ops, cpu_ops_for_cpu, cpu_ops_for_tier, cpu_tier_for_cpu, CpuOps};
pub use matmul::{gemm_u8_i8, matmul_bf16_to_f32, matmul_f32, matmul_i8_to_i32};
pub use vnni_dot::vnni_dot_u8_i8;
Loading