feat(hpc): VPDPBUSD-zmm middle tier for matmul_i8_to_i32

claude · claude · commit bb7b9b7c4e6d · 2026-05-21T05:57:42.000Z
Completes the per-CPU dispatch chain for `matmul_i8_to_i32`. Per PR #180's table the middle tier between AMX TDPBUSD (Sapphire Rapids+) and the scalar reference is `_mm512_dpbusd_epi32` (zmm form, avx512vnni feature) — covers Cooper Lake, Cascade Lake, Ice Lake-SP, Zen 4+ silicon that has AVX-512 VNNI but not AMX. Mirrors the VDPBF16PS arm structure that landed for BF16 in PR #182's `bf16_gemm_dispatch`. New kernel `hpc::int8_tile_gemm::int8_gemm_vpdpbusd_zmm`: * One VPDPBUSD instruction: 16 i32 accumulator lanes, each receiving 4 u8×i8 products = 64 MACs per instruction. * Maps the 16 output lanes to a row of 16 j-columns of `c[i, ·]`, one i row processed at a time, K-quad inner loop accumulating into the same 16 i32 lanes across iterations. * B-column packing: pre-packs B for the current j-block into `b_col_quads[k_quad * 16 + j] = i32 (4 bytes of B[4k_quad.., j_base+j] packed bottom-to-top)` once per j-block; reused across all M i-iterations so the gather cost amortizes. * A row quad broadcast: `_mm512_set1_epi32` of (4 u8 bytes packed) every K-iter — same quad seen by every output column. * K-tail (k % 4 != 0) handled with scalar u8×i8 multiplies per output cell; N-tail (j_count < 16) handled by trimming the store width — padding lanes still receive VPDPBUSD updates but aren't written back. * Stable intrinsic `_mm512_dpbusd_epi32` under `target_feature = "avx512vnni,avx512f"` — no asm-byte needed. Wiring `matmul_i8_to_i32` to three-tier dispatch: 1. amx_available() + 16/16/64-aligned shapes → int8_tile_gemm_16x16 → TDPBUSD asm-byte (16 384 MACs/instr, this commit reuses the kernel from PR #184 fe334de... wait, same PR — from b1979d7 in THIS PR) 2. is_x86_feature_detected!("avx512vnni") → int8_gemm_vpdpbusd_zmm → _mm512_dpbusd_epi32 stable intrinsic (64 MACs/instr, arbitrary shapes, K-tail handled scalar, N-tail handled by per-iteration j_count trim) 3. scalar i8×i8 → i32 reference for non-x86, pre-AVX-512 hosts, or shapes that don't satisfy either SIMD tier's requirements Factored the shared sign-shift bias subtraction into a private `subtract_i8_to_u8_bias(c, b_i8, m, n, k)` helper: both Tier 1 (AMX) and Tier 2 (VNNI) shift LHS i8 → u8 via (+128) then need to subtract 128·colsum(B) from the accumulator. Pure integer arithmetic, bit-identical to the scalar i8×i8 → i32 reference. Verification: * Default v3 build: 2093 lib tests pass (was 2092 — +1 new test `vpdpbusd_zmm_matches_scalar` that exercises the new arm directly with shapes spanning aligned cases, K-tail (k % 4), N-tail (n % 16), and small shapes; asserts byte-equal output vs scalar reference). * Existing `matmul_i8_to_i32_16x16_exact` continues to pass through the AMX tier on this host (which has amx_int8). * cargo clippy --lib --tests --features rayon,native -- -D warnings clean. * cargo fmt --all --check clean. Per-CPU dispatch state after this commit: matmul_bf16_to_f32: SPR+ AMX | Zen4/CPL VDPBF16PS | scalar (PR #182) | (PR #182) | (always) matmul_f32: SPR+ AMX | Zen4/CPL VDPBF16PS | scalar (PR #182) | (PR #182) | (always) matmul_i8_to_i32: SPR+ AMX | CPL/Zen4 VPDPBUSD | scalar (b1979d7) | (THIS COMMIT) | (always) So all three of the public matmul entry points now have full three-tier dispatch on x86_64. Out of scope (separate PRs): * AMX tile path for `simd_int_ops::gemm_u8_i8` (the slice-level u8×i8 surface from PR #182) — it's u8×i8 natively, no sign- shift bias needed, simpler than matmul_i8_to_i32. * AVX-VNNI ymm arm (Arrow Lake / Meteor Lake U: avxvnni without avx512vnni) — the `vnni2_*` functions exist in simd_amx.rs but need to be assembled into a m×n×k VNNI-ymm GEMM. Same shape as the avx512vnni arm just with ymm width. https://claude.ai/code/session_01HbqooFZHAjaUtFEzhA1R2u
diff --git a/src/hpc/amx_matmul.rs b/src/hpc/amx_matmul.rs
@@ -586,20 +586,14 @@ pub fn matmul_i8_to_i32(
     let mut c = vec![0i32; m * n];
 
     if amx_available() && m % 16 == 0 && n % 16 == 0 && k % 64 == 0 {
-        // AMX TDPBUSD path: shift LHS i8 → u8 via (+128), tile-GEMM into
-        // i32, subtract bias 128·colsum(B). The tile kernel zeroes its
-        // internal accumulator (TILEZERO + TDPBUSD accumulate); we need
-        // fresh per-tile output here so we tile manually over M/N and
-        // call int8_tile_gemm_16x16 per (i, j) block.
+        // Tier 1 — AMX TDPBUSD tile path: shift LHS i8 → u8 (+128),
+        // tile-GEMM via int8_tile_gemm_16x16, subtract bias.
         let a_u8: Vec<u8> = a_i8.iter().map(|&v| (v as i32 + 128) as u8).collect();
 
-        // B sub-block extraction per j-tile (B is row-major K × N; the
-        // tile kernel wants K × 16 contiguous). Reused across i-tiles.
         let mut b_tile = vec![0i8; k * 16];
         let mut tile_c = vec![0i32; 256];
 
         for j_tile in (0..n).step_by(16) {
-            // Pack B[0..k, j_tile..j_tile+16] into 16-wide K-rows.
             for kk in 0..k {
                 let row = kk * n + j_tile;
                 b_tile[kk * 16..(kk + 1) * 16]
@@ -609,29 +603,27 @@ pub fn matmul_i8_to_i32(
                 let a_tile = &a_u8[i_tile * k..(i_tile + 16) * k];
                 tile_c.fill(0);
                 crate::hpc::int8_tile_gemm::int8_tile_gemm_16x16(a_tile, &b_tile, &mut tile_c, k);
-                // Write tile_c (16 × 16) into c at (i_tile, j_tile).
                 for ii in 0..16 {
                     let dst_off = (i_tile + ii) * n + j_tile;
                     c[dst_off..dst_off + 16].copy_from_slice(&tile_c[ii * 16..(ii + 1) * 16]);
                 }
             }
         }
-
-        // Subtract bias: c[i, j] -= 128 · colsum(B[:, j]).
-        let mut colsum = vec![0i32; n];
-        for p in 0..k {
-            for j in 0..n {
-                colsum[j] += b_i8[p * n + j] as i32;
-            }
-        }
-        for i in 0..m {
-            for j in 0..n {
-                c[i * n + j] -= 128 * colsum[j];
-            }
+        subtract_i8_to_u8_bias(&mut c, &b_i8, m, n, k);
+    } else if cfg!(target_arch = "x86_64") && std::is_x86_feature_detected!("avx512vnni") {
+        // Tier 2 — AVX-512 VPDPBUSD zmm: 64 MACs per instruction, no
+        // shape-alignment requirement (M/N/K all handled via per-block
+        // trim and scalar K-tail). Same sign-shift bias trick as AMX.
+        let a_u8: Vec<u8> = a_i8.iter().map(|&v| (v as i32 + 128) as u8).collect();
+        // SAFETY: runtime feature-detected avx512vnni above.
+        unsafe {
+            crate::hpc::int8_tile_gemm::int8_gemm_vpdpbusd_zmm(&a_u8, &b_i8, &mut c, m, n, k);
         }
+        subtract_i8_to_u8_bias(&mut c, &b_i8, m, n, k);
     } else {
-        // Scalar i8×i8 → i32 reference — used for non-AMX hosts and for
-        // shapes that don't fit the 16/16/64 tile alignment.
+        // Tier 3 — Scalar i8×i8 → i32 reference for non-x86 hosts,
+        // pre-AVX-512 silicon, or shapes that don't satisfy either of
+        // the SIMD tiers' alignment requirements.
         for i in 0..m {
             for p in 0..k {
                 let av = a_i8[i * k + p] as i32;
@@ -653,6 +645,27 @@ pub fn matmul_i8_to_i32(
     Ok(())
 }
 
+/// Subtract `128 · colsum(B[:, j])` from each `c[i, j]` lane.
+///
+/// Used by both the AMX and AVX-512-VNNI arms of `matmul_i8_to_i32`
+/// to undo the LHS sign-shift bias (A_i8 → A_u8 via +128 means
+/// `A_u8 · B = (A_i8 + 128) · B = A_i8 · B + 128 · sum_k B[k, j]`).
+/// Pure integer arithmetic, no rounding — the public result is
+/// bit-identical to the scalar i8 × i8 → i32 reference.
+fn subtract_i8_to_u8_bias(c: &mut [i32], b_i8: &[i8], m: usize, n: usize, k: usize) {
+    let mut colsum = vec![0i32; n];
+    for p in 0..k {
+        for j in 0..n {
+            colsum[j] += b_i8[p * n + j] as i32;
+        }
+    }
+    for i in 0..m {
+        for j in 0..n {
+            c[i * n + j] -= 128 * colsum[j];
+        }
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
diff --git a/src/hpc/int8_tile_gemm.rs b/src/hpc/int8_tile_gemm.rs
@@ -101,6 +101,111 @@ unsafe fn amx_path(a_u8: &[u8], b_vnni: &[i8], c: &mut [i32], k: usize) {
     tile_release();
 }
 
+// ═════════════════════════════════════════════════════════════════════
+// VPDPBUSD-zmm middle tier (avx512vnni without AMX)
+// ═════════════════════════════════════════════════════════════════════
+
+/// AVX-512 VNNI `u8 × i8 → i32` GEMM kernel for arbitrary M × N × K.
+///
+/// One `_mm512_dpbusd_epi32` instruction: 16 i32 accumulator lanes,
+/// each receiving the sum of 4 `u8 × i8` products = **64 MACs per
+/// instruction**. Pre-packs B in VNNI quad layout once per j-block
+/// (16-wide column band) and reuses across all M i-iterations,
+/// amortizing the gather cost.
+///
+/// K-tail (when K is not a multiple of 4) handled with scalar
+/// u8 × i8 multiplies per output cell; N-tail (when the j-block has
+/// fewer than 16 valid columns) handled by trimming the store after
+/// the VPDPBUSD chain.
+///
+/// This is the middle dispatch tier between AMX TDPBUSD (Sapphire
+/// Rapids+) and the scalar reference — covers Cooper Lake, Cascade
+/// Lake, Ice Lake-SP, Zen 4+ silicon that has avx512vnni but not
+/// AMX. Mirrors the VDPBF16PS arm structure shipped for BF16 in
+/// PR #182.
+///
+/// Output behavior: overwrites `c` (does NOT accumulate). Caller's
+/// responsibility to zero `c` first if a fresh-write GEMM is wanted.
+///
+/// # Safety
+/// Caller must have feature-detected `avx512vnni + avx512f` at runtime.
+#[cfg(target_arch = "x86_64")]
+#[target_feature(enable = "avx512vnni,avx512f")]
+pub unsafe fn int8_gemm_vpdpbusd_zmm(a_u8: &[u8], b_i8: &[i8], c: &mut [i32], m: usize, n: usize, k: usize) {
+    use core::arch::x86_64::{
+        __m512i, _mm512_dpbusd_epi32, _mm512_loadu_si512, _mm512_set1_epi32, _mm512_setzero_si512, _mm512_storeu_si512,
+    };
+
+    let k_quads = k / 4;
+    let k_tail = k % 4;
+
+    // Pre-pack scratch for B columns of the current j-block:
+    // 16 i32 lanes per k_quad, each holding 4 consecutive K-bytes
+    // packed (b[2q+0..2q+4] for output column j+lane).
+    let mut b_col_quads = vec![0i32; k_quads.max(1) * 16];
+    // Scratch for the 16-wide store + N-tail trim.
+    let mut out_buf = [0i32; 16];
+
+    for j_base in (0..n).step_by(16) {
+        let j_count = 16.min(n - j_base);
+
+        // Pack B[0..k, j_base..j_base+j_count] in quad-interleaved layout.
+        // For lanes j >= j_count (the N-tail of this j_block), pad with 0
+        // so the VPDPBUSD doesn't read uninitialized memory; they're not
+        // stored back.
+        for k_quad in 0..k_quads {
+            let row0 = 4 * k_quad * n;
+            let row1 = (4 * k_quad + 1) * n;
+            let row2 = (4 * k_quad + 2) * n;
+            let row3 = (4 * k_quad + 3) * n;
+            for jj in 0..j_count {
+                let b0 = b_i8[row0 + j_base + jj] as u8 as u32;
+                let b1 = b_i8[row1 + j_base + jj] as u8 as u32;
+                let b2 = b_i8[row2 + j_base + jj] as u8 as u32;
+                let b3 = b_i8[row3 + j_base + jj] as u8 as u32;
+                // Pack as i32: bottom byte is k_quad*4+0, top is k_quad*4+3.
+                b_col_quads[k_quad * 16 + jj] = (b0 | (b1 << 8) | (b2 << 16) | (b3 << 24)) as i32;
+            }
+            for jj in j_count..16 {
+                b_col_quads[k_quad * 16 + jj] = 0;
+            }
+        }
+
+        for i in 0..m {
+            let mut acc = _mm512_setzero_si512();
+            let a_row_off = i * k;
+            for k_quad in 0..k_quads {
+                // Broadcast A[i, 4*k_quad..4*k_quad+4] (4 u8) across all
+                // 16 i32 lanes via _mm512_set1_epi32.
+                let a0 = a_u8[a_row_off + 4 * k_quad] as u32;
+                let a1 = a_u8[a_row_off + 4 * k_quad + 1] as u32;
+                let a2 = a_u8[a_row_off + 4 * k_quad + 2] as u32;
+                let a3 = a_u8[a_row_off + 4 * k_quad + 3] as u32;
+                let packed_a = a0 | (a1 << 8) | (a2 << 16) | (a3 << 24);
+                let a_v = _mm512_set1_epi32(packed_a as i32);
+                let b_v = _mm512_loadu_si512(b_col_quads.as_ptr().add(k_quad * 16) as *const __m512i);
+                acc = _mm512_dpbusd_epi32(acc, a_v, b_v);
+            }
+            _mm512_storeu_si512(out_buf.as_mut_ptr() as *mut __m512i, acc);
+
+            // K-tail: scalar multiplies for k = k_quads*4 .. k.
+            if k_tail > 0 {
+                for kk in (k_quads * 4)..k {
+                    let a_val = a_u8[a_row_off + kk] as i32;
+                    let tail_row = kk * n;
+                    for jj in 0..j_count {
+                        out_buf[jj] += a_val * b_i8[tail_row + j_base + jj] as i32;
+                    }
+                }
+            }
+
+            // Store j_count valid lanes (drops N-tail padding lanes).
+            let dst_off = i * n + j_base;
+            c[dst_off..dst_off + j_count].copy_from_slice(&out_buf[..j_count]);
+        }
+    }
+}
+
 // ═════════════════════════════════════════════════════════════════════
 // Scalar fallback (i32 reference)
 // ═════════════════════════════════════════════════════════════════════
@@ -192,6 +297,48 @@ mod tests {
         }
     }
 
+    /// Direct test for the VPDPBUSD-zmm arm, exercising the path the
+    /// `matmul_i8_to_i32` dispatcher would skip when AMX is available.
+    /// Verifies bit-exact parity against the scalar reference for
+    /// arbitrary (M, N, K) — including non-multiple-of-4 K (so the
+    /// scalar K-tail branch fires) and non-multiple-of-16 N (so the
+    /// j-count trim branch fires).
+    #[cfg(target_arch = "x86_64")]
+    #[test]
+    fn vpdpbusd_zmm_matches_scalar() {
+        if !std::is_x86_feature_detected!("avx512vnni") {
+            eprintln!("avx512vnni not detected; skipping");
+            return;
+        }
+
+        fn ref_gemm(a: &[u8], b: &[i8], m: usize, n: usize, k: usize) -> Vec<i32> {
+            let mut c = vec![0i32; m * n];
+            for i in 0..m {
+                for kk in 0..k {
+                    let av = a[i * k + kk] as i32;
+                    for j in 0..n {
+                        c[i * n + j] += av * b[kk * n + j] as i32;
+                    }
+                }
+            }
+            c
+        }
+
+        // Sweep shapes spanning aligned cases, K-tail (k % 4), and
+        // N-tail (n % 16) to exercise every code path.
+        for (m, n, k) in [(16, 16, 64), (3, 5, 7), (17, 33, 100), (1, 17, 12), (8, 16, 4)] {
+            let a: Vec<u8> = (0..m * k).map(|i| ((i * 31 + 7) % 256) as u8).collect();
+            let b: Vec<i8> = (0..k * n)
+                .map(|i| ((i * 17 + 3) % 256) as u8 as i8)
+                .collect();
+            let expected = ref_gemm(&a, &b, m, n, k);
+            let mut got = vec![0i32; m * n];
+            // SAFETY: avx512vnni confirmed at the top of the test.
+            unsafe { int8_gemm_vpdpbusd_zmm(&a, &b, &mut got, m, n, k) };
+            assert_eq!(got, expected, "VPDPBUSD-zmm mismatch at (M={}, N={}, K={})", m, n, k);
+        }
+    }
+
     #[test]
     fn vnni_pack_i8_roundtrip() {
         // Pack then verify the VNNI layout matches the spec: