diff --git a/.gitignore b/.gitignore
index d86432bc..fe785b7e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -13,8 +13,15 @@ benchmark-delta.md
 benchmark-relative.json
 fuzz/corpus
 zstd/fuzz/corpus
+# Fresh fuzz crashes from local `cargo fuzz run` sessions are CI/local
+# scratch — they live in the runner. Pre-existing tracked artifacts
+# under `decode/`, `fse/`, `huff0/` keep working because git tracks
+# them by index regardless of this pattern; new untracked crashes
+# stop polluting `git status`.
+zstd/fuzz/artifacts/**
 .idea
 /=
 AGENTS.md
 .claude/
+.local/
 **/__pycache__/
diff --git a/Cargo.toml b/Cargo.toml
index fbc1467f..ca0ac7cb 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,3 +1,16 @@
 [workspace]
 resolver = "3"
 members = ["zstd", "cli"]
+
+# Bench profile inherits from release but overrides LTO/codegen-units so
+# the perf measurement reflects full cross-crate inlining. We deliberately
+# scope this to `[profile.bench]` instead of `[profile.release]`: the
+# release profile is shared by the `cli` binary and every downstream
+# consumer of this workspace, and fat LTO + codegen-units = 1 multiplies
+# release build times ~5× and significantly increases peak compile RAM.
+# Bench builds run on CI with the wall clock budget for it; everyday
+# `cargo build --release` (and downstream consumers) should not pay that
+# cost without explicitly opting in.
+[profile.bench]
+lto = "fat"
+codegen-units = 1
diff --git a/zstd/benches/compare_ffi.rs b/zstd/benches/compare_ffi.rs
index 3c11dd28..37ac8162 100644
--- a/zstd/benches/compare_ffi.rs
+++ b/zstd/benches/compare_ffi.rs
@@ -213,7 +213,7 @@ fn bench_compress(c: &mut Criterion) {
     for scenario in benchmark_scenarios_cached().iter() {
         for level in supported_levels_filtered() {
             if emit_reports {
-                let rust_compressed = structured_zstd::encoding::compress_to_vec(
+                let rust_compressed = structured_zstd::encoding::compress_slice_to_vec(
                     &scenario.bytes[..],
                     level.rust_level,
                 );
@@ -230,7 +230,7 @@ fn bench_compress(c: &mut Criterion) {
 
             group.bench_function("pure_rust", |b| {
                 b.iter(|| {
-                    black_box(structured_zstd::encoding::compress_to_vec(
+                    black_box(structured_zstd::encoding::compress_slice_to_vec(
                         &scenario.bytes[..],
                         level.rust_level,
                     ))
@@ -250,8 +250,10 @@ fn bench_decompress(c: &mut Criterion) {
     let emit_reports = emit_reports_enabled();
     for scenario in benchmark_scenarios_cached().iter() {
         for level in supported_levels_filtered() {
-            let rust_compressed =
-                structured_zstd::encoding::compress_to_vec(&scenario.bytes[..], level.rust_level);
+            let rust_compressed = structured_zstd::encoding::compress_slice_to_vec(
+                &scenario.bytes[..],
+                level.rust_level,
+            );
             let ffi_compressed = ffi_encode_to_vec(&scenario.bytes[..], level.ffi_level);
             let expected_len = scenario.len();
             bench_decompress_source(
diff --git a/zstd/benches/compare_ffi_memory.rs b/zstd/benches/compare_ffi_memory.rs
index d519dbc3..8f543369 100644
--- a/zstd/benches/compare_ffi_memory.rs
+++ b/zstd/benches/compare_ffi_memory.rs
@@ -421,7 +421,10 @@ fn main() {
         for level in supported_levels_filtered() {
             // Compress
             let (rust_compressed, rust_peak) = measure_peak(|| {
-                structured_zstd::encoding::compress_to_vec(&scenario.bytes[..], level.rust_level)
+                structured_zstd::encoding::compress_slice_to_vec(
+                    &scenario.bytes[..],
+                    level.rust_level,
+                )
             });
             let (ffi_compressed, ffi_peak) =
                 measure_peak(|| ffi_encode(&scenario.bytes[..], level.ffi_level));
diff --git a/zstd/src/encoding/blocks/compressed.rs b/zstd/src/encoding/blocks/compressed.rs
index deb8fc3b..4920bcba 100644
--- a/zstd/src/encoding/blocks/compressed.rs
+++ b/zstd/src/encoding/blocks/compressed.rs
@@ -330,19 +330,19 @@ fn encode_block_parts_with_sequence_scratch<M: Matcher>(
         // Choose the tables
         let ll_mode = choose_table(
             state.fse_tables.ll_previous.as_ref(),
-            &state.fse_tables.ll_default,
+            state.fse_tables.ll_default,
             sequences.iter().map(|seq| encode_literal_length(seq.ll).0),
             9,
         );
         let ml_mode = choose_table(
             state.fse_tables.ml_previous.as_ref(),
-            &state.fse_tables.ml_default,
+            state.fse_tables.ml_default,
             sequences.iter().map(|seq| encode_match_len(seq.ml).0),
             9,
         );
         let of_mode = choose_table(
             state.fse_tables.of_previous.as_ref(),
-            &state.fse_tables.of_default,
+            state.fse_tables.of_default,
             sequences.iter().map(|seq| encode_offset(seq.of).0),
             8,
         );
@@ -540,26 +540,26 @@ fn estimate_sequences_section_bytes(
     // internally, identical decision path.
     let ll_mode = choose_table(
         fse_tables.ll_previous.as_ref(),
-        &fse_tables.ll_default,
+        fse_tables.ll_default,
         sequences.iter().map(|seq| encode_literal_length(seq.ll).0),
         9,
     );
     let ml_mode = choose_table(
         fse_tables.ml_previous.as_ref(),
-        &fse_tables.ml_default,
+        fse_tables.ml_default,
         sequences.iter().map(|seq| encode_match_len(seq.ml).0),
         9,
     );
     let of_mode = choose_table(
         fse_tables.of_previous.as_ref(),
-        &fse_tables.of_default,
+        fse_tables.of_default,
         sequences.iter().map(|seq| encode_offset(seq.of).0),
         8,
     );
 
-    let ll_bits_chosen = fse_section_bits_for_mode(&ll_mode, ll_counts, &fse_tables.ll_default);
-    let ml_bits_chosen = fse_section_bits_for_mode(&ml_mode, ml_counts, &fse_tables.ml_default);
-    let of_bits_chosen = fse_section_bits_for_mode(&of_mode, of_counts, &fse_tables.of_default);
+    let ll_bits_chosen = fse_section_bits_for_mode(&ll_mode, ll_counts, fse_tables.ll_default);
+    let ml_bits_chosen = fse_section_bits_for_mode(&ml_mode, ml_counts, fse_tables.ml_default);
+    let of_bits_chosen = fse_section_bits_for_mode(&of_mode, of_counts, fse_tables.of_default);
 
     let ll_table_desc_bytes = mode_table_description_bytes(&ll_mode);
     let ml_table_desc_bytes = mode_table_description_bytes(&ml_mode);
@@ -822,12 +822,15 @@ fn encode_raw_sequences_into(
 }
 
 fn clone_fse_tables(fse_tables: &FseTables) -> FseTables {
+    // `*_default` fields are `&'static FSETable` (pointer-sized
+    // copies, cache-backed); only the `*_previous` per-frame tables
+    // need a deep clone.
     FseTables {
-        ll_default: fse_tables.ll_default.clone(),
+        ll_default: fse_tables.ll_default,
         ll_previous: fse_tables.ll_previous.clone(),
-        ml_default: fse_tables.ml_default.clone(),
+        ml_default: fse_tables.ml_default,
         ml_previous: fse_tables.ml_previous.clone(),
-        of_default: fse_tables.of_default.clone(),
+        of_default: fse_tables.of_default,
         of_previous: fse_tables.of_previous.clone(),
     }
 }
@@ -1252,9 +1255,9 @@ fn encode_sequences(
     let (ll_code, ll_add_bits, ll_num_bits) = encode_literal_length(sequence.ll);
     let (of_code, of_add_bits, of_num_bits) = encode_offset(sequence.of);
     let (ml_code, ml_add_bits, ml_num_bits) = encode_match_len(sequence.ml);
-    let ll_table = mode_table(ll_mode, &defaults.ll_default);
-    let ml_table = mode_table(ml_mode, &defaults.ml_default);
-    let of_table = mode_table(of_mode, &defaults.of_default);
+    let ll_table = mode_table(ll_mode, defaults.ll_default);
+    let ml_table = mode_table(ml_mode, defaults.ml_default);
+    let of_table = mode_table(of_mode, defaults.of_default);
     let mut ll_state = ll_table.map(|table| table.start_state(ll_code));
     let mut ml_state = ml_table.map(|table| table.start_state(ml_code));
     let mut of_state = of_table.map(|table| table.start_state(of_code));
@@ -1707,34 +1710,34 @@ mod tests {
         );
 
         assert!(tables_match(
-            previous_table(fse_tables.ll_previous.as_ref(), &fse_tables.ll_default).unwrap(),
-            &fse_tables.ll_default
+            previous_table(fse_tables.ll_previous.as_ref(), fse_tables.ll_default).unwrap(),
+            fse_tables.ll_default
         ));
         assert!(tables_match(
-            previous_table(fse_tables.ml_previous.as_ref(), &fse_tables.ml_default).unwrap(),
-            &fse_tables.ml_default
+            previous_table(fse_tables.ml_previous.as_ref(), fse_tables.ml_default).unwrap(),
+            fse_tables.ml_default
         ));
         assert!(tables_match(
-            previous_table(fse_tables.of_previous.as_ref(), &fse_tables.of_default).unwrap(),
-            &fse_tables.of_default
+            previous_table(fse_tables.of_previous.as_ref(), fse_tables.of_default).unwrap(),
+            fse_tables.of_default
         ));
 
         let sample_codes = [0u8, 1u8];
         let ll_repeat = choose_table(
             fse_tables.ll_previous.as_ref(),
-            &fse_tables.ll_default,
+            fse_tables.ll_default,
             sample_codes.iter().copied(),
             9,
         );
         let ml_repeat = choose_table(
             fse_tables.ml_previous.as_ref(),
-            &fse_tables.ml_default,
+            fse_tables.ml_default,
             sample_codes.iter().copied(),
             9,
         );
         let of_repeat = choose_table(
             fse_tables.of_previous.as_ref(),
-            &fse_tables.of_default,
+            fse_tables.of_default,
             sample_codes.iter().copied(),
             8,
         );
@@ -1751,7 +1754,7 @@ mod tests {
         fse_tables.ll_previous = Some(PreviousFseTable::Custom(Box::new(custom)));
 
         let before = core::ptr::from_ref(
-            previous_table(fse_tables.ll_previous.as_ref(), &fse_tables.ll_default).unwrap(),
+            previous_table(fse_tables.ll_previous.as_ref(), fse_tables.ll_default).unwrap(),
         );
 
         remember_last_used_tables(
@@ -1762,7 +1765,7 @@ mod tests {
         );
 
         let after = core::ptr::from_ref(
-            previous_table(fse_tables.ll_previous.as_ref(), &fse_tables.ll_default).unwrap(),
+            previous_table(fse_tables.ll_previous.as_ref(), fse_tables.ll_default).unwrap(),
         );
 
         assert_eq!(before, after);
@@ -1777,7 +1780,7 @@ mod tests {
         let fse_tables = FseTables::new();
         let mode = choose_table(
             None,
-            &fse_tables.ll_default,
+            fse_tables.ll_default,
             core::iter::repeat_n(0u8, 32),
             9,
         );
diff --git a/zstd/src/encoding/dfast/mod.rs b/zstd/src/encoding/dfast/mod.rs
index 1485afd3..fb6d56c9 100644
--- a/zstd/src/encoding/dfast/mod.rs
+++ b/zstd/src/encoding/dfast/mod.rs
@@ -16,10 +16,10 @@ use core::convert::TryInto;
 
 use super::Sequence;
 use super::blocks::encode_offset_with_history;
-use super::incompressible::{block_looks_incompressible, block_looks_incompressible_strict};
+use super::incompressible::block_looks_incompressible;
 use super::match_generator::{
     DFAST_EMPTY_SLOT, DFAST_HASH_BITS, DFAST_INCOMPRESSIBLE_SKIP_STEP, DFAST_LOCAL_SKIP_TRIGGER,
-    DFAST_MAX_SKIP_STEP, DFAST_MIN_MATCH_LEN, DFAST_REBASE_GUARD_BAND, DFAST_SEARCH_DEPTH,
+    DFAST_MAX_SKIP_STEP, DFAST_MIN_MATCH_LEN, DFAST_REBASE_GUARD_BAND, DFAST_SHORT_HASH_BITS_DELTA,
     DFAST_SHORT_HASH_LOOKAHEAD, DFAST_SKIP_STEP_GROWTH_INTERVAL, DFAST_TARGET_LEN, MIN_WINDOW_LOG,
 };
 use super::match_table::helpers::{
@@ -29,9 +29,38 @@ use super::match_table::helpers::{
 use super::match_table::storage::check_stream_abs_headroom;
 use super::opt::types::MatchCandidate;
 
+/// Donor `HASH_READ_SIZE` (`zstd_compress_internal.h`): the largest probe
+/// width any hash / equality check in the dfast hot path reads at once.
+/// Loop guards must stop scanning when fewer than `HASH_READ_SIZE` bytes
+/// remain ahead of the probe cursor, matching donor `ilimit = iend -
+/// HASH_READ_SIZE`. The `DFAST_MIN_MATCH_LEN = 5` floor is the match
+/// acceptance threshold, NOT a safe loop bound — using it as the loop
+/// guard reads up to 3 bytes past the live history end and is UB on a
+/// raw pointer load.
+const HASH_READ_SIZE: usize = 8;
+
+/// Rep-extension minimum match length. The upstream zstd 1.5.7
+/// reference (`zstd_double_fast.c:191`, rep1 emit
+/// `ZSTD_count(ip+1+4, ip+1+4-offset_1, iend) + 4`) accepts 4-byte
+/// rep hits even though hash-search matches require 5 bytes
+/// (`DFAST_MIN_MATCH_LEN`) — rep coding has no on-wire offset cost,
+/// so a 4-byte rep is a net win over re-running the hash probe. Both
+/// the fast-loop inline rep1 peek and the post-match
+/// `extend_with_repcode_after_match` chain must gate on this floor;
+/// using the hash-search floor on either site silently drops 4-byte
+/// rep emissions that upstream produces.
+const DFAST_REP_MIN_MATCH_LEN: usize = 4;
+
 pub(crate) struct DfastMatchGenerator {
     pub(crate) max_window_size: usize,
-    pub(crate) window: VecDeque<Vec<u8>>,
+    /// Per-block length queue. Previously held the raw input
+    /// `VecDeque<Vec<u8>>` for each appended block — that duplicated
+    /// every byte in `history`, doubling the input footprint relative
+    /// to donor on the hot path. Now stores only the lengths so the
+    /// matcher can still pop blocks block-by-block on eviction
+    /// (advancing `history_start` by each pop) while the actual byte
+    /// storage lives once, in `history`.
+    pub(crate) window_blocks: VecDeque<usize>,
     pub(crate) window_size: usize,
     // We keep a contiguous searchable history to avoid rebuilding and reseeding
     // the matcher state from disjoint block buffers on every block.
@@ -39,17 +68,20 @@ pub(crate) struct DfastMatchGenerator {
     pub(crate) history_start: usize,
     pub(crate) history_abs_start: usize,
     pub(crate) offset_hist: [u32; 3],
-    // Storage: `[u32; DFAST_SEARCH_DEPTH]` per bucket. Each slot holds
-    // a +1-biased relative position (`(abs_pos - position_base + 1) as
-    // u32`); the empty-slot sentinel `DFAST_EMPTY_SLOT = 0` is
-    // therefore never a real value. Trades 4 bytes per slot (vs the
-    // previous `usize`) — at `DFAST_SEARCH_DEPTH = 4` that's 16 bytes
-    // per bucket vs the prior 32 bytes, halving the hash-table
-    // footprint. Combined with the donor-parity `hashLog = 17` ceiling
-    // this brings the two-table memory cost from 64 MiB to ~4 MiB on
-    // 1 MiB inputs — close to donor's 768 KiB for the same level.
-    pub(crate) short_hash: Vec<[u32; DFAST_SEARCH_DEPTH]>,
-    pub(crate) long_hash: Vec<[u32; DFAST_SEARCH_DEPTH]>,
+    // Storage: single `u32` per bucket — donor-parity overwrite-on-
+    // collision. Each slot holds a +1-biased relative position
+    // (`(abs_pos - position_base + 1) as u32`); `DFAST_EMPTY_SLOT = 0`
+    // is therefore never a real value. The two tables are sized
+    // independently: `long_hash` (8-byte hash) uses `long_hash_bits`
+    // (donor `hashTable`); `short_hash` (4-byte hash) uses
+    // `short_hash_bits` = `long - 1` (donor `chainTable` for dfast).
+    // Donor parity at Level 3: `2^17 × 4 + 2^16 × 4 = 768 KiB`. The
+    // ratio loss from single-slot is compensated by donor's
+    // `_search_next_long` retry — after a short-hash hit, the search
+    // probes long_hash at `ip + 1` and picks the longer of the two
+    // (see `hash_candidate`, invoked via `best_match`, for the retry).
+    pub(crate) short_hash: Vec<u32>,
+    pub(crate) long_hash: Vec<u32>,
     /// Absolute position whose `(abs_pos - position_base + 1)` slot
     /// encoding evaluates to `1`. Advances only via [`Self::reduce`]
     /// when an insert is about to overflow the u32 window — the
@@ -60,11 +92,19 @@ pub(crate) struct DfastMatchGenerator {
     /// a single matcher instance. Donor parity: `ZSTD_window_reduce`
     /// (`zstd_compress_internal.h`).
     pub(crate) position_base: usize,
-    pub(crate) hash_bits: usize,
-    /// Cached fastpath kernel for `hash_mix_u64`. Resolved once at `new()`
-    /// and reused on every `hash_index` call so we skip the per-call
-    /// `OnceLock` atomic load that `dispatch_hash_mix_u64` would pay.
-    pub(crate) hash_kernel: crate::encoding::fastpath::FastpathKernel,
+    /// Long-hash table bit-width — `long_hash.len() == 1 <<
+    /// long_hash_bits`. Donor parity with `cParams.hashLog` (17 for
+    /// Level 3 large input, 16 for Level 2; see `clevels.h`).
+    pub(crate) long_hash_bits: usize,
+    /// Short-hash table bit-width — `short_hash.len() == 1 <<
+    /// short_hash_bits`. Default is `long_hash_bits -
+    /// DFAST_SHORT_HASH_BITS_DELTA`, donor parity with
+    /// `cParams.chainLog` for dfast levels (one bit smaller than the
+    /// long hash). Halves the short-table footprint without losing
+    /// measurable ratio — the 4-byte short hash overwrites less
+    /// frequently than the 8-byte long hash on average, so the
+    /// smaller bucket count is the donor-correct sizing.
+    pub(crate) short_hash_bits: usize,
     pub(crate) use_fast_loop: bool,
     // Lazy match lookahead depth (internal tuning parameter).
     pub(crate) lazy_depth: u8,
@@ -82,7 +122,7 @@ impl DfastMatchGenerator {
     pub(crate) fn new(max_window_size: usize) -> Self {
         Self {
             max_window_size,
-            window: VecDeque::new(),
+            window_blocks: VecDeque::new(),
             window_size: 0,
             history: Vec::new(),
             history_start: 0,
@@ -91,20 +131,32 @@ impl DfastMatchGenerator {
             short_hash: Vec::new(),
             long_hash: Vec::new(),
             position_base: 0,
-            hash_bits: DFAST_HASH_BITS,
-            hash_kernel: crate::encoding::fastpath::select_kernel(),
+            long_hash_bits: DFAST_HASH_BITS,
+            short_hash_bits: DFAST_HASH_BITS - DFAST_SHORT_HASH_BITS_DELTA,
             use_fast_loop: false,
             lazy_depth: 1,
         }
     }
 
+    /// Set both hash table sizes. `bits` is the long-hash bit count
+    /// (donor `cParams.hashLog`); the short hash is derived as
+    /// `bits - DFAST_SHORT_HASH_BITS_DELTA`, donor-correct for dfast
+    /// levels. Both clamps stay above `MIN_WINDOW_LOG` so very small
+    /// windows don't underflow.
     pub(crate) fn set_hash_bits(&mut self, bits: usize) {
-        let clamped = bits.clamp(MIN_WINDOW_LOG as usize, DFAST_HASH_BITS);
-        if self.hash_bits != clamped {
-            self.hash_bits = clamped;
-            self.short_hash = Vec::new();
+        let min_bits = MIN_WINDOW_LOG as usize;
+        let long_clamped = bits.clamp(min_bits, DFAST_HASH_BITS);
+        let short_clamped = long_clamped
+            .saturating_sub(DFAST_SHORT_HASH_BITS_DELTA)
+            .max(min_bits);
+        if self.long_hash_bits != long_clamped {
+            self.long_hash_bits = long_clamped;
             self.long_hash = Vec::new();
         }
+        if self.short_hash_bits != short_clamped {
+            self.short_hash_bits = short_clamped;
+            self.short_hash = Vec::new();
+        }
     }
 
     /// Encode an absolute position into a u32 slot value
@@ -161,23 +213,21 @@ impl DfastMatchGenerator {
     /// Advance `position_base` by the same amount so future inserts
     /// continue from the rebased origin.
     fn reduce(&mut self, reducer: u32) {
-        let shift_buckets = |buckets: &mut Vec<[u32; DFAST_SEARCH_DEPTH]>| {
-            for bucket in buckets.iter_mut() {
-                for slot in bucket.iter_mut() {
-                    *slot = if *slot <= reducer {
-                        DFAST_EMPTY_SLOT
-                    } else {
-                        *slot - reducer
-                    };
-                }
+        let shift_slots = |slots: &mut [u32]| {
+            for slot in slots.iter_mut() {
+                *slot = if *slot <= reducer {
+                    DFAST_EMPTY_SLOT
+                } else {
+                    *slot - reducer
+                };
             }
         };
-        shift_buckets(&mut self.short_hash);
-        shift_buckets(&mut self.long_hash);
+        shift_slots(&mut self.short_hash);
+        shift_slots(&mut self.long_hash);
         self.position_base += reducer as usize;
     }
 
-    pub(crate) fn reset(&mut self, mut reuse_space: impl FnMut(Vec<u8>)) {
+    pub(crate) fn reset(&mut self) {
         self.window_size = 0;
         self.history.clear();
         self.history_start = 0;
@@ -185,48 +235,175 @@ impl DfastMatchGenerator {
         self.position_base = 0;
         self.offset_hist = [1, 4, 8];
         if !self.short_hash.is_empty() {
-            self.short_hash.fill([DFAST_EMPTY_SLOT; DFAST_SEARCH_DEPTH]);
-            self.long_hash.fill([DFAST_EMPTY_SLOT; DFAST_SEARCH_DEPTH]);
-        }
-        for mut data in self.window.drain(..) {
-            data.resize(data.capacity(), 0);
-            reuse_space(data);
+            self.short_hash.fill(DFAST_EMPTY_SLOT);
+            self.long_hash.fill(DFAST_EMPTY_SLOT);
         }
+        // No Vec<u8> blocks to recycle: `add_data` returns each input
+        // Vec to the caller eagerly via its own `reuse_space`, and the
+        // history Vec is owned solely by the matcher. There is nothing
+        // for an outer pool helper to do at reset time, so the dfast
+        // signature does not take one (HC / Row do because they hold
+        // per-block input Vecs internally; the dispatcher in
+        // `match_generator.rs` resolves the per-backend shape).
+        self.window_blocks.clear();
     }
 
+    /// Slice of bytes from the most recently appended block. Returns
+    /// the trailing `last_block_len` bytes of `history`, or an empty
+    /// slice if no block has been ingested yet.
+    ///
+    /// Mirrors the inline gate pattern used by `skip_matching` /
+    /// `skip_matching_dense` / `start_matching` / `emit_candidate` /
+    /// `emit_trailing_literals`: read
+    /// `window_blocks.back().copied().unwrap_or(0)` and slice the
+    /// trailing `last_len` bytes (which is empty when `last_len == 0`).
+    /// All current external callers — streaming encoder, block
+    /// compressor, per-level helpers — invoke this only after at
+    /// least one `add_data`, but returning an empty slice on the
+    /// empty case keeps the trait surface aligned with the internal
+    /// usage and avoids a panic-vs-gate divergence that would
+    /// surprise a future refactor consolidating the call sites.
     pub(crate) fn get_last_space(&self) -> &[u8] {
-        self.window.back().unwrap().as_slice()
+        let last_len = self.window_blocks.back().copied().unwrap_or(0);
+        &self.history[self.history.len() - last_len..]
     }
 
     pub(crate) fn add_data(&mut self, data: Vec<u8>, mut reuse_space: impl FnMut(Vec<u8>)) {
         assert!(data.len() <= self.max_window_size);
+        // Run the headroom check first so the safety invariant
+        // (`history_abs_start + window_size + len + STREAM_ABS_HEADROOM
+        // <= usize::MAX`) is enforced at the function boundary, not
+        // hidden behind the empty-chunk short-circuit below. With
+        // `data.len() == 0` the check is a cheap no-op on the cumulative
+        // state today, but keeping the call here means the invariant
+        // doesn't depend on "empty data implies nothing changes"
+        // reasoning if a future change ever attaches side effects to
+        // the empty path.
         check_stream_abs_headroom(self.history_abs_start, self.window_size, data.len());
+        // Empty chunks have nothing to record: pushing a `0` into
+        // `window_blocks` would let a streaming caller that flushes
+        // empty chunks grow the deque without bound (`window_size`
+        // stays unchanged so `trim_to_window` never has cause to
+        // evict the zero-length entries). Hand the Vec straight back
+        // to the pool and short-circuit.
+        //
+        // Side effect: this short-circuits BEFORE the eviction `while`
+        // loop below. If a caller shrinks `max_window_size` and then
+        // calls `add_data(vec![])` hoping to trigger trim, the trim
+        // won't fire here. Use `trim_to_window` directly for that
+        // case — it's the dedicated path for shedding retained bytes
+        // and now actually frees the prefix (via `split_off`) instead
+        // of leaving it pinned in the `history` allocation.
+        //
+        // In-tree caller audit (`grep -rn '\.commit_space(' src/encoding/`):
+        // every production path that reaches the driver's
+        // `commit_space` → `add_data` chain originates in
+        // `levels/fastest.rs`'s block emitter, which produces
+        // non-empty blocks gated by `should_emit_raw_fast_path` /
+        // RLE-detect on the source bytes — none of them pass
+        // `Vec::new()`. The streaming encoder's block-sourcing loop
+        // also filters empty reads before forwarding to the matcher.
+        // Tests are the only callers that exercise the empty path
+        // explicitly, and the regression covers eviction-driven trim
+        // semantics through `trim_to_window` directly. So this
+        // behaviour change is observable only by a hypothetical
+        // future caller that relies on `add_data(empty)` as a
+        // side-effecting trim trigger — and we deliberately want
+        // such a caller to use `trim_to_window` instead.
+        if data.is_empty() {
+            reuse_space(data);
+            return;
+        }
         while self.window_size + data.len() > self.max_window_size {
-            let removed = self.window.pop_front().unwrap();
-            self.window_size -= removed.len();
-            self.history_start += removed.len();
-            self.history_abs_start += removed.len();
-            reuse_space(removed);
+            let removed_len = self.window_blocks.pop_front().unwrap();
+            self.window_size -= removed_len;
+            self.history_start += removed_len;
+            self.history_abs_start += removed_len;
         }
         self.compact_history();
         self.history.extend_from_slice(&data);
         self.window_size += data.len();
-        self.window.push_back(data);
+        self.window_blocks.push_back(data.len());
+        // Eager Vec recycle: the only purpose of holding the input Vec
+        // was to return it to the caller's pool on eviction. Now that
+        // `history` owns the bytes, hand the Vec back immediately so
+        // the pool grows on first add instead of waiting for window
+        // overflow.
+        reuse_space(data);
     }
 
-    pub(crate) fn trim_to_window(&mut self, mut reuse_space: impl FnMut(Vec<u8>)) {
+    /// Trim retained blocks until the window fits `max_window_size`.
+    ///
+    /// Unlike `MatchGenerator::trim_to_window`,
+    /// `RowMatchGenerator::trim_to_window`, and
+    /// `HcMatchGenerator::trim_to_window`, this backend does NOT take a
+    /// `reuse_space` callback because it doesn't retain per-block
+    /// `Vec<u8>` storage to recycle (history is the sole byte buffer
+    /// and `add_data` returns each input Vec eagerly). The dispatcher
+    /// in `match_generator.rs` knows the variant and threads the right
+    /// signature; callers needing the eviction byte count derive it
+    /// from the `window_size` delta before/after this call.
+    ///
+    /// The explicit-trim-before-idle path is the reason this helper
+    /// exists: a caller that trims to shed memory before a long
+    /// quiescent period must see the resident size drop immediately,
+    /// not "eventually, on the next ingest".
+    ///
+    /// `compact_history` is NOT the right tool for that — it uses
+    /// `Vec::drain(..history_start)`, which moves elements down in
+    /// the existing allocation and leaves capacity untouched (per
+    /// `Vec` docs, only `shrink_to_fit` releases capacity). So even
+    /// when compact ran, the original buffer stayed alive. Instead,
+    /// rebuild `history` via `split_off`: it allocates a fresh
+    /// buffer sized to the retained suffix, and the assignment
+    /// drops the original (full-capacity) buffer. On a normal block
+    /// loop `add_data` will compact again the next iter — the cost
+    /// is one extra realloc on the trim boundary in exchange for
+    /// actually shedding the prefix to the system allocator.
+    ///
+    /// Unlike `HashChainTable::trim_to_window` /
+    /// `RowMatcher::trim_to_window` this signature deliberately takes
+    /// no `reuse_space` callback — Dfast stores its raw bytes in the
+    /// single contiguous `history` buffer (no per-block `Vec<u8>` to
+    /// recycle), releases the dead prefix via `split_off`, and
+    /// surfaces eviction byte count to the dispatcher via the
+    /// `window_size` delta rather than a callback. A uniform-signature
+    /// trim helper would force every call site to monomorphize an
+    /// `impl FnMut(Vec<u8>)` that is documented to never fire, paying
+    /// codegen + inlining cost on the cold path for zero behaviour
+    /// difference; the dispatcher in `match_generator.rs` already
+    /// branches per backend (matchers diverge on `reset` and a few
+    /// other lifecycle calls) so adding one more per-backend arm is
+    /// free.
+    pub(crate) fn trim_to_window(&mut self) {
         while self.window_size > self.max_window_size {
-            let removed = self.window.pop_front().unwrap();
-            self.window_size -= removed.len();
-            self.history_start += removed.len();
-            self.history_abs_start += removed.len();
-            reuse_space(removed);
+            let removed_len = self.window_blocks.pop_front().unwrap();
+            self.window_size -= removed_len;
+            self.history_start += removed_len;
+            self.history_abs_start += removed_len;
+        }
+        if self.history_start != 0 {
+            // `split_off` returns the suffix in a fresh allocation;
+            // the original Vec (still owning [..history_start]) is
+            // dropped on the assignment below, releasing the dead
+            // prefix back to the allocator.
+            self.history = self.history.split_off(self.history_start);
+            self.history_start = 0;
         }
     }
 
     pub(crate) fn skip_matching(&mut self, incompressible_hint: Option<bool>) {
         self.ensure_hash_tables();
-        let current_len = self.window.back().unwrap().len();
+        let current_len = self.window_blocks.back().copied().unwrap_or(0);
+        if current_len == 0 {
+            // `add_data` short-circuits on empty input and does NOT push
+            // a zero-length entry onto `window_blocks`. A caller that
+            // invokes skip-matching after a streaming flush of an empty
+            // chunk would otherwise re-seed the previous block's
+            // retained tail on every empty write. Mirror the gate that
+            // `start_matching` already uses.
+            return;
+        }
         let current_abs_start = self.history_abs_start + self.window_size - current_len;
         let current_abs_end = current_abs_start + current_len;
         let tail_start = current_abs_start.saturating_sub(Self::BOUNDARY_DENSE_TAIL_LEN);
@@ -260,7 +437,15 @@ impl DfastMatchGenerator {
 
     pub(crate) fn skip_matching_dense(&mut self) {
         self.ensure_hash_tables();
-        let current_len = self.window.back().unwrap().len();
+        let current_len = self.window_blocks.back().copied().unwrap_or(0);
+        if current_len == 0 {
+            // Same gate as `skip_matching` and `start_matching`: empty
+            // chunks fed through `add_data` no longer push a block
+            // entry, so a streaming caller that flushes empty chunks
+            // would otherwise re-seed the retained tail on every
+            // empty write.
+            return;
+        }
         let current_abs_start = self.history_abs_start + self.window_size - current_len;
         let current_abs_end = current_abs_start + current_len;
         let backfill_start = current_abs_start
@@ -275,7 +460,7 @@ impl DfastMatchGenerator {
     pub(crate) fn start_matching(&mut self, mut handle_sequence: impl for<'a> FnMut(Sequence<'a>)) {
         self.ensure_hash_tables();
 
-        let current_len = self.window.back().unwrap().len();
+        let current_len = self.window_blocks.back().copied().unwrap_or(0);
         if current_len == 0 {
             return;
         }
@@ -308,10 +493,17 @@ impl DfastMatchGenerator {
         //    the dynamic bound is `current_len` itself — the `while
         //    pos + DFAST_MIN_MATCH_LEN <= current_len` guard keeps
         //    every offset within that bound regardless of how the
-        //    caller sized `max_window_size`. In production this also
-        //    happens to be `≤ HC_BLOCKSIZE_MAX (128 KiB)` because the
-        //    frame compressor never hands out larger blocks, but the
-        //    safety argument above does not rely on that limit.
+        //    caller sized `max_window_size`. The general path's
+        //    `best_match` → `hash_candidate` chain uses SAFE slicing
+        //    (`data[..8].try_into()`) which panics on a short slice
+        //    rather than reading uninitialised bytes — UB-free even
+        //    when this guard lets `pos` land within 3 bytes of the
+        //    block tail. The fast loop has stricter `+ HASH_READ_SIZE`
+        //    guards because it does raw-pointer `read_unaligned` for
+        //    speed; see `start_matching_fast_loop`. In production this
+        //    also happens to be `≤ HC_BLOCKSIZE_MAX (128 KiB)` because
+        //    the frame compressor never hands out larger blocks, but
+        //    the safety argument above does not rely on that limit.
         // 2. Absolute-position arithmetic (`current_abs_start + pos`):
         //    `current_abs_start` is the frame-lifetime cursor and
         //    advances with total bytes processed, NOT with the
@@ -382,62 +574,149 @@ impl DfastMatchGenerator {
         current_len: usize,
         handle_sequence: &mut impl for<'a> FnMut(Sequence<'a>),
     ) {
-        let block_is_strict_incompressible = self
-            .block_looks_incompressible_strict(current_abs_start, current_abs_start + current_len);
+        // Behaviour change vs the pre-refactor `start_matching_general`:
+        // this fast loop deliberately drops the strict-incompressible
+        // early-skip path (the `block_looks_incompressible_strict` short
+        // circuit + `miss_run` / `DFAST_LOCAL_SKIP_TRIGGER` thresholding).
+        // The step ramp is now driven purely by distance traveled
+        // (`DFAST_SKIP_STEP_GROWTH_INTERVAL = 64`, donor parity except
+        // donor uses 256 — see "Donor-deviation audit" in the PR body),
+        // so blocks the strict gate used to bail out of early now scan
+        // through the standard ramp. `block_looks_incompressible_strict`
+        // is still used by `levels/fastest.rs` for the Fastest preset
+        // and by `incompressible.rs` unit tests, so the helper itself
+        // stays.
+        //
+        // Donor outer/inner structure (`zstd_double_fast.c:167-322`):
+        //   * outer `while(1)` runs once per match-found-and-stored;
+        //   * inner `do { ... } while (ip1 <= ilimit)` carries `hl0`,
+        //     `idxl0` between iterations, and precomputes `hl1` so the
+        //     next iter's long hash is reused (one long-hash compute per
+        //     two positions scanned, not per position).
+        // We mirror that here. Per-frame invariants are hoisted before
+        // the outer loop; mutable state (`position_base`, `history_abs_start`)
+        // is re-snapshotted inside the outer loop because `emit_candidate`
+        // → `insert_positions` → `ensure_room_for` can advance the rebase
+        // base mid-frame.
+        //
+        // Rebase BEFORE any inner-loop slot pack. The hot loop computes
+        // `packed_curr = (abs_ip0 - position_base) as u32 + 1` and writes
+        // it straight into the hash tables, bypassing `insert_position`
+        // (which is where `ensure_room_for` normally fires). On a long
+        // stream of all-miss / non-hashable blocks the matcher can advance
+        // `current_abs_start` arbitrarily far without any per-byte insert,
+        // so `position_base` may be stale by `> u32::MAX`. The first
+        // fast-loop block after that would silently truncate `packed_curr`
+        // and poison both hash tables. The guard band in `ensure_room_for`
+        // (`DFAST_REBASE_GUARD_BAND`) covers the entire current block's
+        // worth of positions, so a single call at function entry suffices.
+        //
+        // Raw-pointer aliasing invariant. The inner loop caches
+        // `short_hash_ptr = self.short_hash.as_mut_ptr()` and
+        // `long_hash_ptr = self.long_hash.as_mut_ptr()` (and a
+        // `history_base_ptr` for the byte-buffer reads), then over
+        // the rest of the function does:
+        //
+        //   * raw writes / reads via `*short_hash_ptr.add(...) =
+        //     packed_curr`, `*long_hash_ptr.add(...) = packed_curr`,
+        //     and `*long_hash_ptr.add(hl1_idx)`;
+        //   * `&self`-shared reads of `self.offset_hist[0]` for the
+        //     rep1 peek;
+        //   * `&self`-shared slice reads of `self.history` (`let
+        //     concat = &self.history[history_start_offset..]`) for
+        //     `extend_backwards_shared` invocations.
+        //
+        // This is sound under both Stacked Borrows and Tree Borrows
+        // because the three fields touched (`short_hash`, `long_hash`,
+        // `history`, `offset_hist`) are physically disjoint
+        // allocations — `Vec<u32>`/`Vec<u8>` each own their own heap
+        // buffer, and `offset_hist` is an inline `[u32; 3]` inside
+        // the struct. A raw pointer derived from one field's Vec data
+        // has provenance over that field only, so the subsequent
+        // `&self` reads of sibling fields don't reborrow through the
+        // raw pointer's provenance tree.
+        //
+        // CRITICAL: this invariant must be preserved by any future
+        // refactor that adds method calls inside the loop body. A
+        // call that takes `&mut self` (e.g.,
+        // `self.ensure_hash_tables()`, `self.insert_position(...)`)
+        // would reborrow the tables and invalidate the cached raw
+        // pointers — every such call must happen OUTSIDE the
+        // `'outer: loop` (as `ensure_room_for` does, hoisted to the
+        // function preamble above) or be followed by a fresh
+        // `as_mut_ptr()` reload of both `short_hash_ptr` /
+        // `long_hash_ptr`. The outer-loop body already re-snapshots
+        // `history_base_ptr` per iteration for exactly this reason
+        // (`emit_candidate` → `insert_positions` may grow `history`
+        // and trigger a realloc), so the same re-snapshot discipline
+        // applies to the hash-table pointers.
+        //
+        // `current_len > 0` is a precondition of this helper: every
+        // caller (`start_matching`, `skip_matching`, `skip_matching_dense`)
+        // returns early at `current_len == 0`. Encode it as a hard
+        // assert so a future caller that forgets the gate fails loudly
+        // in debug rather than wrap-underflowing the `- 1` below; the
+        // rebase is unconditional in release builds since the
+        // precondition holds.
+        debug_assert!(current_len > 0, "fast_loop precondition: current_len > 0");
+        self.ensure_room_for(current_abs_start + current_len - 1);
+        const PRIME: u64 = 0xCF1BBCDCB7A56463_u64;
+        let short_shift = 64 - self.short_hash_bits;
+        let long_shift = 64 - self.long_hash_bits;
         let mut pos = 1usize;
         let mut literals_start = 0usize;
-        let mut skip_step = 1usize;
-        let mut next_skip_growth_pos = DFAST_SKIP_STEP_GROWTH_INTERVAL;
-        let mut miss_run = 0usize;
-        // Loop invariants (two distinct bounds):
-        //
-        // 1. Block-local arithmetic (`ip0..ip3 = pos + N` for small
-        //    `N`, `pos + skip_step` with `skip_step <=
-        //    DFAST_MAX_SKIP_STEP`): the dynamic bound is `current_len`
-        //    itself — the `while pos + DFAST_MIN_MATCH_LEN <=
-        //    current_len` guard keeps every offset within that bound
-        //    regardless of how the caller sized `max_window_size`.
-        //    In production this also happens to be `≤
-        //    HC_BLOCKSIZE_MAX (128 KiB)` because the frame compressor
-        //    never hands out larger blocks, but the safety argument
-        //    above does not rely on that limit.
-        // 2. Absolute-position arithmetic (`current_abs_start + ip0`,
-        //    `current_abs_start + ip2`, etc.): `current_abs_start` is
-        //    the frame-lifetime cursor and advances with total bytes
-        //    processed, NOT with the retained window size. A long
-        //    streaming encode on i686 can push `current_abs_start +
-        //    ipN` past `usize::MAX` even though memory stays bounded
-        //    by `window_size`. The runtime enforcement lives in
-        //    `DfastMatchGenerator::add_data` via
-        //    `check_stream_abs_headroom`: every ingest fails fast if
-        //    cumulative input would advance the cursor within
-        //    `STREAM_ABS_HEADROOM` of `usize::MAX`, which keeps the
-        //    raw `current_abs_start + ipN` arithmetic below
-        //    `usize::MAX` for every position
-        //    this loop ever observes.
-        while pos + DFAST_MIN_MATCH_LEN <= current_len {
-            let ip0 = pos;
-            let ip1 = ip0 + 1;
-            let ip2 = ip0 + 2;
-            let ip3 = ip0 + 3;
-
-            let abs_ip0 = current_abs_start + ip0;
-            let lit_len_ip0 = ip0 - literals_start;
-
-            if ip2 + DFAST_MIN_MATCH_LEN <= current_len {
-                let abs_ip2 = current_abs_start + ip2;
-                let lit_len_ip2 = ip2 - literals_start;
-                if let Some(rep) = self.repcode_candidate(abs_ip2, lit_len_ip2)
-                    && rep.start >= current_abs_start + literals_start
-                    && rep.start <= abs_ip2
+
+        'outer: loop {
+            // Outer-iter precondition: at least `HASH_READ_SIZE = 8` bytes
+            // ahead of `pos` so the unconditional 8-byte `u64` load below
+            // is in-bounds for the live history buffer. `DFAST_MIN_MATCH_LEN
+            // = 5` is the match acceptance threshold and is NOT a safe
+            // load bound — using it here read up to 3 bytes past
+            // `history.len()` on tiny blocks (CI fuzz `interop` crash,
+            // 7-byte input).
+            // NOTE: when this guard fires on the very first outer-iter
+            // (tiny block, `current_len < 9`), we `break 'outer` BEFORE
+            // `tail_seed_anchor` is even declared. The post-loop
+            // `seed_remaining_hashable_starts(.., pos)` then runs with
+            // `pos` at its initial value (1 on first frame, or the
+            // post-match cursor from a previous outer iter); that's the
+            // correct tail seed for tiny blocks — `seed_pos = pos.min(
+            // current_len).min(boundary_tail_start)` plus the
+            // `seed_pos + DFAST_SHORT_HASH_LOOKAHEAD <= current_len`
+            // guard inside the seeder keeps every insert in-bounds.
+            if pos + HASH_READ_SIZE > current_len {
+                break 'outer;
+            }
+            let mut step = 1usize;
+            let mut next_step_pos = pos + DFAST_SKIP_STEP_GROWTH_INTERVAL;
+            let mut ip0 = pos;
+            let mut ip1 = ip0 + step;
+            // Same `HASH_READ_SIZE` rationale for `ip1`: the inner loop
+            // pre-loads 8 bytes at `concat_idx1` for the `hl1` precompute
+            // and the `_search_next_long` retry, so `ip1 + 8 <= current_len`
+            // must hold before we enter. If `ip0` is still hashable but
+            // `ip1` is not (boundary case `pos == current_len - 8`), we
+            // still want to probe `ip0` — skipping it would leave the
+            // last hashable position to `seed_remaining_hashable_starts`,
+            // which inserts but does NOT search, and that drops real
+            // matches in the tail window vs the upstream reference
+            // (whose single-cursor loop probes every position p with
+            // `p + HASH_READ_SIZE <= iend`). Handle the boundary inline
+            // before exiting: a single-cursor probe at `ip0` (rep peek
+            // and `_search_next_long` retry both depend on `ip1` so
+            // they're skipped — donor accepts that exact tradeoff at
+            // the iend boundary).
+            if ip1 + HASH_READ_SIZE > current_len {
+                if let Some(committed) =
+                    self.probe_tail_ip0_only(current_abs_start, current_len, ip0, literals_start)
                 {
                     let start = self.emit_candidate(
                         current_abs_start,
                         &mut literals_start,
-                        rep,
+                        committed,
                         handle_sequence,
                     );
-                    pos = start + rep.match_len;
+                    pos = start + committed.match_len;
                     pos = self.extend_with_repcode_after_match(
                         current_abs_start,
                         current_len,
@@ -445,55 +724,396 @@ impl DfastMatchGenerator {
                         &mut literals_start,
                         handle_sequence,
                     );
-                    skip_step = 1;
-                    next_skip_growth_pos = pos + DFAST_SKIP_STEP_GROWTH_INTERVAL;
-                    miss_run = 0;
-                    continue;
+                    continue 'outer;
                 }
+                break 'outer;
             }
 
-            let best = self.best_match(abs_ip0, lit_len_ip0);
-            if let Some(candidate) = best {
-                let start = self.emit_candidate(
-                    current_abs_start,
-                    &mut literals_start,
-                    candidate,
-                    handle_sequence,
-                );
-                pos = start + candidate.match_len;
-                pos = self.extend_with_repcode_after_match(
-                    current_abs_start,
-                    current_len,
-                    pos,
-                    &mut literals_start,
-                    handle_sequence,
+            // Re-read every per-frame-mutable cursor here — `emit_candidate`
+            // in the previous outer iteration may have triggered a rebase.
+            let history_abs_start = self.history_abs_start;
+            let position_base = self.position_base;
+            let history_start_offset = self.history_start;
+            let history_base_ptr = self.history.as_ptr();
+            let short_hash_ptr = self.short_hash.as_mut_ptr();
+            let long_hash_ptr = self.long_hash.as_mut_ptr();
+            let concat_len = self.history.len() - history_start_offset;
+
+            // Pre-compute long hash at ip0 ONCE per outer iter.
+            // `concat_idx = (current_abs_start + ip0) - history_abs_start`
+            // is the byte offset within `live_history`.
+            let mut hl0_idx;
+            let mut idxl0;
+            // SAFETY: `current_abs_start + ip0 >= history_abs_start`
+            // (`ip` is inside the current block, which is part of live
+            // history). The 8-byte unaligned load on
+            // `concat[concat_idx..]` is safe because the outer loop
+            // guards above enforce `ip0 + HASH_READ_SIZE <= current_len`,
+            // and `current_abs_start + current_len - history_abs_start
+            // <= concat_len` (live history contains the full current
+            // block plus any retained earlier blocks), giving
+            // `concat_idx + 8 <= concat_len`. The `debug_assert!` below
+            // makes that invariant explicit so a future refactor that
+            // touches eviction / `compact_history` / `trim_to_window`
+            // semantics catches a violation in tests instead of leaking
+            // through to ASan UB.
+            unsafe {
+                let concat_idx = (current_abs_start + ip0) - history_abs_start;
+                debug_assert!(
+                    concat_idx + HASH_READ_SIZE <= concat_len,
+                    "fast-loop 8-byte load OOB: concat_idx={} HASH_READ_SIZE={} concat_len={}",
+                    concat_idx,
+                    HASH_READ_SIZE,
+                    concat_len,
                 );
-                skip_step = 1;
-                next_skip_growth_pos = pos + DFAST_SKIP_STEP_GROWTH_INTERVAL;
-                miss_run = 0;
-            } else {
-                self.insert_position(abs_ip0);
-                if ip1 + 4 <= current_len {
-                    self.insert_position(current_abs_start + ip1);
+                let v8 = (history_base_ptr.add(history_start_offset + concat_idx) as *const u64)
+                    .read_unaligned();
+                hl0_idx = (v8.wrapping_mul(PRIME) >> long_shift) as usize;
+                idxl0 = *long_hash_ptr.add(hl0_idx);
+            }
+
+            // Inner-loop exit shape. Every `break 'inner` produces a
+            // value of this type, so the type system enforces the
+            // previously implicit pairing between "did we commit a
+            // match?" and "where does the tail seeder pick up?":
+            //
+            //   * `Committed(c)` — rep1 / long / short(+next_long retry)
+            //     paths chose `c`; outer arm runs emit + rep-extension.
+            //   * `Tail(seed)` — inner ran out of safe scan room at
+            //     `seed = ip0` (first un-scanned position); outer arm
+            //     hands `seed` to `seed_remaining_hashable_starts` so
+            //     the tail seeder does not redundantly re-pack
+            //     positions the fast loop already wrote (which is what
+            //     restarting from outer-entry `pos` would do on a
+            //     miss-only block — throwing away the skip-step win on
+            //     incompressible data).
+            //
+            // Adding a new `break 'inner` variant now forces the
+            // author to pick a `InnerExit::…` variant explicitly; the
+            // previous `Option<MatchCandidate>` + sibling `usize`
+            // pairing relied on a comment-block to flag the coupling.
+            enum InnerExit {
+                Committed(MatchCandidate),
+                Tail(usize),
+            }
+
+            let inner_exit: InnerExit = 'inner: loop {
+                let abs_ip0 = current_abs_start + ip0;
+                let abs_ip1 = current_abs_start + ip1;
+                let lit_len_ip0 = ip0 - literals_start;
+                let lit_len_ip1 = ip1 - literals_start;
+                let packed_curr = ((abs_ip0 - position_base) as u32) + 1;
+
+                let concat_idx0 = abs_ip0 - history_abs_start;
+                let concat_idx1 = abs_ip1 - history_abs_start;
+
+                // Load 8 bytes at ip0 for both short (low 4) and long
+                // probe equality checks. We already used `v8_at_ip0` to
+                // compute hl0/idxl0 in the outer init / previous iter's
+                // carry; reload now (cheap unaligned read) so the
+                // `read_unaligned` is from the same offset the
+                // probe-eq below will use.
+                let v8_0 = unsafe {
+                    (history_base_ptr.add(history_start_offset + concat_idx0) as *const u64)
+                        .read_unaligned()
+                };
+                let v4_0 = v8_0 & 0xFFFF_FFFF;
+                let hs0_idx = (v4_0.wrapping_mul(PRIME) >> short_shift) as usize;
+                let idxs0 = unsafe { *short_hash_ptr.add(hs0_idx) };
+
+                // Donor parity (`zstd_double_fast.c:187`): update BOTH
+                // tables at curr BEFORE checking matches. The benefit is
+                // for hash-table consumers, NOT the rep peek (rep at ip+1
+                // reads `offset_hist[0]`, never the hash tables). The
+                // donor rationale is the long-hash retry path: the next
+                // inner iter's `idxl1` lookup (`hashLong[hl1_idx]`) can
+                // collide with this iter's `hl0_idx`, and writing curr
+                // first means a self-collision still resolves to a real
+                // match instead of the previous occupant. The short
+                // probe of the same iter and the `_search_next_long`
+                // retry at ip+1 are the other consumers that see the
+                // fresh write.
+                unsafe {
+                    *long_hash_ptr.add(hl0_idx) = packed_curr;
+                    *short_hash_ptr.add(hs0_idx) = packed_curr;
                 }
-                if ip2 + 4 <= current_len {
-                    self.insert_position(current_abs_start + ip2);
+
+                // Donor parity (`zstd_double_fast.c:190`): inline rep1
+                // peek at ip+1, 4-byte gate. Donor's hot path checks ONLY
+                // `offset_1` here (full 3-rep walk lives in lazy/btopt).
+                // Since the peek is at `ip+1` with `pos >= literals_start`,
+                // `lit_len_ip1 >= 1`, so `offset_hist[0]` is the donor's
+                // `offset_1`. The `repcode_candidate_shared` helper we used
+                // before walked all three offsets + did a full SIMD
+                // `common_prefix_len` per probe, paying ~3× the work for
+                // rep2/rep3 hits that the dfast fast path never benefits
+                // from (those wins live in the lazy/btopt strategies).
+                let rep1 = self.offset_hist[0] as usize;
+                if rep1 != 0 && rep1 <= abs_ip1 {
+                    let cand_pos_r = abs_ip1 - rep1;
+                    if cand_pos_r >= history_abs_start
+                        && cand_pos_r >= current_abs_start + literals_start
+                    {
+                        let cand_idx_r = cand_pos_r - history_abs_start;
+                        // 4-byte gate; full forward count only if it passes.
+                        let cand4 = unsafe {
+                            (history_base_ptr.add(history_start_offset + cand_idx_r) as *const u32)
+                                .read_unaligned()
+                        };
+                        let cur4 = unsafe {
+                            (history_base_ptr.add(history_start_offset + concat_idx1) as *const u32)
+                                .read_unaligned()
+                        };
+                        if cand4 == cur4 {
+                            let mut match_len = 4usize;
+                            let max_fwd = concat_len.saturating_sub(concat_idx1 + 4);
+                            unsafe {
+                                let lhs =
+                                    history_base_ptr.add(history_start_offset + cand_idx_r + 4);
+                                let rhs =
+                                    history_base_ptr.add(history_start_offset + concat_idx1 + 4);
+                                let ext = crate::encoding::fastpath::dispatch_common_prefix_len_ptr(
+                                    lhs, rhs, max_fwd,
+                                );
+                                match_len += ext;
+                            }
+                            // Rep extensions use the 4-byte
+                            // `DFAST_REP_MIN_MATCH_LEN` floor, NOT the
+                            // hash-search `DFAST_MIN_MATCH_LEN = 5`. Rep
+                            // coding has no on-wire offset cost, so the
+                            // upstream reference accepts 4-byte rep
+                            // hits; gating at 5 here would silently drop
+                            // every 4-byte rep1 the upstream encoder
+                            // produces, and leave the post-match
+                            // `extend_with_repcode_after_match` chain
+                            // (which uses 4) inconsistent with the peek.
+                            if match_len >= DFAST_REP_MIN_MATCH_LEN {
+                                let concat = &self.history[history_start_offset..];
+                                let rep_cand = extend_backwards_shared(
+                                    concat,
+                                    history_abs_start,
+                                    cand_pos_r,
+                                    abs_ip1,
+                                    match_len,
+                                    lit_len_ip1,
+                                );
+                                break 'inner InnerExit::Committed(rep_cand);
+                            }
+                        }
+                    }
                 }
-                if ip3 + 4 <= current_len {
-                    self.insert_position(current_abs_start + ip3);
+
+                // Precompute hl1 (donor `_search_next_long` carry, line 197).
+                let v8_1 = unsafe {
+                    (history_base_ptr.add(history_start_offset + concat_idx1) as *const u64)
+                        .read_unaligned()
+                };
+                let hl1_idx = (v8_1.wrapping_mul(PRIME) >> long_shift) as usize;
+
+                // Long match check at ip0 with idxl0. 8-byte equality
+                // gate (`MEM_read64`) — if it passes, candidate is real.
+                if idxl0 != DFAST_EMPTY_SLOT {
+                    let cand_pos = position_base + (idxl0 as usize) - 1;
+                    if cand_pos >= history_abs_start && cand_pos < abs_ip0 {
+                        let cand_idx = cand_pos - history_abs_start;
+                        // SAFETY: same buffer/length bounds as v8_0 above.
+                        let cand_v8 = unsafe {
+                            (history_base_ptr.add(history_start_offset + cand_idx) as *const u64)
+                                .read_unaligned()
+                        };
+                        if cand_v8 == v8_0 {
+                            // 8 bytes match; count forward + extend back.
+                            let mut match_len = 8usize;
+                            let max_fwd = concat_len.saturating_sub(concat_idx0 + 8);
+                            // SAFETY: both ptrs at the same buffer; offsets
+                            // verified above. `max_fwd` caps the scan to
+                            // the live region.
+                            unsafe {
+                                let lhs = history_base_ptr.add(history_start_offset + cand_idx + 8);
+                                let rhs =
+                                    history_base_ptr.add(history_start_offset + concat_idx0 + 8);
+                                let ext = crate::encoding::fastpath::dispatch_common_prefix_len_ptr(
+                                    lhs, rhs, max_fwd,
+                                );
+                                match_len += ext;
+                            }
+                            let concat = &self.history[history_start_offset..];
+                            let cand = extend_backwards_shared(
+                                concat,
+                                history_abs_start,
+                                cand_pos,
+                                abs_ip0,
+                                match_len,
+                                lit_len_ip0,
+                            );
+                            break 'inner InnerExit::Committed(cand);
+                        }
+                    }
                 }
-                miss_run += 1;
-                if block_is_strict_incompressible || miss_run >= DFAST_LOCAL_SKIP_TRIGGER {
-                    let skip_cap = DFAST_MAX_SKIP_STEP;
-                    if pos >= next_skip_growth_pos {
-                        skip_step = (skip_step + 1).min(skip_cap);
-                        next_skip_growth_pos += DFAST_SKIP_STEP_GROWTH_INTERVAL;
+
+                let idxl1 = unsafe { *long_hash_ptr.add(hl1_idx) };
+
+                // Short match check at ip0 with idxs0 — 4-byte gate
+                // ONLY (donor `zstd_double_fast.c:220`). Forward count
+                // and `_search_next_long` retry happen ONLY on hit.
+                if idxs0 != DFAST_EMPTY_SLOT {
+                    let cand_pos_s = position_base + (idxs0 as usize) - 1;
+                    if cand_pos_s >= history_abs_start && cand_pos_s < abs_ip0 {
+                        let cand_idx_s = cand_pos_s - history_abs_start;
+                        let cand4 = unsafe {
+                            (history_base_ptr.add(history_start_offset + cand_idx_s) as *const u32)
+                                .read_unaligned()
+                        };
+                        if cand4 == v4_0 as u32 {
+                            // Short hit: count forward from byte 4 onwards.
+                            let mut s_match_len = 4usize;
+                            let max_fwd = concat_len.saturating_sub(concat_idx0 + 4);
+                            unsafe {
+                                let lhs =
+                                    history_base_ptr.add(history_start_offset + cand_idx_s + 4);
+                                let rhs =
+                                    history_base_ptr.add(history_start_offset + concat_idx0 + 4);
+                                let ext = crate::encoding::fastpath::dispatch_common_prefix_len_ptr(
+                                    lhs, rhs, max_fwd,
+                                );
+                                s_match_len += ext;
+                            }
+                            let concat = &self.history[history_start_offset..];
+                            let short_cand = extend_backwards_shared(
+                                concat,
+                                history_abs_start,
+                                cand_pos_s,
+                                abs_ip0,
+                                s_match_len,
+                                lit_len_ip0,
+                            );
+
+                            // Enforce the hash-search floor BEFORE
+                            // committing this short hit. The 4-byte
+                            // equality gate plus forward-count extension
+                            // can produce `short_cand.match_len < 5`
+                            // (the literal 4-byte hit, no extension).
+                            // `probe_slot_match` rejects below-floor
+                            // long-hash hits at the same gate; the
+                            // short-hash path must do the same so the
+                            // fast loop never emits a sub-floor non-rep
+                            // match. The retry below can still upgrade
+                            // to a long hit (which has its own 8-byte
+                            // floor, comfortably above `DFAST_MIN_MATCH_LEN`).
+                            let short_hit_valid = short_cand.match_len >= DFAST_MIN_MATCH_LEN;
+
+                            // Donor `_search_next_long` retry (line 260):
+                            // try long match at ip1 with precomputed idxl1.
+                            // If it produces a strictly longer match, use it.
+                            let mut chosen = short_cand;
+                            let mut retry_upgraded = false;
+                            if idxl1 != DFAST_EMPTY_SLOT {
+                                let cand_pos_l1 = position_base + (idxl1 as usize) - 1;
+                                if cand_pos_l1 >= history_abs_start && cand_pos_l1 < abs_ip1 {
+                                    let cand_idx_l1 = cand_pos_l1 - history_abs_start;
+                                    let cand_v8_l1 = unsafe {
+                                        (history_base_ptr.add(history_start_offset + cand_idx_l1)
+                                            as *const u64)
+                                            .read_unaligned()
+                                    };
+                                    if cand_v8_l1 == v8_1 {
+                                        let mut l1_match_len = 8usize;
+                                        let max_fwd_l1 = concat_len.saturating_sub(concat_idx1 + 8);
+                                        unsafe {
+                                            let lhs = history_base_ptr
+                                                .add(history_start_offset + cand_idx_l1 + 8);
+                                            let rhs = history_base_ptr
+                                                .add(history_start_offset + concat_idx1 + 8);
+                                            let ext = crate::encoding::fastpath::dispatch_common_prefix_len_ptr(
+                                                lhs, rhs, max_fwd_l1,
+                                            );
+                                            l1_match_len += ext;
+                                        }
+                                        if l1_match_len > short_cand.match_len {
+                                            let concat = &self.history[history_start_offset..];
+                                            chosen = extend_backwards_shared(
+                                                concat,
+                                                history_abs_start,
+                                                cand_pos_l1,
+                                                abs_ip1,
+                                                l1_match_len,
+                                                lit_len_ip1,
+                                            );
+                                            // Long-hash hits start at 8
+                                            // bytes (`MEM_read64` gate
+                                            // above), well above
+                                            // `DFAST_MIN_MATCH_LEN = 5`,
+                                            // so the retry upgrade is
+                                            // always valid even when
+                                            // the raw short hit was
+                                            // below the floor.
+                                            retry_upgraded = true;
+                                        }
+                                    }
+                                }
+                            }
+                            if short_hit_valid || retry_upgraded {
+                                break 'inner InnerExit::Committed(chosen);
+                            }
+                            // Below-floor short hit with no retry
+                            // upgrade — fall through to the step bump
+                            // and keep scanning. Discarding here is
+                            // correctness-relevant: emitting a 4-byte
+                            // non-rep match would mint an offset on
+                            // wire that costs more than the 4-byte
+                            // payload buys (offsets at level 2/3 dfast
+                            // are 13–17 bits depending on offset class
+                            // and prior offset_hist state — strictly
+                            // worse than emitting the 4 bytes as
+                            // literals).
+                        }
                     }
-                    pos += skip_step;
-                } else {
-                    skip_step = 1;
-                    next_skip_growth_pos = pos + DFAST_SKIP_STEP_GROWTH_INTERVAL;
-                    pos += 1;
+                }
+
+                // Step bump on distance (donor `zstd_double_fast.c:224-228`).
+                if ip1 >= next_step_pos {
+                    step = (step + 1).min(DFAST_MAX_SKIP_STEP);
+                    next_step_pos += DFAST_SKIP_STEP_GROWTH_INTERVAL;
+                }
+
+                // Advance: ip0 = ip1; ip1 += step; carry hl1 → hl0 / idxl1 → idxl0.
+                ip0 = ip1;
+                ip1 += step;
+                hl0_idx = hl1_idx;
+                idxl0 = idxl1;
+                if ip1 + HASH_READ_SIZE > current_len {
+                    // First position the fast loop did NOT pack into the
+                    // hash tables. `seed_remaining_hashable_starts` will
+                    // pick up from `ip0` instead of restarting at the
+                    // outer-entry `pos`.
+                    break 'inner InnerExit::Tail(ip0);
+                }
+            };
+
+            match inner_exit {
+                InnerExit::Committed(candidate) => {
+                    let start = self.emit_candidate(
+                        current_abs_start,
+                        &mut literals_start,
+                        candidate,
+                        handle_sequence,
+                    );
+                    pos = start + candidate.match_len;
+                    pos = self.extend_with_repcode_after_match(
+                        current_abs_start,
+                        current_len,
+                        pos,
+                        &mut literals_start,
+                        handle_sequence,
+                    );
+                }
+                InnerExit::Tail(seed) => {
+                    // Inner loop ran out of safe scan room without
+                    // committing. Hand the tail seeder the first
+                    // un-scanned position so it does not redundantly
+                    // re-pack everything the fast loop already wrote.
+                    pos = seed;
+                    break 'outer;
                 }
             }
         }
@@ -502,6 +1122,160 @@ impl DfastMatchGenerator {
         self.emit_trailing_literals(literals_start, handle_sequence);
     }
 
+    /// Single-cursor probe at the last hashable position in the
+    /// current block. Called from `start_matching_fast_loop` only when
+    /// the outer iteration sees `ip0` still hashable but `ip1` past
+    /// the end — the upstream reference's single-cursor loop probes
+    /// every position with `p + HASH_READ_SIZE <= iend`, and skipping
+    /// `ip0` here would leave a real match unsearched.
+    ///
+    /// Mirror the inner loop's long+short probe and the table update
+    /// at `ip0`, but skip the rep1 peek (reads at `ip1`) and the
+    /// `_search_next_long` retry (reads at `ip1`) — both depend on a
+    /// hashable `ip1`. Upstream accepts this exact tradeoff at the
+    /// `iend` boundary: the rep peek and retry only ever fire when a
+    /// second hashable position exists.
+    ///
+    /// Returns `Some(MatchCandidate)` if a long or short hit at `ip0`
+    /// meets the `DFAST_MIN_MATCH_LEN` floor, `None` otherwise (caller
+    /// then breaks the outer loop). On a hit, the hash tables are NOT
+    /// updated by this helper — the caller routes through
+    /// `emit_candidate` which inserts via `insert_positions` over the
+    /// emitted range, exactly like the inner loop's hit path.
+    fn probe_tail_ip0_only(
+        &self,
+        current_abs_start: usize,
+        current_len: usize,
+        ip0: usize,
+        literals_start: usize,
+    ) -> Option<MatchCandidate> {
+        debug_assert!(ip0 + HASH_READ_SIZE <= current_len);
+        const PRIME: u64 = 0xCF1BBCDCB7A56463_u64;
+        let short_shift = 64 - self.short_hash_bits;
+        let long_shift = 64 - self.long_hash_bits;
+
+        let abs_ip0 = current_abs_start + ip0;
+        let lit_len_ip0 = ip0 - literals_start;
+        let history_abs_start = self.history_abs_start;
+        let position_base = self.position_base;
+        let history_start_offset = self.history_start;
+        let concat_len = self.history.len() - history_start_offset;
+        let history_base_ptr = self.history.as_ptr();
+
+        let concat_idx0 = abs_ip0 - history_abs_start;
+
+        // SAFETY: `concat_idx0 + 8 <= concat_len` follows from the
+        // caller's `ip0 + HASH_READ_SIZE <= current_len` precondition
+        // (the live history contains the full current block).
+        let v8_0 = unsafe {
+            (history_base_ptr.add(history_start_offset + concat_idx0) as *const u64)
+                .read_unaligned()
+        };
+        let v4_0 = v8_0 & 0xFFFF_FFFF;
+        let hl0_idx = (v8_0.wrapping_mul(PRIME) >> long_shift) as usize;
+        let hs0_idx = (v4_0.wrapping_mul(PRIME) >> short_shift) as usize;
+
+        // Read-only on the hash tables here — unlike the inner loop's
+        // "update-before-check" pattern, the writes at `hl0_idx` /
+        // `hs0_idx` would be dead in this helper:
+        //
+        //   * On a hit, the caller routes through `emit_candidate`
+        //     which insert-positions the entire emitted range, then
+        //     either advances `pos` past `current_len - HASH_READ_SIZE`
+        //     and the outer guard breaks (so a future iter never
+        //     re-uses these slots), or `continue 'outer` re-enters
+        //     with `pos = start + match_len ≥ current_len - 3`, which
+        //     fails the outer-entry `pos + HASH_READ_SIZE > current_len`
+        //     guard immediately. Either way, no second probe sees the
+        //     write.
+        //   * On no hit, the caller `break 'outer`s directly. Same
+        //     conclusion.
+        //   * `seed_remaining_hashable_starts` inserts `ip0` itself
+        //     during the post-loop tail seed pass, so even the "fresh
+        //     entry for next block" rationale doesn't justify writing
+        //     here — the seeder does that.
+        //
+        // Skipping the writes also removes a small amount of cache
+        // dirtying on the tail boundary and keeps `probe_tail_ip0_only`
+        // strictly cheaper than a full inner-loop iter.
+        let idxl0 = unsafe { *self.long_hash.as_ptr().add(hl0_idx) };
+        let idxs0 = unsafe { *self.short_hash.as_ptr().add(hs0_idx) };
+
+        // Long-hash probe first (upstream priority: an 8-byte hit
+        // beats a 4-byte hit even before extension).
+        if idxl0 != DFAST_EMPTY_SLOT {
+            let cand_pos = position_base + (idxl0 as usize) - 1;
+            if cand_pos >= history_abs_start && cand_pos < abs_ip0 {
+                let cand_idx = cand_pos - history_abs_start;
+                let cand_v8 = unsafe {
+                    (history_base_ptr.add(history_start_offset + cand_idx) as *const u64)
+                        .read_unaligned()
+                };
+                if cand_v8 == v8_0 {
+                    let mut match_len = 8usize;
+                    let max_fwd = concat_len.saturating_sub(concat_idx0 + 8);
+                    unsafe {
+                        let lhs = history_base_ptr.add(history_start_offset + cand_idx + 8);
+                        let rhs = history_base_ptr.add(history_start_offset + concat_idx0 + 8);
+                        let ext = crate::encoding::fastpath::dispatch_common_prefix_len_ptr(
+                            lhs, rhs, max_fwd,
+                        );
+                        match_len += ext;
+                    }
+                    let concat = &self.history[history_start_offset..];
+                    return Some(extend_backwards_shared(
+                        concat,
+                        history_abs_start,
+                        cand_pos,
+                        abs_ip0,
+                        match_len,
+                        lit_len_ip0,
+                    ));
+                }
+            }
+        }
+
+        // Short-hash probe (4-byte gate, forward extension, same
+        // floor-enforcement as the inner loop's short path — see
+        // comment there).
+        if idxs0 != DFAST_EMPTY_SLOT {
+            let cand_pos_s = position_base + (idxs0 as usize) - 1;
+            if cand_pos_s >= history_abs_start && cand_pos_s < abs_ip0 {
+                let cand_idx_s = cand_pos_s - history_abs_start;
+                let cand4 = unsafe {
+                    (history_base_ptr.add(history_start_offset + cand_idx_s) as *const u32)
+                        .read_unaligned()
+                };
+                if cand4 == v4_0 as u32 {
+                    let mut s_match_len = 4usize;
+                    let max_fwd = concat_len.saturating_sub(concat_idx0 + 4);
+                    unsafe {
+                        let lhs = history_base_ptr.add(history_start_offset + cand_idx_s + 4);
+                        let rhs = history_base_ptr.add(history_start_offset + concat_idx0 + 4);
+                        let ext = crate::encoding::fastpath::dispatch_common_prefix_len_ptr(
+                            lhs, rhs, max_fwd,
+                        );
+                        s_match_len += ext;
+                    }
+                    let concat = &self.history[history_start_offset..];
+                    let short_cand = extend_backwards_shared(
+                        concat,
+                        history_abs_start,
+                        cand_pos_s,
+                        abs_ip0,
+                        s_match_len,
+                        lit_len_ip0,
+                    );
+                    if short_cand.match_len >= DFAST_MIN_MATCH_LEN {
+                        return Some(short_cand);
+                    }
+                }
+            }
+        }
+
+        None
+    }
+
     /// Donor `zstd_double_fast.c` post-match rep-0 extension. After the
     /// primary match has been emitted and `pos` advanced past it, donor
     /// opportunistically chains additional `rep_2`-coded matches at the
@@ -517,7 +1291,7 @@ impl DfastMatchGenerator {
     ///   * skips the hash-table probe entirely on every extra match.
     ///
     /// Critically uses donor's `MINMATCH = 4` here rather than the
-    /// stricter `DFAST_MIN_MATCH_LEN = 6` enforced on the main search
+    /// `DFAST_MIN_MATCH_LEN = 5` enforced on the main search
     /// loop. The donor accepts any 4-byte rep extension; we mirror that
     /// because the rep emission carries no offset cost — even a 4-byte
     /// rep is a net win over re-running the full hash search. Returns
@@ -531,10 +1305,9 @@ impl DfastMatchGenerator {
         literals_start: &mut usize,
         handle_sequence: &mut impl for<'a> FnMut(Sequence<'a>),
     ) -> usize {
-        const DONOR_REP_MIN_MATCH_LEN: usize = 4;
         loop {
-            // Need at least DONOR_REP_MIN_MATCH_LEN bytes of room past `pos`.
-            if pos + DONOR_REP_MIN_MATCH_LEN > current_len {
+            // Need at least DFAST_REP_MIN_MATCH_LEN bytes of room past `pos`.
+            if pos + DFAST_REP_MIN_MATCH_LEN > current_len {
                 break;
             }
             // After a primary emit `literals_start == pos`, so `lit_len`
@@ -561,7 +1334,7 @@ impl DfastMatchGenerator {
                 None => break,
             };
             let concat = &self.history[self.history_start..];
-            if cur_idx + DONOR_REP_MIN_MATCH_LEN > concat.len() {
+            if cur_idx + DFAST_REP_MIN_MATCH_LEN > concat.len() {
                 break;
             }
             // Cheap 4-byte gate before the SIMD `common_prefix_len`.
@@ -569,12 +1342,61 @@ impl DfastMatchGenerator {
                 break;
             }
             let match_len = common_prefix_len(&concat[cand_idx..], &concat[cur_idx..]);
-            if match_len < DONOR_REP_MIN_MATCH_LEN {
+            if match_len < DFAST_REP_MIN_MATCH_LEN {
                 break;
             }
-            // Insert the rep range into hash tables so future positions
-            // hashing into this area find these candidates.
-            self.insert_positions(abs_pos, abs_pos + match_len);
+            // Sparse complementary insertion (upstream parity,
+            // `zstd_double_fast.c:300-304`): upstream inserts ONLY at
+            // `curr+2`, `ip-2`, `ip-1` after a match — three specific
+            // positions, not the whole match range. The previous
+            // `insert_positions(abs_pos, abs_pos + match_len)` made
+            // sense only under the 4-slot bucket; with single-slot
+            // donor parity it would just overwrite every bucket along
+            // the match span and discard whichever positions the
+            // producer was about to re-probe.
+            //
+            // At the floor `match_len == DFAST_REP_MIN_MATCH_LEN (= 4)`
+            // the three targets collapse to two distinct positions.
+            // With `post_match_end = abs_pos + 4` the three offsets
+            // resolve to:
+            //   `curr + 2` = abs_pos + 2
+            //   `ip - 2`   = abs_pos + 4 - 2 = abs_pos + 2  ← same as curr+2
+            //   `ip - 1`   = abs_pos + 4 - 1 = abs_pos + 3  ← distinct
+            // So `curr+2` and `ip-2` write the same slot twice;
+            // single-slot overwrite is idempotent, so the duplicate
+            // write is correctness-neutral. It's one wasted store on
+            // the shortest rep extension and not worth a branch to
+            // dedup. For `match_len >= 5` all three offsets are
+            // distinct.
+            //
+            // Why `abs_pos` itself is NOT in the insert set, despite
+            // not being written by the fast loop's pre-check insert
+            // (the previous range form `insert_positions(abs_pos,
+            // post_match_end)` did include it): upstream
+            // `ZSTD_compressBlock_doubleFast_*` likewise does not
+            // insert at the rep-extension start. After a rep match,
+            // upstream just advances `ip` and reruns the outer fast
+            // loop, which writes the new `curr` (the post-rep cursor)
+            // — never the rep's own start. The three offsets above
+            // are upstream's primary-match-emit insertion pattern,
+            // mirrored here to preserve hit rate on the chains that
+            // follow a rep extension. The hash-state delta vs the
+            // prior range-insert behavior is intentional and tracked
+            // by the sequence-stream comparator harness (the
+            // per-sequence diff vs FFI mentioned in the PR body's
+            // deferred section); the audit signed off on the current
+            // sparse set as the closest faithful mirror of upstream.
+            let post_match_end = abs_pos + match_len;
+            let insert_targets = [
+                abs_pos + 2,                      // curr + 2
+                post_match_end.saturating_sub(2), // ip - 2 (post-match cursor)
+                post_match_end.saturating_sub(1), // ip - 1
+            ];
+            for &target in &insert_targets {
+                if target > abs_pos && target < post_match_end {
+                    self.insert_position(target);
+                }
+            }
             // Emit zero-literal rep sequence.
             handle_sequence(Sequence::Triple {
                 literals: &[],
@@ -613,7 +1435,14 @@ impl DfastMatchGenerator {
             current_abs_start + *literals_start,
             candidate.start + candidate.match_len,
         );
-        let current = self.window.back().unwrap().as_slice();
+        // Inline the trailing-block slice rather than calling
+        // `get_last_space()` so this matches the gate pattern used by
+        // `skip_matching` / `start_matching` (read `window_blocks.back()`
+        // with `unwrap_or(0)`). `emit_candidate` runs only after a
+        // successful match was found in the active block, so
+        // `last_len > 0` is a structural precondition.
+        let last_len = self.window_blocks.back().copied().unwrap_or(0);
+        let current = &self.history[self.history.len() - last_len..];
         let start = candidate.start - current_abs_start;
         let literals = &current[*literals_start..start];
         handle_sequence(Sequence::Triple {
@@ -635,8 +1464,13 @@ impl DfastMatchGenerator {
         literals_start: usize,
         handle_sequence: &mut impl for<'a> FnMut(Sequence<'a>),
     ) {
-        if literals_start < self.window.back().unwrap().len() {
-            let current = self.window.back().unwrap().as_slice();
+        let last_len = self.window_blocks.back().copied().unwrap_or(0);
+        if literals_start < last_len {
+            // Inline rather than calling `get_last_space()` so the gate
+            // pattern matches `skip_matching` / `start_matching` /
+            // `emit_candidate`. `last_len > 0` is already established by
+            // the `literals_start < last_len` check above.
+            let current = &self.history[self.history.len() - last_len..];
             handle_sequence(Sequence::Literals {
                 literals: &current[literals_start..],
             });
@@ -644,13 +1478,16 @@ impl DfastMatchGenerator {
     }
 
     pub(crate) fn ensure_hash_tables(&mut self) {
-        let table_len = 1usize << self.hash_bits;
-        if self.short_hash.len() != table_len {
-            // This is intentionally lazy so Fastest/Uncompressed never pay the
-            // ~dfast-level memory cost. The current size tracks the issue's
-            // zstd level-3 style parameters rather than a generic low-memory preset.
-            self.short_hash = alloc::vec![[DFAST_EMPTY_SLOT; DFAST_SEARCH_DEPTH]; table_len];
-            self.long_hash = alloc::vec![[DFAST_EMPTY_SLOT; DFAST_SEARCH_DEPTH]; table_len];
+        // Independent sizing per donor `clevels.h`: long-hash =
+        // `hashLog`, short-hash = `chainLog`. Lazy allocation so
+        // Fastest/Uncompressed never pay the dfast-level memory cost.
+        let long_len = 1usize << self.long_hash_bits;
+        let short_len = 1usize << self.short_hash_bits;
+        if self.long_hash.len() != long_len {
+            self.long_hash = alloc::vec![DFAST_EMPTY_SLOT; long_len];
+        }
+        if self.short_hash.len() != short_len {
+            self.short_hash = alloc::vec![DFAST_EMPTY_SLOT; short_len];
         }
     }
 
@@ -674,6 +1511,7 @@ impl DfastMatchGenerator {
         self.history_abs_start + self.live_history().len()
     }
 
+    #[inline(always)]
     pub(crate) fn best_match(&self, abs_pos: usize, lit_len: usize) -> Option<MatchCandidate> {
         let rep = self.repcode_candidate(abs_pos, lit_len);
         let hash = self.hash_candidate(abs_pos, lit_len);
@@ -700,6 +1538,7 @@ impl DfastMatchGenerator {
         )
     }
 
+    #[inline(always)]
     pub(crate) fn repcode_candidate(
         &self,
         abs_pos: usize,
@@ -715,6 +1554,7 @@ impl DfastMatchGenerator {
         )
     }
 
+    #[inline(always)]
     pub(crate) fn hash_candidate(&self, abs_pos: usize, lit_len: usize) -> Option<MatchCandidate> {
         // Hoist all the per-loop invariants out of the combinator chains.
         // `short_candidates`/`long_candidates` each re-fetch `live_history`
@@ -732,56 +1572,82 @@ impl DfastMatchGenerator {
         let position_base = self.position_base;
         let mut best = None;
 
-        // Long-hash probes first (8-byte hash → longer matches more likely).
-        if current_idx + 8 <= concat.len() {
-            let long_hash = self.hash8(&concat[current_idx..]);
-            // SAFETY: `hash_index` masks to `hash_bits` and `long_hash.len()
-            // == 1 << hash_bits` (`ensure_hash_tables`).
+        // Long-hash probe first (8-byte hash → longer matches more
+        // likely). Single-slot per bucket — donor parity, no chain
+        // walking. The retry policy below mirrors donor
+        // `_search_next_long`: if the long-hash misses but the
+        // short-hash hits, peek the long-hash at `abs_pos + 1` and
+        // pick the longer of the two matches.
+        let long_hit = if current_idx + 8 <= concat.len() {
+            let long_hash = self.long_hash_index(&concat[current_idx..]);
+            // SAFETY: `long_hash_index` masks to `long_hash_bits` and
+            // `long_hash.len() == 1 << long_hash_bits` (`ensure_hash_tables`).
             debug_assert!(long_hash < self.long_hash.len());
-            let bucket = unsafe { self.long_hash.get_unchecked(long_hash) };
-            for &slot in bucket {
-                // Inline unpack: slot 0 = empty sentinel, otherwise
-                // `slot - 1 + position_base` is the absolute position.
-                if slot == DFAST_EMPTY_SLOT {
-                    continue;
-                }
-                let candidate_pos = position_base + (slot as usize) - 1;
-                if candidate_pos < history_abs_start || candidate_pos >= abs_pos {
-                    continue;
-                }
-                let candidate_idx = candidate_pos - history_abs_start;
-                let match_len = common_prefix_len(&concat[candidate_idx..], &concat[current_idx..]);
-                if match_len >= DFAST_MIN_MATCH_LEN {
-                    let candidate =
-                        self.extend_backwards(candidate_pos, abs_pos, match_len, lit_len);
-                    best = best_len_offset_candidate(best, Some(candidate));
-                    if best.is_some_and(|b| b.match_len >= DFAST_TARGET_LEN) {
-                        return best;
-                    }
-                }
+            let slot = unsafe { *self.long_hash.get_unchecked(long_hash) };
+            self.probe_slot_match(
+                slot,
+                position_base,
+                history_abs_start,
+                abs_pos,
+                current_idx,
+                concat,
+                lit_len,
+                8, // long-hash equality gate
+            )
+        } else {
+            None
+        };
+        if let Some(cand) = long_hit {
+            best = best_len_offset_candidate(best, Some(cand));
+            if best.is_some_and(|b| b.match_len >= DFAST_TARGET_LEN) {
+                return best;
             }
         }
 
         if current_idx + 4 <= concat.len() {
-            let short_hash = self.hash4(&concat[current_idx..]);
+            let short_hash = self.short_hash_index(&concat[current_idx..]);
             debug_assert!(short_hash < self.short_hash.len());
-            let bucket = unsafe { self.short_hash.get_unchecked(short_hash) };
-            for &slot in bucket {
-                if slot == DFAST_EMPTY_SLOT {
-                    continue;
-                }
-                let candidate_pos = position_base + (slot as usize) - 1;
-                if candidate_pos < history_abs_start || candidate_pos >= abs_pos {
-                    continue;
+            let slot = unsafe { *self.short_hash.get_unchecked(short_hash) };
+            if let Some(short_cand) = self.probe_slot_match(
+                slot,
+                position_base,
+                history_abs_start,
+                abs_pos,
+                current_idx,
+                concat,
+                lit_len,
+                4, // short-hash equality gate
+            ) {
+                best = best_len_offset_candidate(best, Some(short_cand));
+                if best.is_some_and(|b| b.match_len >= DFAST_TARGET_LEN) {
+                    return best;
                 }
-                let candidate_idx = candidate_pos - history_abs_start;
-                let match_len = common_prefix_len(&concat[candidate_idx..], &concat[current_idx..]);
-                if match_len >= DFAST_MIN_MATCH_LEN {
-                    let candidate =
-                        self.extend_backwards(candidate_pos, abs_pos, match_len, lit_len);
-                    best = best_len_offset_candidate(best, Some(candidate));
-                    if best.is_some_and(|b| b.match_len >= DFAST_TARGET_LEN) {
-                        return best;
+                // Donor `_search_next_long` retry: short hit landed but
+                // a long hit at `abs_pos + 1` could be even longer. The
+                // donor inner loop precomputes `hashLong[hl1]` for
+                // exactly this case (line 213 in `zstd_double_fast.c`);
+                // we lift it inline here so the single-slot table
+                // retains the compression-quality donor gets from its
+                // overlapping probe pattern.
+                let next_idx = current_idx + 1;
+                if best.is_none_or(|b| b.match_len < DFAST_TARGET_LEN)
+                    && next_idx + 8 <= concat.len()
+                {
+                    let next_long_hash = self.long_hash_index(&concat[next_idx..]);
+                    debug_assert!(next_long_hash < self.long_hash.len());
+                    let next_slot = unsafe { *self.long_hash.get_unchecked(next_long_hash) };
+                    if let Some(retry) = self.probe_slot_match(
+                        next_slot,
+                        position_base,
+                        history_abs_start,
+                        abs_pos + 1,
+                        next_idx,
+                        concat,
+                        lit_len.saturating_add(1),
+                        8, // long-hash equality gate, `_search_next_long` retry
+                    ) && retry.match_len > short_cand.match_len
+                    {
+                        best = best_len_offset_candidate(best, Some(retry));
                     }
                 }
             }
@@ -789,6 +1655,60 @@ impl DfastMatchGenerator {
         best
     }
 
+    /// Resolve a single packed-slot value against the live history and
+    /// return a backward-extended `MatchCandidate` if the bucket holds
+    /// a valid in-range position whose forward extension reaches at
+    /// least `DFAST_MIN_MATCH_LEN` bytes. Shared between the long-hash
+    /// primary probe, the short-hash primary probe, and the
+    /// `_search_next_long` retry — keeps the bounds-checking logic in
+    /// one place so the three call sites can't drift.
+    #[inline(always)]
+    #[allow(clippy::too_many_arguments)]
+    fn probe_slot_match(
+        &self,
+        slot: u32,
+        position_base: usize,
+        history_abs_start: usize,
+        abs_pos: usize,
+        current_idx: usize,
+        concat: &[u8],
+        lit_len: usize,
+        gate_len: usize,
+    ) -> Option<MatchCandidate> {
+        if slot == DFAST_EMPTY_SLOT {
+            return None;
+        }
+        let candidate_pos = position_base + (slot as usize) - 1;
+        if candidate_pos < history_abs_start || candidate_pos >= abs_pos {
+            return None;
+        }
+        let candidate_idx = candidate_pos - history_abs_start;
+        // Probe-width equality gate before the SIMD walk. The long-hash
+        // callers pass `gate_len = 8` (matches an `MEM_read64`-style
+        // probe in the upstream reference); the short-hash caller
+        // passes `gate_len = 4` (`MEM_read32`-style). A 1-byte
+        // precheck would accept hash collisions whose actual common
+        // prefix runs from 1 byte up to less than the probe width,
+        // paying the full `common_prefix_len` walk for them on every
+        // iteration. The wider gate rejects those collisions early
+        // and matches what the upstream `_search_next_long` path does
+        // after a hit.
+        if candidate_idx + gate_len > concat.len() || current_idx + gate_len > concat.len() {
+            return None;
+        }
+        if concat[candidate_idx..candidate_idx + gate_len]
+            != concat[current_idx..current_idx + gate_len]
+        {
+            return None;
+        }
+        let match_len = common_prefix_len(&concat[candidate_idx..], &concat[current_idx..]);
+        if match_len < DFAST_MIN_MATCH_LEN {
+            return None;
+        }
+        Some(self.extend_backwards(candidate_pos, abs_pos, match_len, lit_len))
+    }
+
+    #[inline(always)]
     fn extend_backwards(
         &self,
         candidate_pos: usize,
@@ -809,8 +1729,91 @@ impl DfastMatchGenerator {
     pub(crate) fn insert_positions(&mut self, start: usize, end: usize) {
         let start = start.max(self.history_abs_start);
         let end = end.min(self.history_abs_end());
-        for pos in start..end {
-            self.insert_position(pos);
+        if start >= end {
+            return;
+        }
+        // Hoist the rebase trigger out of the inner loop: a single
+        // `ensure_room_for(end - 1)` covers every `pack_slot` in the
+        // range. The per-position `ensure_room_for` call inside
+        // `insert_position` would re-check on every byte and the
+        // compiler cannot prove the call is idempotent through
+        // `&mut self`.
+        self.ensure_room_for(end - 1);
+
+        // Snapshot every per-call invariant. `&mut self` blocks the
+        // optimiser from hoisting these loads across the inner loop —
+        // the bodies of `short_hash` / `long_hash` writes mutate
+        // through `self`, so each iteration would otherwise reload
+        // `history.len()`, `history_start`, `position_base`,
+        // `*_hash_bits`. With ~1 input byte per
+        // call on the dfast hot path that re-load shape was the
+        // dominant cost in the per-position cluster.
+        let history_abs_start = self.history_abs_start;
+        let position_base = self.position_base;
+        let history_start = self.history_start;
+        let concat_len = self.history.len() - history_start;
+        let history_base_ptr = self.history.as_ptr();
+        let short_hash_bits = self.short_hash_bits;
+        let long_hash_bits = self.long_hash_bits;
+        let short_hash_ptr = self.short_hash.as_mut_ptr();
+        let long_hash_ptr = self.long_hash.as_mut_ptr();
+        let short_shift = 64 - short_hash_bits;
+        let long_shift = 64 - long_hash_bits;
+
+        // Two contiguous regions in the input range:
+        // * `[start .. long_safe_end)` — every position has at least 8
+        //   bytes of lookahead, so both short and long hashes get
+        //   inserted from a single 8-byte unaligned load.
+        // * `[long_safe_end .. short_safe_end)` — only 4..7 bytes
+        //   remain, so only the short hash gets inserted (4-byte
+        //   load).
+        // Past `short_safe_end` neither hash has enough lookahead and
+        // donor parity is "no insert" — skip entirely.
+        let abs_concat_end = history_abs_start + concat_len;
+        let long_safe_end = abs_concat_end.saturating_sub(7).min(end);
+        let short_safe_end = abs_concat_end.saturating_sub(3).min(end);
+
+        // SAFETY: `history_base_ptr.add(history_start + idx)` is
+        // in-bounds for `idx + 8 <= concat_len`, which the two
+        // `*_safe_end` cutoffs enforce. `short_hash_ptr.add(k)` /
+        // `long_hash_ptr.add(k)` are in-bounds because
+        // `ensure_hash_tables` sizes the two tables to `1 <<
+        // *_hash_bits` and `k = mixed >> (64 - bits)` has at most
+        // `bits` bits set. `position_base` and `history_abs_start`
+        // are constant across the loop after the single `ensure_room_for`
+        // call above. `packed` fits in `u32` by that same gate.
+        let mut pos = start;
+        while pos < long_safe_end {
+            unsafe {
+                let idx = pos - history_abs_start;
+                let packed = ((pos - position_base) as u32) + 1;
+                let load_ptr = history_base_ptr.add(history_start + idx);
+                let v8 = (load_ptr as *const u64).read_unaligned();
+                let v4 = v8 & 0xFFFF_FFFF;
+                // Donor parity (`zstd_compress_internal.h:923-924`):
+                // scalar `* prime8bytes` then shift to high bits. Drops
+                // the CRC32d-based kernel dispatch (3-4 instructions) for
+                // a single mul on the per-byte insert path.
+                let mixed_short = v4.wrapping_mul(0xCF1BBCDCB7A56463_u64);
+                let mixed_long = v8.wrapping_mul(0xCF1BBCDCB7A56463_u64);
+                let short_idx = (mixed_short >> short_shift) as usize;
+                let long_idx = (mixed_long >> long_shift) as usize;
+                *short_hash_ptr.add(short_idx) = packed;
+                *long_hash_ptr.add(long_idx) = packed;
+            }
+            pos += 1;
+        }
+        while pos < short_safe_end {
+            unsafe {
+                let idx = pos - history_abs_start;
+                let packed = ((pos - position_base) as u32) + 1;
+                let load_ptr = history_base_ptr.add(history_start + idx);
+                let v4 = (load_ptr as *const u32).read_unaligned() as u64;
+                let mixed_short = v4.wrapping_mul(0xCF1BBCDCB7A56463_u64);
+                let short_idx = (mixed_short >> short_shift) as usize;
+                *short_hash_ptr.add(short_idx) = packed;
+            }
+            pos += 1;
         }
     }
 
@@ -860,42 +1863,44 @@ impl DfastMatchGenerator {
         // rebase is needed.
         self.ensure_room_for(pos);
         let packed = self.pack_slot(pos);
-        // SAFETY: `hash_index` masks the mixed hash to `hash_bits` bits and
-        // both tables are sized to `1 << hash_bits` in `ensure_hash_tables`,
-        // so every index produced here is provably below the table length.
-        // Eliding the bounds check on this per-byte hot path saves ~4
-        // instructions and one branch per call.
+        // SAFETY: the `*_hash_index` helpers mask the mixed hash to
+        // `long_hash_bits` / `short_hash_bits`, and `ensure_hash_tables`
+        // sizes the two tables to `1 << long_hash_bits` /
+        // `1 << short_hash_bits` respectively, so every produced index
+        // is provably below the table length. Eliding the bounds check
+        // on this per-byte hot path saves ~4 instructions per call.
+        //
+        // Single-slot overwrite (upstream parity): the previous 4-slot
+        // bucket shift (`copy_within(..)`) is gone — upstream
+        // `ZSTD_compressBlock_doubleFast_*` writes a single `U32` per
+        // hash position and relies on the dense `_search_next_long`
+        // retry in `hash_candidate` (via `best_match`) to preserve
+        // compression ratio.
         if idx + 4 <= concat_len {
             let concat = &self.history[self.history_start..];
-            let short = self.hash4(&concat[idx..]);
+            let short = self.short_hash_index(&concat[idx..]);
             debug_assert!(short < self.short_hash.len());
-            let bucket = unsafe { self.short_hash.get_unchecked_mut(short) };
-            if bucket[0] != packed {
-                bucket.copy_within(0..DFAST_SEARCH_DEPTH - 1, 1);
-                bucket[0] = packed;
-            }
+            unsafe { *self.short_hash.get_unchecked_mut(short) = packed };
         }
 
         if idx + 8 <= concat_len {
             let concat = &self.history[self.history_start..];
-            let long = self.hash8(&concat[idx..]);
+            let long = self.long_hash_index(&concat[idx..]);
             debug_assert!(long < self.long_hash.len());
-            let bucket = unsafe { self.long_hash.get_unchecked_mut(long) };
-            if bucket[0] != packed {
-                bucket.copy_within(0..DFAST_SEARCH_DEPTH - 1, 1);
-                bucket[0] = packed;
-            }
+            unsafe { *self.long_hash.get_unchecked_mut(long) = packed };
         }
     }
 
-    pub(crate) fn hash4(&self, data: &[u8]) -> usize {
+    #[inline(always)]
+    pub(crate) fn short_hash_index(&self, data: &[u8]) -> usize {
         let value = u32::from_le_bytes(data[..4].try_into().unwrap()) as u64;
-        self.hash_index(value)
+        self.hash_index_with_bits(value, self.short_hash_bits)
     }
 
-    pub(crate) fn hash8(&self, data: &[u8]) -> usize {
+    #[inline(always)]
+    pub(crate) fn long_hash_index(&self, data: &[u8]) -> usize {
         let value = u64::from_le_bytes(data[..8].try_into().unwrap());
-        self.hash_index(value)
+        self.hash_index_with_bits(value, self.long_hash_bits)
     }
 
     fn block_looks_incompressible(&self, start: usize, end: usize) -> bool {
@@ -912,23 +1917,16 @@ impl DfastMatchGenerator {
         block_looks_incompressible(block)
     }
 
-    fn block_looks_incompressible_strict(&self, start: usize, end: usize) -> bool {
-        let live = self.live_history();
-        if start >= end || start < self.history_abs_start {
-            return false;
-        }
-        let start_idx = start - self.history_abs_start;
-        let end_idx = end - self.history_abs_start;
-        if end_idx > live.len() {
-            return false;
-        }
-        let block = &live[start_idx..end_idx];
-        block_looks_incompressible_strict(block)
-    }
-
-    fn hash_index(&self, value: u64) -> usize {
-        let mixed = crate::encoding::fastpath::hash_mix_u64_with_kernel(self.hash_kernel, value);
-        (mixed >> (64 - self.hash_bits)) as usize
+    #[inline(always)]
+    fn hash_index_with_bits(&self, value: u64, bits: usize) -> usize {
+        // Donor parity (`zstd_compress_internal.h:923-924`, `ZSTD_hash8`):
+        // a single 64-bit multiply by `prime8bytes` followed by a high-bits
+        // shift. Drops the CRC32d + rotate + mul kernel dispatch the rest
+        // of the crate uses — for dfast the donor's scalar hash is
+        // distribution-equivalent and one instruction shorter on the hot
+        // path.
+        let mixed = value.wrapping_mul(0xCF1BBCDCB7A56463_u64);
+        (mixed >> (64 - bits)) as usize
     }
 }
 
@@ -1125,8 +2123,8 @@ mod extend_with_repcode_tests {
     }
 
     /// The helper accepts 4-byte rep extensions (donor `MINMATCH = 4`),
-    /// not the main-loop `DFAST_MIN_MATCH_LEN = 6` floor. A regression
-    /// back to 6 would still pass the constant-run / cross-block tests
+    /// not the main-loop `DFAST_MIN_MATCH_LEN = 5` floor. A regression
+    /// back above 4 would still pass the constant-run / cross-block tests
     /// above (their rep matches extend much further), so this fixture
     /// is built so the rep matches EXACTLY 4 bytes before terminating:
     /// the byte at `pos + 4` differs from the byte at `pos + 4 - rep`.
@@ -1193,7 +2191,7 @@ mod extend_with_repcode_tests {
                 assert_eq!(
                     *match_len, 4,
                     "rep emit must be exactly 4 bytes (donor MINMATCH floor). \
-                     A regression back to DFAST_MIN_MATCH_LEN = 6 would skip \
+                     A regression back to DFAST_MIN_MATCH_LEN > 4 would skip \
                      this emission entirely and the test would fail with 0 seqs."
                 );
             }
diff --git a/zstd/src/encoding/frame_compressor.rs b/zstd/src/encoding/frame_compressor.rs
index 37b565fd..befc304a 100644
--- a/zstd/src/encoding/frame_compressor.rs
+++ b/zstd/src/encoding/frame_compressor.rs
@@ -76,11 +76,20 @@ impl PreviousFseTable {
 }
 
 pub(crate) struct FseTables {
-    pub(crate) ll_default: FSETable,
+    /// The three predefined LL/ML/OF tables are functions of
+    /// compile-time-constant distributions; the cached helpers in
+    /// `fse_encoder` build them once per process and hand back
+    /// `&'static FSETable`. Storing the reference here (rather than
+    /// an owned `FSETable`) collapses `FrameCompressor::new` to a
+    /// 3-pointer copy on the cache-hot path — clone takes ~4 µs on
+    /// x86_64, the bare pointer copy is sub-nanosecond — and removes
+    /// the same per-clone cost from `clone_fse_tables` on the
+    /// block-split estimator path.
+    pub(crate) ll_default: &'static FSETable,
     pub(crate) ll_previous: Option<PreviousFseTable>,
-    pub(crate) ml_default: FSETable,
+    pub(crate) ml_default: &'static FSETable,
     pub(crate) ml_previous: Option<PreviousFseTable>,
-    pub(crate) of_default: FSETable,
+    pub(crate) of_default: &'static FSETable,
     pub(crate) of_previous: Option<PreviousFseTable>,
 }
 
@@ -438,7 +447,8 @@ impl<R: Read, W: Write, M: Matcher> FrameCompressor<R, W, M> {
     /// To avoid endlessly encoding from a potentially endless source (like a network socket) you can use the
     /// [Read::take] function
     pub fn compress(&mut self) {
-        let source_size_hint_known = self.source_size_hint.is_some();
+        let initial_size_hint = self.source_size_hint;
+        let source_size_hint_known = initial_size_hint.is_some();
         let use_dictionary_state =
             !matches!(self.compression_level, CompressionLevel::Uncompressed)
                 && self.state.matcher.supports_dictionary_priming()
@@ -518,9 +528,36 @@ impl<R: Read, W: Write, M: Matcher> FrameCompressor<R, W, M> {
             window_size != 0,
             "matcher reported window_size == 0, which is invalid"
         );
-        // Accumulate all compressed blocks; the frame header is written after
-        // all input has been read so that Frame_Content_Size is known.
-        let mut all_blocks: Vec<u8> = Vec::with_capacity(1024 * 130);
+        // Accumulate all compressed blocks; the frame header is written
+        // after all input has been read so that Frame_Content_Size is
+        // known. The default seed is one donor block; smaller seeds for
+        // small payloads avoid pinning a full block worth of bytes when
+        // the compressed output fits in a few hundred bytes. For larger
+        // inputs the default seed amortises the first few `Vec::extend`
+        // doublings cheaply and the `peak - default_seed` residue is
+        // dominated by internal `compress_block_encoded` buffers anyway,
+        // so changing it produces no measurable savings.
+        //
+        // Seed-size tiers (mirrors donor `ZSTD_CStreamOutSize` naming):
+        //
+        // * `ALL_BLOCKS_TINY_CAP` — payload ≤ this size, seed equals
+        //   payload bound; ≥ everything compressed output could need
+        //   for a tiny input.
+        // * `ALL_BLOCKS_SMALL_CAP` — small-input seed picked to absorb
+        //   one or two doublings without over-allocating.
+        // * `ALL_BLOCKS_DEFAULT_CAP` — one donor block; the value the
+        //   rest of the encoder is sized around.
+        const ALL_BLOCKS_TINY_THRESHOLD: u64 = 4 * 1024;
+        const ALL_BLOCKS_SMALL_THRESHOLD: u64 = 64 * 1024;
+        const ALL_BLOCKS_TINY_CAP: usize = 4 * 1024;
+        const ALL_BLOCKS_SMALL_CAP: usize = 16 * 1024;
+        const ALL_BLOCKS_DEFAULT_CAP: usize = 130 * 1024;
+        let initial_all_blocks_cap = match initial_size_hint {
+            Some(h) if h <= ALL_BLOCKS_TINY_THRESHOLD => ALL_BLOCKS_TINY_CAP,
+            Some(h) if h <= ALL_BLOCKS_SMALL_THRESHOLD => ALL_BLOCKS_SMALL_CAP,
+            _ => ALL_BLOCKS_DEFAULT_CAP,
+        };
+        let mut all_blocks: Vec<u8> = Vec::with_capacity(initial_all_blocks_cap);
         let mut total_uncompressed: u64 = 0;
         let mut pending_input: Vec<u8> = Vec::new();
         let mut reached_eof = false;
diff --git a/zstd/src/encoding/match_generator.rs b/zstd/src/encoding/match_generator.rs
index 0ff88a62..050adcba 100644
--- a/zstd/src/encoding/match_generator.rs
+++ b/zstd/src/encoding/match_generator.rs
@@ -50,33 +50,45 @@ use std::arch::is_aarch64_feature_detected;
 #[cfg(all(test, feature = "std", target_arch = "x86_64"))]
 use std::arch::is_x86_feature_detected;
 
-pub(crate) const DFAST_MIN_MATCH_LEN: usize = 6;
+pub(crate) const DFAST_MIN_MATCH_LEN: usize = 5;
 pub(crate) const DFAST_SHORT_HASH_LOOKAHEAD: usize = 4;
 pub(crate) const ROW_MIN_MATCH_LEN: usize = 6;
 pub(crate) const DFAST_TARGET_LEN: usize = 48;
-// Donor `clevels.h:31` at level 3 large-input bucket sets `hashLog = 17`.
-// Our table layout stores `[u32; DFAST_SEARCH_DEPTH]` (4 u32 slots) per
-// bucket, so a `1 << 17`-bucket table is `128 K × 16 B = 2 MiB per
-// table`, `4 MiB` for the (short, long) pair. We previously held a
-// 20-bit ceiling (1 M buckets × 32 B per `[usize; 4]` = 32 MiB per
-// table, 64 MiB for the pair) which inflated peak memory ~85× over
-// donor at default level — the bench matrix added in PR #143 surfaced
-// 64 MB Rust vs 768 KB donor for the two-table footprint on
-// `decodecorpus-z000033`. Donor stores a single `U32` per slot plus a
-// `chainTable` for older positions, so their pair is still smaller
-// than ours per-level; aligning the bucket count via this constant
-// already eats the dominant factor, the storage-width change ate the
-// rest. `dfast_hash_bits_for_window` still clamps the runtime value
-// to `[MIN_WINDOW_LOG, DFAST_HASH_BITS]`, so this const is the upper
-// bound rather than a fixed default.
+// Donor `clevels.h:31` at level 3 large-input bucket sets
+// `hashLog = 17` (the long-hash table) and `chainLog = 16` (the
+// short-hash table — donor names this `chainTable` even though for
+// dfast it's used as a plain single-slot hash). Each table holds one
+// `U32` per slot; the donor overwrites on collision and recovers
+// compression quality via the inline `_search_next_long` retry
+// (after a short-hash hit, probes `hashLong[hl1]` at `ip + 1` and
+// keeps the longer match).
+//
+// We mirror that storage layout: single `u32` per bucket (no
+// `[u32; N]` array), `long_hash` sized `1 << DFAST_HASH_BITS` and
+// `short_hash` one bit smaller via `DFAST_SHORT_HASH_BITS_DELTA`.
+// Two-table footprint at Level 3: `2^17 × 4 + 2^16 × 4 = 768 KiB`,
+// exact upstream parity. The `_search_next_long` retry lives in
+// `DfastMatchGenerator::hash_candidate` (called via
+// `best_match`). Earlier revisions kept a
+// 4-slot bucket per hash position; that paid 4× the donor memory
+// without measurable ratio gain once the retry was in place.
+//
+// `dfast_hash_bits_for_window` still clamps the runtime long-hash
+// value to `[MIN_WINDOW_LOG, DFAST_HASH_BITS]`, so this const is the
+// upper bound rather than a fixed default.
 pub(crate) const DFAST_HASH_BITS: usize = 17;
-pub(crate) const DFAST_SEARCH_DEPTH: usize = 4;
-/// Sentinel value for an empty slot in the `[u32; DFAST_SEARCH_DEPTH]`
-/// hash buckets. Real positions are stored as `(abs_pos -
-/// position_base + 1) as u32`, so `0` is reserved as the "empty"
-/// marker and a true relative offset of `0` never appears in the
-/// table. Mirrors the LDM table's `LdmEntry.offset == 0` convention
-/// (see `encoding/ldm/table.rs`) so both rebasing structures share
+/// Difference between `long_hash_bits` and `short_hash_bits` —
+/// donor `hashLog - chainLog` is 1 at every dfast level (`clevels.h`
+/// level 2: 16-15=1; level 3: 17-16=1). The short hash is one bit
+/// smaller than the long hash so the per-bucket footprint matches
+/// donor sizing exactly.
+pub(crate) const DFAST_SHORT_HASH_BITS_DELTA: usize = 1;
+/// Sentinel value for an empty slot in the dfast hash tables. Real
+/// positions are stored as `(abs_pos - position_base + 1) as u32`, so
+/// `0` is reserved as the "empty" marker and a true relative offset
+/// of `0` never appears in the table. Mirrors the LDM table's
+/// `LdmEntry.offset == 0` convention (see `encoding/ldm/table.rs`)
+/// so both rebasing structures share
 /// one sentinel scheme.
 pub(crate) const DFAST_EMPTY_SLOT: u32 = 0;
 
@@ -666,15 +678,18 @@ impl MatchGeneratorDriver {
                     });
                 }
                 super::strategy::BackendTag::Dfast => {
-                    let mut retired = Vec::new();
-                    self.dfast_matcher_mut().trim_to_window(|data| {
-                        evicted_bytes += data.len();
-                        retired.push(data);
-                    });
-                    for mut data in retired {
-                        data.resize(data.capacity(), 0);
-                        self.vec_pool.push(data);
-                    }
+                    // Dfast doesn't retain input Vecs — `history` is the
+                    // only byte store, so there is no per-block buffer
+                    // to push back through a callback. Eviction byte
+                    // count is derived from the `window_size` delta
+                    // before/after; the Dfast variant of
+                    // `trim_to_window` takes no closure, sidestepping
+                    // an unused-`impl FnMut` monomorphization that
+                    // would otherwise contractually never fire.
+                    let dfast = self.dfast_matcher_mut();
+                    let pre = dfast.window_size;
+                    dfast.trim_to_window();
+                    evicted_bytes += pre - dfast.window_size;
                 }
                 super::strategy::BackendTag::Row => {
                     let mut retired = Vec::new();
@@ -784,11 +799,7 @@ impl Matcher for MatchGeneratorDriver {
                     // before constructing the replacement variant).
                     m.short_hash = Vec::new();
                     m.long_hash = Vec::new();
-                    let vec_pool = &mut self.vec_pool;
-                    m.reset(|mut data| {
-                        data.resize(data.capacity(), 0);
-                        vec_pool.push(data);
-                    });
+                    m.reset();
                 }
                 MatcherStorage::Row(m) => {
                     m.row_heads = Vec::new();
@@ -873,11 +884,10 @@ impl Matcher for MatchGeneratorDriver {
                 } else {
                     DFAST_HASH_BITS
                 });
-                let vec_pool = &mut self.vec_pool;
-                dfast.reset(|mut data| {
-                    data.resize(data.capacity(), 0);
-                    vec_pool.push(data);
-                });
+                // Dfast holds no per-block input Vecs (history owns the
+                // bytes and `add_data` returns each Vec eagerly), so
+                // `reset` takes no `reuse_space` callback.
+                dfast.reset();
             }
             MatcherStorage::Row(row) => {
                 row.max_window_size = max_window_size;
@@ -3826,6 +3836,67 @@ fn dfast_matches_roundtrip_multi_block_pattern() {
     assert_eq!(&history[prefix_len..], second_block.as_slice());
 }
 
+/// Regression for the `DFAST_MIN_MATCH_LEN: 6 -> 5` drop. The fixture
+/// is built so the longest available match is EXACTLY 5 bytes — a
+/// matcher that still effectively requires a 6-byte floor would emit
+/// only literals here and the assertion would catch the silent
+/// 5-byte miss.
+///
+/// Fixture layout (34 B):
+///   bytes 0..5    `"ABCDE"`  — match source
+///   bytes 5..28   `'!'` × 23 — filler that does NOT start with 'A'
+///   bytes 28..33  `"ABCDE"`  — match site (repeats the prefix)
+///   byte  33      `'F'`      — terminator: differs from byte 5 (`'!'`),
+///                              so the forward extension at the match
+///                              site stops at exactly length 5.
+///
+/// A 5-byte match at offset 28 must be emitted; a 6-byte+ match at the
+/// same offset must NOT.
+#[test]
+fn dfast_accepts_exact_five_byte_match() {
+    // Layout the input so that:
+    //   bytes 0..5   = "ABCDE"        (the match source)
+    //   bytes 5..28  = 23 filler bytes that do NOT start with 'A'
+    //   bytes 28..33 = "ABCDE"        (the 5-byte match site)
+    //   byte  33     = 'F'            (differs from byte 5 = '!')
+    // The longest available copy at position 28 is exactly 5 bytes:
+    // the byte at position 33 ('F') differs from the byte at position 5
+    // ('!'), so the forward extension stops at length 5.
+    let mut data = Vec::new();
+    data.extend_from_slice(b"ABCDE"); // 0..5
+    data.extend_from_slice(b"!!!!!!!!!!!!!!!!!!!!!!!"); // 5..28 (23 bytes)
+    data.extend_from_slice(b"ABCDE"); // 28..33
+    data.push(b'F'); // 33: forces forward extension to stop at length 5
+    assert_eq!(data.len(), 34);
+
+    let mut matcher = DfastMatchGenerator::new(1 << 22);
+    matcher.add_data(data.clone(), |_| {});
+
+    let mut saw_five_byte_match = false;
+    let mut saw_longer_match = false;
+    matcher.start_matching(|seq| {
+        if let Sequence::Triple {
+            offset, match_len, ..
+        } = seq
+        {
+            if offset == 28 && match_len == 5 {
+                saw_five_byte_match = true;
+            } else if offset == 28 && match_len > 5 {
+                saw_longer_match = true;
+            }
+        }
+    });
+
+    assert!(
+        saw_five_byte_match,
+        "dfast must accept the exact-5-byte match — a 6-byte floor would skip it"
+    );
+    assert!(
+        !saw_longer_match,
+        "fixture pinned to length 5 — byte 33 ('F') must terminate the extension"
+    );
+}
+
 #[test]
 fn driver_switches_backends_and_initializes_dfast_via_reset() {
     let mut driver = MatchGeneratorDriver::new(32, 2);
@@ -4116,7 +4187,7 @@ fn dictionary_entropy_seed_initializes_opt_state_from_tables() {
     let ll = crate::fse::fse_encoder::default_ll_table();
     let ml = crate::fse::fse_encoder::default_ml_table();
     let of = crate::fse::fse_encoder::default_of_table();
-    hc.seed_dictionary_entropy(Some(&huff), Some(&ll), Some(&ml), Some(&of));
+    hc.seed_dictionary_entropy(Some(&huff), Some(ll), Some(ml), Some(of));
 
     hc.backend.bt_mut().opt_state.rescale_freqs(
         b"abcd",
@@ -4161,7 +4232,7 @@ fn dictionary_fse_seed_applies_without_huffman_seed() {
     let ll = crate::fse::fse_encoder::default_ll_table();
     let ml = crate::fse::fse_encoder::default_ml_table();
     let of = crate::fse::fse_encoder::default_of_table();
-    hc.seed_dictionary_entropy(None, Some(&ll), Some(&ml), Some(&of));
+    hc.seed_dictionary_entropy(None, Some(ll), Some(ml), Some(of));
     hc.backend.bt_mut().opt_state.rescale_freqs(
         b"abcd",
         HcOptimalCostProfile::const_for_strategy::<super::strategy::BtUltra2>(),
@@ -4204,7 +4275,7 @@ fn dictionary_seed_overrides_predef_price_mode_on_tiny_input() {
     let ll = crate::fse::fse_encoder::default_ll_table();
     let ml = crate::fse::fse_encoder::default_ml_table();
     let of = crate::fse::fse_encoder::default_of_table();
-    hc.seed_dictionary_entropy(None, Some(&ll), Some(&ml), Some(&of));
+    hc.seed_dictionary_entropy(None, Some(ll), Some(ml), Some(of));
     hc.backend.bt_mut().opt_state.rescale_freqs(
         b"abc",
         HcOptimalCostProfile::const_for_strategy::<super::strategy::BtUltra2>(),
@@ -4264,7 +4335,7 @@ fn btultra2_seed_pass_disabled_when_dictionary_entropy_seed_present() {
     let ll = crate::fse::fse_encoder::default_ll_table();
     let ml = crate::fse::fse_encoder::default_ml_table();
     let of = crate::fse::fse_encoder::default_of_table();
-    hc.seed_dictionary_entropy(None, Some(&ll), Some(&ml), Some(&of));
+    hc.seed_dictionary_entropy(None, Some(ll), Some(ml), Some(of));
     assert!(
         !hc.should_run_btultra2_seed_pass::<super::strategy::BtUltra2>(HC_PREDEF_THRESHOLD + 1),
         "dictionary-seeded first block should skip btultra2 warmup pass"
@@ -4410,7 +4481,7 @@ fn dictionary_huffman_seed_ignored_when_literals_uncompressed() {
     let ll = crate::fse::fse_encoder::default_ll_table();
     let ml = crate::fse::fse_encoder::default_ml_table();
     let of = crate::fse::fse_encoder::default_of_table();
-    stats.seed_dictionary_entropy(Some(&huff), Some(&ll), Some(&ml), Some(&of));
+    stats.seed_dictionary_entropy(Some(&huff), Some(ll), Some(ml), Some(of));
     stats.rescale_freqs(
         b"abcd",
         HcOptimalCostProfile::const_for_strategy::<super::strategy::BtUltra2>(),
@@ -4822,8 +4893,15 @@ fn driver_small_source_hint_shrinks_dfast_hash_tables() {
     space.truncate(12);
     driver.commit_space(space);
     driver.skip_matching_with_hint(None);
-    let full_tables = driver.dfast_matcher().short_hash.len();
-    assert_eq!(full_tables, 1 << DFAST_HASH_BITS);
+    // Donor-parity split sizes: long-hash = DFAST_HASH_BITS,
+    // short-hash = DFAST_HASH_BITS - DFAST_SHORT_HASH_BITS_DELTA.
+    let full_long = driver.dfast_matcher().long_hash.len();
+    let full_short = driver.dfast_matcher().short_hash.len();
+    assert_eq!(full_long, 1 << DFAST_HASH_BITS);
+    assert_eq!(
+        full_short,
+        1 << (DFAST_HASH_BITS - DFAST_SHORT_HASH_BITS_DELTA)
+    );
 
     driver.set_source_size_hint(1024);
     driver.reset(CompressionLevel::Level(2));
@@ -4832,13 +4910,30 @@ fn driver_small_source_hint_shrinks_dfast_hash_tables() {
     space.truncate(12);
     driver.commit_space(space);
     driver.skip_matching_with_hint(None);
-    let hinted_tables = driver.dfast_matcher().short_hash.len();
+    let hinted_long = driver.dfast_matcher().long_hash.len();
+    let hinted_short = driver.dfast_matcher().short_hash.len();
 
     assert_eq!(driver.window_size(), 1 << MIN_HINTED_WINDOW_LOG);
-    assert_eq!(hinted_tables, 1 << MIN_HINTED_WINDOW_LOG);
+    // At the hinted floor `MIN_HINTED_WINDOW_LOG`, the long table
+    // matches the hinted size; the short table sits one
+    // `DFAST_SHORT_HASH_BITS_DELTA` step below it, clamped at its own
+    // `MIN_WINDOW_LOG` floor. The one-bit split between the two tables
+    // is preserved — the short table is NOT pulled up to equal the
+    // long table at this floor.
+    assert_eq!(hinted_long, 1 << MIN_HINTED_WINDOW_LOG);
+    let expected_hinted_short_bits = (MIN_HINTED_WINDOW_LOG as usize)
+        .saturating_sub(DFAST_SHORT_HASH_BITS_DELTA)
+        .max(MIN_WINDOW_LOG as usize);
+    assert_eq!(
+        hinted_short,
+        1 << expected_hinted_short_bits,
+        "short table must sit one DFAST_SHORT_HASH_BITS_DELTA below the long table \
+         (clamped at MIN_WINDOW_LOG) — a regression that pulls it up to the long-table \
+         floor would still satisfy the `< full_short` bound below and slip through"
+    );
     assert!(
-        hinted_tables < full_tables,
-        "tiny source hint should reduce dfast table footprint"
+        hinted_long < full_long && hinted_short < full_short,
+        "tiny source hint should reduce both dfast tables"
     );
 }
 
@@ -5069,11 +5164,20 @@ fn driver_unhinted_level2_keeps_default_dfast_hash_table_size() {
     driver.commit_space(space);
     driver.skip_matching_with_hint(None);
 
-    let table_len = driver.dfast_matcher().short_hash.len();
+    // Donor-parity split: long-hash at DFAST_HASH_BITS, short-hash one
+    // bit smaller (DFAST_SHORT_HASH_BITS_DELTA = 1, matching donor
+    // `chainLog = hashLog - 1` for dfast levels).
+    let long_len = driver.dfast_matcher().long_hash.len();
+    let short_len = driver.dfast_matcher().short_hash.len();
     assert_eq!(
-        table_len,
+        long_len,
         1 << DFAST_HASH_BITS,
-        "unhinted Level(2) should keep default dfast table size"
+        "unhinted Level(2) should keep default long-hash table size"
+    );
+    assert_eq!(
+        short_len,
+        1 << (DFAST_HASH_BITS - DFAST_SHORT_HASH_BITS_DELTA),
+        "unhinted Level(2) short-hash should be one bit smaller than long-hash"
     );
 }
 
@@ -6856,7 +6960,15 @@ fn dfast_add_data_callback_reports_evicted_len_not_capacity() {
 }
 
 #[test]
-fn dfast_trim_to_window_callback_reports_evicted_len_not_capacity() {
+fn dfast_trim_to_window_evicts_oldest_block_by_length() {
+    // After the history-only storage refactor (#111 Phase 7c step 3),
+    // Dfast no longer retains input `Vec<u8>`s — the `history`
+    // contiguous buffer is the sole byte store, and `add_data`
+    // returns the input Vec to the caller's pool eagerly. So
+    // `trim_to_window` doesn't have anything to hand back to the
+    // closure (no Vec exists to give). The eviction is observable
+    // instead through `window_size` shrinking by the per-block
+    // length recorded in `window_blocks`.
     let mut matcher = DfastMatchGenerator::new(16);
 
     let mut first = Vec::with_capacity(64);
@@ -6867,18 +6979,26 @@ fn dfast_trim_to_window_callback_reports_evicted_len_not_capacity() {
     second.extend_from_slice(b"ijklmnop");
     matcher.add_data(second, |_| {});
 
+    assert_eq!(matcher.window_size, 16);
+    assert_eq!(matcher.window_blocks.len(), 2);
+
     matcher.max_window_size = 8;
 
-    let mut observed_evicted_len = None;
-    matcher.trim_to_window(|data| {
-        observed_evicted_len = Some(data.len());
-    });
+    matcher.trim_to_window();
 
+    // No callback signature to assert on: the Dfast variant of
+    // `trim_to_window` takes none. That signature shape (vs HC/Row
+    // which accept `impl FnMut(Vec<u8>)`) is the property locking in
+    // the contract — there is no closure to invoke or skip, so no
+    // future change can "start invoking the callback" without a
+    // compile-time signature break that the dispatcher and this test
+    // would force the author to address.
     assert_eq!(
-        observed_evicted_len,
-        Some(8),
-        "trim callback must report evicted byte length, not backing capacity"
+        matcher.window_size, 8,
+        "exactly one 8-byte block must remain"
     );
+    assert_eq!(matcher.window_blocks.len(), 1);
+    assert_eq!(matcher.history_abs_start, 8);
 }
 
 #[test]
@@ -7036,18 +7156,16 @@ fn dfast_skip_matching_dense_backfills_newly_hashable_long_tail_positions() {
         target_rel + 8 <= live.len(),
         "fixture must make the boundary start long-hashable"
     );
-    let long_hash = matcher.hash8(&live[target_rel..]);
+    let long_hash = matcher.long_hash_index(&live[target_rel..]);
     let target_slot = matcher.pack_slot(target_abs_pos);
-    // Guard against the membership check turning vacuous if a future
-    // `pack_slot` regression returned `DFAST_EMPTY_SLOT`: empty buckets
-    // are pre-filled with the sentinel, so `contains(&empty)` would
-    // pass without proving the real position was seeded.
+    // Single-slot tables (donor parity): the bucket holds at most one
+    // u32; the assertion below is a direct equality (no `.contains`).
     assert_ne!(
         target_slot, DFAST_EMPTY_SLOT,
         "pack_slot must never return the empty-slot sentinel for a real position"
     );
-    assert!(
-        matcher.long_hash[long_hash].contains(&target_slot),
+    assert_eq!(
+        matcher.long_hash[long_hash], target_slot,
         "dense skip must seed long-hash entry for newly hashable boundary start"
     );
 }
@@ -7059,7 +7177,7 @@ fn dfast_seed_remaining_hashable_starts_seeds_last_short_hash_positions() {
     matcher.add_data(block, |_| {});
     matcher.ensure_hash_tables();
 
-    let current_len = matcher.window.back().unwrap().len();
+    let current_len = matcher.window_blocks.back().copied().unwrap_or(0);
     let current_abs_start = matcher.history_abs_start + matcher.window_size - current_len;
     let seed_start = current_len - DFAST_MIN_MATCH_LEN;
     matcher.seed_remaining_hashable_starts(current_abs_start, current_len, seed_start);
@@ -7071,18 +7189,14 @@ fn dfast_seed_remaining_hashable_starts_seeds_last_short_hash_positions() {
         target_rel + 4 <= live.len(),
         "fixture must leave the last short-hash start valid"
     );
-    let short_hash = matcher.hash4(&live[target_rel..]);
+    let short_hash = matcher.short_hash_index(&live[target_rel..]);
     let target_slot = matcher.pack_slot(target_abs_pos);
-    // Guard against the membership check turning vacuous if a future
-    // `pack_slot` regression returned `DFAST_EMPTY_SLOT`: empty buckets
-    // are pre-filled with the sentinel, so `contains(&empty)` would
-    // pass without proving the real position was seeded.
     assert_ne!(
         target_slot, DFAST_EMPTY_SLOT,
         "pack_slot must never return the empty-slot sentinel for a real position"
     );
-    assert!(
-        matcher.short_hash[short_hash].contains(&target_slot),
+    assert_eq!(
+        matcher.short_hash[short_hash], target_slot,
         "tail seeding must include the last 4-byte-hashable start"
     );
 }
@@ -7094,7 +7208,7 @@ fn dfast_seed_remaining_hashable_starts_handles_pos_at_block_end() {
     matcher.add_data(block, |_| {});
     matcher.ensure_hash_tables();
 
-    let current_len = matcher.window.back().unwrap().len();
+    let current_len = matcher.window_blocks.back().copied().unwrap_or(0);
     let current_abs_start = matcher.history_abs_start + matcher.window_size - current_len;
     matcher.seed_remaining_hashable_starts(current_abs_start, current_len, current_len);
 
@@ -7105,18 +7219,14 @@ fn dfast_seed_remaining_hashable_starts_handles_pos_at_block_end() {
         target_rel + 4 <= live.len(),
         "fixture must leave the last short-hash start valid"
     );
-    let short_hash = matcher.hash4(&live[target_rel..]);
+    let short_hash = matcher.short_hash_index(&live[target_rel..]);
     let target_slot = matcher.pack_slot(target_abs_pos);
-    // Guard against the membership check turning vacuous if a future
-    // `pack_slot` regression returned `DFAST_EMPTY_SLOT`: empty buckets
-    // are pre-filled with the sentinel, so `contains(&empty)` would
-    // pass without proving the real position was seeded.
     assert_ne!(
         target_slot, DFAST_EMPTY_SLOT,
         "pack_slot must never return the empty-slot sentinel for a real position"
     );
-    assert!(
-        matcher.short_hash[short_hash].contains(&target_slot),
+    assert_eq!(
+        matcher.short_hash[short_hash], target_slot,
         "tail seeding must still include the last 4-byte-hashable start when pos is at block end"
     );
 }
@@ -7152,8 +7262,8 @@ fn dfast_ensure_room_for_rebases_above_guard_band() {
     let early_abs = 1024usize;
     let early_packed = dfast.pack_slot(early_abs);
     assert_ne!(early_packed, DFAST_EMPTY_SLOT);
-    dfast.short_hash[0][0] = early_packed;
-    dfast.long_hash[0][0] = early_packed;
+    dfast.short_hash[0] = early_packed;
+    dfast.long_hash[0] = early_packed;
 
     // Pick a trigger position that forces the first rebase. With
     // `position_base = 0`, the smallest `abs_pos` that fails the
@@ -7174,11 +7284,11 @@ fn dfast_ensure_room_for_rebases_above_guard_band() {
     // donor parity for `ZSTD_window_reduce`'s clamp-at-zero rule.
     // Verify BOTH tables — `reduce()` walks them in sequence.
     assert_eq!(
-        dfast.short_hash[0][0], DFAST_EMPTY_SLOT,
+        dfast.short_hash[0], DFAST_EMPTY_SLOT,
         "pre-rebase short-hash entries below the reducer must become empty"
     );
     assert_eq!(
-        dfast.long_hash[0][0], DFAST_EMPTY_SLOT,
+        dfast.long_hash[0], DFAST_EMPTY_SLOT,
         "pre-rebase long-hash entries below the reducer must become empty"
     );
 
diff --git a/zstd/src/encoding/mod.rs b/zstd/src/encoding/mod.rs
index b5b89815..64a3dda6 100644
--- a/zstd/src/encoding/mod.rs
+++ b/zstd/src/encoding/mod.rs
@@ -92,6 +92,55 @@ use alloc::vec::Vec;
 
 pub(crate) const BETTER_WINDOW_LOG: u8 = 23;
 
+/// Worst-case compressed-size bound for `src_size` uncompressed
+/// bytes. Mirrors the upstream `ZSTD_COMPRESSBOUND(srcSize)` macro
+/// from `lib/zstd.h` (zstd 1.5.7) — the C *macro*, NOT the public
+/// `ZSTD_compressBound()` function. The function adds frame/block-
+/// header headroom in some code paths; the macro is a tighter
+/// expression suitable for sizing an internal output Vec, which is
+/// the only call site here. Callers that need a hard upper bound on
+/// the wire-format compressed size should NOT reuse this — use the
+/// `zstd-sys` function call directly. If the macro changes in a
+/// future upstream revision, treat the upstream header as
+/// authoritative and re-derive this function — do not trust the
+/// inline copy without cross-checking.
+///
+/// Inline approximation (this revision):
+///
+/// ```text
+/// srcSize
+///   + (srcSize >> 8)
+///   + (srcSize < 128 KiB ? ((128 KiB - srcSize) >> 11) : 0)
+/// ```
+///
+/// Consulted by [`compress_slice_to_vec`] for the small-input branch
+/// of its output-`Vec` seed: the seed is
+/// `min(compress_bound(src), OUTPUT_BLOCK_CAP = 128 KiB)`, so for
+/// inputs that fit within one donor block (`≤ ~128 KiB`) the
+/// destination never reallocates inside the measured window. Without
+/// the bound, `Vec::push` growth doubles in powers of two and pins
+/// `~2 × final_compressed_size` resident at the last realloc, which
+/// shows up as ~1 MiB of peak-RSS noise on 1 MiB inputs and prevents
+/// FFI-parity on the memory bench. For inputs larger than the cap the
+/// `Vec` still grows by amortized doubling — see
+/// [`compress_slice_to_vec`] for the trade-off rationale.
+#[inline]
+pub(crate) const fn compress_bound(src_size: usize) -> usize {
+    const SMALL_INPUT_THRESHOLD: usize = 128 * 1024;
+    let tail = if src_size < SMALL_INPUT_THRESHOLD {
+        (SMALL_INPUT_THRESHOLD - src_size) >> 11
+    } else {
+        0
+    };
+    // `saturating_add` guards against the corner case where `src_size`
+    // is close to `usize::MAX`. The current sole caller
+    // (`compress_slice_to_vec`) clamps the result with `.min(OUTPUT_BLOCK_CAP)`
+    // so an overflow there would still produce a sensible cap, but
+    // future `pub(crate)` callers may use the raw bound — keep the
+    // arithmetic itself overflow-safe.
+    src_size.saturating_add(src_size >> 8).saturating_add(tail)
+}
+
 /// Convenience function to compress some source into a target without reusing any resources of the compressor
 /// ```rust
 /// use structured_zstd::encoding::{compress, CompressionLevel};
@@ -112,6 +161,50 @@ pub fn compress<R: Read, W: Write>(source: R, target: W, level: CompressionLevel
 /// can provide a source-size hint to the one-shot encoder path. Peak memory can
 /// therefore be roughly `input_size + output_size`. For very large payloads or
 /// tighter memory budgets, prefer streaming APIs such as [`StreamingEncoder`].
+///
+/// **Peak-memory shape change in this revision.** The implementation
+/// delegates to [`compress_slice_to_vec`], which seeds the output
+/// `Vec` with `min(compress_bound(input.len()), OUTPUT_BLOCK_CAP =
+/// 128 KiB)` instead of the previous `Vec::new()` (zero-capacity +
+/// power-of-two growth). For inputs in the few-KiB to ~128 KiB range
+/// this is a strict improvement (no doubling spikes inside the
+/// measured window). For inputs significantly larger than 128 KiB the
+/// allocation curve still grows by amortized doubling but starts from
+/// a 128 KiB floor rather than 0. Downstream consumers that measure
+/// peak RSS on this entry point will see a different curve than
+/// pre-revision; bench shape, not steady-state, is what changed.
+///
+/// **This is NOT a streaming API.** The source is fully buffered
+/// into a `Vec<u8>` before any compression work begins, so peak input
+/// memory is bounded by `source.len()` (not "constant regardless of
+/// payload size" as a stream-shaped encoder would offer). The RSS
+/// notes below apply to the materialization-then-compress shape; if
+/// the source is large enough that holding it in memory is not
+/// acceptable, use [`StreamingEncoder`] which consumes chunks
+/// incrementally without the up-front Vec build.
+///
+/// The other side of the peak shape is the input buffering: this
+/// helper drives `read_to_end` to materialize the full source into a
+/// `Vec<u8>` before forwarding the slice to [`compress_slice_to_vec`].
+/// For a `Read` whose size is unknown ahead of time, `read_to_end`
+/// grows that input `Vec` via power-of-two doubling — peak input
+/// allocation can be up to 2× the final source length transiently.
+/// At the moment that input buffer crosses ~128 KiB the output Vec
+/// seed kicks in concurrently. The total live working set on this
+/// entry point is approximately
+/// `input.capacity() + output_vec_seed + internal_accumulators`,
+/// where `output_vec_seed` is `min(compress_bound(input.len()), 128
+/// KiB)` and `internal_accumulators` covers
+/// `FrameCompressor::all_blocks` (pre-reserved at frame start, up to
+/// ~130 KiB at default block cap) plus per-block scratch (hash tables,
+/// literal/sequence staging). Round the helper's RSS peak to
+/// `input.capacity() + output_vec_seed + ~130 KiB internal +
+/// per-block scratch` rather than the bare `input.capacity() + 128
+/// KiB` figure quoted in earlier revisions, which only accounted for
+/// the output seed. [`StreamingEncoder`] avoids the input
+/// materialization step entirely and is the right entry point when
+/// the source is large or unbounded.
+///
 /// ```rust
 /// use structured_zstd::encoding::{compress_to_vec, CompressionLevel};
 /// let data: &[u8] = &[0,0,0,0,0,0,0,0,0,0,0,0];
@@ -121,11 +214,73 @@ pub fn compress_to_vec<R: Read>(source: R, level: CompressionLevel) -> Vec<u8> {
     let mut source = source;
     let mut input = Vec::new();
     source.read_to_end(&mut input).unwrap();
+    compress_slice_to_vec(input.as_slice(), level)
+}
 
-    let mut vec = Vec::new();
+/// Compress a contiguous byte slice into a fresh `Vec<u8>` without
+/// the input-buffering step that [`compress_to_vec`] performs to
+/// adapt a `Read` source. Donor-parity peak-memory shape: the input
+/// is read by reference, and the output `Vec` is seeded with
+/// `min(compress_bound(src), OUTPUT_BLOCK_CAP = 128 KiB)`.
+///
+/// The seed is intentionally capped at one donor block
+/// (`OUTPUT_BLOCK_CAP = 128 KiB`). Three regimes:
+///
+/// * Inputs up to ~127 KiB where `compress_bound(src) <
+///   OUTPUT_BLOCK_CAP` — the `min` picks the tighter bound and the
+///   `Vec` never reallocates inside the measured window (cheap, no
+///   doubling spikes, no over-allocation).
+/// * Inputs around 128 KiB where `compress_bound(src) >=
+///   OUTPUT_BLOCK_CAP` — the `min` clamps to the cap, so the "no
+///   reallocation" property holds only as long as the actual
+///   compressed output also fits within `OUTPUT_BLOCK_CAP`. For
+///   high-entropy inputs near the cap boundary the `Vec` may grow
+///   by one doubling step before the next block lands.
+/// * Larger inputs — the `Vec` grows via amortized doubling, with
+///   peak transient memory ≈ 2× current compressed size at the last
+///   realloc. Deliberate trade-off against pinning
+///   `compress_bound(src)` upfront, which on the 100 MiB
+///   `large-log-stream` scenario pinned 100.4 MiB even though the
+///   actual compressed output was ≪ 1 MiB.
+///
+/// # Panics
+///
+/// Panics on encoder error (matches the failure surface of
+/// [`compress_to_vec`], which this function backs). The internal
+/// [`FrameCompressor::compress`] call propagates `io::Error` from the
+/// output [`Vec`] writer; under the in-tree default writer that
+/// channel is infallible and any error becomes a `panic`. Out-of-
+/// memory during `Vec::with_capacity` or the encoder's per-block
+/// scratch allocations is handled by the global allocator's abort
+/// policy. Migrating to a fallible variant requires plumbing
+/// `Result` through `FrameCompressor::compress` — out of scope for
+/// the slice/Vec entry points which mirror the donor `ZSTD_compress`
+/// shape (no error return on the bulk path).
+///
+/// ```rust
+/// use structured_zstd::encoding::{compress_slice_to_vec, CompressionLevel};
+/// let data: &[u8] = &[0,0,0,0,0,0,0,0,0,0,0,0];
+/// let compressed = compress_slice_to_vec(data, CompressionLevel::Fastest);
+/// ```
+pub fn compress_slice_to_vec(source: &[u8], level: CompressionLevel) -> Vec<u8> {
+    // Initial capacity sized to a single output block, matching donor
+    // `ZSTD_CStreamOutSize()` (≈ `ZSTD_BLOCKSIZE_MAX` + headroom). For
+    // small inputs `compress_bound(src)` is smaller than this block —
+    // use the tighter bound to avoid over-allocation. For larger inputs
+    // the `Vec` grows via amortized doubling, with peak ≈ 2×current
+    // compressed size at the last realloc (donor streaming shape: the
+    // caller buffer is one block, not `ZSTD_compressBound(srcSize)`).
+    // Pre-allocating `compress_bound(src)` here would charge worst-case
+    // expansion against peak even on highly compressible inputs — on
+    // the 100 MiB `large-log-stream` scenario that single allocation
+    // dominated the encoder's measured footprint (`compress_bound` ≈
+    // 100.4 MiB pinned, actual compressed output ≪ 1 MiB).
+    const OUTPUT_BLOCK_CAP: usize = 128 * 1024;
+    let cap = compress_bound(source.len()).min(OUTPUT_BLOCK_CAP);
+    let mut vec = Vec::with_capacity(cap);
     let mut frame_enc = FrameCompressor::new(level);
-    frame_enc.set_source_size_hint(input.len() as u64);
-    frame_enc.set_source(input.as_slice());
+    frame_enc.set_source_size_hint(source.len() as u64);
+    frame_enc.set_source(source);
     frame_enc.set_drain(&mut vec);
     frame_enc.compress();
     vec
diff --git a/zstd/src/fse/fse_encoder.rs b/zstd/src/fse/fse_encoder.rs
index 40253516..ee420723 100644
--- a/zstd/src/fse/fse_encoder.rs
+++ b/zstd/src/fse/fse_encoder.rs
@@ -699,14 +699,105 @@ const OF_DIST: &[i32] = &[
     1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1,
 ];
 
-pub(crate) fn default_ml_table() -> FSETable {
-    build_table_from_probabilities(ML_DIST, 6)
+// The three predefined LL/ML/OF distribution tables are pure functions
+// of compile-time constants (`LL_DIST`, `ML_DIST`, `OF_DIST` and fixed
+// `acc_log` values). Each `build_table_from_probabilities` call costs
+// ~12 µs on a modern x86_64 — multiplied by three, that's ~38 µs of
+// pure setup paid on every `FrameCompressor::new`. On a 1 KiB frame
+// the actual compression work is ~1-5 µs, so the predefined-table
+// build dominates the per-frame cost for tiny inputs.
+//
+// Cache each predefined table once per process via a dedicated
+// `AtomicPtr<FSETable>` and hand callers a clone. The per-distribution
+// cache slot is encoded by the static identity of the cache itself
+// (each helper has its own `static`), NOT by pointer comparison on
+// the distribution slice — `LL_DIST` / `ML_DIST` / `OF_DIST` are
+// `const`, and `core::ptr::eq` on the materialized slice pointer of
+// a `const` is only stable as long as rustc keeps lowering it to a
+// single anonymous static. A future rustc change that duplicates the
+// const at each use site would silently sink every call through the
+// "unknown slice" branch and bypass the cache forever. Owning the
+// `static AtomicPtr` per-helper means the cache slot is selected at
+// compile time without consulting the slice pointer at all.
+//
+// `AtomicPtr<FSETable>` lock-free init works in both `std` and
+// `no_std` builds — only `alloc` is required (always available in
+// this crate via `extern crate alloc`), which keeps the cache on a
+// single code path instead of a `cfg(feature = "std")` branch with
+// an eager rebuild fallback. Memory ordering: `Acquire` on the load
+// so the consumer sees a fully-published `FSETable`; `AcqRel` on the
+// compare-exchange so the publishing side both reads any concurrent
+// winner (`Acquire`) and publishes its store (`Release`). The first
+// writer leaks `Box<FSETable>` intentionally — the cache lives for
+// the entire program lifetime, exactly like a `LazyLock`.
+fn get_or_init_cached_table(
+    cache: &core::sync::atomic::AtomicPtr<FSETable>,
+    probs: &[i32],
+    acc_log: u8,
+) -> &'static FSETable {
+    use core::sync::atomic::Ordering;
+
+    let cur = cache.load(Ordering::Acquire);
+    if !cur.is_null() {
+        // SAFETY: a non-null entry in this cache was published by a
+        // previous winner of the `compare_exchange` below, which
+        // leaked the `Box<FSETable>` (the cache never frees, mirroring
+        // a `LazyLock` lifetime). The pointed-to allocation is
+        // therefore valid for `'static` and immutable for the rest
+        // of the program, so handing out a `&'static FSETable` to
+        // multiple threads is sound.
+        return unsafe { &*cur };
+    }
+
+    let built = alloc::boxed::Box::new(build_table_from_probabilities(probs, acc_log));
+    let raw = alloc::boxed::Box::into_raw(built);
+    // `AcqRel` on success rather than the minimal `Release`: the
+    // success path doesn't actually need Acquire (no prior loads to
+    // synchronise with at the publication point), but matching the
+    // failure Acquire ordering keeps the success and failure
+    // branches symmetric on rustc's atomic surface — both arms then
+    // see the same memory-fence shape for whatever follows. The
+    // marginal cost is one extra fence instruction on x86/aarch64;
+    // this path runs at most three times per process so it never
+    // shows up in a flamegraph.
+    match cache.compare_exchange(
+        core::ptr::null_mut(),
+        raw,
+        Ordering::AcqRel,
+        Ordering::Acquire,
+    ) {
+        Ok(_) => {
+            // We installed the singleton. SAFETY: `raw` is the
+            // pointer we just published; no other thread can free it.
+            unsafe { &*raw }
+        }
+        Err(existing) => {
+            // Another thread beat us to the publish — reclaim our
+            // throwaway allocation and use the winner. SAFETY: we own
+            // `raw` here (compare_exchange did NOT store it), so
+            // reconstituting the `Box` is sound. `existing` was
+            // published by the winning thread and is leaked for
+            // `'static`, mirroring the `cur` path.
+            drop(unsafe { alloc::boxed::Box::from_raw(raw) });
+            unsafe { &*existing }
+        }
+    }
+}
+
+pub(crate) fn default_ml_table() -> &'static FSETable {
+    static CACHE: core::sync::atomic::AtomicPtr<FSETable> =
+        core::sync::atomic::AtomicPtr::new(core::ptr::null_mut());
+    get_or_init_cached_table(&CACHE, ML_DIST, 6)
 }
 
-pub(crate) fn default_ll_table() -> FSETable {
-    build_table_from_probabilities(LL_DIST, 6)
+pub(crate) fn default_ll_table() -> &'static FSETable {
+    static CACHE: core::sync::atomic::AtomicPtr<FSETable> =
+        core::sync::atomic::AtomicPtr::new(core::ptr::null_mut());
+    get_or_init_cached_table(&CACHE, LL_DIST, 6)
 }
 
-pub(crate) fn default_of_table() -> FSETable {
-    build_table_from_probabilities(OF_DIST, 5)
+pub(crate) fn default_of_table() -> &'static FSETable {
+    static CACHE: core::sync::atomic::AtomicPtr<FSETable> =
+        core::sync::atomic::AtomicPtr::new(core::ptr::null_mut());
+    get_or_init_cached_table(&CACHE, OF_DIST, 5)
 }
diff --git a/zstd/src/fse/mod.rs b/zstd/src/fse/mod.rs
index 5e410322..170c34da 100644
--- a/zstd/src/fse/mod.rs
+++ b/zstd/src/fse/mod.rs
@@ -95,9 +95,9 @@ fn table_header_bits_exact() {
     };
 
     // Predefined tables
-    check(&default_ll_table());
-    check(&default_ml_table());
-    check(&default_of_table());
+    check(default_ll_table());
+    check(default_ml_table());
+    check(default_of_table());
 
     // Tables built from synthetic data
     let data: alloc::vec::Vec<u8> = (0u8..32).cycle().take(1000).collect();
diff --git a/zstd/src/tests/fuzz_regressions.rs b/zstd/src/tests/fuzz_regressions.rs
index a9fa5c95..b1b25494 100644
--- a/zstd/src/tests/fuzz_regressions.rs
+++ b/zstd/src/tests/fuzz_regressions.rs
@@ -25,3 +25,72 @@ fn test_all_artifacts() {
             .and_then(|()| frame_dec.decode_blocks(&mut f, BlockDecodingStrategy::All));
     }
 }
+
+/// Regression for the `interop` fuzz target: a 7-byte input crashed the
+/// level 3 dfast hot loop because `start_matching_fast_loop` guarded the
+/// loop with `pos + DFAST_MIN_MATCH_LEN <= current_len` (`MIN_MATCH = 5`)
+/// but unconditionally issued 8-byte `u64` loads via raw-pointer
+/// `read_unaligned` for the long-hash probe. On any block whose tail
+/// landed within `[current_len - 8, current_len - 5]` the load read past
+/// `history.len()`, which is UB on `*const u64::read_unaligned` even if
+/// the underlying `Vec`'s spare capacity covers the bytes.
+///
+/// The fix tightens every fast-loop guard to `+ HASH_READ_SIZE = 8` so
+/// the load is always in-bounds for the live history, matching donor
+/// `ilimit = iend - HASH_READ_SIZE` in `zstd_double_fast.c`.
+///
+/// Artifact: `zstd/fuzz/artifacts/interop/crash-01be...0dc7`. Base64
+/// `BGAuICAKIA==` → bytes `04 60 2e 20 20 0a 20`. CI fuzz run that
+/// produced this artifact:
+/// https://github.com/structured-world/structured-zstd/actions/runs/25974756307
+///
+/// SIGNAL CAVEAT: a plain `cargo nextest run` of this test may pass
+/// against the pre-fix code because the OOB `*const u64::read_unaligned`
+/// usually lands inside the live `Vec`'s spare capacity — the bytes are
+/// well-defined for the allocator even though the read is UB. The
+/// regression reliably fires only under a sanitizer that tracks valid
+/// length (CI fuzz job runs ASan; `cargo +nightly miri test` also
+/// catches it). Treat a green `cargo test` here as a smoke check, not
+/// proof that the fast-loop guards are correct; the authoritative
+/// signal for this fixture is the Linux fuzz CI job.
+#[test]
+fn interop_7_byte_input_does_not_oob_in_dfast_fast_loop() {
+    extern crate std;
+    use crate::decoding::{BlockDecodingStrategy, FrameDecoder};
+    use crate::encoding::{CompressionLevel, compress_to_vec};
+
+    // Bytes inline; the original libFuzzer artifact file was
+    // content-hashed (`crash-01be...0dc7`) so any future fuzz run
+    // that re-discovers the same root cause via a different input
+    // would land in a different filename anyway — the literal is
+    // the canonical regression vector, not the artifact path.
+    // Base64 `BGAuICAKIA==`.
+    let data: &[u8] = &[0x04, 0x60, 0x2e, 0x20, 0x20, 0x0a, 0x20];
+
+    // Pin to `Level(3)` rather than `Default` so this regression keeps
+    // covering the dfast fast loop specifically — the original UB
+    // surfaced through level 3 dfast, and pinning here means a future
+    // retune of the `Default` alias cannot accidentally route this
+    // test off the dfast path and let the regression pass without
+    // exercising the fixed code. Pre-fix this panicked / produced a
+    // garbage frame on Linux fuzz (ASan caught the UB).
+    let compressed = compress_to_vec(data, CompressionLevel::Level(3));
+
+    // Roundtrip through the in-tree decoder — matches the convention
+    // used by `test_all_artifacts` above and avoids coupling this
+    // regression to the donor `zstd` crate. The OOB load shows up as
+    // a panic / decode error before this point under ASan; if we get
+    // here with a parseable frame the bytes must match the input.
+    let mut frame_dec = FrameDecoder::new();
+    let mut cursor = compressed.as_slice();
+    frame_dec.reset(&mut cursor).unwrap();
+    frame_dec
+        .decode_blocks(&mut cursor, BlockDecodingStrategy::All)
+        .unwrap();
+    // `expect` over `unwrap_or_default`: a real decoder failure must
+    // surface as a "decoder returned None" panic, not as an empty
+    // `decoded` that then fails `assert_eq!` with a misleading
+    // "left: [] right: [04 60 ...]" diff that hides the real cause.
+    let decoded = frame_dec.collect().expect("decoder returned no payload");
+    assert_eq!(decoded.as_slice(), data);
+}