diff --git a/.gitignore b/.gitignore index d86432bc..fe785b7e 100644 --- a/.gitignore +++ b/.gitignore @@ -13,8 +13,15 @@ benchmark-delta.md benchmark-relative.json fuzz/corpus zstd/fuzz/corpus +# Fresh fuzz crashes from local `cargo fuzz run` sessions are CI/local +# scratch — they live in the runner. Pre-existing tracked artifacts +# under `decode/`, `fse/`, `huff0/` keep working because git tracks +# them by index regardless of this pattern; new untracked crashes +# stop polluting `git status`. +zstd/fuzz/artifacts/** .idea /= AGENTS.md .claude/ +.local/ **/__pycache__/ diff --git a/Cargo.toml b/Cargo.toml index fbc1467f..ca0ac7cb 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,3 +1,16 @@ [workspace] resolver = "3" members = ["zstd", "cli"] + +# Bench profile inherits from release but overrides LTO/codegen-units so +# the perf measurement reflects full cross-crate inlining. We deliberately +# scope this to `[profile.bench]` instead of `[profile.release]`: the +# release profile is shared by the `cli` binary and every downstream +# consumer of this workspace, and fat LTO + codegen-units = 1 multiplies +# release build times ~5× and significantly increases peak compile RAM. +# Bench builds run on CI with the wall clock budget for it; everyday +# `cargo build --release` (and downstream consumers) should not pay that +# cost without explicitly opting in. +[profile.bench] +lto = "fat" +codegen-units = 1 diff --git a/zstd/benches/compare_ffi.rs b/zstd/benches/compare_ffi.rs index 3c11dd28..37ac8162 100644 --- a/zstd/benches/compare_ffi.rs +++ b/zstd/benches/compare_ffi.rs @@ -213,7 +213,7 @@ fn bench_compress(c: &mut Criterion) { for scenario in benchmark_scenarios_cached().iter() { for level in supported_levels_filtered() { if emit_reports { - let rust_compressed = structured_zstd::encoding::compress_to_vec( + let rust_compressed = structured_zstd::encoding::compress_slice_to_vec( &scenario.bytes[..], level.rust_level, ); @@ -230,7 +230,7 @@ fn bench_compress(c: &mut Criterion) { group.bench_function("pure_rust", |b| { b.iter(|| { - black_box(structured_zstd::encoding::compress_to_vec( + black_box(structured_zstd::encoding::compress_slice_to_vec( &scenario.bytes[..], level.rust_level, )) @@ -250,8 +250,10 @@ fn bench_decompress(c: &mut Criterion) { let emit_reports = emit_reports_enabled(); for scenario in benchmark_scenarios_cached().iter() { for level in supported_levels_filtered() { - let rust_compressed = - structured_zstd::encoding::compress_to_vec(&scenario.bytes[..], level.rust_level); + let rust_compressed = structured_zstd::encoding::compress_slice_to_vec( + &scenario.bytes[..], + level.rust_level, + ); let ffi_compressed = ffi_encode_to_vec(&scenario.bytes[..], level.ffi_level); let expected_len = scenario.len(); bench_decompress_source( diff --git a/zstd/benches/compare_ffi_memory.rs b/zstd/benches/compare_ffi_memory.rs index d519dbc3..8f543369 100644 --- a/zstd/benches/compare_ffi_memory.rs +++ b/zstd/benches/compare_ffi_memory.rs @@ -421,7 +421,10 @@ fn main() { for level in supported_levels_filtered() { // Compress let (rust_compressed, rust_peak) = measure_peak(|| { - structured_zstd::encoding::compress_to_vec(&scenario.bytes[..], level.rust_level) + structured_zstd::encoding::compress_slice_to_vec( + &scenario.bytes[..], + level.rust_level, + ) }); let (ffi_compressed, ffi_peak) = measure_peak(|| ffi_encode(&scenario.bytes[..], level.ffi_level)); diff --git a/zstd/src/encoding/blocks/compressed.rs b/zstd/src/encoding/blocks/compressed.rs index deb8fc3b..4920bcba 100644 --- a/zstd/src/encoding/blocks/compressed.rs +++ b/zstd/src/encoding/blocks/compressed.rs @@ -330,19 +330,19 @@ fn encode_block_parts_with_sequence_scratch( // Choose the tables let ll_mode = choose_table( state.fse_tables.ll_previous.as_ref(), - &state.fse_tables.ll_default, + state.fse_tables.ll_default, sequences.iter().map(|seq| encode_literal_length(seq.ll).0), 9, ); let ml_mode = choose_table( state.fse_tables.ml_previous.as_ref(), - &state.fse_tables.ml_default, + state.fse_tables.ml_default, sequences.iter().map(|seq| encode_match_len(seq.ml).0), 9, ); let of_mode = choose_table( state.fse_tables.of_previous.as_ref(), - &state.fse_tables.of_default, + state.fse_tables.of_default, sequences.iter().map(|seq| encode_offset(seq.of).0), 8, ); @@ -540,26 +540,26 @@ fn estimate_sequences_section_bytes( // internally, identical decision path. let ll_mode = choose_table( fse_tables.ll_previous.as_ref(), - &fse_tables.ll_default, + fse_tables.ll_default, sequences.iter().map(|seq| encode_literal_length(seq.ll).0), 9, ); let ml_mode = choose_table( fse_tables.ml_previous.as_ref(), - &fse_tables.ml_default, + fse_tables.ml_default, sequences.iter().map(|seq| encode_match_len(seq.ml).0), 9, ); let of_mode = choose_table( fse_tables.of_previous.as_ref(), - &fse_tables.of_default, + fse_tables.of_default, sequences.iter().map(|seq| encode_offset(seq.of).0), 8, ); - let ll_bits_chosen = fse_section_bits_for_mode(&ll_mode, ll_counts, &fse_tables.ll_default); - let ml_bits_chosen = fse_section_bits_for_mode(&ml_mode, ml_counts, &fse_tables.ml_default); - let of_bits_chosen = fse_section_bits_for_mode(&of_mode, of_counts, &fse_tables.of_default); + let ll_bits_chosen = fse_section_bits_for_mode(&ll_mode, ll_counts, fse_tables.ll_default); + let ml_bits_chosen = fse_section_bits_for_mode(&ml_mode, ml_counts, fse_tables.ml_default); + let of_bits_chosen = fse_section_bits_for_mode(&of_mode, of_counts, fse_tables.of_default); let ll_table_desc_bytes = mode_table_description_bytes(&ll_mode); let ml_table_desc_bytes = mode_table_description_bytes(&ml_mode); @@ -822,12 +822,15 @@ fn encode_raw_sequences_into( } fn clone_fse_tables(fse_tables: &FseTables) -> FseTables { + // `*_default` fields are `&'static FSETable` (pointer-sized + // copies, cache-backed); only the `*_previous` per-frame tables + // need a deep clone. FseTables { - ll_default: fse_tables.ll_default.clone(), + ll_default: fse_tables.ll_default, ll_previous: fse_tables.ll_previous.clone(), - ml_default: fse_tables.ml_default.clone(), + ml_default: fse_tables.ml_default, ml_previous: fse_tables.ml_previous.clone(), - of_default: fse_tables.of_default.clone(), + of_default: fse_tables.of_default, of_previous: fse_tables.of_previous.clone(), } } @@ -1252,9 +1255,9 @@ fn encode_sequences( let (ll_code, ll_add_bits, ll_num_bits) = encode_literal_length(sequence.ll); let (of_code, of_add_bits, of_num_bits) = encode_offset(sequence.of); let (ml_code, ml_add_bits, ml_num_bits) = encode_match_len(sequence.ml); - let ll_table = mode_table(ll_mode, &defaults.ll_default); - let ml_table = mode_table(ml_mode, &defaults.ml_default); - let of_table = mode_table(of_mode, &defaults.of_default); + let ll_table = mode_table(ll_mode, defaults.ll_default); + let ml_table = mode_table(ml_mode, defaults.ml_default); + let of_table = mode_table(of_mode, defaults.of_default); let mut ll_state = ll_table.map(|table| table.start_state(ll_code)); let mut ml_state = ml_table.map(|table| table.start_state(ml_code)); let mut of_state = of_table.map(|table| table.start_state(of_code)); @@ -1707,34 +1710,34 @@ mod tests { ); assert!(tables_match( - previous_table(fse_tables.ll_previous.as_ref(), &fse_tables.ll_default).unwrap(), - &fse_tables.ll_default + previous_table(fse_tables.ll_previous.as_ref(), fse_tables.ll_default).unwrap(), + fse_tables.ll_default )); assert!(tables_match( - previous_table(fse_tables.ml_previous.as_ref(), &fse_tables.ml_default).unwrap(), - &fse_tables.ml_default + previous_table(fse_tables.ml_previous.as_ref(), fse_tables.ml_default).unwrap(), + fse_tables.ml_default )); assert!(tables_match( - previous_table(fse_tables.of_previous.as_ref(), &fse_tables.of_default).unwrap(), - &fse_tables.of_default + previous_table(fse_tables.of_previous.as_ref(), fse_tables.of_default).unwrap(), + fse_tables.of_default )); let sample_codes = [0u8, 1u8]; let ll_repeat = choose_table( fse_tables.ll_previous.as_ref(), - &fse_tables.ll_default, + fse_tables.ll_default, sample_codes.iter().copied(), 9, ); let ml_repeat = choose_table( fse_tables.ml_previous.as_ref(), - &fse_tables.ml_default, + fse_tables.ml_default, sample_codes.iter().copied(), 9, ); let of_repeat = choose_table( fse_tables.of_previous.as_ref(), - &fse_tables.of_default, + fse_tables.of_default, sample_codes.iter().copied(), 8, ); @@ -1751,7 +1754,7 @@ mod tests { fse_tables.ll_previous = Some(PreviousFseTable::Custom(Box::new(custom))); let before = core::ptr::from_ref( - previous_table(fse_tables.ll_previous.as_ref(), &fse_tables.ll_default).unwrap(), + previous_table(fse_tables.ll_previous.as_ref(), fse_tables.ll_default).unwrap(), ); remember_last_used_tables( @@ -1762,7 +1765,7 @@ mod tests { ); let after = core::ptr::from_ref( - previous_table(fse_tables.ll_previous.as_ref(), &fse_tables.ll_default).unwrap(), + previous_table(fse_tables.ll_previous.as_ref(), fse_tables.ll_default).unwrap(), ); assert_eq!(before, after); @@ -1777,7 +1780,7 @@ mod tests { let fse_tables = FseTables::new(); let mode = choose_table( None, - &fse_tables.ll_default, + fse_tables.ll_default, core::iter::repeat_n(0u8, 32), 9, ); diff --git a/zstd/src/encoding/dfast/mod.rs b/zstd/src/encoding/dfast/mod.rs index 1485afd3..fb6d56c9 100644 --- a/zstd/src/encoding/dfast/mod.rs +++ b/zstd/src/encoding/dfast/mod.rs @@ -16,10 +16,10 @@ use core::convert::TryInto; use super::Sequence; use super::blocks::encode_offset_with_history; -use super::incompressible::{block_looks_incompressible, block_looks_incompressible_strict}; +use super::incompressible::block_looks_incompressible; use super::match_generator::{ DFAST_EMPTY_SLOT, DFAST_HASH_BITS, DFAST_INCOMPRESSIBLE_SKIP_STEP, DFAST_LOCAL_SKIP_TRIGGER, - DFAST_MAX_SKIP_STEP, DFAST_MIN_MATCH_LEN, DFAST_REBASE_GUARD_BAND, DFAST_SEARCH_DEPTH, + DFAST_MAX_SKIP_STEP, DFAST_MIN_MATCH_LEN, DFAST_REBASE_GUARD_BAND, DFAST_SHORT_HASH_BITS_DELTA, DFAST_SHORT_HASH_LOOKAHEAD, DFAST_SKIP_STEP_GROWTH_INTERVAL, DFAST_TARGET_LEN, MIN_WINDOW_LOG, }; use super::match_table::helpers::{ @@ -29,9 +29,38 @@ use super::match_table::helpers::{ use super::match_table::storage::check_stream_abs_headroom; use super::opt::types::MatchCandidate; +/// Donor `HASH_READ_SIZE` (`zstd_compress_internal.h`): the largest probe +/// width any hash / equality check in the dfast hot path reads at once. +/// Loop guards must stop scanning when fewer than `HASH_READ_SIZE` bytes +/// remain ahead of the probe cursor, matching donor `ilimit = iend - +/// HASH_READ_SIZE`. The `DFAST_MIN_MATCH_LEN = 5` floor is the match +/// acceptance threshold, NOT a safe loop bound — using it as the loop +/// guard reads up to 3 bytes past the live history end and is UB on a +/// raw pointer load. +const HASH_READ_SIZE: usize = 8; + +/// Rep-extension minimum match length. The upstream zstd 1.5.7 +/// reference (`zstd_double_fast.c:191`, rep1 emit +/// `ZSTD_count(ip+1+4, ip+1+4-offset_1, iend) + 4`) accepts 4-byte +/// rep hits even though hash-search matches require 5 bytes +/// (`DFAST_MIN_MATCH_LEN`) — rep coding has no on-wire offset cost, +/// so a 4-byte rep is a net win over re-running the hash probe. Both +/// the fast-loop inline rep1 peek and the post-match +/// `extend_with_repcode_after_match` chain must gate on this floor; +/// using the hash-search floor on either site silently drops 4-byte +/// rep emissions that upstream produces. +const DFAST_REP_MIN_MATCH_LEN: usize = 4; + pub(crate) struct DfastMatchGenerator { pub(crate) max_window_size: usize, - pub(crate) window: VecDeque>, + /// Per-block length queue. Previously held the raw input + /// `VecDeque>` for each appended block — that duplicated + /// every byte in `history`, doubling the input footprint relative + /// to donor on the hot path. Now stores only the lengths so the + /// matcher can still pop blocks block-by-block on eviction + /// (advancing `history_start` by each pop) while the actual byte + /// storage lives once, in `history`. + pub(crate) window_blocks: VecDeque, pub(crate) window_size: usize, // We keep a contiguous searchable history to avoid rebuilding and reseeding // the matcher state from disjoint block buffers on every block. @@ -39,17 +68,20 @@ pub(crate) struct DfastMatchGenerator { pub(crate) history_start: usize, pub(crate) history_abs_start: usize, pub(crate) offset_hist: [u32; 3], - // Storage: `[u32; DFAST_SEARCH_DEPTH]` per bucket. Each slot holds - // a +1-biased relative position (`(abs_pos - position_base + 1) as - // u32`); the empty-slot sentinel `DFAST_EMPTY_SLOT = 0` is - // therefore never a real value. Trades 4 bytes per slot (vs the - // previous `usize`) — at `DFAST_SEARCH_DEPTH = 4` that's 16 bytes - // per bucket vs the prior 32 bytes, halving the hash-table - // footprint. Combined with the donor-parity `hashLog = 17` ceiling - // this brings the two-table memory cost from 64 MiB to ~4 MiB on - // 1 MiB inputs — close to donor's 768 KiB for the same level. - pub(crate) short_hash: Vec<[u32; DFAST_SEARCH_DEPTH]>, - pub(crate) long_hash: Vec<[u32; DFAST_SEARCH_DEPTH]>, + // Storage: single `u32` per bucket — donor-parity overwrite-on- + // collision. Each slot holds a +1-biased relative position + // (`(abs_pos - position_base + 1) as u32`); `DFAST_EMPTY_SLOT = 0` + // is therefore never a real value. The two tables are sized + // independently: `long_hash` (8-byte hash) uses `long_hash_bits` + // (donor `hashTable`); `short_hash` (4-byte hash) uses + // `short_hash_bits` = `long - 1` (donor `chainTable` for dfast). + // Donor parity at Level 3: `2^17 × 4 + 2^16 × 4 = 768 KiB`. The + // ratio loss from single-slot is compensated by donor's + // `_search_next_long` retry — after a short-hash hit, the search + // probes long_hash at `ip + 1` and picks the longer of the two + // (see `hash_candidate`, invoked via `best_match`, for the retry). + pub(crate) short_hash: Vec, + pub(crate) long_hash: Vec, /// Absolute position whose `(abs_pos - position_base + 1)` slot /// encoding evaluates to `1`. Advances only via [`Self::reduce`] /// when an insert is about to overflow the u32 window — the @@ -60,11 +92,19 @@ pub(crate) struct DfastMatchGenerator { /// a single matcher instance. Donor parity: `ZSTD_window_reduce` /// (`zstd_compress_internal.h`). pub(crate) position_base: usize, - pub(crate) hash_bits: usize, - /// Cached fastpath kernel for `hash_mix_u64`. Resolved once at `new()` - /// and reused on every `hash_index` call so we skip the per-call - /// `OnceLock` atomic load that `dispatch_hash_mix_u64` would pay. - pub(crate) hash_kernel: crate::encoding::fastpath::FastpathKernel, + /// Long-hash table bit-width — `long_hash.len() == 1 << + /// long_hash_bits`. Donor parity with `cParams.hashLog` (17 for + /// Level 3 large input, 16 for Level 2; see `clevels.h`). + pub(crate) long_hash_bits: usize, + /// Short-hash table bit-width — `short_hash.len() == 1 << + /// short_hash_bits`. Default is `long_hash_bits - + /// DFAST_SHORT_HASH_BITS_DELTA`, donor parity with + /// `cParams.chainLog` for dfast levels (one bit smaller than the + /// long hash). Halves the short-table footprint without losing + /// measurable ratio — the 4-byte short hash overwrites less + /// frequently than the 8-byte long hash on average, so the + /// smaller bucket count is the donor-correct sizing. + pub(crate) short_hash_bits: usize, pub(crate) use_fast_loop: bool, // Lazy match lookahead depth (internal tuning parameter). pub(crate) lazy_depth: u8, @@ -82,7 +122,7 @@ impl DfastMatchGenerator { pub(crate) fn new(max_window_size: usize) -> Self { Self { max_window_size, - window: VecDeque::new(), + window_blocks: VecDeque::new(), window_size: 0, history: Vec::new(), history_start: 0, @@ -91,20 +131,32 @@ impl DfastMatchGenerator { short_hash: Vec::new(), long_hash: Vec::new(), position_base: 0, - hash_bits: DFAST_HASH_BITS, - hash_kernel: crate::encoding::fastpath::select_kernel(), + long_hash_bits: DFAST_HASH_BITS, + short_hash_bits: DFAST_HASH_BITS - DFAST_SHORT_HASH_BITS_DELTA, use_fast_loop: false, lazy_depth: 1, } } + /// Set both hash table sizes. `bits` is the long-hash bit count + /// (donor `cParams.hashLog`); the short hash is derived as + /// `bits - DFAST_SHORT_HASH_BITS_DELTA`, donor-correct for dfast + /// levels. Both clamps stay above `MIN_WINDOW_LOG` so very small + /// windows don't underflow. pub(crate) fn set_hash_bits(&mut self, bits: usize) { - let clamped = bits.clamp(MIN_WINDOW_LOG as usize, DFAST_HASH_BITS); - if self.hash_bits != clamped { - self.hash_bits = clamped; - self.short_hash = Vec::new(); + let min_bits = MIN_WINDOW_LOG as usize; + let long_clamped = bits.clamp(min_bits, DFAST_HASH_BITS); + let short_clamped = long_clamped + .saturating_sub(DFAST_SHORT_HASH_BITS_DELTA) + .max(min_bits); + if self.long_hash_bits != long_clamped { + self.long_hash_bits = long_clamped; self.long_hash = Vec::new(); } + if self.short_hash_bits != short_clamped { + self.short_hash_bits = short_clamped; + self.short_hash = Vec::new(); + } } /// Encode an absolute position into a u32 slot value @@ -161,23 +213,21 @@ impl DfastMatchGenerator { /// Advance `position_base` by the same amount so future inserts /// continue from the rebased origin. fn reduce(&mut self, reducer: u32) { - let shift_buckets = |buckets: &mut Vec<[u32; DFAST_SEARCH_DEPTH]>| { - for bucket in buckets.iter_mut() { - for slot in bucket.iter_mut() { - *slot = if *slot <= reducer { - DFAST_EMPTY_SLOT - } else { - *slot - reducer - }; - } + let shift_slots = |slots: &mut [u32]| { + for slot in slots.iter_mut() { + *slot = if *slot <= reducer { + DFAST_EMPTY_SLOT + } else { + *slot - reducer + }; } }; - shift_buckets(&mut self.short_hash); - shift_buckets(&mut self.long_hash); + shift_slots(&mut self.short_hash); + shift_slots(&mut self.long_hash); self.position_base += reducer as usize; } - pub(crate) fn reset(&mut self, mut reuse_space: impl FnMut(Vec)) { + pub(crate) fn reset(&mut self) { self.window_size = 0; self.history.clear(); self.history_start = 0; @@ -185,48 +235,175 @@ impl DfastMatchGenerator { self.position_base = 0; self.offset_hist = [1, 4, 8]; if !self.short_hash.is_empty() { - self.short_hash.fill([DFAST_EMPTY_SLOT; DFAST_SEARCH_DEPTH]); - self.long_hash.fill([DFAST_EMPTY_SLOT; DFAST_SEARCH_DEPTH]); - } - for mut data in self.window.drain(..) { - data.resize(data.capacity(), 0); - reuse_space(data); + self.short_hash.fill(DFAST_EMPTY_SLOT); + self.long_hash.fill(DFAST_EMPTY_SLOT); } + // No Vec blocks to recycle: `add_data` returns each input + // Vec to the caller eagerly via its own `reuse_space`, and the + // history Vec is owned solely by the matcher. There is nothing + // for an outer pool helper to do at reset time, so the dfast + // signature does not take one (HC / Row do because they hold + // per-block input Vecs internally; the dispatcher in + // `match_generator.rs` resolves the per-backend shape). + self.window_blocks.clear(); } + /// Slice of bytes from the most recently appended block. Returns + /// the trailing `last_block_len` bytes of `history`, or an empty + /// slice if no block has been ingested yet. + /// + /// Mirrors the inline gate pattern used by `skip_matching` / + /// `skip_matching_dense` / `start_matching` / `emit_candidate` / + /// `emit_trailing_literals`: read + /// `window_blocks.back().copied().unwrap_or(0)` and slice the + /// trailing `last_len` bytes (which is empty when `last_len == 0`). + /// All current external callers — streaming encoder, block + /// compressor, per-level helpers — invoke this only after at + /// least one `add_data`, but returning an empty slice on the + /// empty case keeps the trait surface aligned with the internal + /// usage and avoids a panic-vs-gate divergence that would + /// surprise a future refactor consolidating the call sites. pub(crate) fn get_last_space(&self) -> &[u8] { - self.window.back().unwrap().as_slice() + let last_len = self.window_blocks.back().copied().unwrap_or(0); + &self.history[self.history.len() - last_len..] } pub(crate) fn add_data(&mut self, data: Vec, mut reuse_space: impl FnMut(Vec)) { assert!(data.len() <= self.max_window_size); + // Run the headroom check first so the safety invariant + // (`history_abs_start + window_size + len + STREAM_ABS_HEADROOM + // <= usize::MAX`) is enforced at the function boundary, not + // hidden behind the empty-chunk short-circuit below. With + // `data.len() == 0` the check is a cheap no-op on the cumulative + // state today, but keeping the call here means the invariant + // doesn't depend on "empty data implies nothing changes" + // reasoning if a future change ever attaches side effects to + // the empty path. check_stream_abs_headroom(self.history_abs_start, self.window_size, data.len()); + // Empty chunks have nothing to record: pushing a `0` into + // `window_blocks` would let a streaming caller that flushes + // empty chunks grow the deque without bound (`window_size` + // stays unchanged so `trim_to_window` never has cause to + // evict the zero-length entries). Hand the Vec straight back + // to the pool and short-circuit. + // + // Side effect: this short-circuits BEFORE the eviction `while` + // loop below. If a caller shrinks `max_window_size` and then + // calls `add_data(vec![])` hoping to trigger trim, the trim + // won't fire here. Use `trim_to_window` directly for that + // case — it's the dedicated path for shedding retained bytes + // and now actually frees the prefix (via `split_off`) instead + // of leaving it pinned in the `history` allocation. + // + // In-tree caller audit (`grep -rn '\.commit_space(' src/encoding/`): + // every production path that reaches the driver's + // `commit_space` → `add_data` chain originates in + // `levels/fastest.rs`'s block emitter, which produces + // non-empty blocks gated by `should_emit_raw_fast_path` / + // RLE-detect on the source bytes — none of them pass + // `Vec::new()`. The streaming encoder's block-sourcing loop + // also filters empty reads before forwarding to the matcher. + // Tests are the only callers that exercise the empty path + // explicitly, and the regression covers eviction-driven trim + // semantics through `trim_to_window` directly. So this + // behaviour change is observable only by a hypothetical + // future caller that relies on `add_data(empty)` as a + // side-effecting trim trigger — and we deliberately want + // such a caller to use `trim_to_window` instead. + if data.is_empty() { + reuse_space(data); + return; + } while self.window_size + data.len() > self.max_window_size { - let removed = self.window.pop_front().unwrap(); - self.window_size -= removed.len(); - self.history_start += removed.len(); - self.history_abs_start += removed.len(); - reuse_space(removed); + let removed_len = self.window_blocks.pop_front().unwrap(); + self.window_size -= removed_len; + self.history_start += removed_len; + self.history_abs_start += removed_len; } self.compact_history(); self.history.extend_from_slice(&data); self.window_size += data.len(); - self.window.push_back(data); + self.window_blocks.push_back(data.len()); + // Eager Vec recycle: the only purpose of holding the input Vec + // was to return it to the caller's pool on eviction. Now that + // `history` owns the bytes, hand the Vec back immediately so + // the pool grows on first add instead of waiting for window + // overflow. + reuse_space(data); } - pub(crate) fn trim_to_window(&mut self, mut reuse_space: impl FnMut(Vec)) { + /// Trim retained blocks until the window fits `max_window_size`. + /// + /// Unlike `MatchGenerator::trim_to_window`, + /// `RowMatchGenerator::trim_to_window`, and + /// `HcMatchGenerator::trim_to_window`, this backend does NOT take a + /// `reuse_space` callback because it doesn't retain per-block + /// `Vec` storage to recycle (history is the sole byte buffer + /// and `add_data` returns each input Vec eagerly). The dispatcher + /// in `match_generator.rs` knows the variant and threads the right + /// signature; callers needing the eviction byte count derive it + /// from the `window_size` delta before/after this call. + /// + /// The explicit-trim-before-idle path is the reason this helper + /// exists: a caller that trims to shed memory before a long + /// quiescent period must see the resident size drop immediately, + /// not "eventually, on the next ingest". + /// + /// `compact_history` is NOT the right tool for that — it uses + /// `Vec::drain(..history_start)`, which moves elements down in + /// the existing allocation and leaves capacity untouched (per + /// `Vec` docs, only `shrink_to_fit` releases capacity). So even + /// when compact ran, the original buffer stayed alive. Instead, + /// rebuild `history` via `split_off`: it allocates a fresh + /// buffer sized to the retained suffix, and the assignment + /// drops the original (full-capacity) buffer. On a normal block + /// loop `add_data` will compact again the next iter — the cost + /// is one extra realloc on the trim boundary in exchange for + /// actually shedding the prefix to the system allocator. + /// + /// Unlike `HashChainTable::trim_to_window` / + /// `RowMatcher::trim_to_window` this signature deliberately takes + /// no `reuse_space` callback — Dfast stores its raw bytes in the + /// single contiguous `history` buffer (no per-block `Vec` to + /// recycle), releases the dead prefix via `split_off`, and + /// surfaces eviction byte count to the dispatcher via the + /// `window_size` delta rather than a callback. A uniform-signature + /// trim helper would force every call site to monomorphize an + /// `impl FnMut(Vec)` that is documented to never fire, paying + /// codegen + inlining cost on the cold path for zero behaviour + /// difference; the dispatcher in `match_generator.rs` already + /// branches per backend (matchers diverge on `reset` and a few + /// other lifecycle calls) so adding one more per-backend arm is + /// free. + pub(crate) fn trim_to_window(&mut self) { while self.window_size > self.max_window_size { - let removed = self.window.pop_front().unwrap(); - self.window_size -= removed.len(); - self.history_start += removed.len(); - self.history_abs_start += removed.len(); - reuse_space(removed); + let removed_len = self.window_blocks.pop_front().unwrap(); + self.window_size -= removed_len; + self.history_start += removed_len; + self.history_abs_start += removed_len; + } + if self.history_start != 0 { + // `split_off` returns the suffix in a fresh allocation; + // the original Vec (still owning [..history_start]) is + // dropped on the assignment below, releasing the dead + // prefix back to the allocator. + self.history = self.history.split_off(self.history_start); + self.history_start = 0; } } pub(crate) fn skip_matching(&mut self, incompressible_hint: Option) { self.ensure_hash_tables(); - let current_len = self.window.back().unwrap().len(); + let current_len = self.window_blocks.back().copied().unwrap_or(0); + if current_len == 0 { + // `add_data` short-circuits on empty input and does NOT push + // a zero-length entry onto `window_blocks`. A caller that + // invokes skip-matching after a streaming flush of an empty + // chunk would otherwise re-seed the previous block's + // retained tail on every empty write. Mirror the gate that + // `start_matching` already uses. + return; + } let current_abs_start = self.history_abs_start + self.window_size - current_len; let current_abs_end = current_abs_start + current_len; let tail_start = current_abs_start.saturating_sub(Self::BOUNDARY_DENSE_TAIL_LEN); @@ -260,7 +437,15 @@ impl DfastMatchGenerator { pub(crate) fn skip_matching_dense(&mut self) { self.ensure_hash_tables(); - let current_len = self.window.back().unwrap().len(); + let current_len = self.window_blocks.back().copied().unwrap_or(0); + if current_len == 0 { + // Same gate as `skip_matching` and `start_matching`: empty + // chunks fed through `add_data` no longer push a block + // entry, so a streaming caller that flushes empty chunks + // would otherwise re-seed the retained tail on every + // empty write. + return; + } let current_abs_start = self.history_abs_start + self.window_size - current_len; let current_abs_end = current_abs_start + current_len; let backfill_start = current_abs_start @@ -275,7 +460,7 @@ impl DfastMatchGenerator { pub(crate) fn start_matching(&mut self, mut handle_sequence: impl for<'a> FnMut(Sequence<'a>)) { self.ensure_hash_tables(); - let current_len = self.window.back().unwrap().len(); + let current_len = self.window_blocks.back().copied().unwrap_or(0); if current_len == 0 { return; } @@ -308,10 +493,17 @@ impl DfastMatchGenerator { // the dynamic bound is `current_len` itself — the `while // pos + DFAST_MIN_MATCH_LEN <= current_len` guard keeps // every offset within that bound regardless of how the - // caller sized `max_window_size`. In production this also - // happens to be `≤ HC_BLOCKSIZE_MAX (128 KiB)` because the - // frame compressor never hands out larger blocks, but the - // safety argument above does not rely on that limit. + // caller sized `max_window_size`. The general path's + // `best_match` → `hash_candidate` chain uses SAFE slicing + // (`data[..8].try_into()`) which panics on a short slice + // rather than reading uninitialised bytes — UB-free even + // when this guard lets `pos` land within 3 bytes of the + // block tail. The fast loop has stricter `+ HASH_READ_SIZE` + // guards because it does raw-pointer `read_unaligned` for + // speed; see `start_matching_fast_loop`. In production this + // also happens to be `≤ HC_BLOCKSIZE_MAX (128 KiB)` because + // the frame compressor never hands out larger blocks, but + // the safety argument above does not rely on that limit. // 2. Absolute-position arithmetic (`current_abs_start + pos`): // `current_abs_start` is the frame-lifetime cursor and // advances with total bytes processed, NOT with the @@ -382,62 +574,149 @@ impl DfastMatchGenerator { current_len: usize, handle_sequence: &mut impl for<'a> FnMut(Sequence<'a>), ) { - let block_is_strict_incompressible = self - .block_looks_incompressible_strict(current_abs_start, current_abs_start + current_len); + // Behaviour change vs the pre-refactor `start_matching_general`: + // this fast loop deliberately drops the strict-incompressible + // early-skip path (the `block_looks_incompressible_strict` short + // circuit + `miss_run` / `DFAST_LOCAL_SKIP_TRIGGER` thresholding). + // The step ramp is now driven purely by distance traveled + // (`DFAST_SKIP_STEP_GROWTH_INTERVAL = 64`, donor parity except + // donor uses 256 — see "Donor-deviation audit" in the PR body), + // so blocks the strict gate used to bail out of early now scan + // through the standard ramp. `block_looks_incompressible_strict` + // is still used by `levels/fastest.rs` for the Fastest preset + // and by `incompressible.rs` unit tests, so the helper itself + // stays. + // + // Donor outer/inner structure (`zstd_double_fast.c:167-322`): + // * outer `while(1)` runs once per match-found-and-stored; + // * inner `do { ... } while (ip1 <= ilimit)` carries `hl0`, + // `idxl0` between iterations, and precomputes `hl1` so the + // next iter's long hash is reused (one long-hash compute per + // two positions scanned, not per position). + // We mirror that here. Per-frame invariants are hoisted before + // the outer loop; mutable state (`position_base`, `history_abs_start`) + // is re-snapshotted inside the outer loop because `emit_candidate` + // → `insert_positions` → `ensure_room_for` can advance the rebase + // base mid-frame. + // + // Rebase BEFORE any inner-loop slot pack. The hot loop computes + // `packed_curr = (abs_ip0 - position_base) as u32 + 1` and writes + // it straight into the hash tables, bypassing `insert_position` + // (which is where `ensure_room_for` normally fires). On a long + // stream of all-miss / non-hashable blocks the matcher can advance + // `current_abs_start` arbitrarily far without any per-byte insert, + // so `position_base` may be stale by `> u32::MAX`. The first + // fast-loop block after that would silently truncate `packed_curr` + // and poison both hash tables. The guard band in `ensure_room_for` + // (`DFAST_REBASE_GUARD_BAND`) covers the entire current block's + // worth of positions, so a single call at function entry suffices. + // + // Raw-pointer aliasing invariant. The inner loop caches + // `short_hash_ptr = self.short_hash.as_mut_ptr()` and + // `long_hash_ptr = self.long_hash.as_mut_ptr()` (and a + // `history_base_ptr` for the byte-buffer reads), then over + // the rest of the function does: + // + // * raw writes / reads via `*short_hash_ptr.add(...) = + // packed_curr`, `*long_hash_ptr.add(...) = packed_curr`, + // and `*long_hash_ptr.add(hl1_idx)`; + // * `&self`-shared reads of `self.offset_hist[0]` for the + // rep1 peek; + // * `&self`-shared slice reads of `self.history` (`let + // concat = &self.history[history_start_offset..]`) for + // `extend_backwards_shared` invocations. + // + // This is sound under both Stacked Borrows and Tree Borrows + // because the three fields touched (`short_hash`, `long_hash`, + // `history`, `offset_hist`) are physically disjoint + // allocations — `Vec`/`Vec` each own their own heap + // buffer, and `offset_hist` is an inline `[u32; 3]` inside + // the struct. A raw pointer derived from one field's Vec data + // has provenance over that field only, so the subsequent + // `&self` reads of sibling fields don't reborrow through the + // raw pointer's provenance tree. + // + // CRITICAL: this invariant must be preserved by any future + // refactor that adds method calls inside the loop body. A + // call that takes `&mut self` (e.g., + // `self.ensure_hash_tables()`, `self.insert_position(...)`) + // would reborrow the tables and invalidate the cached raw + // pointers — every such call must happen OUTSIDE the + // `'outer: loop` (as `ensure_room_for` does, hoisted to the + // function preamble above) or be followed by a fresh + // `as_mut_ptr()` reload of both `short_hash_ptr` / + // `long_hash_ptr`. The outer-loop body already re-snapshots + // `history_base_ptr` per iteration for exactly this reason + // (`emit_candidate` → `insert_positions` may grow `history` + // and trigger a realloc), so the same re-snapshot discipline + // applies to the hash-table pointers. + // + // `current_len > 0` is a precondition of this helper: every + // caller (`start_matching`, `skip_matching`, `skip_matching_dense`) + // returns early at `current_len == 0`. Encode it as a hard + // assert so a future caller that forgets the gate fails loudly + // in debug rather than wrap-underflowing the `- 1` below; the + // rebase is unconditional in release builds since the + // precondition holds. + debug_assert!(current_len > 0, "fast_loop precondition: current_len > 0"); + self.ensure_room_for(current_abs_start + current_len - 1); + const PRIME: u64 = 0xCF1BBCDCB7A56463_u64; + let short_shift = 64 - self.short_hash_bits; + let long_shift = 64 - self.long_hash_bits; let mut pos = 1usize; let mut literals_start = 0usize; - let mut skip_step = 1usize; - let mut next_skip_growth_pos = DFAST_SKIP_STEP_GROWTH_INTERVAL; - let mut miss_run = 0usize; - // Loop invariants (two distinct bounds): - // - // 1. Block-local arithmetic (`ip0..ip3 = pos + N` for small - // `N`, `pos + skip_step` with `skip_step <= - // DFAST_MAX_SKIP_STEP`): the dynamic bound is `current_len` - // itself — the `while pos + DFAST_MIN_MATCH_LEN <= - // current_len` guard keeps every offset within that bound - // regardless of how the caller sized `max_window_size`. - // In production this also happens to be `≤ - // HC_BLOCKSIZE_MAX (128 KiB)` because the frame compressor - // never hands out larger blocks, but the safety argument - // above does not rely on that limit. - // 2. Absolute-position arithmetic (`current_abs_start + ip0`, - // `current_abs_start + ip2`, etc.): `current_abs_start` is - // the frame-lifetime cursor and advances with total bytes - // processed, NOT with the retained window size. A long - // streaming encode on i686 can push `current_abs_start + - // ipN` past `usize::MAX` even though memory stays bounded - // by `window_size`. The runtime enforcement lives in - // `DfastMatchGenerator::add_data` via - // `check_stream_abs_headroom`: every ingest fails fast if - // cumulative input would advance the cursor within - // `STREAM_ABS_HEADROOM` of `usize::MAX`, which keeps the - // raw `current_abs_start + ipN` arithmetic below - // `usize::MAX` for every position - // this loop ever observes. - while pos + DFAST_MIN_MATCH_LEN <= current_len { - let ip0 = pos; - let ip1 = ip0 + 1; - let ip2 = ip0 + 2; - let ip3 = ip0 + 3; - - let abs_ip0 = current_abs_start + ip0; - let lit_len_ip0 = ip0 - literals_start; - - if ip2 + DFAST_MIN_MATCH_LEN <= current_len { - let abs_ip2 = current_abs_start + ip2; - let lit_len_ip2 = ip2 - literals_start; - if let Some(rep) = self.repcode_candidate(abs_ip2, lit_len_ip2) - && rep.start >= current_abs_start + literals_start - && rep.start <= abs_ip2 + + 'outer: loop { + // Outer-iter precondition: at least `HASH_READ_SIZE = 8` bytes + // ahead of `pos` so the unconditional 8-byte `u64` load below + // is in-bounds for the live history buffer. `DFAST_MIN_MATCH_LEN + // = 5` is the match acceptance threshold and is NOT a safe + // load bound — using it here read up to 3 bytes past + // `history.len()` on tiny blocks (CI fuzz `interop` crash, + // 7-byte input). + // NOTE: when this guard fires on the very first outer-iter + // (tiny block, `current_len < 9`), we `break 'outer` BEFORE + // `tail_seed_anchor` is even declared. The post-loop + // `seed_remaining_hashable_starts(.., pos)` then runs with + // `pos` at its initial value (1 on first frame, or the + // post-match cursor from a previous outer iter); that's the + // correct tail seed for tiny blocks — `seed_pos = pos.min( + // current_len).min(boundary_tail_start)` plus the + // `seed_pos + DFAST_SHORT_HASH_LOOKAHEAD <= current_len` + // guard inside the seeder keeps every insert in-bounds. + if pos + HASH_READ_SIZE > current_len { + break 'outer; + } + let mut step = 1usize; + let mut next_step_pos = pos + DFAST_SKIP_STEP_GROWTH_INTERVAL; + let mut ip0 = pos; + let mut ip1 = ip0 + step; + // Same `HASH_READ_SIZE` rationale for `ip1`: the inner loop + // pre-loads 8 bytes at `concat_idx1` for the `hl1` precompute + // and the `_search_next_long` retry, so `ip1 + 8 <= current_len` + // must hold before we enter. If `ip0` is still hashable but + // `ip1` is not (boundary case `pos == current_len - 8`), we + // still want to probe `ip0` — skipping it would leave the + // last hashable position to `seed_remaining_hashable_starts`, + // which inserts but does NOT search, and that drops real + // matches in the tail window vs the upstream reference + // (whose single-cursor loop probes every position p with + // `p + HASH_READ_SIZE <= iend`). Handle the boundary inline + // before exiting: a single-cursor probe at `ip0` (rep peek + // and `_search_next_long` retry both depend on `ip1` so + // they're skipped — donor accepts that exact tradeoff at + // the iend boundary). + if ip1 + HASH_READ_SIZE > current_len { + if let Some(committed) = + self.probe_tail_ip0_only(current_abs_start, current_len, ip0, literals_start) { let start = self.emit_candidate( current_abs_start, &mut literals_start, - rep, + committed, handle_sequence, ); - pos = start + rep.match_len; + pos = start + committed.match_len; pos = self.extend_with_repcode_after_match( current_abs_start, current_len, @@ -445,55 +724,396 @@ impl DfastMatchGenerator { &mut literals_start, handle_sequence, ); - skip_step = 1; - next_skip_growth_pos = pos + DFAST_SKIP_STEP_GROWTH_INTERVAL; - miss_run = 0; - continue; + continue 'outer; } + break 'outer; } - let best = self.best_match(abs_ip0, lit_len_ip0); - if let Some(candidate) = best { - let start = self.emit_candidate( - current_abs_start, - &mut literals_start, - candidate, - handle_sequence, - ); - pos = start + candidate.match_len; - pos = self.extend_with_repcode_after_match( - current_abs_start, - current_len, - pos, - &mut literals_start, - handle_sequence, + // Re-read every per-frame-mutable cursor here — `emit_candidate` + // in the previous outer iteration may have triggered a rebase. + let history_abs_start = self.history_abs_start; + let position_base = self.position_base; + let history_start_offset = self.history_start; + let history_base_ptr = self.history.as_ptr(); + let short_hash_ptr = self.short_hash.as_mut_ptr(); + let long_hash_ptr = self.long_hash.as_mut_ptr(); + let concat_len = self.history.len() - history_start_offset; + + // Pre-compute long hash at ip0 ONCE per outer iter. + // `concat_idx = (current_abs_start + ip0) - history_abs_start` + // is the byte offset within `live_history`. + let mut hl0_idx; + let mut idxl0; + // SAFETY: `current_abs_start + ip0 >= history_abs_start` + // (`ip` is inside the current block, which is part of live + // history). The 8-byte unaligned load on + // `concat[concat_idx..]` is safe because the outer loop + // guards above enforce `ip0 + HASH_READ_SIZE <= current_len`, + // and `current_abs_start + current_len - history_abs_start + // <= concat_len` (live history contains the full current + // block plus any retained earlier blocks), giving + // `concat_idx + 8 <= concat_len`. The `debug_assert!` below + // makes that invariant explicit so a future refactor that + // touches eviction / `compact_history` / `trim_to_window` + // semantics catches a violation in tests instead of leaking + // through to ASan UB. + unsafe { + let concat_idx = (current_abs_start + ip0) - history_abs_start; + debug_assert!( + concat_idx + HASH_READ_SIZE <= concat_len, + "fast-loop 8-byte load OOB: concat_idx={} HASH_READ_SIZE={} concat_len={}", + concat_idx, + HASH_READ_SIZE, + concat_len, ); - skip_step = 1; - next_skip_growth_pos = pos + DFAST_SKIP_STEP_GROWTH_INTERVAL; - miss_run = 0; - } else { - self.insert_position(abs_ip0); - if ip1 + 4 <= current_len { - self.insert_position(current_abs_start + ip1); + let v8 = (history_base_ptr.add(history_start_offset + concat_idx) as *const u64) + .read_unaligned(); + hl0_idx = (v8.wrapping_mul(PRIME) >> long_shift) as usize; + idxl0 = *long_hash_ptr.add(hl0_idx); + } + + // Inner-loop exit shape. Every `break 'inner` produces a + // value of this type, so the type system enforces the + // previously implicit pairing between "did we commit a + // match?" and "where does the tail seeder pick up?": + // + // * `Committed(c)` — rep1 / long / short(+next_long retry) + // paths chose `c`; outer arm runs emit + rep-extension. + // * `Tail(seed)` — inner ran out of safe scan room at + // `seed = ip0` (first un-scanned position); outer arm + // hands `seed` to `seed_remaining_hashable_starts` so + // the tail seeder does not redundantly re-pack + // positions the fast loop already wrote (which is what + // restarting from outer-entry `pos` would do on a + // miss-only block — throwing away the skip-step win on + // incompressible data). + // + // Adding a new `break 'inner` variant now forces the + // author to pick a `InnerExit::…` variant explicitly; the + // previous `Option` + sibling `usize` + // pairing relied on a comment-block to flag the coupling. + enum InnerExit { + Committed(MatchCandidate), + Tail(usize), + } + + let inner_exit: InnerExit = 'inner: loop { + let abs_ip0 = current_abs_start + ip0; + let abs_ip1 = current_abs_start + ip1; + let lit_len_ip0 = ip0 - literals_start; + let lit_len_ip1 = ip1 - literals_start; + let packed_curr = ((abs_ip0 - position_base) as u32) + 1; + + let concat_idx0 = abs_ip0 - history_abs_start; + let concat_idx1 = abs_ip1 - history_abs_start; + + // Load 8 bytes at ip0 for both short (low 4) and long + // probe equality checks. We already used `v8_at_ip0` to + // compute hl0/idxl0 in the outer init / previous iter's + // carry; reload now (cheap unaligned read) so the + // `read_unaligned` is from the same offset the + // probe-eq below will use. + let v8_0 = unsafe { + (history_base_ptr.add(history_start_offset + concat_idx0) as *const u64) + .read_unaligned() + }; + let v4_0 = v8_0 & 0xFFFF_FFFF; + let hs0_idx = (v4_0.wrapping_mul(PRIME) >> short_shift) as usize; + let idxs0 = unsafe { *short_hash_ptr.add(hs0_idx) }; + + // Donor parity (`zstd_double_fast.c:187`): update BOTH + // tables at curr BEFORE checking matches. The benefit is + // for hash-table consumers, NOT the rep peek (rep at ip+1 + // reads `offset_hist[0]`, never the hash tables). The + // donor rationale is the long-hash retry path: the next + // inner iter's `idxl1` lookup (`hashLong[hl1_idx]`) can + // collide with this iter's `hl0_idx`, and writing curr + // first means a self-collision still resolves to a real + // match instead of the previous occupant. The short + // probe of the same iter and the `_search_next_long` + // retry at ip+1 are the other consumers that see the + // fresh write. + unsafe { + *long_hash_ptr.add(hl0_idx) = packed_curr; + *short_hash_ptr.add(hs0_idx) = packed_curr; } - if ip2 + 4 <= current_len { - self.insert_position(current_abs_start + ip2); + + // Donor parity (`zstd_double_fast.c:190`): inline rep1 + // peek at ip+1, 4-byte gate. Donor's hot path checks ONLY + // `offset_1` here (full 3-rep walk lives in lazy/btopt). + // Since the peek is at `ip+1` with `pos >= literals_start`, + // `lit_len_ip1 >= 1`, so `offset_hist[0]` is the donor's + // `offset_1`. The `repcode_candidate_shared` helper we used + // before walked all three offsets + did a full SIMD + // `common_prefix_len` per probe, paying ~3× the work for + // rep2/rep3 hits that the dfast fast path never benefits + // from (those wins live in the lazy/btopt strategies). + let rep1 = self.offset_hist[0] as usize; + if rep1 != 0 && rep1 <= abs_ip1 { + let cand_pos_r = abs_ip1 - rep1; + if cand_pos_r >= history_abs_start + && cand_pos_r >= current_abs_start + literals_start + { + let cand_idx_r = cand_pos_r - history_abs_start; + // 4-byte gate; full forward count only if it passes. + let cand4 = unsafe { + (history_base_ptr.add(history_start_offset + cand_idx_r) as *const u32) + .read_unaligned() + }; + let cur4 = unsafe { + (history_base_ptr.add(history_start_offset + concat_idx1) as *const u32) + .read_unaligned() + }; + if cand4 == cur4 { + let mut match_len = 4usize; + let max_fwd = concat_len.saturating_sub(concat_idx1 + 4); + unsafe { + let lhs = + history_base_ptr.add(history_start_offset + cand_idx_r + 4); + let rhs = + history_base_ptr.add(history_start_offset + concat_idx1 + 4); + let ext = crate::encoding::fastpath::dispatch_common_prefix_len_ptr( + lhs, rhs, max_fwd, + ); + match_len += ext; + } + // Rep extensions use the 4-byte + // `DFAST_REP_MIN_MATCH_LEN` floor, NOT the + // hash-search `DFAST_MIN_MATCH_LEN = 5`. Rep + // coding has no on-wire offset cost, so the + // upstream reference accepts 4-byte rep + // hits; gating at 5 here would silently drop + // every 4-byte rep1 the upstream encoder + // produces, and leave the post-match + // `extend_with_repcode_after_match` chain + // (which uses 4) inconsistent with the peek. + if match_len >= DFAST_REP_MIN_MATCH_LEN { + let concat = &self.history[history_start_offset..]; + let rep_cand = extend_backwards_shared( + concat, + history_abs_start, + cand_pos_r, + abs_ip1, + match_len, + lit_len_ip1, + ); + break 'inner InnerExit::Committed(rep_cand); + } + } + } } - if ip3 + 4 <= current_len { - self.insert_position(current_abs_start + ip3); + + // Precompute hl1 (donor `_search_next_long` carry, line 197). + let v8_1 = unsafe { + (history_base_ptr.add(history_start_offset + concat_idx1) as *const u64) + .read_unaligned() + }; + let hl1_idx = (v8_1.wrapping_mul(PRIME) >> long_shift) as usize; + + // Long match check at ip0 with idxl0. 8-byte equality + // gate (`MEM_read64`) — if it passes, candidate is real. + if idxl0 != DFAST_EMPTY_SLOT { + let cand_pos = position_base + (idxl0 as usize) - 1; + if cand_pos >= history_abs_start && cand_pos < abs_ip0 { + let cand_idx = cand_pos - history_abs_start; + // SAFETY: same buffer/length bounds as v8_0 above. + let cand_v8 = unsafe { + (history_base_ptr.add(history_start_offset + cand_idx) as *const u64) + .read_unaligned() + }; + if cand_v8 == v8_0 { + // 8 bytes match; count forward + extend back. + let mut match_len = 8usize; + let max_fwd = concat_len.saturating_sub(concat_idx0 + 8); + // SAFETY: both ptrs at the same buffer; offsets + // verified above. `max_fwd` caps the scan to + // the live region. + unsafe { + let lhs = history_base_ptr.add(history_start_offset + cand_idx + 8); + let rhs = + history_base_ptr.add(history_start_offset + concat_idx0 + 8); + let ext = crate::encoding::fastpath::dispatch_common_prefix_len_ptr( + lhs, rhs, max_fwd, + ); + match_len += ext; + } + let concat = &self.history[history_start_offset..]; + let cand = extend_backwards_shared( + concat, + history_abs_start, + cand_pos, + abs_ip0, + match_len, + lit_len_ip0, + ); + break 'inner InnerExit::Committed(cand); + } + } } - miss_run += 1; - if block_is_strict_incompressible || miss_run >= DFAST_LOCAL_SKIP_TRIGGER { - let skip_cap = DFAST_MAX_SKIP_STEP; - if pos >= next_skip_growth_pos { - skip_step = (skip_step + 1).min(skip_cap); - next_skip_growth_pos += DFAST_SKIP_STEP_GROWTH_INTERVAL; + + let idxl1 = unsafe { *long_hash_ptr.add(hl1_idx) }; + + // Short match check at ip0 with idxs0 — 4-byte gate + // ONLY (donor `zstd_double_fast.c:220`). Forward count + // and `_search_next_long` retry happen ONLY on hit. + if idxs0 != DFAST_EMPTY_SLOT { + let cand_pos_s = position_base + (idxs0 as usize) - 1; + if cand_pos_s >= history_abs_start && cand_pos_s < abs_ip0 { + let cand_idx_s = cand_pos_s - history_abs_start; + let cand4 = unsafe { + (history_base_ptr.add(history_start_offset + cand_idx_s) as *const u32) + .read_unaligned() + }; + if cand4 == v4_0 as u32 { + // Short hit: count forward from byte 4 onwards. + let mut s_match_len = 4usize; + let max_fwd = concat_len.saturating_sub(concat_idx0 + 4); + unsafe { + let lhs = + history_base_ptr.add(history_start_offset + cand_idx_s + 4); + let rhs = + history_base_ptr.add(history_start_offset + concat_idx0 + 4); + let ext = crate::encoding::fastpath::dispatch_common_prefix_len_ptr( + lhs, rhs, max_fwd, + ); + s_match_len += ext; + } + let concat = &self.history[history_start_offset..]; + let short_cand = extend_backwards_shared( + concat, + history_abs_start, + cand_pos_s, + abs_ip0, + s_match_len, + lit_len_ip0, + ); + + // Enforce the hash-search floor BEFORE + // committing this short hit. The 4-byte + // equality gate plus forward-count extension + // can produce `short_cand.match_len < 5` + // (the literal 4-byte hit, no extension). + // `probe_slot_match` rejects below-floor + // long-hash hits at the same gate; the + // short-hash path must do the same so the + // fast loop never emits a sub-floor non-rep + // match. The retry below can still upgrade + // to a long hit (which has its own 8-byte + // floor, comfortably above `DFAST_MIN_MATCH_LEN`). + let short_hit_valid = short_cand.match_len >= DFAST_MIN_MATCH_LEN; + + // Donor `_search_next_long` retry (line 260): + // try long match at ip1 with precomputed idxl1. + // If it produces a strictly longer match, use it. + let mut chosen = short_cand; + let mut retry_upgraded = false; + if idxl1 != DFAST_EMPTY_SLOT { + let cand_pos_l1 = position_base + (idxl1 as usize) - 1; + if cand_pos_l1 >= history_abs_start && cand_pos_l1 < abs_ip1 { + let cand_idx_l1 = cand_pos_l1 - history_abs_start; + let cand_v8_l1 = unsafe { + (history_base_ptr.add(history_start_offset + cand_idx_l1) + as *const u64) + .read_unaligned() + }; + if cand_v8_l1 == v8_1 { + let mut l1_match_len = 8usize; + let max_fwd_l1 = concat_len.saturating_sub(concat_idx1 + 8); + unsafe { + let lhs = history_base_ptr + .add(history_start_offset + cand_idx_l1 + 8); + let rhs = history_base_ptr + .add(history_start_offset + concat_idx1 + 8); + let ext = crate::encoding::fastpath::dispatch_common_prefix_len_ptr( + lhs, rhs, max_fwd_l1, + ); + l1_match_len += ext; + } + if l1_match_len > short_cand.match_len { + let concat = &self.history[history_start_offset..]; + chosen = extend_backwards_shared( + concat, + history_abs_start, + cand_pos_l1, + abs_ip1, + l1_match_len, + lit_len_ip1, + ); + // Long-hash hits start at 8 + // bytes (`MEM_read64` gate + // above), well above + // `DFAST_MIN_MATCH_LEN = 5`, + // so the retry upgrade is + // always valid even when + // the raw short hit was + // below the floor. + retry_upgraded = true; + } + } + } + } + if short_hit_valid || retry_upgraded { + break 'inner InnerExit::Committed(chosen); + } + // Below-floor short hit with no retry + // upgrade — fall through to the step bump + // and keep scanning. Discarding here is + // correctness-relevant: emitting a 4-byte + // non-rep match would mint an offset on + // wire that costs more than the 4-byte + // payload buys (offsets at level 2/3 dfast + // are 13–17 bits depending on offset class + // and prior offset_hist state — strictly + // worse than emitting the 4 bytes as + // literals). + } } - pos += skip_step; - } else { - skip_step = 1; - next_skip_growth_pos = pos + DFAST_SKIP_STEP_GROWTH_INTERVAL; - pos += 1; + } + + // Step bump on distance (donor `zstd_double_fast.c:224-228`). + if ip1 >= next_step_pos { + step = (step + 1).min(DFAST_MAX_SKIP_STEP); + next_step_pos += DFAST_SKIP_STEP_GROWTH_INTERVAL; + } + + // Advance: ip0 = ip1; ip1 += step; carry hl1 → hl0 / idxl1 → idxl0. + ip0 = ip1; + ip1 += step; + hl0_idx = hl1_idx; + idxl0 = idxl1; + if ip1 + HASH_READ_SIZE > current_len { + // First position the fast loop did NOT pack into the + // hash tables. `seed_remaining_hashable_starts` will + // pick up from `ip0` instead of restarting at the + // outer-entry `pos`. + break 'inner InnerExit::Tail(ip0); + } + }; + + match inner_exit { + InnerExit::Committed(candidate) => { + let start = self.emit_candidate( + current_abs_start, + &mut literals_start, + candidate, + handle_sequence, + ); + pos = start + candidate.match_len; + pos = self.extend_with_repcode_after_match( + current_abs_start, + current_len, + pos, + &mut literals_start, + handle_sequence, + ); + } + InnerExit::Tail(seed) => { + // Inner loop ran out of safe scan room without + // committing. Hand the tail seeder the first + // un-scanned position so it does not redundantly + // re-pack everything the fast loop already wrote. + pos = seed; + break 'outer; } } } @@ -502,6 +1122,160 @@ impl DfastMatchGenerator { self.emit_trailing_literals(literals_start, handle_sequence); } + /// Single-cursor probe at the last hashable position in the + /// current block. Called from `start_matching_fast_loop` only when + /// the outer iteration sees `ip0` still hashable but `ip1` past + /// the end — the upstream reference's single-cursor loop probes + /// every position with `p + HASH_READ_SIZE <= iend`, and skipping + /// `ip0` here would leave a real match unsearched. + /// + /// Mirror the inner loop's long+short probe and the table update + /// at `ip0`, but skip the rep1 peek (reads at `ip1`) and the + /// `_search_next_long` retry (reads at `ip1`) — both depend on a + /// hashable `ip1`. Upstream accepts this exact tradeoff at the + /// `iend` boundary: the rep peek and retry only ever fire when a + /// second hashable position exists. + /// + /// Returns `Some(MatchCandidate)` if a long or short hit at `ip0` + /// meets the `DFAST_MIN_MATCH_LEN` floor, `None` otherwise (caller + /// then breaks the outer loop). On a hit, the hash tables are NOT + /// updated by this helper — the caller routes through + /// `emit_candidate` which inserts via `insert_positions` over the + /// emitted range, exactly like the inner loop's hit path. + fn probe_tail_ip0_only( + &self, + current_abs_start: usize, + current_len: usize, + ip0: usize, + literals_start: usize, + ) -> Option { + debug_assert!(ip0 + HASH_READ_SIZE <= current_len); + const PRIME: u64 = 0xCF1BBCDCB7A56463_u64; + let short_shift = 64 - self.short_hash_bits; + let long_shift = 64 - self.long_hash_bits; + + let abs_ip0 = current_abs_start + ip0; + let lit_len_ip0 = ip0 - literals_start; + let history_abs_start = self.history_abs_start; + let position_base = self.position_base; + let history_start_offset = self.history_start; + let concat_len = self.history.len() - history_start_offset; + let history_base_ptr = self.history.as_ptr(); + + let concat_idx0 = abs_ip0 - history_abs_start; + + // SAFETY: `concat_idx0 + 8 <= concat_len` follows from the + // caller's `ip0 + HASH_READ_SIZE <= current_len` precondition + // (the live history contains the full current block). + let v8_0 = unsafe { + (history_base_ptr.add(history_start_offset + concat_idx0) as *const u64) + .read_unaligned() + }; + let v4_0 = v8_0 & 0xFFFF_FFFF; + let hl0_idx = (v8_0.wrapping_mul(PRIME) >> long_shift) as usize; + let hs0_idx = (v4_0.wrapping_mul(PRIME) >> short_shift) as usize; + + // Read-only on the hash tables here — unlike the inner loop's + // "update-before-check" pattern, the writes at `hl0_idx` / + // `hs0_idx` would be dead in this helper: + // + // * On a hit, the caller routes through `emit_candidate` + // which insert-positions the entire emitted range, then + // either advances `pos` past `current_len - HASH_READ_SIZE` + // and the outer guard breaks (so a future iter never + // re-uses these slots), or `continue 'outer` re-enters + // with `pos = start + match_len ≥ current_len - 3`, which + // fails the outer-entry `pos + HASH_READ_SIZE > current_len` + // guard immediately. Either way, no second probe sees the + // write. + // * On no hit, the caller `break 'outer`s directly. Same + // conclusion. + // * `seed_remaining_hashable_starts` inserts `ip0` itself + // during the post-loop tail seed pass, so even the "fresh + // entry for next block" rationale doesn't justify writing + // here — the seeder does that. + // + // Skipping the writes also removes a small amount of cache + // dirtying on the tail boundary and keeps `probe_tail_ip0_only` + // strictly cheaper than a full inner-loop iter. + let idxl0 = unsafe { *self.long_hash.as_ptr().add(hl0_idx) }; + let idxs0 = unsafe { *self.short_hash.as_ptr().add(hs0_idx) }; + + // Long-hash probe first (upstream priority: an 8-byte hit + // beats a 4-byte hit even before extension). + if idxl0 != DFAST_EMPTY_SLOT { + let cand_pos = position_base + (idxl0 as usize) - 1; + if cand_pos >= history_abs_start && cand_pos < abs_ip0 { + let cand_idx = cand_pos - history_abs_start; + let cand_v8 = unsafe { + (history_base_ptr.add(history_start_offset + cand_idx) as *const u64) + .read_unaligned() + }; + if cand_v8 == v8_0 { + let mut match_len = 8usize; + let max_fwd = concat_len.saturating_sub(concat_idx0 + 8); + unsafe { + let lhs = history_base_ptr.add(history_start_offset + cand_idx + 8); + let rhs = history_base_ptr.add(history_start_offset + concat_idx0 + 8); + let ext = crate::encoding::fastpath::dispatch_common_prefix_len_ptr( + lhs, rhs, max_fwd, + ); + match_len += ext; + } + let concat = &self.history[history_start_offset..]; + return Some(extend_backwards_shared( + concat, + history_abs_start, + cand_pos, + abs_ip0, + match_len, + lit_len_ip0, + )); + } + } + } + + // Short-hash probe (4-byte gate, forward extension, same + // floor-enforcement as the inner loop's short path — see + // comment there). + if idxs0 != DFAST_EMPTY_SLOT { + let cand_pos_s = position_base + (idxs0 as usize) - 1; + if cand_pos_s >= history_abs_start && cand_pos_s < abs_ip0 { + let cand_idx_s = cand_pos_s - history_abs_start; + let cand4 = unsafe { + (history_base_ptr.add(history_start_offset + cand_idx_s) as *const u32) + .read_unaligned() + }; + if cand4 == v4_0 as u32 { + let mut s_match_len = 4usize; + let max_fwd = concat_len.saturating_sub(concat_idx0 + 4); + unsafe { + let lhs = history_base_ptr.add(history_start_offset + cand_idx_s + 4); + let rhs = history_base_ptr.add(history_start_offset + concat_idx0 + 4); + let ext = crate::encoding::fastpath::dispatch_common_prefix_len_ptr( + lhs, rhs, max_fwd, + ); + s_match_len += ext; + } + let concat = &self.history[history_start_offset..]; + let short_cand = extend_backwards_shared( + concat, + history_abs_start, + cand_pos_s, + abs_ip0, + s_match_len, + lit_len_ip0, + ); + if short_cand.match_len >= DFAST_MIN_MATCH_LEN { + return Some(short_cand); + } + } + } + } + + None + } + /// Donor `zstd_double_fast.c` post-match rep-0 extension. After the /// primary match has been emitted and `pos` advanced past it, donor /// opportunistically chains additional `rep_2`-coded matches at the @@ -517,7 +1291,7 @@ impl DfastMatchGenerator { /// * skips the hash-table probe entirely on every extra match. /// /// Critically uses donor's `MINMATCH = 4` here rather than the - /// stricter `DFAST_MIN_MATCH_LEN = 6` enforced on the main search + /// `DFAST_MIN_MATCH_LEN = 5` enforced on the main search /// loop. The donor accepts any 4-byte rep extension; we mirror that /// because the rep emission carries no offset cost — even a 4-byte /// rep is a net win over re-running the full hash search. Returns @@ -531,10 +1305,9 @@ impl DfastMatchGenerator { literals_start: &mut usize, handle_sequence: &mut impl for<'a> FnMut(Sequence<'a>), ) -> usize { - const DONOR_REP_MIN_MATCH_LEN: usize = 4; loop { - // Need at least DONOR_REP_MIN_MATCH_LEN bytes of room past `pos`. - if pos + DONOR_REP_MIN_MATCH_LEN > current_len { + // Need at least DFAST_REP_MIN_MATCH_LEN bytes of room past `pos`. + if pos + DFAST_REP_MIN_MATCH_LEN > current_len { break; } // After a primary emit `literals_start == pos`, so `lit_len` @@ -561,7 +1334,7 @@ impl DfastMatchGenerator { None => break, }; let concat = &self.history[self.history_start..]; - if cur_idx + DONOR_REP_MIN_MATCH_LEN > concat.len() { + if cur_idx + DFAST_REP_MIN_MATCH_LEN > concat.len() { break; } // Cheap 4-byte gate before the SIMD `common_prefix_len`. @@ -569,12 +1342,61 @@ impl DfastMatchGenerator { break; } let match_len = common_prefix_len(&concat[cand_idx..], &concat[cur_idx..]); - if match_len < DONOR_REP_MIN_MATCH_LEN { + if match_len < DFAST_REP_MIN_MATCH_LEN { break; } - // Insert the rep range into hash tables so future positions - // hashing into this area find these candidates. - self.insert_positions(abs_pos, abs_pos + match_len); + // Sparse complementary insertion (upstream parity, + // `zstd_double_fast.c:300-304`): upstream inserts ONLY at + // `curr+2`, `ip-2`, `ip-1` after a match — three specific + // positions, not the whole match range. The previous + // `insert_positions(abs_pos, abs_pos + match_len)` made + // sense only under the 4-slot bucket; with single-slot + // donor parity it would just overwrite every bucket along + // the match span and discard whichever positions the + // producer was about to re-probe. + // + // At the floor `match_len == DFAST_REP_MIN_MATCH_LEN (= 4)` + // the three targets collapse to two distinct positions. + // With `post_match_end = abs_pos + 4` the three offsets + // resolve to: + // `curr + 2` = abs_pos + 2 + // `ip - 2` = abs_pos + 4 - 2 = abs_pos + 2 ← same as curr+2 + // `ip - 1` = abs_pos + 4 - 1 = abs_pos + 3 ← distinct + // So `curr+2` and `ip-2` write the same slot twice; + // single-slot overwrite is idempotent, so the duplicate + // write is correctness-neutral. It's one wasted store on + // the shortest rep extension and not worth a branch to + // dedup. For `match_len >= 5` all three offsets are + // distinct. + // + // Why `abs_pos` itself is NOT in the insert set, despite + // not being written by the fast loop's pre-check insert + // (the previous range form `insert_positions(abs_pos, + // post_match_end)` did include it): upstream + // `ZSTD_compressBlock_doubleFast_*` likewise does not + // insert at the rep-extension start. After a rep match, + // upstream just advances `ip` and reruns the outer fast + // loop, which writes the new `curr` (the post-rep cursor) + // — never the rep's own start. The three offsets above + // are upstream's primary-match-emit insertion pattern, + // mirrored here to preserve hit rate on the chains that + // follow a rep extension. The hash-state delta vs the + // prior range-insert behavior is intentional and tracked + // by the sequence-stream comparator harness (the + // per-sequence diff vs FFI mentioned in the PR body's + // deferred section); the audit signed off on the current + // sparse set as the closest faithful mirror of upstream. + let post_match_end = abs_pos + match_len; + let insert_targets = [ + abs_pos + 2, // curr + 2 + post_match_end.saturating_sub(2), // ip - 2 (post-match cursor) + post_match_end.saturating_sub(1), // ip - 1 + ]; + for &target in &insert_targets { + if target > abs_pos && target < post_match_end { + self.insert_position(target); + } + } // Emit zero-literal rep sequence. handle_sequence(Sequence::Triple { literals: &[], @@ -613,7 +1435,14 @@ impl DfastMatchGenerator { current_abs_start + *literals_start, candidate.start + candidate.match_len, ); - let current = self.window.back().unwrap().as_slice(); + // Inline the trailing-block slice rather than calling + // `get_last_space()` so this matches the gate pattern used by + // `skip_matching` / `start_matching` (read `window_blocks.back()` + // with `unwrap_or(0)`). `emit_candidate` runs only after a + // successful match was found in the active block, so + // `last_len > 0` is a structural precondition. + let last_len = self.window_blocks.back().copied().unwrap_or(0); + let current = &self.history[self.history.len() - last_len..]; let start = candidate.start - current_abs_start; let literals = ¤t[*literals_start..start]; handle_sequence(Sequence::Triple { @@ -635,8 +1464,13 @@ impl DfastMatchGenerator { literals_start: usize, handle_sequence: &mut impl for<'a> FnMut(Sequence<'a>), ) { - if literals_start < self.window.back().unwrap().len() { - let current = self.window.back().unwrap().as_slice(); + let last_len = self.window_blocks.back().copied().unwrap_or(0); + if literals_start < last_len { + // Inline rather than calling `get_last_space()` so the gate + // pattern matches `skip_matching` / `start_matching` / + // `emit_candidate`. `last_len > 0` is already established by + // the `literals_start < last_len` check above. + let current = &self.history[self.history.len() - last_len..]; handle_sequence(Sequence::Literals { literals: ¤t[literals_start..], }); @@ -644,13 +1478,16 @@ impl DfastMatchGenerator { } pub(crate) fn ensure_hash_tables(&mut self) { - let table_len = 1usize << self.hash_bits; - if self.short_hash.len() != table_len { - // This is intentionally lazy so Fastest/Uncompressed never pay the - // ~dfast-level memory cost. The current size tracks the issue's - // zstd level-3 style parameters rather than a generic low-memory preset. - self.short_hash = alloc::vec![[DFAST_EMPTY_SLOT; DFAST_SEARCH_DEPTH]; table_len]; - self.long_hash = alloc::vec![[DFAST_EMPTY_SLOT; DFAST_SEARCH_DEPTH]; table_len]; + // Independent sizing per donor `clevels.h`: long-hash = + // `hashLog`, short-hash = `chainLog`. Lazy allocation so + // Fastest/Uncompressed never pay the dfast-level memory cost. + let long_len = 1usize << self.long_hash_bits; + let short_len = 1usize << self.short_hash_bits; + if self.long_hash.len() != long_len { + self.long_hash = alloc::vec![DFAST_EMPTY_SLOT; long_len]; + } + if self.short_hash.len() != short_len { + self.short_hash = alloc::vec![DFAST_EMPTY_SLOT; short_len]; } } @@ -674,6 +1511,7 @@ impl DfastMatchGenerator { self.history_abs_start + self.live_history().len() } + #[inline(always)] pub(crate) fn best_match(&self, abs_pos: usize, lit_len: usize) -> Option { let rep = self.repcode_candidate(abs_pos, lit_len); let hash = self.hash_candidate(abs_pos, lit_len); @@ -700,6 +1538,7 @@ impl DfastMatchGenerator { ) } + #[inline(always)] pub(crate) fn repcode_candidate( &self, abs_pos: usize, @@ -715,6 +1554,7 @@ impl DfastMatchGenerator { ) } + #[inline(always)] pub(crate) fn hash_candidate(&self, abs_pos: usize, lit_len: usize) -> Option { // Hoist all the per-loop invariants out of the combinator chains. // `short_candidates`/`long_candidates` each re-fetch `live_history` @@ -732,56 +1572,82 @@ impl DfastMatchGenerator { let position_base = self.position_base; let mut best = None; - // Long-hash probes first (8-byte hash → longer matches more likely). - if current_idx + 8 <= concat.len() { - let long_hash = self.hash8(&concat[current_idx..]); - // SAFETY: `hash_index` masks to `hash_bits` and `long_hash.len() - // == 1 << hash_bits` (`ensure_hash_tables`). + // Long-hash probe first (8-byte hash → longer matches more + // likely). Single-slot per bucket — donor parity, no chain + // walking. The retry policy below mirrors donor + // `_search_next_long`: if the long-hash misses but the + // short-hash hits, peek the long-hash at `abs_pos + 1` and + // pick the longer of the two matches. + let long_hit = if current_idx + 8 <= concat.len() { + let long_hash = self.long_hash_index(&concat[current_idx..]); + // SAFETY: `long_hash_index` masks to `long_hash_bits` and + // `long_hash.len() == 1 << long_hash_bits` (`ensure_hash_tables`). debug_assert!(long_hash < self.long_hash.len()); - let bucket = unsafe { self.long_hash.get_unchecked(long_hash) }; - for &slot in bucket { - // Inline unpack: slot 0 = empty sentinel, otherwise - // `slot - 1 + position_base` is the absolute position. - if slot == DFAST_EMPTY_SLOT { - continue; - } - let candidate_pos = position_base + (slot as usize) - 1; - if candidate_pos < history_abs_start || candidate_pos >= abs_pos { - continue; - } - let candidate_idx = candidate_pos - history_abs_start; - let match_len = common_prefix_len(&concat[candidate_idx..], &concat[current_idx..]); - if match_len >= DFAST_MIN_MATCH_LEN { - let candidate = - self.extend_backwards(candidate_pos, abs_pos, match_len, lit_len); - best = best_len_offset_candidate(best, Some(candidate)); - if best.is_some_and(|b| b.match_len >= DFAST_TARGET_LEN) { - return best; - } - } + let slot = unsafe { *self.long_hash.get_unchecked(long_hash) }; + self.probe_slot_match( + slot, + position_base, + history_abs_start, + abs_pos, + current_idx, + concat, + lit_len, + 8, // long-hash equality gate + ) + } else { + None + }; + if let Some(cand) = long_hit { + best = best_len_offset_candidate(best, Some(cand)); + if best.is_some_and(|b| b.match_len >= DFAST_TARGET_LEN) { + return best; } } if current_idx + 4 <= concat.len() { - let short_hash = self.hash4(&concat[current_idx..]); + let short_hash = self.short_hash_index(&concat[current_idx..]); debug_assert!(short_hash < self.short_hash.len()); - let bucket = unsafe { self.short_hash.get_unchecked(short_hash) }; - for &slot in bucket { - if slot == DFAST_EMPTY_SLOT { - continue; - } - let candidate_pos = position_base + (slot as usize) - 1; - if candidate_pos < history_abs_start || candidate_pos >= abs_pos { - continue; + let slot = unsafe { *self.short_hash.get_unchecked(short_hash) }; + if let Some(short_cand) = self.probe_slot_match( + slot, + position_base, + history_abs_start, + abs_pos, + current_idx, + concat, + lit_len, + 4, // short-hash equality gate + ) { + best = best_len_offset_candidate(best, Some(short_cand)); + if best.is_some_and(|b| b.match_len >= DFAST_TARGET_LEN) { + return best; } - let candidate_idx = candidate_pos - history_abs_start; - let match_len = common_prefix_len(&concat[candidate_idx..], &concat[current_idx..]); - if match_len >= DFAST_MIN_MATCH_LEN { - let candidate = - self.extend_backwards(candidate_pos, abs_pos, match_len, lit_len); - best = best_len_offset_candidate(best, Some(candidate)); - if best.is_some_and(|b| b.match_len >= DFAST_TARGET_LEN) { - return best; + // Donor `_search_next_long` retry: short hit landed but + // a long hit at `abs_pos + 1` could be even longer. The + // donor inner loop precomputes `hashLong[hl1]` for + // exactly this case (line 213 in `zstd_double_fast.c`); + // we lift it inline here so the single-slot table + // retains the compression-quality donor gets from its + // overlapping probe pattern. + let next_idx = current_idx + 1; + if best.is_none_or(|b| b.match_len < DFAST_TARGET_LEN) + && next_idx + 8 <= concat.len() + { + let next_long_hash = self.long_hash_index(&concat[next_idx..]); + debug_assert!(next_long_hash < self.long_hash.len()); + let next_slot = unsafe { *self.long_hash.get_unchecked(next_long_hash) }; + if let Some(retry) = self.probe_slot_match( + next_slot, + position_base, + history_abs_start, + abs_pos + 1, + next_idx, + concat, + lit_len.saturating_add(1), + 8, // long-hash equality gate, `_search_next_long` retry + ) && retry.match_len > short_cand.match_len + { + best = best_len_offset_candidate(best, Some(retry)); } } } @@ -789,6 +1655,60 @@ impl DfastMatchGenerator { best } + /// Resolve a single packed-slot value against the live history and + /// return a backward-extended `MatchCandidate` if the bucket holds + /// a valid in-range position whose forward extension reaches at + /// least `DFAST_MIN_MATCH_LEN` bytes. Shared between the long-hash + /// primary probe, the short-hash primary probe, and the + /// `_search_next_long` retry — keeps the bounds-checking logic in + /// one place so the three call sites can't drift. + #[inline(always)] + #[allow(clippy::too_many_arguments)] + fn probe_slot_match( + &self, + slot: u32, + position_base: usize, + history_abs_start: usize, + abs_pos: usize, + current_idx: usize, + concat: &[u8], + lit_len: usize, + gate_len: usize, + ) -> Option { + if slot == DFAST_EMPTY_SLOT { + return None; + } + let candidate_pos = position_base + (slot as usize) - 1; + if candidate_pos < history_abs_start || candidate_pos >= abs_pos { + return None; + } + let candidate_idx = candidate_pos - history_abs_start; + // Probe-width equality gate before the SIMD walk. The long-hash + // callers pass `gate_len = 8` (matches an `MEM_read64`-style + // probe in the upstream reference); the short-hash caller + // passes `gate_len = 4` (`MEM_read32`-style). A 1-byte + // precheck would accept hash collisions whose actual common + // prefix runs from 1 byte up to less than the probe width, + // paying the full `common_prefix_len` walk for them on every + // iteration. The wider gate rejects those collisions early + // and matches what the upstream `_search_next_long` path does + // after a hit. + if candidate_idx + gate_len > concat.len() || current_idx + gate_len > concat.len() { + return None; + } + if concat[candidate_idx..candidate_idx + gate_len] + != concat[current_idx..current_idx + gate_len] + { + return None; + } + let match_len = common_prefix_len(&concat[candidate_idx..], &concat[current_idx..]); + if match_len < DFAST_MIN_MATCH_LEN { + return None; + } + Some(self.extend_backwards(candidate_pos, abs_pos, match_len, lit_len)) + } + + #[inline(always)] fn extend_backwards( &self, candidate_pos: usize, @@ -809,8 +1729,91 @@ impl DfastMatchGenerator { pub(crate) fn insert_positions(&mut self, start: usize, end: usize) { let start = start.max(self.history_abs_start); let end = end.min(self.history_abs_end()); - for pos in start..end { - self.insert_position(pos); + if start >= end { + return; + } + // Hoist the rebase trigger out of the inner loop: a single + // `ensure_room_for(end - 1)` covers every `pack_slot` in the + // range. The per-position `ensure_room_for` call inside + // `insert_position` would re-check on every byte and the + // compiler cannot prove the call is idempotent through + // `&mut self`. + self.ensure_room_for(end - 1); + + // Snapshot every per-call invariant. `&mut self` blocks the + // optimiser from hoisting these loads across the inner loop — + // the bodies of `short_hash` / `long_hash` writes mutate + // through `self`, so each iteration would otherwise reload + // `history.len()`, `history_start`, `position_base`, + // `*_hash_bits`. With ~1 input byte per + // call on the dfast hot path that re-load shape was the + // dominant cost in the per-position cluster. + let history_abs_start = self.history_abs_start; + let position_base = self.position_base; + let history_start = self.history_start; + let concat_len = self.history.len() - history_start; + let history_base_ptr = self.history.as_ptr(); + let short_hash_bits = self.short_hash_bits; + let long_hash_bits = self.long_hash_bits; + let short_hash_ptr = self.short_hash.as_mut_ptr(); + let long_hash_ptr = self.long_hash.as_mut_ptr(); + let short_shift = 64 - short_hash_bits; + let long_shift = 64 - long_hash_bits; + + // Two contiguous regions in the input range: + // * `[start .. long_safe_end)` — every position has at least 8 + // bytes of lookahead, so both short and long hashes get + // inserted from a single 8-byte unaligned load. + // * `[long_safe_end .. short_safe_end)` — only 4..7 bytes + // remain, so only the short hash gets inserted (4-byte + // load). + // Past `short_safe_end` neither hash has enough lookahead and + // donor parity is "no insert" — skip entirely. + let abs_concat_end = history_abs_start + concat_len; + let long_safe_end = abs_concat_end.saturating_sub(7).min(end); + let short_safe_end = abs_concat_end.saturating_sub(3).min(end); + + // SAFETY: `history_base_ptr.add(history_start + idx)` is + // in-bounds for `idx + 8 <= concat_len`, which the two + // `*_safe_end` cutoffs enforce. `short_hash_ptr.add(k)` / + // `long_hash_ptr.add(k)` are in-bounds because + // `ensure_hash_tables` sizes the two tables to `1 << + // *_hash_bits` and `k = mixed >> (64 - bits)` has at most + // `bits` bits set. `position_base` and `history_abs_start` + // are constant across the loop after the single `ensure_room_for` + // call above. `packed` fits in `u32` by that same gate. + let mut pos = start; + while pos < long_safe_end { + unsafe { + let idx = pos - history_abs_start; + let packed = ((pos - position_base) as u32) + 1; + let load_ptr = history_base_ptr.add(history_start + idx); + let v8 = (load_ptr as *const u64).read_unaligned(); + let v4 = v8 & 0xFFFF_FFFF; + // Donor parity (`zstd_compress_internal.h:923-924`): + // scalar `* prime8bytes` then shift to high bits. Drops + // the CRC32d-based kernel dispatch (3-4 instructions) for + // a single mul on the per-byte insert path. + let mixed_short = v4.wrapping_mul(0xCF1BBCDCB7A56463_u64); + let mixed_long = v8.wrapping_mul(0xCF1BBCDCB7A56463_u64); + let short_idx = (mixed_short >> short_shift) as usize; + let long_idx = (mixed_long >> long_shift) as usize; + *short_hash_ptr.add(short_idx) = packed; + *long_hash_ptr.add(long_idx) = packed; + } + pos += 1; + } + while pos < short_safe_end { + unsafe { + let idx = pos - history_abs_start; + let packed = ((pos - position_base) as u32) + 1; + let load_ptr = history_base_ptr.add(history_start + idx); + let v4 = (load_ptr as *const u32).read_unaligned() as u64; + let mixed_short = v4.wrapping_mul(0xCF1BBCDCB7A56463_u64); + let short_idx = (mixed_short >> short_shift) as usize; + *short_hash_ptr.add(short_idx) = packed; + } + pos += 1; } } @@ -860,42 +1863,44 @@ impl DfastMatchGenerator { // rebase is needed. self.ensure_room_for(pos); let packed = self.pack_slot(pos); - // SAFETY: `hash_index` masks the mixed hash to `hash_bits` bits and - // both tables are sized to `1 << hash_bits` in `ensure_hash_tables`, - // so every index produced here is provably below the table length. - // Eliding the bounds check on this per-byte hot path saves ~4 - // instructions and one branch per call. + // SAFETY: the `*_hash_index` helpers mask the mixed hash to + // `long_hash_bits` / `short_hash_bits`, and `ensure_hash_tables` + // sizes the two tables to `1 << long_hash_bits` / + // `1 << short_hash_bits` respectively, so every produced index + // is provably below the table length. Eliding the bounds check + // on this per-byte hot path saves ~4 instructions per call. + // + // Single-slot overwrite (upstream parity): the previous 4-slot + // bucket shift (`copy_within(..)`) is gone — upstream + // `ZSTD_compressBlock_doubleFast_*` writes a single `U32` per + // hash position and relies on the dense `_search_next_long` + // retry in `hash_candidate` (via `best_match`) to preserve + // compression ratio. if idx + 4 <= concat_len { let concat = &self.history[self.history_start..]; - let short = self.hash4(&concat[idx..]); + let short = self.short_hash_index(&concat[idx..]); debug_assert!(short < self.short_hash.len()); - let bucket = unsafe { self.short_hash.get_unchecked_mut(short) }; - if bucket[0] != packed { - bucket.copy_within(0..DFAST_SEARCH_DEPTH - 1, 1); - bucket[0] = packed; - } + unsafe { *self.short_hash.get_unchecked_mut(short) = packed }; } if idx + 8 <= concat_len { let concat = &self.history[self.history_start..]; - let long = self.hash8(&concat[idx..]); + let long = self.long_hash_index(&concat[idx..]); debug_assert!(long < self.long_hash.len()); - let bucket = unsafe { self.long_hash.get_unchecked_mut(long) }; - if bucket[0] != packed { - bucket.copy_within(0..DFAST_SEARCH_DEPTH - 1, 1); - bucket[0] = packed; - } + unsafe { *self.long_hash.get_unchecked_mut(long) = packed }; } } - pub(crate) fn hash4(&self, data: &[u8]) -> usize { + #[inline(always)] + pub(crate) fn short_hash_index(&self, data: &[u8]) -> usize { let value = u32::from_le_bytes(data[..4].try_into().unwrap()) as u64; - self.hash_index(value) + self.hash_index_with_bits(value, self.short_hash_bits) } - pub(crate) fn hash8(&self, data: &[u8]) -> usize { + #[inline(always)] + pub(crate) fn long_hash_index(&self, data: &[u8]) -> usize { let value = u64::from_le_bytes(data[..8].try_into().unwrap()); - self.hash_index(value) + self.hash_index_with_bits(value, self.long_hash_bits) } fn block_looks_incompressible(&self, start: usize, end: usize) -> bool { @@ -912,23 +1917,16 @@ impl DfastMatchGenerator { block_looks_incompressible(block) } - fn block_looks_incompressible_strict(&self, start: usize, end: usize) -> bool { - let live = self.live_history(); - if start >= end || start < self.history_abs_start { - return false; - } - let start_idx = start - self.history_abs_start; - let end_idx = end - self.history_abs_start; - if end_idx > live.len() { - return false; - } - let block = &live[start_idx..end_idx]; - block_looks_incompressible_strict(block) - } - - fn hash_index(&self, value: u64) -> usize { - let mixed = crate::encoding::fastpath::hash_mix_u64_with_kernel(self.hash_kernel, value); - (mixed >> (64 - self.hash_bits)) as usize + #[inline(always)] + fn hash_index_with_bits(&self, value: u64, bits: usize) -> usize { + // Donor parity (`zstd_compress_internal.h:923-924`, `ZSTD_hash8`): + // a single 64-bit multiply by `prime8bytes` followed by a high-bits + // shift. Drops the CRC32d + rotate + mul kernel dispatch the rest + // of the crate uses — for dfast the donor's scalar hash is + // distribution-equivalent and one instruction shorter on the hot + // path. + let mixed = value.wrapping_mul(0xCF1BBCDCB7A56463_u64); + (mixed >> (64 - bits)) as usize } } @@ -1125,8 +2123,8 @@ mod extend_with_repcode_tests { } /// The helper accepts 4-byte rep extensions (donor `MINMATCH = 4`), - /// not the main-loop `DFAST_MIN_MATCH_LEN = 6` floor. A regression - /// back to 6 would still pass the constant-run / cross-block tests + /// not the main-loop `DFAST_MIN_MATCH_LEN = 5` floor. A regression + /// back above 4 would still pass the constant-run / cross-block tests /// above (their rep matches extend much further), so this fixture /// is built so the rep matches EXACTLY 4 bytes before terminating: /// the byte at `pos + 4` differs from the byte at `pos + 4 - rep`. @@ -1193,7 +2191,7 @@ mod extend_with_repcode_tests { assert_eq!( *match_len, 4, "rep emit must be exactly 4 bytes (donor MINMATCH floor). \ - A regression back to DFAST_MIN_MATCH_LEN = 6 would skip \ + A regression back to DFAST_MIN_MATCH_LEN > 4 would skip \ this emission entirely and the test would fail with 0 seqs." ); } diff --git a/zstd/src/encoding/frame_compressor.rs b/zstd/src/encoding/frame_compressor.rs index 37b565fd..befc304a 100644 --- a/zstd/src/encoding/frame_compressor.rs +++ b/zstd/src/encoding/frame_compressor.rs @@ -76,11 +76,20 @@ impl PreviousFseTable { } pub(crate) struct FseTables { - pub(crate) ll_default: FSETable, + /// The three predefined LL/ML/OF tables are functions of + /// compile-time-constant distributions; the cached helpers in + /// `fse_encoder` build them once per process and hand back + /// `&'static FSETable`. Storing the reference here (rather than + /// an owned `FSETable`) collapses `FrameCompressor::new` to a + /// 3-pointer copy on the cache-hot path — clone takes ~4 µs on + /// x86_64, the bare pointer copy is sub-nanosecond — and removes + /// the same per-clone cost from `clone_fse_tables` on the + /// block-split estimator path. + pub(crate) ll_default: &'static FSETable, pub(crate) ll_previous: Option, - pub(crate) ml_default: FSETable, + pub(crate) ml_default: &'static FSETable, pub(crate) ml_previous: Option, - pub(crate) of_default: FSETable, + pub(crate) of_default: &'static FSETable, pub(crate) of_previous: Option, } @@ -438,7 +447,8 @@ impl FrameCompressor { /// To avoid endlessly encoding from a potentially endless source (like a network socket) you can use the /// [Read::take] function pub fn compress(&mut self) { - let source_size_hint_known = self.source_size_hint.is_some(); + let initial_size_hint = self.source_size_hint; + let source_size_hint_known = initial_size_hint.is_some(); let use_dictionary_state = !matches!(self.compression_level, CompressionLevel::Uncompressed) && self.state.matcher.supports_dictionary_priming() @@ -518,9 +528,36 @@ impl FrameCompressor { window_size != 0, "matcher reported window_size == 0, which is invalid" ); - // Accumulate all compressed blocks; the frame header is written after - // all input has been read so that Frame_Content_Size is known. - let mut all_blocks: Vec = Vec::with_capacity(1024 * 130); + // Accumulate all compressed blocks; the frame header is written + // after all input has been read so that Frame_Content_Size is + // known. The default seed is one donor block; smaller seeds for + // small payloads avoid pinning a full block worth of bytes when + // the compressed output fits in a few hundred bytes. For larger + // inputs the default seed amortises the first few `Vec::extend` + // doublings cheaply and the `peak - default_seed` residue is + // dominated by internal `compress_block_encoded` buffers anyway, + // so changing it produces no measurable savings. + // + // Seed-size tiers (mirrors donor `ZSTD_CStreamOutSize` naming): + // + // * `ALL_BLOCKS_TINY_CAP` — payload ≤ this size, seed equals + // payload bound; ≥ everything compressed output could need + // for a tiny input. + // * `ALL_BLOCKS_SMALL_CAP` — small-input seed picked to absorb + // one or two doublings without over-allocating. + // * `ALL_BLOCKS_DEFAULT_CAP` — one donor block; the value the + // rest of the encoder is sized around. + const ALL_BLOCKS_TINY_THRESHOLD: u64 = 4 * 1024; + const ALL_BLOCKS_SMALL_THRESHOLD: u64 = 64 * 1024; + const ALL_BLOCKS_TINY_CAP: usize = 4 * 1024; + const ALL_BLOCKS_SMALL_CAP: usize = 16 * 1024; + const ALL_BLOCKS_DEFAULT_CAP: usize = 130 * 1024; + let initial_all_blocks_cap = match initial_size_hint { + Some(h) if h <= ALL_BLOCKS_TINY_THRESHOLD => ALL_BLOCKS_TINY_CAP, + Some(h) if h <= ALL_BLOCKS_SMALL_THRESHOLD => ALL_BLOCKS_SMALL_CAP, + _ => ALL_BLOCKS_DEFAULT_CAP, + }; + let mut all_blocks: Vec = Vec::with_capacity(initial_all_blocks_cap); let mut total_uncompressed: u64 = 0; let mut pending_input: Vec = Vec::new(); let mut reached_eof = false; diff --git a/zstd/src/encoding/match_generator.rs b/zstd/src/encoding/match_generator.rs index 0ff88a62..050adcba 100644 --- a/zstd/src/encoding/match_generator.rs +++ b/zstd/src/encoding/match_generator.rs @@ -50,33 +50,45 @@ use std::arch::is_aarch64_feature_detected; #[cfg(all(test, feature = "std", target_arch = "x86_64"))] use std::arch::is_x86_feature_detected; -pub(crate) const DFAST_MIN_MATCH_LEN: usize = 6; +pub(crate) const DFAST_MIN_MATCH_LEN: usize = 5; pub(crate) const DFAST_SHORT_HASH_LOOKAHEAD: usize = 4; pub(crate) const ROW_MIN_MATCH_LEN: usize = 6; pub(crate) const DFAST_TARGET_LEN: usize = 48; -// Donor `clevels.h:31` at level 3 large-input bucket sets `hashLog = 17`. -// Our table layout stores `[u32; DFAST_SEARCH_DEPTH]` (4 u32 slots) per -// bucket, so a `1 << 17`-bucket table is `128 K × 16 B = 2 MiB per -// table`, `4 MiB` for the (short, long) pair. We previously held a -// 20-bit ceiling (1 M buckets × 32 B per `[usize; 4]` = 32 MiB per -// table, 64 MiB for the pair) which inflated peak memory ~85× over -// donor at default level — the bench matrix added in PR #143 surfaced -// 64 MB Rust vs 768 KB donor for the two-table footprint on -// `decodecorpus-z000033`. Donor stores a single `U32` per slot plus a -// `chainTable` for older positions, so their pair is still smaller -// than ours per-level; aligning the bucket count via this constant -// already eats the dominant factor, the storage-width change ate the -// rest. `dfast_hash_bits_for_window` still clamps the runtime value -// to `[MIN_WINDOW_LOG, DFAST_HASH_BITS]`, so this const is the upper -// bound rather than a fixed default. +// Donor `clevels.h:31` at level 3 large-input bucket sets +// `hashLog = 17` (the long-hash table) and `chainLog = 16` (the +// short-hash table — donor names this `chainTable` even though for +// dfast it's used as a plain single-slot hash). Each table holds one +// `U32` per slot; the donor overwrites on collision and recovers +// compression quality via the inline `_search_next_long` retry +// (after a short-hash hit, probes `hashLong[hl1]` at `ip + 1` and +// keeps the longer match). +// +// We mirror that storage layout: single `u32` per bucket (no +// `[u32; N]` array), `long_hash` sized `1 << DFAST_HASH_BITS` and +// `short_hash` one bit smaller via `DFAST_SHORT_HASH_BITS_DELTA`. +// Two-table footprint at Level 3: `2^17 × 4 + 2^16 × 4 = 768 KiB`, +// exact upstream parity. The `_search_next_long` retry lives in +// `DfastMatchGenerator::hash_candidate` (called via +// `best_match`). Earlier revisions kept a +// 4-slot bucket per hash position; that paid 4× the donor memory +// without measurable ratio gain once the retry was in place. +// +// `dfast_hash_bits_for_window` still clamps the runtime long-hash +// value to `[MIN_WINDOW_LOG, DFAST_HASH_BITS]`, so this const is the +// upper bound rather than a fixed default. pub(crate) const DFAST_HASH_BITS: usize = 17; -pub(crate) const DFAST_SEARCH_DEPTH: usize = 4; -/// Sentinel value for an empty slot in the `[u32; DFAST_SEARCH_DEPTH]` -/// hash buckets. Real positions are stored as `(abs_pos - -/// position_base + 1) as u32`, so `0` is reserved as the "empty" -/// marker and a true relative offset of `0` never appears in the -/// table. Mirrors the LDM table's `LdmEntry.offset == 0` convention -/// (see `encoding/ldm/table.rs`) so both rebasing structures share +/// Difference between `long_hash_bits` and `short_hash_bits` — +/// donor `hashLog - chainLog` is 1 at every dfast level (`clevels.h` +/// level 2: 16-15=1; level 3: 17-16=1). The short hash is one bit +/// smaller than the long hash so the per-bucket footprint matches +/// donor sizing exactly. +pub(crate) const DFAST_SHORT_HASH_BITS_DELTA: usize = 1; +/// Sentinel value for an empty slot in the dfast hash tables. Real +/// positions are stored as `(abs_pos - position_base + 1) as u32`, so +/// `0` is reserved as the "empty" marker and a true relative offset +/// of `0` never appears in the table. Mirrors the LDM table's +/// `LdmEntry.offset == 0` convention (see `encoding/ldm/table.rs`) +/// so both rebasing structures share /// one sentinel scheme. pub(crate) const DFAST_EMPTY_SLOT: u32 = 0; @@ -666,15 +678,18 @@ impl MatchGeneratorDriver { }); } super::strategy::BackendTag::Dfast => { - let mut retired = Vec::new(); - self.dfast_matcher_mut().trim_to_window(|data| { - evicted_bytes += data.len(); - retired.push(data); - }); - for mut data in retired { - data.resize(data.capacity(), 0); - self.vec_pool.push(data); - } + // Dfast doesn't retain input Vecs — `history` is the + // only byte store, so there is no per-block buffer + // to push back through a callback. Eviction byte + // count is derived from the `window_size` delta + // before/after; the Dfast variant of + // `trim_to_window` takes no closure, sidestepping + // an unused-`impl FnMut` monomorphization that + // would otherwise contractually never fire. + let dfast = self.dfast_matcher_mut(); + let pre = dfast.window_size; + dfast.trim_to_window(); + evicted_bytes += pre - dfast.window_size; } super::strategy::BackendTag::Row => { let mut retired = Vec::new(); @@ -784,11 +799,7 @@ impl Matcher for MatchGeneratorDriver { // before constructing the replacement variant). m.short_hash = Vec::new(); m.long_hash = Vec::new(); - let vec_pool = &mut self.vec_pool; - m.reset(|mut data| { - data.resize(data.capacity(), 0); - vec_pool.push(data); - }); + m.reset(); } MatcherStorage::Row(m) => { m.row_heads = Vec::new(); @@ -873,11 +884,10 @@ impl Matcher for MatchGeneratorDriver { } else { DFAST_HASH_BITS }); - let vec_pool = &mut self.vec_pool; - dfast.reset(|mut data| { - data.resize(data.capacity(), 0); - vec_pool.push(data); - }); + // Dfast holds no per-block input Vecs (history owns the + // bytes and `add_data` returns each Vec eagerly), so + // `reset` takes no `reuse_space` callback. + dfast.reset(); } MatcherStorage::Row(row) => { row.max_window_size = max_window_size; @@ -3826,6 +3836,67 @@ fn dfast_matches_roundtrip_multi_block_pattern() { assert_eq!(&history[prefix_len..], second_block.as_slice()); } +/// Regression for the `DFAST_MIN_MATCH_LEN: 6 -> 5` drop. The fixture +/// is built so the longest available match is EXACTLY 5 bytes — a +/// matcher that still effectively requires a 6-byte floor would emit +/// only literals here and the assertion would catch the silent +/// 5-byte miss. +/// +/// Fixture layout (34 B): +/// bytes 0..5 `"ABCDE"` — match source +/// bytes 5..28 `'!'` × 23 — filler that does NOT start with 'A' +/// bytes 28..33 `"ABCDE"` — match site (repeats the prefix) +/// byte 33 `'F'` — terminator: differs from byte 5 (`'!'`), +/// so the forward extension at the match +/// site stops at exactly length 5. +/// +/// A 5-byte match at offset 28 must be emitted; a 6-byte+ match at the +/// same offset must NOT. +#[test] +fn dfast_accepts_exact_five_byte_match() { + // Layout the input so that: + // bytes 0..5 = "ABCDE" (the match source) + // bytes 5..28 = 23 filler bytes that do NOT start with 'A' + // bytes 28..33 = "ABCDE" (the 5-byte match site) + // byte 33 = 'F' (differs from byte 5 = '!') + // The longest available copy at position 28 is exactly 5 bytes: + // the byte at position 33 ('F') differs from the byte at position 5 + // ('!'), so the forward extension stops at length 5. + let mut data = Vec::new(); + data.extend_from_slice(b"ABCDE"); // 0..5 + data.extend_from_slice(b"!!!!!!!!!!!!!!!!!!!!!!!"); // 5..28 (23 bytes) + data.extend_from_slice(b"ABCDE"); // 28..33 + data.push(b'F'); // 33: forces forward extension to stop at length 5 + assert_eq!(data.len(), 34); + + let mut matcher = DfastMatchGenerator::new(1 << 22); + matcher.add_data(data.clone(), |_| {}); + + let mut saw_five_byte_match = false; + let mut saw_longer_match = false; + matcher.start_matching(|seq| { + if let Sequence::Triple { + offset, match_len, .. + } = seq + { + if offset == 28 && match_len == 5 { + saw_five_byte_match = true; + } else if offset == 28 && match_len > 5 { + saw_longer_match = true; + } + } + }); + + assert!( + saw_five_byte_match, + "dfast must accept the exact-5-byte match — a 6-byte floor would skip it" + ); + assert!( + !saw_longer_match, + "fixture pinned to length 5 — byte 33 ('F') must terminate the extension" + ); +} + #[test] fn driver_switches_backends_and_initializes_dfast_via_reset() { let mut driver = MatchGeneratorDriver::new(32, 2); @@ -4116,7 +4187,7 @@ fn dictionary_entropy_seed_initializes_opt_state_from_tables() { let ll = crate::fse::fse_encoder::default_ll_table(); let ml = crate::fse::fse_encoder::default_ml_table(); let of = crate::fse::fse_encoder::default_of_table(); - hc.seed_dictionary_entropy(Some(&huff), Some(&ll), Some(&ml), Some(&of)); + hc.seed_dictionary_entropy(Some(&huff), Some(ll), Some(ml), Some(of)); hc.backend.bt_mut().opt_state.rescale_freqs( b"abcd", @@ -4161,7 +4232,7 @@ fn dictionary_fse_seed_applies_without_huffman_seed() { let ll = crate::fse::fse_encoder::default_ll_table(); let ml = crate::fse::fse_encoder::default_ml_table(); let of = crate::fse::fse_encoder::default_of_table(); - hc.seed_dictionary_entropy(None, Some(&ll), Some(&ml), Some(&of)); + hc.seed_dictionary_entropy(None, Some(ll), Some(ml), Some(of)); hc.backend.bt_mut().opt_state.rescale_freqs( b"abcd", HcOptimalCostProfile::const_for_strategy::(), @@ -4204,7 +4275,7 @@ fn dictionary_seed_overrides_predef_price_mode_on_tiny_input() { let ll = crate::fse::fse_encoder::default_ll_table(); let ml = crate::fse::fse_encoder::default_ml_table(); let of = crate::fse::fse_encoder::default_of_table(); - hc.seed_dictionary_entropy(None, Some(&ll), Some(&ml), Some(&of)); + hc.seed_dictionary_entropy(None, Some(ll), Some(ml), Some(of)); hc.backend.bt_mut().opt_state.rescale_freqs( b"abc", HcOptimalCostProfile::const_for_strategy::(), @@ -4264,7 +4335,7 @@ fn btultra2_seed_pass_disabled_when_dictionary_entropy_seed_present() { let ll = crate::fse::fse_encoder::default_ll_table(); let ml = crate::fse::fse_encoder::default_ml_table(); let of = crate::fse::fse_encoder::default_of_table(); - hc.seed_dictionary_entropy(None, Some(&ll), Some(&ml), Some(&of)); + hc.seed_dictionary_entropy(None, Some(ll), Some(ml), Some(of)); assert!( !hc.should_run_btultra2_seed_pass::(HC_PREDEF_THRESHOLD + 1), "dictionary-seeded first block should skip btultra2 warmup pass" @@ -4410,7 +4481,7 @@ fn dictionary_huffman_seed_ignored_when_literals_uncompressed() { let ll = crate::fse::fse_encoder::default_ll_table(); let ml = crate::fse::fse_encoder::default_ml_table(); let of = crate::fse::fse_encoder::default_of_table(); - stats.seed_dictionary_entropy(Some(&huff), Some(&ll), Some(&ml), Some(&of)); + stats.seed_dictionary_entropy(Some(&huff), Some(ll), Some(ml), Some(of)); stats.rescale_freqs( b"abcd", HcOptimalCostProfile::const_for_strategy::(), @@ -4822,8 +4893,15 @@ fn driver_small_source_hint_shrinks_dfast_hash_tables() { space.truncate(12); driver.commit_space(space); driver.skip_matching_with_hint(None); - let full_tables = driver.dfast_matcher().short_hash.len(); - assert_eq!(full_tables, 1 << DFAST_HASH_BITS); + // Donor-parity split sizes: long-hash = DFAST_HASH_BITS, + // short-hash = DFAST_HASH_BITS - DFAST_SHORT_HASH_BITS_DELTA. + let full_long = driver.dfast_matcher().long_hash.len(); + let full_short = driver.dfast_matcher().short_hash.len(); + assert_eq!(full_long, 1 << DFAST_HASH_BITS); + assert_eq!( + full_short, + 1 << (DFAST_HASH_BITS - DFAST_SHORT_HASH_BITS_DELTA) + ); driver.set_source_size_hint(1024); driver.reset(CompressionLevel::Level(2)); @@ -4832,13 +4910,30 @@ fn driver_small_source_hint_shrinks_dfast_hash_tables() { space.truncate(12); driver.commit_space(space); driver.skip_matching_with_hint(None); - let hinted_tables = driver.dfast_matcher().short_hash.len(); + let hinted_long = driver.dfast_matcher().long_hash.len(); + let hinted_short = driver.dfast_matcher().short_hash.len(); assert_eq!(driver.window_size(), 1 << MIN_HINTED_WINDOW_LOG); - assert_eq!(hinted_tables, 1 << MIN_HINTED_WINDOW_LOG); + // At the hinted floor `MIN_HINTED_WINDOW_LOG`, the long table + // matches the hinted size; the short table sits one + // `DFAST_SHORT_HASH_BITS_DELTA` step below it, clamped at its own + // `MIN_WINDOW_LOG` floor. The one-bit split between the two tables + // is preserved — the short table is NOT pulled up to equal the + // long table at this floor. + assert_eq!(hinted_long, 1 << MIN_HINTED_WINDOW_LOG); + let expected_hinted_short_bits = (MIN_HINTED_WINDOW_LOG as usize) + .saturating_sub(DFAST_SHORT_HASH_BITS_DELTA) + .max(MIN_WINDOW_LOG as usize); + assert_eq!( + hinted_short, + 1 << expected_hinted_short_bits, + "short table must sit one DFAST_SHORT_HASH_BITS_DELTA below the long table \ + (clamped at MIN_WINDOW_LOG) — a regression that pulls it up to the long-table \ + floor would still satisfy the `< full_short` bound below and slip through" + ); assert!( - hinted_tables < full_tables, - "tiny source hint should reduce dfast table footprint" + hinted_long < full_long && hinted_short < full_short, + "tiny source hint should reduce both dfast tables" ); } @@ -5069,11 +5164,20 @@ fn driver_unhinted_level2_keeps_default_dfast_hash_table_size() { driver.commit_space(space); driver.skip_matching_with_hint(None); - let table_len = driver.dfast_matcher().short_hash.len(); + // Donor-parity split: long-hash at DFAST_HASH_BITS, short-hash one + // bit smaller (DFAST_SHORT_HASH_BITS_DELTA = 1, matching donor + // `chainLog = hashLog - 1` for dfast levels). + let long_len = driver.dfast_matcher().long_hash.len(); + let short_len = driver.dfast_matcher().short_hash.len(); assert_eq!( - table_len, + long_len, 1 << DFAST_HASH_BITS, - "unhinted Level(2) should keep default dfast table size" + "unhinted Level(2) should keep default long-hash table size" + ); + assert_eq!( + short_len, + 1 << (DFAST_HASH_BITS - DFAST_SHORT_HASH_BITS_DELTA), + "unhinted Level(2) short-hash should be one bit smaller than long-hash" ); } @@ -6856,7 +6960,15 @@ fn dfast_add_data_callback_reports_evicted_len_not_capacity() { } #[test] -fn dfast_trim_to_window_callback_reports_evicted_len_not_capacity() { +fn dfast_trim_to_window_evicts_oldest_block_by_length() { + // After the history-only storage refactor (#111 Phase 7c step 3), + // Dfast no longer retains input `Vec`s — the `history` + // contiguous buffer is the sole byte store, and `add_data` + // returns the input Vec to the caller's pool eagerly. So + // `trim_to_window` doesn't have anything to hand back to the + // closure (no Vec exists to give). The eviction is observable + // instead through `window_size` shrinking by the per-block + // length recorded in `window_blocks`. let mut matcher = DfastMatchGenerator::new(16); let mut first = Vec::with_capacity(64); @@ -6867,18 +6979,26 @@ fn dfast_trim_to_window_callback_reports_evicted_len_not_capacity() { second.extend_from_slice(b"ijklmnop"); matcher.add_data(second, |_| {}); + assert_eq!(matcher.window_size, 16); + assert_eq!(matcher.window_blocks.len(), 2); + matcher.max_window_size = 8; - let mut observed_evicted_len = None; - matcher.trim_to_window(|data| { - observed_evicted_len = Some(data.len()); - }); + matcher.trim_to_window(); + // No callback signature to assert on: the Dfast variant of + // `trim_to_window` takes none. That signature shape (vs HC/Row + // which accept `impl FnMut(Vec)`) is the property locking in + // the contract — there is no closure to invoke or skip, so no + // future change can "start invoking the callback" without a + // compile-time signature break that the dispatcher and this test + // would force the author to address. assert_eq!( - observed_evicted_len, - Some(8), - "trim callback must report evicted byte length, not backing capacity" + matcher.window_size, 8, + "exactly one 8-byte block must remain" ); + assert_eq!(matcher.window_blocks.len(), 1); + assert_eq!(matcher.history_abs_start, 8); } #[test] @@ -7036,18 +7156,16 @@ fn dfast_skip_matching_dense_backfills_newly_hashable_long_tail_positions() { target_rel + 8 <= live.len(), "fixture must make the boundary start long-hashable" ); - let long_hash = matcher.hash8(&live[target_rel..]); + let long_hash = matcher.long_hash_index(&live[target_rel..]); let target_slot = matcher.pack_slot(target_abs_pos); - // Guard against the membership check turning vacuous if a future - // `pack_slot` regression returned `DFAST_EMPTY_SLOT`: empty buckets - // are pre-filled with the sentinel, so `contains(&empty)` would - // pass without proving the real position was seeded. + // Single-slot tables (donor parity): the bucket holds at most one + // u32; the assertion below is a direct equality (no `.contains`). assert_ne!( target_slot, DFAST_EMPTY_SLOT, "pack_slot must never return the empty-slot sentinel for a real position" ); - assert!( - matcher.long_hash[long_hash].contains(&target_slot), + assert_eq!( + matcher.long_hash[long_hash], target_slot, "dense skip must seed long-hash entry for newly hashable boundary start" ); } @@ -7059,7 +7177,7 @@ fn dfast_seed_remaining_hashable_starts_seeds_last_short_hash_positions() { matcher.add_data(block, |_| {}); matcher.ensure_hash_tables(); - let current_len = matcher.window.back().unwrap().len(); + let current_len = matcher.window_blocks.back().copied().unwrap_or(0); let current_abs_start = matcher.history_abs_start + matcher.window_size - current_len; let seed_start = current_len - DFAST_MIN_MATCH_LEN; matcher.seed_remaining_hashable_starts(current_abs_start, current_len, seed_start); @@ -7071,18 +7189,14 @@ fn dfast_seed_remaining_hashable_starts_seeds_last_short_hash_positions() { target_rel + 4 <= live.len(), "fixture must leave the last short-hash start valid" ); - let short_hash = matcher.hash4(&live[target_rel..]); + let short_hash = matcher.short_hash_index(&live[target_rel..]); let target_slot = matcher.pack_slot(target_abs_pos); - // Guard against the membership check turning vacuous if a future - // `pack_slot` regression returned `DFAST_EMPTY_SLOT`: empty buckets - // are pre-filled with the sentinel, so `contains(&empty)` would - // pass without proving the real position was seeded. assert_ne!( target_slot, DFAST_EMPTY_SLOT, "pack_slot must never return the empty-slot sentinel for a real position" ); - assert!( - matcher.short_hash[short_hash].contains(&target_slot), + assert_eq!( + matcher.short_hash[short_hash], target_slot, "tail seeding must include the last 4-byte-hashable start" ); } @@ -7094,7 +7208,7 @@ fn dfast_seed_remaining_hashable_starts_handles_pos_at_block_end() { matcher.add_data(block, |_| {}); matcher.ensure_hash_tables(); - let current_len = matcher.window.back().unwrap().len(); + let current_len = matcher.window_blocks.back().copied().unwrap_or(0); let current_abs_start = matcher.history_abs_start + matcher.window_size - current_len; matcher.seed_remaining_hashable_starts(current_abs_start, current_len, current_len); @@ -7105,18 +7219,14 @@ fn dfast_seed_remaining_hashable_starts_handles_pos_at_block_end() { target_rel + 4 <= live.len(), "fixture must leave the last short-hash start valid" ); - let short_hash = matcher.hash4(&live[target_rel..]); + let short_hash = matcher.short_hash_index(&live[target_rel..]); let target_slot = matcher.pack_slot(target_abs_pos); - // Guard against the membership check turning vacuous if a future - // `pack_slot` regression returned `DFAST_EMPTY_SLOT`: empty buckets - // are pre-filled with the sentinel, so `contains(&empty)` would - // pass without proving the real position was seeded. assert_ne!( target_slot, DFAST_EMPTY_SLOT, "pack_slot must never return the empty-slot sentinel for a real position" ); - assert!( - matcher.short_hash[short_hash].contains(&target_slot), + assert_eq!( + matcher.short_hash[short_hash], target_slot, "tail seeding must still include the last 4-byte-hashable start when pos is at block end" ); } @@ -7152,8 +7262,8 @@ fn dfast_ensure_room_for_rebases_above_guard_band() { let early_abs = 1024usize; let early_packed = dfast.pack_slot(early_abs); assert_ne!(early_packed, DFAST_EMPTY_SLOT); - dfast.short_hash[0][0] = early_packed; - dfast.long_hash[0][0] = early_packed; + dfast.short_hash[0] = early_packed; + dfast.long_hash[0] = early_packed; // Pick a trigger position that forces the first rebase. With // `position_base = 0`, the smallest `abs_pos` that fails the @@ -7174,11 +7284,11 @@ fn dfast_ensure_room_for_rebases_above_guard_band() { // donor parity for `ZSTD_window_reduce`'s clamp-at-zero rule. // Verify BOTH tables — `reduce()` walks them in sequence. assert_eq!( - dfast.short_hash[0][0], DFAST_EMPTY_SLOT, + dfast.short_hash[0], DFAST_EMPTY_SLOT, "pre-rebase short-hash entries below the reducer must become empty" ); assert_eq!( - dfast.long_hash[0][0], DFAST_EMPTY_SLOT, + dfast.long_hash[0], DFAST_EMPTY_SLOT, "pre-rebase long-hash entries below the reducer must become empty" ); diff --git a/zstd/src/encoding/mod.rs b/zstd/src/encoding/mod.rs index b5b89815..64a3dda6 100644 --- a/zstd/src/encoding/mod.rs +++ b/zstd/src/encoding/mod.rs @@ -92,6 +92,55 @@ use alloc::vec::Vec; pub(crate) const BETTER_WINDOW_LOG: u8 = 23; +/// Worst-case compressed-size bound for `src_size` uncompressed +/// bytes. Mirrors the upstream `ZSTD_COMPRESSBOUND(srcSize)` macro +/// from `lib/zstd.h` (zstd 1.5.7) — the C *macro*, NOT the public +/// `ZSTD_compressBound()` function. The function adds frame/block- +/// header headroom in some code paths; the macro is a tighter +/// expression suitable for sizing an internal output Vec, which is +/// the only call site here. Callers that need a hard upper bound on +/// the wire-format compressed size should NOT reuse this — use the +/// `zstd-sys` function call directly. If the macro changes in a +/// future upstream revision, treat the upstream header as +/// authoritative and re-derive this function — do not trust the +/// inline copy without cross-checking. +/// +/// Inline approximation (this revision): +/// +/// ```text +/// srcSize +/// + (srcSize >> 8) +/// + (srcSize < 128 KiB ? ((128 KiB - srcSize) >> 11) : 0) +/// ``` +/// +/// Consulted by [`compress_slice_to_vec`] for the small-input branch +/// of its output-`Vec` seed: the seed is +/// `min(compress_bound(src), OUTPUT_BLOCK_CAP = 128 KiB)`, so for +/// inputs that fit within one donor block (`≤ ~128 KiB`) the +/// destination never reallocates inside the measured window. Without +/// the bound, `Vec::push` growth doubles in powers of two and pins +/// `~2 × final_compressed_size` resident at the last realloc, which +/// shows up as ~1 MiB of peak-RSS noise on 1 MiB inputs and prevents +/// FFI-parity on the memory bench. For inputs larger than the cap the +/// `Vec` still grows by amortized doubling — see +/// [`compress_slice_to_vec`] for the trade-off rationale. +#[inline] +pub(crate) const fn compress_bound(src_size: usize) -> usize { + const SMALL_INPUT_THRESHOLD: usize = 128 * 1024; + let tail = if src_size < SMALL_INPUT_THRESHOLD { + (SMALL_INPUT_THRESHOLD - src_size) >> 11 + } else { + 0 + }; + // `saturating_add` guards against the corner case where `src_size` + // is close to `usize::MAX`. The current sole caller + // (`compress_slice_to_vec`) clamps the result with `.min(OUTPUT_BLOCK_CAP)` + // so an overflow there would still produce a sensible cap, but + // future `pub(crate)` callers may use the raw bound — keep the + // arithmetic itself overflow-safe. + src_size.saturating_add(src_size >> 8).saturating_add(tail) +} + /// Convenience function to compress some source into a target without reusing any resources of the compressor /// ```rust /// use structured_zstd::encoding::{compress, CompressionLevel}; @@ -112,6 +161,50 @@ pub fn compress(source: R, target: W, level: CompressionLevel /// can provide a source-size hint to the one-shot encoder path. Peak memory can /// therefore be roughly `input_size + output_size`. For very large payloads or /// tighter memory budgets, prefer streaming APIs such as [`StreamingEncoder`]. +/// +/// **Peak-memory shape change in this revision.** The implementation +/// delegates to [`compress_slice_to_vec`], which seeds the output +/// `Vec` with `min(compress_bound(input.len()), OUTPUT_BLOCK_CAP = +/// 128 KiB)` instead of the previous `Vec::new()` (zero-capacity + +/// power-of-two growth). For inputs in the few-KiB to ~128 KiB range +/// this is a strict improvement (no doubling spikes inside the +/// measured window). For inputs significantly larger than 128 KiB the +/// allocation curve still grows by amortized doubling but starts from +/// a 128 KiB floor rather than 0. Downstream consumers that measure +/// peak RSS on this entry point will see a different curve than +/// pre-revision; bench shape, not steady-state, is what changed. +/// +/// **This is NOT a streaming API.** The source is fully buffered +/// into a `Vec` before any compression work begins, so peak input +/// memory is bounded by `source.len()` (not "constant regardless of +/// payload size" as a stream-shaped encoder would offer). The RSS +/// notes below apply to the materialization-then-compress shape; if +/// the source is large enough that holding it in memory is not +/// acceptable, use [`StreamingEncoder`] which consumes chunks +/// incrementally without the up-front Vec build. +/// +/// The other side of the peak shape is the input buffering: this +/// helper drives `read_to_end` to materialize the full source into a +/// `Vec` before forwarding the slice to [`compress_slice_to_vec`]. +/// For a `Read` whose size is unknown ahead of time, `read_to_end` +/// grows that input `Vec` via power-of-two doubling — peak input +/// allocation can be up to 2× the final source length transiently. +/// At the moment that input buffer crosses ~128 KiB the output Vec +/// seed kicks in concurrently. The total live working set on this +/// entry point is approximately +/// `input.capacity() + output_vec_seed + internal_accumulators`, +/// where `output_vec_seed` is `min(compress_bound(input.len()), 128 +/// KiB)` and `internal_accumulators` covers +/// `FrameCompressor::all_blocks` (pre-reserved at frame start, up to +/// ~130 KiB at default block cap) plus per-block scratch (hash tables, +/// literal/sequence staging). Round the helper's RSS peak to +/// `input.capacity() + output_vec_seed + ~130 KiB internal + +/// per-block scratch` rather than the bare `input.capacity() + 128 +/// KiB` figure quoted in earlier revisions, which only accounted for +/// the output seed. [`StreamingEncoder`] avoids the input +/// materialization step entirely and is the right entry point when +/// the source is large or unbounded. +/// /// ```rust /// use structured_zstd::encoding::{compress_to_vec, CompressionLevel}; /// let data: &[u8] = &[0,0,0,0,0,0,0,0,0,0,0,0]; @@ -121,11 +214,73 @@ pub fn compress_to_vec(source: R, level: CompressionLevel) -> Vec { let mut source = source; let mut input = Vec::new(); source.read_to_end(&mut input).unwrap(); + compress_slice_to_vec(input.as_slice(), level) +} - let mut vec = Vec::new(); +/// Compress a contiguous byte slice into a fresh `Vec` without +/// the input-buffering step that [`compress_to_vec`] performs to +/// adapt a `Read` source. Donor-parity peak-memory shape: the input +/// is read by reference, and the output `Vec` is seeded with +/// `min(compress_bound(src), OUTPUT_BLOCK_CAP = 128 KiB)`. +/// +/// The seed is intentionally capped at one donor block +/// (`OUTPUT_BLOCK_CAP = 128 KiB`). Three regimes: +/// +/// * Inputs up to ~127 KiB where `compress_bound(src) < +/// OUTPUT_BLOCK_CAP` — the `min` picks the tighter bound and the +/// `Vec` never reallocates inside the measured window (cheap, no +/// doubling spikes, no over-allocation). +/// * Inputs around 128 KiB where `compress_bound(src) >= +/// OUTPUT_BLOCK_CAP` — the `min` clamps to the cap, so the "no +/// reallocation" property holds only as long as the actual +/// compressed output also fits within `OUTPUT_BLOCK_CAP`. For +/// high-entropy inputs near the cap boundary the `Vec` may grow +/// by one doubling step before the next block lands. +/// * Larger inputs — the `Vec` grows via amortized doubling, with +/// peak transient memory ≈ 2× current compressed size at the last +/// realloc. Deliberate trade-off against pinning +/// `compress_bound(src)` upfront, which on the 100 MiB +/// `large-log-stream` scenario pinned 100.4 MiB even though the +/// actual compressed output was ≪ 1 MiB. +/// +/// # Panics +/// +/// Panics on encoder error (matches the failure surface of +/// [`compress_to_vec`], which this function backs). The internal +/// [`FrameCompressor::compress`] call propagates `io::Error` from the +/// output [`Vec`] writer; under the in-tree default writer that +/// channel is infallible and any error becomes a `panic`. Out-of- +/// memory during `Vec::with_capacity` or the encoder's per-block +/// scratch allocations is handled by the global allocator's abort +/// policy. Migrating to a fallible variant requires plumbing +/// `Result` through `FrameCompressor::compress` — out of scope for +/// the slice/Vec entry points which mirror the donor `ZSTD_compress` +/// shape (no error return on the bulk path). +/// +/// ```rust +/// use structured_zstd::encoding::{compress_slice_to_vec, CompressionLevel}; +/// let data: &[u8] = &[0,0,0,0,0,0,0,0,0,0,0,0]; +/// let compressed = compress_slice_to_vec(data, CompressionLevel::Fastest); +/// ``` +pub fn compress_slice_to_vec(source: &[u8], level: CompressionLevel) -> Vec { + // Initial capacity sized to a single output block, matching donor + // `ZSTD_CStreamOutSize()` (≈ `ZSTD_BLOCKSIZE_MAX` + headroom). For + // small inputs `compress_bound(src)` is smaller than this block — + // use the tighter bound to avoid over-allocation. For larger inputs + // the `Vec` grows via amortized doubling, with peak ≈ 2×current + // compressed size at the last realloc (donor streaming shape: the + // caller buffer is one block, not `ZSTD_compressBound(srcSize)`). + // Pre-allocating `compress_bound(src)` here would charge worst-case + // expansion against peak even on highly compressible inputs — on + // the 100 MiB `large-log-stream` scenario that single allocation + // dominated the encoder's measured footprint (`compress_bound` ≈ + // 100.4 MiB pinned, actual compressed output ≪ 1 MiB). + const OUTPUT_BLOCK_CAP: usize = 128 * 1024; + let cap = compress_bound(source.len()).min(OUTPUT_BLOCK_CAP); + let mut vec = Vec::with_capacity(cap); let mut frame_enc = FrameCompressor::new(level); - frame_enc.set_source_size_hint(input.len() as u64); - frame_enc.set_source(input.as_slice()); + frame_enc.set_source_size_hint(source.len() as u64); + frame_enc.set_source(source); frame_enc.set_drain(&mut vec); frame_enc.compress(); vec diff --git a/zstd/src/fse/fse_encoder.rs b/zstd/src/fse/fse_encoder.rs index 40253516..ee420723 100644 --- a/zstd/src/fse/fse_encoder.rs +++ b/zstd/src/fse/fse_encoder.rs @@ -699,14 +699,105 @@ const OF_DIST: &[i32] = &[ 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1, ]; -pub(crate) fn default_ml_table() -> FSETable { - build_table_from_probabilities(ML_DIST, 6) +// The three predefined LL/ML/OF distribution tables are pure functions +// of compile-time constants (`LL_DIST`, `ML_DIST`, `OF_DIST` and fixed +// `acc_log` values). Each `build_table_from_probabilities` call costs +// ~12 µs on a modern x86_64 — multiplied by three, that's ~38 µs of +// pure setup paid on every `FrameCompressor::new`. On a 1 KiB frame +// the actual compression work is ~1-5 µs, so the predefined-table +// build dominates the per-frame cost for tiny inputs. +// +// Cache each predefined table once per process via a dedicated +// `AtomicPtr` and hand callers a clone. The per-distribution +// cache slot is encoded by the static identity of the cache itself +// (each helper has its own `static`), NOT by pointer comparison on +// the distribution slice — `LL_DIST` / `ML_DIST` / `OF_DIST` are +// `const`, and `core::ptr::eq` on the materialized slice pointer of +// a `const` is only stable as long as rustc keeps lowering it to a +// single anonymous static. A future rustc change that duplicates the +// const at each use site would silently sink every call through the +// "unknown slice" branch and bypass the cache forever. Owning the +// `static AtomicPtr` per-helper means the cache slot is selected at +// compile time without consulting the slice pointer at all. +// +// `AtomicPtr` lock-free init works in both `std` and +// `no_std` builds — only `alloc` is required (always available in +// this crate via `extern crate alloc`), which keeps the cache on a +// single code path instead of a `cfg(feature = "std")` branch with +// an eager rebuild fallback. Memory ordering: `Acquire` on the load +// so the consumer sees a fully-published `FSETable`; `AcqRel` on the +// compare-exchange so the publishing side both reads any concurrent +// winner (`Acquire`) and publishes its store (`Release`). The first +// writer leaks `Box` intentionally — the cache lives for +// the entire program lifetime, exactly like a `LazyLock`. +fn get_or_init_cached_table( + cache: &core::sync::atomic::AtomicPtr, + probs: &[i32], + acc_log: u8, +) -> &'static FSETable { + use core::sync::atomic::Ordering; + + let cur = cache.load(Ordering::Acquire); + if !cur.is_null() { + // SAFETY: a non-null entry in this cache was published by a + // previous winner of the `compare_exchange` below, which + // leaked the `Box` (the cache never frees, mirroring + // a `LazyLock` lifetime). The pointed-to allocation is + // therefore valid for `'static` and immutable for the rest + // of the program, so handing out a `&'static FSETable` to + // multiple threads is sound. + return unsafe { &*cur }; + } + + let built = alloc::boxed::Box::new(build_table_from_probabilities(probs, acc_log)); + let raw = alloc::boxed::Box::into_raw(built); + // `AcqRel` on success rather than the minimal `Release`: the + // success path doesn't actually need Acquire (no prior loads to + // synchronise with at the publication point), but matching the + // failure Acquire ordering keeps the success and failure + // branches symmetric on rustc's atomic surface — both arms then + // see the same memory-fence shape for whatever follows. The + // marginal cost is one extra fence instruction on x86/aarch64; + // this path runs at most three times per process so it never + // shows up in a flamegraph. + match cache.compare_exchange( + core::ptr::null_mut(), + raw, + Ordering::AcqRel, + Ordering::Acquire, + ) { + Ok(_) => { + // We installed the singleton. SAFETY: `raw` is the + // pointer we just published; no other thread can free it. + unsafe { &*raw } + } + Err(existing) => { + // Another thread beat us to the publish — reclaim our + // throwaway allocation and use the winner. SAFETY: we own + // `raw` here (compare_exchange did NOT store it), so + // reconstituting the `Box` is sound. `existing` was + // published by the winning thread and is leaked for + // `'static`, mirroring the `cur` path. + drop(unsafe { alloc::boxed::Box::from_raw(raw) }); + unsafe { &*existing } + } + } +} + +pub(crate) fn default_ml_table() -> &'static FSETable { + static CACHE: core::sync::atomic::AtomicPtr = + core::sync::atomic::AtomicPtr::new(core::ptr::null_mut()); + get_or_init_cached_table(&CACHE, ML_DIST, 6) } -pub(crate) fn default_ll_table() -> FSETable { - build_table_from_probabilities(LL_DIST, 6) +pub(crate) fn default_ll_table() -> &'static FSETable { + static CACHE: core::sync::atomic::AtomicPtr = + core::sync::atomic::AtomicPtr::new(core::ptr::null_mut()); + get_or_init_cached_table(&CACHE, LL_DIST, 6) } -pub(crate) fn default_of_table() -> FSETable { - build_table_from_probabilities(OF_DIST, 5) +pub(crate) fn default_of_table() -> &'static FSETable { + static CACHE: core::sync::atomic::AtomicPtr = + core::sync::atomic::AtomicPtr::new(core::ptr::null_mut()); + get_or_init_cached_table(&CACHE, OF_DIST, 5) } diff --git a/zstd/src/fse/mod.rs b/zstd/src/fse/mod.rs index 5e410322..170c34da 100644 --- a/zstd/src/fse/mod.rs +++ b/zstd/src/fse/mod.rs @@ -95,9 +95,9 @@ fn table_header_bits_exact() { }; // Predefined tables - check(&default_ll_table()); - check(&default_ml_table()); - check(&default_of_table()); + check(default_ll_table()); + check(default_ml_table()); + check(default_of_table()); // Tables built from synthetic data let data: alloc::vec::Vec = (0u8..32).cycle().take(1000).collect(); diff --git a/zstd/src/tests/fuzz_regressions.rs b/zstd/src/tests/fuzz_regressions.rs index a9fa5c95..b1b25494 100644 --- a/zstd/src/tests/fuzz_regressions.rs +++ b/zstd/src/tests/fuzz_regressions.rs @@ -25,3 +25,72 @@ fn test_all_artifacts() { .and_then(|()| frame_dec.decode_blocks(&mut f, BlockDecodingStrategy::All)); } } + +/// Regression for the `interop` fuzz target: a 7-byte input crashed the +/// level 3 dfast hot loop because `start_matching_fast_loop` guarded the +/// loop with `pos + DFAST_MIN_MATCH_LEN <= current_len` (`MIN_MATCH = 5`) +/// but unconditionally issued 8-byte `u64` loads via raw-pointer +/// `read_unaligned` for the long-hash probe. On any block whose tail +/// landed within `[current_len - 8, current_len - 5]` the load read past +/// `history.len()`, which is UB on `*const u64::read_unaligned` even if +/// the underlying `Vec`'s spare capacity covers the bytes. +/// +/// The fix tightens every fast-loop guard to `+ HASH_READ_SIZE = 8` so +/// the load is always in-bounds for the live history, matching donor +/// `ilimit = iend - HASH_READ_SIZE` in `zstd_double_fast.c`. +/// +/// Artifact: `zstd/fuzz/artifacts/interop/crash-01be...0dc7`. Base64 +/// `BGAuICAKIA==` → bytes `04 60 2e 20 20 0a 20`. CI fuzz run that +/// produced this artifact: +/// https://github.com/structured-world/structured-zstd/actions/runs/25974756307 +/// +/// SIGNAL CAVEAT: a plain `cargo nextest run` of this test may pass +/// against the pre-fix code because the OOB `*const u64::read_unaligned` +/// usually lands inside the live `Vec`'s spare capacity — the bytes are +/// well-defined for the allocator even though the read is UB. The +/// regression reliably fires only under a sanitizer that tracks valid +/// length (CI fuzz job runs ASan; `cargo +nightly miri test` also +/// catches it). Treat a green `cargo test` here as a smoke check, not +/// proof that the fast-loop guards are correct; the authoritative +/// signal for this fixture is the Linux fuzz CI job. +#[test] +fn interop_7_byte_input_does_not_oob_in_dfast_fast_loop() { + extern crate std; + use crate::decoding::{BlockDecodingStrategy, FrameDecoder}; + use crate::encoding::{CompressionLevel, compress_to_vec}; + + // Bytes inline; the original libFuzzer artifact file was + // content-hashed (`crash-01be...0dc7`) so any future fuzz run + // that re-discovers the same root cause via a different input + // would land in a different filename anyway — the literal is + // the canonical regression vector, not the artifact path. + // Base64 `BGAuICAKIA==`. + let data: &[u8] = &[0x04, 0x60, 0x2e, 0x20, 0x20, 0x0a, 0x20]; + + // Pin to `Level(3)` rather than `Default` so this regression keeps + // covering the dfast fast loop specifically — the original UB + // surfaced through level 3 dfast, and pinning here means a future + // retune of the `Default` alias cannot accidentally route this + // test off the dfast path and let the regression pass without + // exercising the fixed code. Pre-fix this panicked / produced a + // garbage frame on Linux fuzz (ASan caught the UB). + let compressed = compress_to_vec(data, CompressionLevel::Level(3)); + + // Roundtrip through the in-tree decoder — matches the convention + // used by `test_all_artifacts` above and avoids coupling this + // regression to the donor `zstd` crate. The OOB load shows up as + // a panic / decode error before this point under ASan; if we get + // here with a parseable frame the bytes must match the input. + let mut frame_dec = FrameDecoder::new(); + let mut cursor = compressed.as_slice(); + frame_dec.reset(&mut cursor).unwrap(); + frame_dec + .decode_blocks(&mut cursor, BlockDecodingStrategy::All) + .unwrap(); + // `expect` over `unwrap_or_default`: a real decoder failure must + // surface as a "decoder returned None" panic, not as an empty + // `decoded` that then fails `assert_eq!` with a misleading + // "left: [] right: [04 60 ...]" diff that hides the real cause. + let decoded = frame_dec.collect().expect("decoder returned no payload"); + assert_eq!(decoded.as_slice(), data); +}