From 437c2f930743d2aa5f3090fb90fc3b40d84be66a Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Mon, 27 Apr 2026 11:36:34 -0400 Subject: [PATCH 1/6] pack-objects: pass --objects with --path-walk When 'git pack-objects' has the --path-walk option enabled, it uses a different set of revision walk parameters than normal. For once, --objects was previously assumed by the path-walk API and was not needed to be added. We also needed --boundary to allow discovering UNINTERESTING objects to use as delta bases. We will be updating the path-walk API soon to work with some filter options. However, the revision machinery will trigger a fatal error: fatal: object filtering requires --objects The fix is easy: add the --objects option as an argument. This has no effect on the path-walk API but does simplify the revision option parsing for the objects filter. We can remove the comment about "removing" the options because they were never removed and instead not added. We still need to disable using bitmaps. Signed-off-by: Derrick Stolee --- builtin/pack-objects.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c index dd2480a73d2edf..4338962904bc94 100644 --- a/builtin/pack-objects.c +++ b/builtin/pack-objects.c @@ -5190,10 +5190,7 @@ int cmd_pack_objects(int argc, } if (path_walk) { strvec_push(&rp, "--boundary"); - /* - * We must disable the bitmaps because we are removing - * the --objects / --objects-edge[-aggressive] options. - */ + strvec_push(&rp, "--objects"); use_bitmap_index = 0; } else if (thin) { use_internal_rev_list = 1; From af45e23dbc335148e2ad4b4414fa22817137bc51 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Mon, 27 Apr 2026 09:02:45 -0400 Subject: [PATCH 2/6] path-walk: support blobless filter The 'git pack-objects' command can opt-in to using the path-walk API for scanning the objects. Currently, this option is dynamically disabled if combined with '--filter=', even when using a simple filter such as 'blob:none' to signal a blobless packfile. This is a common scenario for repos at scale, so is worth integrating. Also, users can opt-in to the '--path-walk' option by default through the pack.usePathWalk=true config option. When using that in a blobless partial clone, the following warning can appear even though the user did not specify either option directly: warning: cannot use --filter with --path-walk Teach the path-walk API to handle the 'blob:none' object filter natively. When revs->filter.choice is LOFC_BLOB_NONE, the path-walk sets info->blobs to 0 (skipping all blob objects) and clears the filter from revs so that prepare_revision_walk() does not reject the configuration. This check is implemented in the static prepare_filters() method, which will simultaneously check if the input filters are compatible and will make the appropriate mutations to the path_walk_info and filters if the path_walk_info is non-NULL. This allows us to use this logic both in the API method path_walk_filter_compatible() for use in builtin/pack-objects.c and as a prep step in walk_objects_by_path(). Update the test helper (test-path-walk) to accept --filter= as a test-tool option (before '--'), applying it to revs after setup_revisions() to avoid the --objects requirement check. Also switch test-path-walk from REV_INFO_INIT with manual repo assignment to repo_init_revisions(), which properly initializes the filter_spec strbuf needed for filter parsing. Add tests for blob:none with --all and with a single branch. Signed-off-by: Derrick Stolee --- builtin/pack-objects.c | 2 +- path-walk.c | 30 ++++++++++++++++++++ path-walk.h | 7 +++++ t/helper/test-path-walk.c | 11 ++++++- t/t6601-path-walk.sh | 60 +++++++++++++++++++++++++++++++++++++++ 5 files changed, 108 insertions(+), 2 deletions(-) diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c index 4338962904bc94..bc9fb5b45737a3 100644 --- a/builtin/pack-objects.c +++ b/builtin/pack-objects.c @@ -5177,7 +5177,7 @@ int cmd_pack_objects(int argc, if (path_walk) { const char *option = NULL; - if (filter_options.choice) + if (!path_walk_filter_compatible(&filter_options)) option = "--filter"; else if (use_delta_islands) option = "--delta-islands"; diff --git a/path-walk.c b/path-walk.c index 6e426af4330893..a4dd197c37e624 100644 --- a/path-walk.c +++ b/path-walk.c @@ -9,6 +9,7 @@ #include "hashmap.h" #include "hex.h" #include "list-objects.h" +#include "list-objects-filter-options.h" #include "object.h" #include "oid-array.h" #include "path.h" @@ -485,6 +486,32 @@ static int setup_pending_objects(struct path_walk_info *info, return 0; } +static int prepare_filters(struct path_walk_info *info, + struct list_objects_filter_options *options) +{ + switch (options->choice) { + case LOFC_DISABLED: + return 1; + + case LOFC_BLOB_NONE: + if (info) { + info->blobs = 0; + list_objects_filter_release(options); + } + return 1; + + default: + error(_("object filter '%s' not supported by the path-walk API"), + list_objects_filter_spec(options)); + return 0; + } +} + +int path_walk_filter_compatible(struct list_objects_filter_options *options) +{ + return prepare_filters(NULL, options); +} + /** * Given the configuration of 'info', walk the commits based on 'info->revs' and * call 'info->path_fn' on each discovered path. @@ -512,6 +539,9 @@ int walk_objects_by_path(struct path_walk_info *info) trace2_region_enter("path-walk", "commit-walk", info->revs->repo); + if (!prepare_filters(info, &info->revs->filter)) + return -1; + CALLOC_ARRAY(commit_list, 1); commit_list->type = OBJ_COMMIT; diff --git a/path-walk.h b/path-walk.h index 5ef5a8440e6b5e..be8d27b39815af 100644 --- a/path-walk.h +++ b/path-walk.h @@ -85,3 +85,10 @@ void path_walk_info_clear(struct path_walk_info *info); * Returns nonzero on an error. */ int walk_objects_by_path(struct path_walk_info *info); + +struct list_objects_filter_options; +/** + * Given a set of options for filtering objects, return 1 if the options + * are compatible with the path-walk API and 0 otherwise. + */ +int path_walk_filter_compatible(struct list_objects_filter_options *options); diff --git a/t/helper/test-path-walk.c b/t/helper/test-path-walk.c index fe63002c2be27d..88f86ae0dc1157 100644 --- a/t/helper/test-path-walk.c +++ b/t/helper/test-path-walk.c @@ -4,6 +4,7 @@ #include "dir.h" #include "environment.h" #include "hex.h" +#include "list-objects-filter-options.h" #include "object-name.h" #include "object.h" #include "pretty.h" @@ -71,6 +72,8 @@ int cmd__path_walk(int argc, const char **argv) struct rev_info revs = REV_INFO_INIT; struct path_walk_info info = PATH_WALK_INFO_INIT; struct path_walk_test_data data = { 0 }; + struct list_objects_filter_options filter_options = + LIST_OBJECTS_FILTER_INIT; struct option options[] = { OPT_BOOL(0, "blobs", &info.blobs, N_("toggle inclusion of blob objects")), @@ -86,11 +89,12 @@ int cmd__path_walk(int argc, const char **argv) N_("toggle aggressive edge walk")), OPT_BOOL(0, "stdin-pl", &stdin_pl, N_("read a pattern list over stdin")), + OPT_PARSE_LIST_OBJECTS_FILTER(&filter_options), OPT_END(), }; setup_git_directory(); - revs.repo = the_repository; + repo_init_revisions(the_repository, &revs, NULL); argc = parse_options(argc, argv, NULL, options, path_walk_usage, @@ -101,6 +105,10 @@ int cmd__path_walk(int argc, const char **argv) else usage(path_walk_usage[0]); + /* Apply the filter after setup_revisions to avoid the --objects check. */ + if (filter_options.choice) + list_objects_filter_copy(&revs.filter, &filter_options); + info.revs = &revs; info.path_fn = emit_block; info.path_fn_data = &data; @@ -129,6 +137,7 @@ int cmd__path_walk(int argc, const char **argv) free(info.pl); } + list_objects_filter_release(&filter_options); release_revisions(&revs); return res; } diff --git a/t/t6601-path-walk.sh b/t/t6601-path-walk.sh index 56bd1e3c5bec97..94df309987b011 100755 --- a/t/t6601-path-walk.sh +++ b/t/t6601-path-walk.sh @@ -415,4 +415,64 @@ test_expect_success 'trees are reported exactly once' ' test_line_count = 1 out-filtered ' +test_expect_success 'all, blob:none filter' ' + test-tool path-walk --filter=blob:none -- --all >out && + + cat >expect <<-EOF && + 0:commit::$(git rev-parse topic) + 0:commit::$(git rev-parse base) + 0:commit::$(git rev-parse base~1) + 0:commit::$(git rev-parse base~2) + 1:tag:/tags:$(git rev-parse refs/tags/first) + 1:tag:/tags:$(git rev-parse refs/tags/second.1) + 1:tag:/tags:$(git rev-parse refs/tags/second.2) + 1:tag:/tags:$(git rev-parse refs/tags/third) + 1:tag:/tags:$(git rev-parse refs/tags/fourth) + 1:tag:/tags:$(git rev-parse refs/tags/tree-tag) + 1:tag:/tags:$(git rev-parse refs/tags/blob-tag) + 2:tree::$(git rev-parse topic^{tree}) + 2:tree::$(git rev-parse base^{tree}) + 2:tree::$(git rev-parse base~1^{tree}) + 2:tree::$(git rev-parse base~2^{tree}) + 2:tree::$(git rev-parse refs/tags/tree-tag^{}) + 2:tree::$(git rev-parse refs/tags/tree-tag2^{}) + 3:tree:a/:$(git rev-parse base:a) + 4:tree:child/:$(git rev-parse refs/tags/tree-tag:child) + 5:tree:left/:$(git rev-parse base:left) + 5:tree:left/:$(git rev-parse base~2:left) + 6:tree:right/:$(git rev-parse topic:right) + 6:tree:right/:$(git rev-parse base~1:right) + 6:tree:right/:$(git rev-parse base~2:right) + blobs:0 + commits:4 + tags:7 + trees:13 + EOF + + test_cmp_sorted expect out +' + +test_expect_success 'topic only, blob:none filter' ' + test-tool path-walk --filter=blob:none -- topic >out && + + cat >expect <<-EOF && + 0:commit::$(git rev-parse topic) + 0:commit::$(git rev-parse base~1) + 0:commit::$(git rev-parse base~2) + 1:tree::$(git rev-parse topic^{tree}) + 1:tree::$(git rev-parse base~1^{tree}) + 1:tree::$(git rev-parse base~2^{tree}) + 2:tree:left/:$(git rev-parse base~2:left) + 3:tree:right/:$(git rev-parse topic:right) + 3:tree:right/:$(git rev-parse base~1:right) + 3:tree:right/:$(git rev-parse base~2:right) + blobs:0 + commits:3 + tags:0 + trees:7 + EOF + + test_cmp_sorted expect out +' + test_done From 889954e1aa0435ae4b19336ff11db0fd17f43692 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Mon, 27 Apr 2026 11:00:59 -0400 Subject: [PATCH 3/6] backfill: die on incompatible filter options The 'git backfill' command uses the path-walk API in a critical way: it uses the objects output from the command to find the batches of missing objects that should be requested from the server. Unlike 'git pack-objects', we cannot fall back to another mechanism. The previous change added the path_walk_filter_compatible() method that we can reuse here. Use it during argument validation in cmd_backfill(). Signed-off-by: Derrick Stolee --- builtin/backfill.c | 2 ++ t/t5620-backfill.sh | 8 ++++++++ 2 files changed, 10 insertions(+) diff --git a/builtin/backfill.c b/builtin/backfill.c index d794dd842f65ce..51eaa42169ee6f 100644 --- a/builtin/backfill.c +++ b/builtin/backfill.c @@ -144,6 +144,8 @@ int cmd_backfill(int argc, const char **argv, const char *prefix, struct reposit if (argc > 1) die(_("unrecognized argument: %s"), argv[1]); + if (!path_walk_filter_compatible(&ctx.revs.filter)) + die(_("cannot backfill with these filter options")); repo_config(repo, git_default_config, NULL); diff --git a/t/t5620-backfill.sh b/t/t5620-backfill.sh index f3b5e39493677b..3580e10b9c08e3 100755 --- a/t/t5620-backfill.sh +++ b/t/t5620-backfill.sh @@ -15,6 +15,14 @@ test_expect_success 'backfill rejects unexpected arguments' ' test_grep "unrecognized argument: --unexpected-arg" err ' +test_expect_success 'backfill rejects incompatible filter options' ' + test_must_fail git backfill --objects --filter=tree:1 2>err && + test_grep "cannot backfill with these filter options" err && + + test_must_fail git backfill --objects --filter=blob:limit=10m 2>err && + test_grep "cannot backfill with these filter options" err +' + # We create objects in the 'src' repo. test_expect_success 'setup repo for object creation' ' echo "{print \$1}" >print_1.awk && From 32ef3865c25182b89c5c3b0b5f593710e3f691d6 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Mon, 27 Apr 2026 10:15:33 -0400 Subject: [PATCH 4/6] path-walk: support blob size limit filter Extend the path-walk API to handle the 'blob:limit=' object filter natively. This filter omits blobs whose size is equal to or greater than the given limit, matching the semantics used by the list-objects-filter machinery. When revs->filter.choice is LOFC_BLOB_LIMIT, the prepare_filters() method stores the limit value in info->blob_limit and clears the filter from revs. If the limit is zero, this degenerates to blob:none (all blobs excluded), so info->blobs is set to 0 instead. During walk_path(), blob batches are filtered before being delivered to the callback: each blob's size is checked via odb_read_object_info(), and only blobs strictly smaller than the limit are included. Blobs whose size cannot be determined (e.g. missing in a partial clone) are conservatively included, matching the existing filter behavior. Empty batches after filtering are skipped entirely. The check for inclusion in the path batch looks a little strange at first glance. We use odb_read_object_info() to read the object's size. Based on all of the assumptions to this point, this _should_ return OBJ_BLOB. Since we are focused on the size filter, we use a short-circuited OR (||) to skip the size check if that method returns a different object type. Notice that this inspection of object sizes requires the content to be present in the repository. The odb_read_object_info() call will download a missing blob on-demand. This means that the use of the path-walk API within 'git backfill' would not operate nicely with this filter type. The intention of that command is to download missing blobs in batches. Downloading objects one-by-one would go against the point. Update the validation in 'git backfill' to add its own compatibility check on top of path_walk_filter_compatible(). Add tests for blob:limit=0 (equivalent to blob:none) and blob:limit=3 (which exercises partial filtering within a batch where some blobs are kept and others are excluded). Signed-off-by: Derrick Stolee --- builtin/backfill.c | 2 ++ path-walk.c | 38 +++++++++++++++++++-- path-walk.h | 8 +++++ t/t5620-backfill.sh | 2 +- t/t6601-path-walk.sh | 78 ++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 124 insertions(+), 4 deletions(-) diff --git a/builtin/backfill.c b/builtin/backfill.c index 51eaa42169ee6f..7ef9dc305e1bc5 100644 --- a/builtin/backfill.c +++ b/builtin/backfill.c @@ -146,6 +146,8 @@ int cmd_backfill(int argc, const char **argv, const char *prefix, struct reposit die(_("unrecognized argument: %s"), argv[1]); if (!path_walk_filter_compatible(&ctx.revs.filter)) die(_("cannot backfill with these filter options")); + if (ctx.revs.filter.blob_limit_value) + die(_("cannot backfill with blob size limits")); repo_config(repo, git_default_config, NULL); diff --git a/path-walk.c b/path-walk.c index a4dd197c37e624..0e7dab7a6a68bf 100644 --- a/path-walk.c +++ b/path-walk.c @@ -10,6 +10,7 @@ #include "hex.h" #include "list-objects.h" #include "list-objects-filter-options.h" +#include "odb.h" #include "object.h" #include "oid-array.h" #include "path.h" @@ -315,9 +316,29 @@ static int walk_path(struct path_walk_context *ctx, /* Evaluate function pointer on this data, if requested. */ if ((list->type == OBJ_TREE && ctx->info->trees) || (list->type == OBJ_BLOB && ctx->info->blobs) || - (list->type == OBJ_TAG && ctx->info->tags)) - ret = ctx->info->path_fn(path, &list->oids, list->type, - ctx->info->path_fn_data); + (list->type == OBJ_TAG && ctx->info->tags)) { + struct oid_array *oids = &list->oids; + struct oid_array filtered = OID_ARRAY_INIT; + + if (list->type == OBJ_BLOB && ctx->info->blob_limit) { + for (size_t i = 0; i < list->oids.nr; i++) { + unsigned long size; + + if (odb_read_object_info(ctx->repo->objects, + &list->oids.oid[i], + &size) != OBJ_BLOB || + size < ctx->info->blob_limit) + oid_array_append(&filtered, + &list->oids.oid[i]); + } + oids = &filtered; + } + + if (oids->nr) + ret = ctx->info->path_fn(path, oids, list->type, + ctx->info->path_fn_data); + oid_array_clear(&filtered); + } /* Expand data for children. */ if (list->type == OBJ_TREE) { @@ -500,6 +521,17 @@ static int prepare_filters(struct path_walk_info *info, } return 1; + case LOFC_BLOB_LIMIT: + if (info) { + if (!options->blob_limit_value) { + info->blobs = 0; + } else { + info->blob_limit = options->blob_limit_value; + } + list_objects_filter_release(options); + } + return 1; + default: error(_("object filter '%s' not supported by the path-walk API"), list_objects_filter_spec(options)); diff --git a/path-walk.h b/path-walk.h index be8d27b39815af..bcb81b70a14272 100644 --- a/path-walk.h +++ b/path-walk.h @@ -42,6 +42,14 @@ struct path_walk_info { int blobs; int tags; + /** + * If non-zero, specifies a maximum blob size. Blobs with a + * size equal to or greater than this limit will be omitted + * from the walk. Blobs smaller than the limit (or blobs + * whose size cannot be determined) are still visited. + */ + unsigned long blob_limit; + /** * When 'prune_all_uninteresting' is set and a path has all objects * marked as UNINTERESTING, then the path-walk will not visit those diff --git a/t/t5620-backfill.sh b/t/t5620-backfill.sh index 3580e10b9c08e3..3c8a75192a4791 100755 --- a/t/t5620-backfill.sh +++ b/t/t5620-backfill.sh @@ -20,7 +20,7 @@ test_expect_success 'backfill rejects incompatible filter options' ' test_grep "cannot backfill with these filter options" err && test_must_fail git backfill --objects --filter=blob:limit=10m 2>err && - test_grep "cannot backfill with these filter options" err + test_grep "cannot backfill with blob size limits" err ' # We create objects in the 'src' repo. diff --git a/t/t6601-path-walk.sh b/t/t6601-path-walk.sh index 94df309987b011..d9be7b9cd21ee0 100755 --- a/t/t6601-path-walk.sh +++ b/t/t6601-path-walk.sh @@ -475,4 +475,82 @@ test_expect_success 'topic only, blob:none filter' ' test_cmp_sorted expect out ' +test_expect_success 'all, blob:limit=0 filter' ' + test-tool path-walk --filter=blob:limit=0 -- --all >out && + + cat >expect <<-EOF && + 0:commit::$(git rev-parse topic) + 0:commit::$(git rev-parse base) + 0:commit::$(git rev-parse base~1) + 0:commit::$(git rev-parse base~2) + 1:tag:/tags:$(git rev-parse refs/tags/first) + 1:tag:/tags:$(git rev-parse refs/tags/second.1) + 1:tag:/tags:$(git rev-parse refs/tags/second.2) + 1:tag:/tags:$(git rev-parse refs/tags/third) + 1:tag:/tags:$(git rev-parse refs/tags/fourth) + 1:tag:/tags:$(git rev-parse refs/tags/tree-tag) + 1:tag:/tags:$(git rev-parse refs/tags/blob-tag) + 2:tree::$(git rev-parse topic^{tree}) + 2:tree::$(git rev-parse base^{tree}) + 2:tree::$(git rev-parse base~1^{tree}) + 2:tree::$(git rev-parse base~2^{tree}) + 2:tree::$(git rev-parse refs/tags/tree-tag^{}) + 2:tree::$(git rev-parse refs/tags/tree-tag2^{}) + 3:tree:a/:$(git rev-parse base:a) + 4:tree:child/:$(git rev-parse refs/tags/tree-tag:child) + 5:tree:left/:$(git rev-parse base:left) + 5:tree:left/:$(git rev-parse base~2:left) + 6:tree:right/:$(git rev-parse topic:right) + 6:tree:right/:$(git rev-parse base~1:right) + 6:tree:right/:$(git rev-parse base~2:right) + blobs:0 + commits:4 + tags:7 + trees:13 + EOF + + test_cmp_sorted expect out +' + +test_expect_success 'all, blob:limit=3 filter' ' + test-tool path-walk --filter=blob:limit=3 -- --all >out && + + cat >expect <<-EOF && + 0:commit::$(git rev-parse topic) + 0:commit::$(git rev-parse base) + 0:commit::$(git rev-parse base~1) + 0:commit::$(git rev-parse base~2) + 1:tag:/tags:$(git rev-parse refs/tags/first) + 1:tag:/tags:$(git rev-parse refs/tags/second.1) + 1:tag:/tags:$(git rev-parse refs/tags/second.2) + 1:tag:/tags:$(git rev-parse refs/tags/third) + 1:tag:/tags:$(git rev-parse refs/tags/fourth) + 1:tag:/tags:$(git rev-parse refs/tags/tree-tag) + 1:tag:/tags:$(git rev-parse refs/tags/blob-tag) + 2:tree::$(git rev-parse topic^{tree}) + 2:tree::$(git rev-parse base^{tree}) + 2:tree::$(git rev-parse base~1^{tree}) + 2:tree::$(git rev-parse base~2^{tree}) + 2:tree::$(git rev-parse refs/tags/tree-tag^{}) + 2:tree::$(git rev-parse refs/tags/tree-tag2^{}) + 3:blob:a:$(git rev-parse base~2:a) + 4:tree:a/:$(git rev-parse base:a) + 5:tree:child/:$(git rev-parse refs/tags/tree-tag:child) + 6:tree:left/:$(git rev-parse base:left) + 6:tree:left/:$(git rev-parse base~2:left) + 7:blob:left/b:$(git rev-parse base~2:left/b) + 8:tree:right/:$(git rev-parse topic:right) + 8:tree:right/:$(git rev-parse base~1:right) + 8:tree:right/:$(git rev-parse base~2:right) + 9:blob:right/c:$(git rev-parse base~2:right/c) + 10:blob:right/d:$(git rev-parse base~1:right/d) + blobs:4 + commits:4 + tags:7 + trees:13 + EOF + + test_cmp_sorted expect out +' + test_done From 123769455ee9f3b69c9025111975f36cd2dfb046 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Mon, 27 Apr 2026 11:36:48 -0400 Subject: [PATCH 5/6] t/perf: add pack-objects filter and path-walk benchmark Add p5315-pack-objects-filter.sh to measure the performance of 'git pack-objects --revs --all' under different filter and traversal combinations: * no filter (baseline) * --filter=blob:none (blobless) * --filter=sparse:oid= (cone-mode sparse) Each filter scenario is tested both with and without --path-walk, producing paired measurements that show the impact of the path-walk traversal for each filter type. The sparse filter definition is built automatically by sampling depth-2 directories from the test repository, making the test work on any repo passed via GIT_PERF_LARGE_REPO. For repos that lack depth-2 directories, a single top-level directory is used; for flat repos, the sparse tests are skipped via prerequisite. Currently the sparse:oid filter is not yet supported by the path-walk API, so the --path-walk variant silently falls back to the standard traversal. This establishes the baseline for measuring improvement once sparse filter support is added. Signed-off-by: Derrick Stolee --- t/perf/p5315-pack-objects-filter.sh | 129 ++++++++++++++++++++++++++++ 1 file changed, 129 insertions(+) create mode 100755 t/perf/p5315-pack-objects-filter.sh diff --git a/t/perf/p5315-pack-objects-filter.sh b/t/perf/p5315-pack-objects-filter.sh new file mode 100755 index 00000000000000..00226c15393006 --- /dev/null +++ b/t/perf/p5315-pack-objects-filter.sh @@ -0,0 +1,129 @@ +#!/bin/sh + +test_description='Tests pack-objects performance with filters and --path-walk' +. ./perf-lib.sh + +test_perf_large_repo + +test_expect_success 'setup filter inputs' ' + # Sample a few depth-2 directories from the test repo to build + # a cone-mode sparse-checkout definition. The sampling picks + # directories at evenly-spaced positions so the choice is stable + # and scales to repos of any shape. + + git ls-tree -d --name-only HEAD >top-dirs && + top_nr=$(wc -l depth2-dirs && + while read tdir + do + git ls-tree -d --name-only "HEAD:$tdir" 2>/dev/null | + sed "s|^|$tdir/|" >>depth2-dirs || return 1 + done sparse-patterns && + + git hash-object -w sparse-patterns >sparse-oid && + echo "Sparse cone: $first $mid" && + cat sparse-patterns && + test_set_prereq SPARSE_OID + elif test "$top_nr" -ge 1 + then + # Fallback: use a single top-level directory. + first=$(sed -n "1p" top-dirs) && + { + echo "/*" && + echo "!/*/" && + echo "/$first/" + } >sparse-patterns && + + git hash-object -w sparse-patterns >sparse-oid && + echo "Sparse cone: $first" && + cat sparse-patterns && + test_set_prereq SPARSE_OID + fi +' + +test_perf 'repack (no filter)' ' + git pack-objects --stdout --revs --all pk +' + +test_size 'repack size (no filter)' ' + test_file_size pk +' + +test_perf 'repack (no filter, --path-walk)' ' + git pack-objects --stdout --revs --all --path-walk pk +' + +test_size 'repack size (no filter, --path-walk)' ' + test_file_size pk +' + +test_perf 'repack (blob:none)' ' + git pack-objects --stdout --revs --all --filter=blob:none pk +' + +test_size 'repack size (blob:none)' ' + test_file_size pk +' + +test_perf 'repack (blob:none, --path-walk)' ' + git pack-objects --stdout --revs --all --path-walk \ + --filter=blob:none pk +' + +test_size 'repack size (blob:none, --path-walk)' ' + test_file_size pk +' + +test_perf 'repack (sparse:oid)' \ + --prereq SPARSE_OID ' + git pack-objects --stdout --revs --all \ + --filter=sparse:oid=$(cat sparse-oid) pk +' + +test_size 'repack size (sparse:oid)' \ + --prereq SPARSE_OID ' + test_file_size pk +' + +test_perf 'repack (sparse:oid, --path-walk)' \ + --prereq SPARSE_OID ' + git pack-objects --stdout --revs --all --path-walk \ + --filter=sparse:oid=$(cat sparse-oid) pk +' + +test_size 'repack size (sparse:oid, --path-walk)' \ + --prereq SPARSE_OID ' + test_file_size pk +' + +test_done From bf00b444575fd67d2fb79676bde9261ccb5949d5 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Mon, 27 Apr 2026 14:14:42 -0400 Subject: [PATCH 6/6] path-walk: support cone-mode sparse filters The --filter=sparse: option to 'git pack-objects' allows focusing an object set to a sparse-checkout definition. This reduces the set of matching blobs, but also the set of matching trees. No server currently supports fetching with this filter because it is expensive to compute and reachability bitmaps do not help without a significant effort to extend the bitmap feature to store bitmaps for each supported sparse- checkout definition. Without focusing on serving fetches and clones with these filters, there are still benefits that could be realized by making this faster. With the sparse index, it's more realistic now than ever to be able to operate a local clone that was bootstrapped by a packfile created with a sparse filter, because the missing trees are not needed to move a sparse-checkout from one commit to another or to view the history of any path in scope. Such clones could perhaps be bootstrapped by partial bundles. Previously, constructing these sparse packs has been incredibly computationally inefficient. The revision walk that explores which objects are in scope spends a lot of time checking each object to see if it matches the sparse-checkout patterns, causing quadratic behavior (number of objects times number of sparse-checkout patterns). This improves somewhat when using cone-mode sparse-checkout patterns that can use hashtables and prefix matches to determine containment. However, the check per object is still too expensive for most cases. This is where the path-walk feature comes in. We can proceed as normal by placing objects in bins by path and _then_ check a group of objects all at once. Further, we can use cone mode patterns to prune the tree walk to avoid expanding groups of tree objects that don't match the prefix-based patterns. The implementation here is focused around loading the sparse-checkout patterns from the provided object ID and checking that the patterns are indeed cone-mode patterns. We can then load the correct pattern list into the path walk context and use the logic that already exists from bff45557675 (backfill: add --sparse option, 2025-02-03), though that feature loads sparse-checkout patterns from the worktree's local settings. We use a combination of errors and warnings to signal problems during this load. The difference is that errors are likely fatal for the non-path-walk version while the warnings are probably just implementation details for the path-walk version and the 'git pack-objects' command can fall back to the revision walk version. However, there was one detail that I noticed while implementing this that does impact 'git backfill --sparse' though we haven't seen it matter in real scenarios: we could under-count objects within a sparse scope if they appear in multiple paths and those paths are within and without the sparse-checkout. Move the SEEN flag assignment in add_tree_entries() to after the sparse pattern and pathspec checks. Previously, SEEN was set immediately upon discovering an object, before checking whether its path matched the sparse patterns. When the same object ID appeared at multiple paths (e.g. sibling directories with identical contents), the first path to be visited would mark the object as SEEN. If that path was outside the sparse cone, the object would be skipped there but also never discovered at its in-cone path. By deferring the SEEN flag until after the pattern check passes, objects that are skipped due to sparse filtering remain discoverable at other paths where they may be in scope. The new tests in t5317 demonstrate this behavior and would fail without this change. The recently-added tests in p5315 demonstrate this improvement, even on the Git repository: Test HEAD~1 HEAD -------------------------------------------------------------------- 5315.10: repack (sparse:oid) 61.68 62.59 +1.5% 5315.11: repack size (sparse:oid) 285.8M 285.7M -0.0% 5315.12: repack (sparse:oid, --path-walk) 61.75 9.49 -84.6% 5315.13: repack size (sparse:oid, --path-walk) 286.0M 265.1M -7.3% This shows that the performance is relatively stable for the case without --path-walk, but now that the --path-walk option is engaged with the change we can see an 84% reduction in the end-to-end time and a 7% reduction in the packfile size. The size decrease is due to the existing benefits of the path-walk algorithm placing relevant objects next to each other in the first compression pass, followed by a second pass order by name hash and size. Signed-off-by: Derrick Stolee --- path-walk.c | 46 ++++++++- t/t5317-pack-objects-filter-objects.sh | 131 +++++++++++++++++++++++++ t/t6601-path-walk.sh | 120 ++++++++++++++++++++++ 3 files changed, 295 insertions(+), 2 deletions(-) diff --git a/path-walk.c b/path-walk.c index 0e7dab7a6a68bf..8c01a39f754146 100644 --- a/path-walk.c +++ b/path-walk.c @@ -10,6 +10,7 @@ #include "hex.h" #include "list-objects.h" #include "list-objects-filter-options.h" +#include "object-name.h" #include "odb.h" #include "object.h" #include "oid-array.h" @@ -180,10 +181,8 @@ static int add_tree_entries(struct path_walk_context *ctx, return -1; } - /* Skip this object if already seen. */ if (o->flags & SEEN) continue; - o->flags |= SEEN; strbuf_setlen(&path, base_len); strbuf_add(&path, entry.path, entry.pathlen); @@ -239,6 +238,7 @@ static int add_tree_entries(struct path_walk_context *ctx, continue; } + o->flags |= SEEN; add_path_to_list(ctx, path.buf, type, &entry.oid, !(o->flags & UNINTERESTING)); @@ -532,6 +532,48 @@ static int prepare_filters(struct path_walk_info *info, } return 1; + case LOFC_SPARSE_OID: + if (info) { + struct object_id sparse_oid; + struct repository *repo = info->revs->repo; + + if (info->pl) { + warning(_("sparse filter cannot be combined with existing sparse patterns")); + return 0; + } + + if (repo_get_oid_with_flags(repo, + options->sparse_oid_name, + &sparse_oid, + GET_OID_BLOB)) { + error(_("unable to access sparse blob in '%s'"), + options->sparse_oid_name); + return 0; + } + + CALLOC_ARRAY(info->pl, 1); + info->pl->use_cone_patterns = 1; + + if (add_patterns_from_blob_to_list(&sparse_oid, "", 0, + info->pl) < 0) { + clear_pattern_list(info->pl); + FREE_AND_NULL(info->pl); + error(_("unable to parse sparse filter data in '%s'"), + oid_to_hex(&sparse_oid)); + return 0; + } + + if (!info->pl->use_cone_patterns) { + clear_pattern_list(info->pl); + FREE_AND_NULL(info->pl); + warning(_("sparse filter is not cone-mode compatible")); + return 0; + } + + list_objects_filter_release(options); + } + return 1; + default: error(_("object filter '%s' not supported by the path-walk API"), list_objects_filter_spec(options)); diff --git a/t/t5317-pack-objects-filter-objects.sh b/t/t5317-pack-objects-filter-objects.sh index 501d715b9a16b7..bce61fc3321056 100755 --- a/t/t5317-pack-objects-filter-objects.sh +++ b/t/t5317-pack-objects-filter-objects.sh @@ -478,4 +478,135 @@ test_expect_success 'verify pack-objects w/ --missing=allow-any' ' EOF ' +# Test that --path-walk produces the same object set as standard traversal +# when using sparse:oid filters with cone-mode patterns. +# +# The standard sparse filter always includes ALL tree objects, while the +# path-walk with cone-mode patterns prunes entire subtrees that are outside +# the sparse cone. Both should produce the same set of blobs and commits. + +test_expect_success 'setup pw_sparse for path-walk comparison' ' + git init pw_sparse && + mkdir -p pw_sparse/inc/sub pw_sparse/exc/sub && + + for n in 1 2 + do + echo "inc $n" >pw_sparse/inc/file$n && + echo "inc sub $n" >pw_sparse/inc/sub/file$n && + echo "exc $n" >pw_sparse/exc/file$n && + echo "exc sub $n" >pw_sparse/exc/sub/file$n && + echo "root $n" >pw_sparse/root$n || return 1 + done && + + git -C pw_sparse add . && + git -C pw_sparse commit -m "first" && + + echo "inc 1 modified" >pw_sparse/inc/file1 && + echo "exc 1 modified" >pw_sparse/exc/file1 && + echo "root 1 modified" >pw_sparse/root1 && + git -C pw_sparse add . && + git -C pw_sparse commit -m "second" && + + # Cone-mode sparse pattern: include root + inc/ + printf "/*\n!/*/\n/inc/\n" | + git -C pw_sparse hash-object -w --stdin >sparse_oid +' + +test_expect_success 'sparse:oid with --path-walk produces same blobs' ' + oid=$(cat sparse_oid) && + + git -C pw_sparse pack-objects --revs --stdout \ + --filter=sparse:oid=$oid >standard.pack <<-EOF && + HEAD + EOF + git -C pw_sparse index-pack ../standard.pack && + git -C pw_sparse verify-pack -v ../standard.pack >standard_verify && + + git -C pw_sparse pack-objects --revs --stdout \ + --path-walk --filter=sparse:oid=$oid >pathwalk.pack <<-EOF && + HEAD + EOF + git -C pw_sparse index-pack ../pathwalk.pack && + git -C pw_sparse verify-pack -v ../pathwalk.pack >pathwalk_verify && + + # Blobs must match exactly + grep -E "^[0-9a-f]{40} blob" standard_verify | + awk "{print \$1}" | sort >standard_blobs && + grep -E "^[0-9a-f]{40} blob" pathwalk_verify | + awk "{print \$1}" | sort >pathwalk_blobs && + test_cmp standard_blobs pathwalk_blobs && + + # Commits must match exactly + grep -E "^[0-9a-f]{40} commit" standard_verify | + awk "{print \$1}" | sort >standard_commits && + grep -E "^[0-9a-f]{40} commit" pathwalk_verify | + awk "{print \$1}" | sort >pathwalk_commits && + test_cmp standard_commits pathwalk_commits +' + +test_expect_success 'sparse:oid with --path-walk prunes excluded trees' ' + # The standard filter includes ALL trees, while path-walk + # prunes trees outside the sparse cone (exc/ and exc/sub/). + grep -E "^[0-9a-f]{40} tree" standard_verify | + awk "{print \$1}" | sort >standard_trees && + grep -E "^[0-9a-f]{40} tree" pathwalk_verify | + awk "{print \$1}" | sort >pathwalk_trees && + + # path-walk should have fewer trees + test $(wc -l extra_trees && + test_must_be_empty extra_trees +' + +# Test the edge case where the same tree/blob OID appears at both an +# in-cone and out-of-cone path. When sibling directories have identical +# contents, they share a tree OID. The path-walk defers marking objects +# SEEN until after checking sparse patterns, so an object at an out-of-cone +# path can still be discovered at an in-cone path. + +test_expect_success 'setup pw_shared for shared OID across cone boundary' ' + git init pw_shared && + mkdir pw_shared/aaa pw_shared/zzz && + echo "shared content" >pw_shared/aaa/file && + echo "shared content" >pw_shared/zzz/file && + echo "root file" >pw_shared/rootfile && + git -C pw_shared add . && + git -C pw_shared commit -m "aaa and zzz share tree OID" && + + # Verify they share a tree OID + aaa_tree=$(git -C pw_shared rev-parse HEAD:aaa) && + zzz_tree=$(git -C pw_shared rev-parse HEAD:zzz) && + test "$aaa_tree" = "$zzz_tree" && + + # Cone pattern: include root + zzz/ (not aaa/) + printf "/*\n!/*/\n/zzz/\n" | + git -C pw_shared hash-object -w --stdin >shared_sparse_oid +' + +test_expect_success 'shared tree OID: --path-walk blobs match standard' ' + oid=$(cat shared_sparse_oid) && + + git -C pw_shared pack-objects --revs --stdout \ + --filter=sparse:oid=$oid >shared_std.pack <<-EOF && + HEAD + EOF + git -C pw_shared index-pack ../shared_std.pack && + git -C pw_shared verify-pack -v ../shared_std.pack >shared_std_verify && + + git -C pw_shared pack-objects --revs --stdout \ + --path-walk --filter=sparse:oid=$oid >shared_pw.pack <<-EOF && + HEAD + EOF + git -C pw_shared index-pack ../shared_pw.pack && + git -C pw_shared verify-pack -v ../shared_pw.pack >shared_pw_verify && + + grep -E "^[0-9a-f]{40} blob" shared_std_verify | + awk "{print \$1}" | sort >shared_std_blobs && + grep -E "^[0-9a-f]{40} blob" shared_pw_verify | + awk "{print \$1}" | sort >shared_pw_blobs && + test_cmp shared_std_blobs shared_pw_blobs +' + test_done diff --git a/t/t6601-path-walk.sh b/t/t6601-path-walk.sh index d9be7b9cd21ee0..e247b08dfdf19b 100755 --- a/t/t6601-path-walk.sh +++ b/t/t6601-path-walk.sh @@ -553,4 +553,124 @@ test_expect_success 'all, blob:limit=3 filter' ' test_cmp_sorted expect out ' +test_expect_success 'setup sparse filter blob' ' + # Cone-mode patterns: include root, exclude all dirs, include left/ + cat >patterns <<-\EOF && + /* + !/*/ + /left/ + EOF + sparse_oid=$(git hash-object -w -t blob patterns) +' + +test_expect_success 'all, sparse:oid filter' ' + test-tool path-walk --filter=sparse:oid=$sparse_oid -- --all >out && + + cat >expect <<-EOF && + 0:commit::$(git rev-parse topic) + 0:commit::$(git rev-parse base) + 0:commit::$(git rev-parse base~1) + 0:commit::$(git rev-parse base~2) + 1:tag:/tags:$(git rev-parse refs/tags/first) + 1:tag:/tags:$(git rev-parse refs/tags/second.1) + 1:tag:/tags:$(git rev-parse refs/tags/second.2) + 1:tag:/tags:$(git rev-parse refs/tags/third) + 1:tag:/tags:$(git rev-parse refs/tags/fourth) + 1:tag:/tags:$(git rev-parse refs/tags/tree-tag) + 1:tag:/tags:$(git rev-parse refs/tags/blob-tag) + 2:blob:/tagged-blobs:$(git rev-parse refs/tags/blob-tag^{}) + 2:blob:/tagged-blobs:$(git rev-parse refs/tags/blob-tag2^{}) + 3:tree::$(git rev-parse topic^{tree}) + 3:tree::$(git rev-parse base^{tree}) + 3:tree::$(git rev-parse base~1^{tree}) + 3:tree::$(git rev-parse base~2^{tree}) + 3:tree::$(git rev-parse refs/tags/tree-tag^{}) + 3:tree::$(git rev-parse refs/tags/tree-tag2^{}) + 4:blob:a:$(git rev-parse base~2:a) + 5:blob:file2:$(git rev-parse refs/tags/tree-tag2^{}:file2) + 6:tree:left/:$(git rev-parse base:left) + 6:tree:left/:$(git rev-parse base~2:left) + 7:blob:left/b:$(git rev-parse base~2:left/b) + 7:blob:left/b:$(git rev-parse base:left/b) + blobs:6 + commits:4 + tags:7 + trees:8 + EOF + + test_cmp_sorted expect out +' + +test_expect_success 'topic only, sparse:oid filter' ' + test-tool path-walk --filter=sparse:oid=$sparse_oid -- topic >out && + + cat >expect <<-EOF && + 0:commit::$(git rev-parse topic) + 0:commit::$(git rev-parse base~1) + 0:commit::$(git rev-parse base~2) + 1:tree::$(git rev-parse topic^{tree}) + 1:tree::$(git rev-parse base~1^{tree}) + 1:tree::$(git rev-parse base~2^{tree}) + 2:blob:a:$(git rev-parse base~2:a) + 3:tree:left/:$(git rev-parse base~2:left) + 4:blob:left/b:$(git rev-parse base~2:left/b) + blobs:2 + commits:3 + tags:0 + trees:4 + EOF + + test_cmp_sorted expect out +' + +# Demonstrate the SEEN flag ordering issue: when the same tree/blob OID +# appears at two sibling paths where one is in-cone and the other is +# out-of-cone, the path-walk may mark the object SEEN via the out-of-cone +# path (which sorts first alphabetically) and then skip it when encountered +# at the in-cone path. + +test_expect_success 'setup shared tree OID across cone boundary' ' + git checkout --orphan shared-tree && + git rm -rf . && + mkdir aaa zzz && + echo "shared content" >aaa/file && + echo "shared content" >zzz/file && + echo "root file" >rootfile && + git add aaa zzz rootfile && + git commit -m "aaa and zzz have same tree OID" && + + # Verify they really share a tree OID + aaa_tree=$(git rev-parse HEAD:aaa) && + zzz_tree=$(git rev-parse HEAD:zzz) && + test "$aaa_tree" = "$zzz_tree" && + + # Cone pattern: include root + zzz/ (not aaa/) + cat >shared-patterns <<-\EOF && + /* + !/*/ + /zzz/ + EOF + shared_sparse_oid=$(git hash-object -w -t blob shared-patterns) +' + +test_expect_success 'sparse:oid with shared tree OID across cone boundary' ' + test-tool path-walk \ + --filter=sparse:oid=$shared_sparse_oid \ + -- shared-tree >out && + + cat >expect <<-EOF && + 0:commit::$(git rev-parse shared-tree) + 1:tree::$(git rev-parse shared-tree^{tree}) + 2:blob:rootfile:$(git rev-parse shared-tree:rootfile) + 3:tree:zzz/:$(git rev-parse shared-tree:zzz) + 4:blob:zzz/file:$(git rev-parse shared-tree:zzz/file) + blobs:2 + commits:1 + tags:0 + trees:2 + EOF + + test_cmp_sorted expect out +' + test_done