From dc660106ea8511e6adc44d2b70e9a4ae8b18090e Mon Sep 17 00:00:00 2001 From: Johannes Schindelin Date: Tue, 27 Jan 2026 15:56:44 +0200 Subject: [PATCH 1/6] index-pack, unpack-objects: use size_t for object size When unpacking objects from a packfile, the object size is decoded from a variable-length encoding. On platforms where unsigned long is 32-bit (such as Windows, even in 64-bit builds), the shift operation overflows when decoding sizes larger than 4GB. The result is a truncated size value, causing the unpacked object to be corrupted or rejected. Fix this by changing the size variable to size_t, which is 64-bit on 64-bit platforms, and ensuring the shift arithmetic occurs in 64-bit space. This was originally authored by LordKiRon , who preferred not to reveal their real name and therefore agreed that I take over authorship. Signed-off-by: Johannes Schindelin --- builtin/index-pack.c | 9 +++++---- builtin/unpack-objects.c | 5 +++-- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/builtin/index-pack.c b/builtin/index-pack.c index ca7784dc2c4969..cc660582e97a4d 100644 --- a/builtin/index-pack.c +++ b/builtin/index-pack.c @@ -37,7 +37,7 @@ static const char index_pack_usage[] = struct object_entry { struct pack_idx_entry idx; - unsigned long size; + size_t size; unsigned char hdr_size; signed char type; signed char real_type; @@ -469,7 +469,7 @@ static int is_delta_type(enum object_type type) return (type == OBJ_REF_DELTA || type == OBJ_OFS_DELTA); } -static void *unpack_entry_data(off_t offset, unsigned long size, +static void *unpack_entry_data(off_t offset, size_t size, enum object_type type, struct object_id *oid) { static char fixed_buf[8192]; @@ -524,7 +524,8 @@ static void *unpack_raw_entry(struct object_entry *obj, struct object_id *oid) { unsigned char *p; - unsigned long size, c; + size_t size; + unsigned long c; off_t base_offset; unsigned shift; void *data; @@ -542,7 +543,7 @@ static void *unpack_raw_entry(struct object_entry *obj, p = fill(1); c = *p; use(1); - size += (c & 0x7f) << shift; + size += ((size_t)c & 0x7f) << shift; shift += 7; } obj->size = size; diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c index e01cf6e360f6d1..59a36c2481a457 100644 --- a/builtin/unpack-objects.c +++ b/builtin/unpack-objects.c @@ -533,7 +533,8 @@ static void unpack_one(unsigned nr) { unsigned shift; unsigned char *pack; - unsigned long size, c; + size_t size; + unsigned long c; enum object_type type; obj_list[nr].offset = consumed_bytes; @@ -548,7 +549,7 @@ static void unpack_one(unsigned nr) pack = fill(1); c = *pack; use(1); - size += (c & 0x7f) << shift; + size += ((size_t)c & 0x7f) << shift; shift += 7; } From 92f4327b1fe09126dd6421b071a9071ce5530371 Mon Sep 17 00:00:00 2001 From: Johannes Schindelin Date: Thu, 29 Jan 2026 14:07:48 +0100 Subject: [PATCH 2/6] git-zlib: handle data streams larger than 4GB On Windows, zlib's `uLong` type is 32-bit even on 64-bit systems. When processing data streams larger than 4GB, the `total_in` and `total_out` fields in zlib's `z_stream` structure wrap around, which caused the sanity checks in `zlib_post_call()` to trigger `BUG()` assertions. The git_zstream wrapper now tracks its own 64-bit totals rather than copying them from zlib. The sanity checks compare only the low bits, using `maximum_unsigned_value_of_type(uLong)` to mask appropriately for the platform's `uLong` size. This is based on work by LordKiRon in git-for-windows#6076. Signed-off-by: Johannes Schindelin --- git-zlib.c | 25 +++++++++++++++++-------- git-zlib.h | 4 ++-- object-file.c | 2 +- 3 files changed, 20 insertions(+), 11 deletions(-) diff --git a/git-zlib.c b/git-zlib.c index df9604910e3fdf..b91cb323aee916 100644 --- a/git-zlib.c +++ b/git-zlib.c @@ -30,6 +30,9 @@ static const char *zerr_to_string(int status) */ /* #define ZLIB_BUF_MAX ((uInt)-1) */ #define ZLIB_BUF_MAX ((uInt) 1024 * 1024 * 1024) /* 1GB */ + +/* uLong is 32-bit on Windows, even on 64-bit systems */ +#define ULONG_MAX_VALUE maximum_unsigned_value_of_type(uLong) static inline uInt zlib_buf_cap(unsigned long len) { return (ZLIB_BUF_MAX < len) ? ZLIB_BUF_MAX : len; @@ -39,31 +42,37 @@ static void zlib_pre_call(git_zstream *s) { s->z.next_in = s->next_in; s->z.next_out = s->next_out; - s->z.total_in = s->total_in; - s->z.total_out = s->total_out; + s->z.total_in = (uLong)(s->total_in & ULONG_MAX_VALUE); + s->z.total_out = (uLong)(s->total_out & ULONG_MAX_VALUE); s->z.avail_in = zlib_buf_cap(s->avail_in); s->z.avail_out = zlib_buf_cap(s->avail_out); } static void zlib_post_call(git_zstream *s, int status) { - unsigned long bytes_consumed; - unsigned long bytes_produced; + size_t bytes_consumed; + size_t bytes_produced; bytes_consumed = s->z.next_in - s->next_in; bytes_produced = s->z.next_out - s->next_out; - if (s->z.total_out != s->total_out + bytes_produced) + /* + * zlib's total_out/total_in are uLong which may wrap for >4GB. + * We track our own totals and verify only the low bits match. + */ + if ((s->z.total_out & ULONG_MAX_VALUE) != + ((s->total_out + bytes_produced) & ULONG_MAX_VALUE)) BUG("total_out mismatch"); /* * zlib does not update total_in when it returns Z_NEED_DICT, * causing a mismatch here. Skip the sanity check in that case. */ if (status != Z_NEED_DICT && - s->z.total_in != s->total_in + bytes_consumed) + (s->z.total_in & ULONG_MAX_VALUE) != + ((s->total_in + bytes_consumed) & ULONG_MAX_VALUE)) BUG("total_in mismatch"); - s->total_out = s->z.total_out; - s->total_in = s->z.total_in; + s->total_out += bytes_produced; + s->total_in += bytes_consumed; /* zlib-ng marks `next_in` as `const`, so we have to cast it away. */ s->next_in = (unsigned char *) s->z.next_in; s->next_out = s->z.next_out; diff --git a/git-zlib.h b/git-zlib.h index 0e66fefa8c9f05..44380e8ad38305 100644 --- a/git-zlib.h +++ b/git-zlib.h @@ -7,8 +7,8 @@ typedef struct git_zstream { struct z_stream_s z; unsigned long avail_in; unsigned long avail_out; - unsigned long total_in; - unsigned long total_out; + size_t total_in; + size_t total_out; unsigned char *next_in; unsigned char *next_out; } git_zstream; diff --git a/object-file.c b/object-file.c index 2acc9522df2daa..086b2b65ffe65e 100644 --- a/object-file.c +++ b/object-file.c @@ -1118,7 +1118,7 @@ int odb_source_loose_write_stream(struct odb_source *source, } while (ret == Z_OK || ret == Z_BUF_ERROR); if (stream.total_in != len + hdrlen) - die(_("write stream object %ld != %"PRIuMAX), stream.total_in, + die(_("write stream object %"PRIuMAX" != %"PRIuMAX), (uintmax_t)stream.total_in, (uintmax_t)len + hdrlen); /* From 3a539061c5f62c65d46bd0eb774bb1b1239463ff Mon Sep 17 00:00:00 2001 From: Johannes Schindelin Date: Wed, 28 Jan 2026 01:01:23 +0200 Subject: [PATCH 3/6] odb, packfile: use size_t for streaming object sizes The odb_read_stream structure uses unsigned long for the size field, which is 32-bit on Windows even in 64-bit builds. When streaming objects larger than 4GB, the size would be truncated to zero or an incorrect value, resulting in empty files being written to disk. Change the size field in odb_read_stream to size_t and introduce unpack_object_header_sz() to return sizes via size_t pointer. Since object_info.sizep remains unsigned long for API compatibility, use temporary variables where the types differ, with comments noting the truncation limitation for code paths that still use unsigned long. This was originally authored by LordKiRon , who preferred not to reveal their real name and therefore agreed that I take over authorship. Signed-off-by: Johannes Schindelin --- builtin/pack-objects.c | 23 ++++++++++++++++------- object-file.c | 10 +++++++++- odb/streaming.c | 13 ++++++++++++- odb/streaming.h | 2 +- oss-fuzz/fuzz-pack-headers.c | 2 +- pack-bitmap.c | 2 +- pack-check.c | 6 ++++-- packfile.c | 24 +++++++++++++++--------- packfile.h | 4 ++-- 9 files changed, 61 insertions(+), 25 deletions(-) diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c index dd2480a73d2edf..aa4b1cb9b8a6c1 100644 --- a/builtin/pack-objects.c +++ b/builtin/pack-objects.c @@ -629,14 +629,21 @@ static off_t write_reuse_object(struct hashfile *f, struct object_entry *entry, struct packed_git *p = IN_PACK(entry); struct pack_window *w_curs = NULL; uint32_t pos; - off_t offset; + off_t offset, cur; enum object_type type = oe_type(entry); + enum object_type in_pack_type; off_t datalen; unsigned char header[MAX_PACK_OBJECT_HEADER], dheader[MAX_PACK_OBJECT_HEADER]; unsigned hdrlen; const unsigned hashsz = the_hash_algo->rawsz; - unsigned long entry_size = SIZE(entry); + size_t entry_size; + + cur = entry->in_pack_offset; + in_pack_type = unpack_object_header(p, &w_curs, &cur, &entry_size); + if (in_pack_type < 0) + die(_("write_reuse_object: unable to parse object header of %s"), + oid_to_hex(&entry->idx.oid)); if (DELTA(entry)) type = (allow_ofs_delta && DELTA(entry)->idx.offset) ? @@ -1087,7 +1094,7 @@ static void write_reused_pack_one(struct packed_git *reuse_packfile, { off_t offset, next, cur; enum object_type type; - unsigned long size; + size_t size; offset = pack_pos_to_offset(reuse_packfile, pos); next = pack_pos_to_offset(reuse_packfile, pos + 1); @@ -2243,7 +2250,7 @@ static void check_object(struct object_entry *entry, uint32_t object_index) off_t ofs; unsigned char *buf, c; enum object_type type; - unsigned long in_pack_size; + size_t in_pack_size; buf = use_pack(p, &w_curs, entry->in_pack_offset, &avail); @@ -2734,16 +2741,18 @@ unsigned long oe_get_size_slow(struct packing_data *pack, struct pack_window *w_curs; unsigned char *buf; enum object_type type; - unsigned long used, avail, size; + unsigned long used, avail; + size_t size; if (e->type_ != OBJ_OFS_DELTA && e->type_ != OBJ_REF_DELTA) { + unsigned long sz; packing_data_lock(&to_pack); if (odb_read_object_info(the_repository->objects, - &e->idx.oid, &size) < 0) + &e->idx.oid, &sz) < 0) die(_("unable to get size of %s"), oid_to_hex(&e->idx.oid)); packing_data_unlock(&to_pack); - return size; + return sz; } p = oe_in_pack(pack, e); diff --git a/object-file.c b/object-file.c index 086b2b65ffe65e..0be2981c7a1f43 100644 --- a/object-file.c +++ b/object-file.c @@ -2326,6 +2326,7 @@ int odb_source_loose_read_object_stream(struct odb_read_stream **out, struct object_info oi = OBJECT_INFO_INIT; struct odb_loose_read_stream *st; unsigned long mapsize; + unsigned long size_ul; void *mapped; mapped = odb_source_loose_map_object(source, oid, &mapsize); @@ -2349,11 +2350,18 @@ int odb_source_loose_read_object_stream(struct odb_read_stream **out, goto error; } - oi.sizep = &st->base.size; + /* + * object_info.sizep is unsigned long* (32-bit on Windows), but + * st->base.size is size_t (64-bit). Use temporary variable. + * Note: loose objects >4GB would still truncate here, but such + * large loose objects are uncommon (they'd normally be packed). + */ + oi.sizep = &size_ul; oi.typep = &st->base.type; if (parse_loose_header(st->hdr, &oi) < 0 || st->base.type < 0) goto error; + st->base.size = size_ul; st->mapped = mapped; st->mapsize = mapsize; diff --git a/odb/streaming.c b/odb/streaming.c index 5927a12954ba59..af2adf5ce786d6 100644 --- a/odb/streaming.c +++ b/odb/streaming.c @@ -157,15 +157,26 @@ static int open_istream_incore(struct odb_read_stream **out, .base.read = read_istream_incore, }; struct odb_incore_read_stream *st; + unsigned long size_ul; int ret; oi.typep = &stream.base.type; - oi.sizep = &stream.base.size; + /* + * object_info.sizep is unsigned long* (32-bit on Windows), but + * stream.base.size is size_t (64-bit). We use a temporary variable + * because the types are incompatible. Note: this path still truncates + * for >4GB objects, but large objects should use pack streaming + * (packfile_store_read_object_stream) which handles size_t properly. + * This incore fallback is only used for small objects or when pack + * streaming is unavailable. + */ + oi.sizep = &size_ul; oi.contentp = (void **)&stream.buf; ret = odb_read_object_info_extended(odb, oid, &oi, OBJECT_INFO_DIE_IF_CORRUPT); if (ret) return ret; + stream.base.size = size_ul; CALLOC_ARRAY(st, 1); *st = stream; diff --git a/odb/streaming.h b/odb/streaming.h index c7861f7e13c606..517e2ea2d3f5c3 100644 --- a/odb/streaming.h +++ b/odb/streaming.h @@ -21,7 +21,7 @@ struct odb_read_stream { odb_read_stream_close_fn close; odb_read_stream_read_fn read; enum object_type type; - unsigned long size; /* inflated size of full object */ + size_t size; /* inflated size of full object */ }; /* diff --git a/oss-fuzz/fuzz-pack-headers.c b/oss-fuzz/fuzz-pack-headers.c index 150c0f5fa2d7ec..ef61ab577c5098 100644 --- a/oss-fuzz/fuzz-pack-headers.c +++ b/oss-fuzz/fuzz-pack-headers.c @@ -6,7 +6,7 @@ int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size); int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { enum object_type type; - unsigned long len; + size_t len; unpack_object_header_buffer((const unsigned char *)data, (unsigned long)size, &type, &len); diff --git a/pack-bitmap.c b/pack-bitmap.c index f6ec18d83afe21..f9af8a96bdf4ee 100644 --- a/pack-bitmap.c +++ b/pack-bitmap.c @@ -2270,7 +2270,7 @@ static int try_partial_reuse(struct bitmap_index *bitmap_git, { off_t delta_obj_offset; enum object_type type; - unsigned long size; + size_t size; if (pack_pos >= pack->p->num_objects) return -1; /* not actually in the pack */ diff --git a/pack-check.c b/pack-check.c index 79992bb509f473..2792f34d2595bf 100644 --- a/pack-check.c +++ b/pack-check.c @@ -110,7 +110,7 @@ static int verify_packfile(struct repository *r, void *data; struct object_id oid; enum object_type type; - unsigned long size; + size_t size; off_t curpos; int data_valid; @@ -143,7 +143,9 @@ static int verify_packfile(struct repository *r, data = NULL; data_valid = 0; } else { - data = unpack_entry(r, p, entries[i].offset, &type, &size); + unsigned long sz; + data = unpack_entry(r, p, entries[i].offset, &type, &sz); + size = sz; data_valid = 1; } diff --git a/packfile.c b/packfile.c index b012d648adaf2e..fdae91dd110682 100644 --- a/packfile.c +++ b/packfile.c @@ -1133,7 +1133,7 @@ int packfile_store_count_objects(struct packfile_store *store, } unsigned long unpack_object_header_buffer(const unsigned char *buf, - unsigned long len, enum object_type *type, unsigned long *sizep) + unsigned long len, enum object_type *type, size_t *sizep) { unsigned shift; size_t size, c; @@ -1144,7 +1144,11 @@ unsigned long unpack_object_header_buffer(const unsigned char *buf, size = c & 15; shift = 4; while (c & 0x80) { - if (len <= used || (bitsizeof(long) - 7) < shift) { + /* + * Each continuation byte adds 7 bits. Ensure shift won't + * overflow size_t (use size_t not long for 64-bit on Windows). + */ + if (len <= used || (bitsizeof(size_t) - 7) < shift) { error("bad object header"); size = used = 0; break; @@ -1153,7 +1157,7 @@ unsigned long unpack_object_header_buffer(const unsigned char *buf, size = st_add(size, st_left_shift(c & 0x7f, shift)); shift += 7; } - *sizep = cast_size_t_to_ulong(size); + *sizep = size; return used; } @@ -1215,7 +1219,7 @@ unsigned long get_size_from_delta(struct packed_git *p, int unpack_object_header(struct packed_git *p, struct pack_window **w_curs, off_t *curpos, - unsigned long *sizep) + size_t *sizep) { unsigned char *base; unsigned long left; @@ -1367,7 +1371,7 @@ static enum object_type packed_to_object_type(struct repository *r, while (type == OBJ_OFS_DELTA || type == OBJ_REF_DELTA) { off_t base_offset; - unsigned long size; + size_t size; /* Push the object we're going to leave behind */ if (poi_stack_nr >= poi_stack_alloc && poi_stack == small_poi_stack) { poi_stack_alloc = alloc_nr(poi_stack_nr); @@ -1586,7 +1590,7 @@ static int packed_object_info_with_index_pos(struct packed_git *p, off_t obj_off uint32_t *maybe_index_pos, struct object_info *oi) { struct pack_window *w_curs = NULL; - unsigned long size; + size_t size; off_t curpos = obj_offset; enum object_type type = OBJ_NONE; uint32_t pack_pos; @@ -1778,7 +1782,7 @@ void *unpack_entry(struct repository *r, struct packed_git *p, off_t obj_offset, struct pack_window *w_curs = NULL; off_t curpos = obj_offset; void *data = NULL; - unsigned long size; + size_t size; enum object_type type; struct unpack_entry_stack_ent small_delta_stack[UNPACK_ENTRY_STACK_PREALLOC]; struct unpack_entry_stack_ent *delta_stack = small_delta_stack; @@ -1943,8 +1947,10 @@ void *unpack_entry(struct repository *r, struct packed_git *p, off_t obj_offset, (uintmax_t)curpos, p->pack_name); data = NULL; } else { + unsigned long sz; data = patch_delta(base, base_size, delta_data, - delta_size, &size); + delta_size, &sz); + size = sz; /* * We could not apply the delta; warn the user, but @@ -2929,7 +2935,7 @@ int packfile_read_object_stream(struct odb_read_stream **out, struct odb_packed_read_stream *stream; struct pack_window *window = NULL; enum object_type in_pack_type; - unsigned long size; + size_t size; in_pack_type = unpack_object_header(pack, &window, &offset, &size); unuse_pack(&window); diff --git a/packfile.h b/packfile.h index 9b647da7dda7c1..49d6bdecf6ea18 100644 --- a/packfile.h +++ b/packfile.h @@ -456,9 +456,9 @@ off_t find_pack_entry_one(const struct object_id *oid, struct packed_git *); int is_pack_valid(struct packed_git *); void *unpack_entry(struct repository *r, struct packed_git *, off_t, enum object_type *, unsigned long *); -unsigned long unpack_object_header_buffer(const unsigned char *buf, unsigned long len, enum object_type *type, unsigned long *sizep); +unsigned long unpack_object_header_buffer(const unsigned char *buf, unsigned long len, enum object_type *type, size_t *sizep); unsigned long get_size_from_delta(struct packed_git *, struct pack_window **, off_t); -int unpack_object_header(struct packed_git *, struct pack_window **, off_t *, unsigned long *); +int unpack_object_header(struct packed_git *, struct pack_window **, off_t *, size_t *); off_t get_delta_base(struct packed_git *p, struct pack_window **w_curs, off_t *curpos, enum object_type type, off_t delta_obj_offset); From 3274cba862ae42a6813710410274a692ec0f5d29 Mon Sep 17 00:00:00 2001 From: Johannes Schindelin Date: Thu, 29 Jan 2026 10:18:01 +0200 Subject: [PATCH 4/6] delta, packfile: use size_t for delta header sizes The delta header decoding functions return unsigned long, which truncates on Windows for objects larger than 4GB. Introduce size_t variants get_delta_hdr_size_sz() and get_size_from_delta_sz() that preserve the full 64-bit size, and use them in packed_object_info() where the size is needed for streaming decisions. This was originally authored by LordKiRon , who preferred not to reveal their real name and therefore agreed that I take over authorship. Signed-off-by: Johannes Schindelin --- delta.h | 14 ++++++++++++-- packfile.c | 33 ++++++++++++++++++++++++--------- 2 files changed, 36 insertions(+), 11 deletions(-) diff --git a/delta.h b/delta.h index 8a56ec07992c75..fad68cfc45f6f4 100644 --- a/delta.h +++ b/delta.h @@ -86,8 +86,11 @@ void *patch_delta(const void *src_buf, unsigned long src_size, * This must be called twice on the delta data buffer, first to get the * expected source buffer size, and again to get the target buffer size. */ -static inline unsigned long get_delta_hdr_size(const unsigned char **datap, - const unsigned char *top) +/* + * Size_t variant that doesn't truncate - use for >4GB objects on Windows. + */ +static inline size_t get_delta_hdr_size_sz(const unsigned char **datap, + const unsigned char *top) { const unsigned char *data = *datap; size_t cmd, size = 0; @@ -98,6 +101,13 @@ static inline unsigned long get_delta_hdr_size(const unsigned char **datap, i += 7; } while (cmd & 0x80 && data < top); *datap = data; + return size; +} + +static inline unsigned long get_delta_hdr_size(const unsigned char **datap, + const unsigned char *top) +{ + size_t size = get_delta_hdr_size_sz(datap, top); return cast_size_t_to_ulong(size); } diff --git a/packfile.c b/packfile.c index fdae91dd110682..4208f53046b630 100644 --- a/packfile.c +++ b/packfile.c @@ -1161,9 +1161,12 @@ unsigned long unpack_object_header_buffer(const unsigned char *buf, return used; } -unsigned long get_size_from_delta(struct packed_git *p, - struct pack_window **w_curs, - off_t curpos) +/* + * Size_t variant for >4GB delta results on Windows. + */ +static size_t get_size_from_delta_sz(struct packed_git *p, + struct pack_window **w_curs, + off_t curpos) { const unsigned char *data; unsigned char delta_head[20], *in; @@ -1210,10 +1213,18 @@ unsigned long get_size_from_delta(struct packed_git *p, data = delta_head; /* ignore base size */ - get_delta_hdr_size(&data, delta_head+sizeof(delta_head)); + get_delta_hdr_size_sz(&data, delta_head+sizeof(delta_head)); /* Read the result size */ - return get_delta_hdr_size(&data, delta_head+sizeof(delta_head)); + return get_delta_hdr_size_sz(&data, delta_head+sizeof(delta_head)); +} + +unsigned long get_size_from_delta(struct packed_git *p, + struct pack_window **w_curs, + off_t curpos) +{ + size_t size = get_size_from_delta_sz(p, w_curs, curpos); + return cast_size_t_to_ulong(size); } int unpack_object_header(struct packed_git *p, @@ -1618,14 +1629,18 @@ static int packed_object_info_with_index_pos(struct packed_git *p, off_t obj_off ret = -1; goto out; } - *oi->sizep = get_size_from_delta(p, &w_curs, tmp_pos); - if (*oi->sizep == 0) { + /* + * Use size_t variant to avoid die() on >4GB deltas. + * oi->sizep is unsigned long, so truncation may occur, + * but streaming code uses its own size_t tracking. + */ + size = get_size_from_delta_sz(p, &w_curs, tmp_pos); + if (size == 0) { ret = -1; goto out; } - } else { - *oi->sizep = size; } + *oi->sizep = (unsigned long)size; } if (oi->disk_sizep || (oi->mtimep && p->is_cruft)) { From afa74a3a2b9caf9989055a9311309f590729d6c1 Mon Sep 17 00:00:00 2001 From: Johannes Schindelin Date: Thu, 6 Jul 2023 13:37:46 +0200 Subject: [PATCH 5/6] test-tool: add a helper to synthesize large packfiles To test Git's behavior with very large pack files, we need a way to generate such files quickly. A naive approach using only readily-available Git commands would take over 10 hours for a 4GB pack file, which is prohibitive. Side-stepping Git's machinery and actual zlib compression by writing uncompressed content with the appropriate zlib header makes things much faster. The fastest method using this approach generates many small, unreachable blob objects and takes about 1.5 minutes for 4GB. However, this cannot be used because we need to test git clone, which requires a reachable commit history. Generating many reachable commits with small, uncompressed blobs takes about 4 minutes for 4GB. But this approach 1) does not reproduce the issues we want to fix (which require individual objects larger than 4GB) and 2) is comparatively slow because of the many SHA-1 calculations. The approach taken here generates a single large blob (filled with NUL bytes), along with the trees and commits needed to make it reachable. This takes about 2.5 minutes for 4.5GB, which is the fastest option that produces a valid, clonable repository with an object large enough to trigger the bugs we want to test. Signed-off-by: Johannes Schindelin --- Makefile | 1 + compat/zlib-compat.h | 2 + t/helper/meson.build | 1 + t/helper/test-synthesize.c | 250 +++++++++++++++++++++++++++++++++++++ t/helper/test-tool.c | 1 + t/helper/test-tool.h | 1 + 6 files changed, 256 insertions(+) create mode 100644 t/helper/test-synthesize.c diff --git a/Makefile b/Makefile index cedc234173e377..85405cb5b8d148 100644 --- a/Makefile +++ b/Makefile @@ -872,6 +872,7 @@ TEST_BUILTINS_OBJS += test-submodule-config.o TEST_BUILTINS_OBJS += test-submodule-nested-repo-config.o TEST_BUILTINS_OBJS += test-submodule.o TEST_BUILTINS_OBJS += test-subprocess.o +TEST_BUILTINS_OBJS += test-synthesize.o TEST_BUILTINS_OBJS += test-trace2.o TEST_BUILTINS_OBJS += test-truncate.o TEST_BUILTINS_OBJS += test-userdiff.o diff --git a/compat/zlib-compat.h b/compat/zlib-compat.h index ac0827662298af..5078c5ef6ce0e8 100644 --- a/compat/zlib-compat.h +++ b/compat/zlib-compat.h @@ -7,6 +7,8 @@ # define z_stream_s zng_stream_s # define gz_header_s zng_gz_header_s +# define adler32(adler, buf, len) zng_adler32(adler, buf, len) + # define crc32(crc, buf, len) zng_crc32(crc, buf, len) # define inflate(strm, bits) zng_inflate(strm, bits) diff --git a/t/helper/meson.build b/t/helper/meson.build index 675e64c0101b61..3235f10ab8aae1 100644 --- a/t/helper/meson.build +++ b/t/helper/meson.build @@ -69,6 +69,7 @@ test_tool_sources = [ 'test-submodule-nested-repo-config.c', 'test-submodule.c', 'test-subprocess.c', + 'test-synthesize.c', 'test-tool.c', 'test-trace2.c', 'test-truncate.c', diff --git a/t/helper/test-synthesize.c b/t/helper/test-synthesize.c new file mode 100644 index 00000000000000..3ce7078078efff --- /dev/null +++ b/t/helper/test-synthesize.c @@ -0,0 +1,250 @@ +#define USE_THE_REPOSITORY_VARIABLE + +#include "test-tool.h" +#include "git-compat-util.h" +#include "git-zlib.h" +#include "hash.h" +#include "hex.h" +#include "object-file.h" +#include "object.h" +#include "pack.h" +#include "parse-options.h" +#include "parse.h" +#include "repository.h" +#include "setup.h" +#include "strbuf.h" +#include "write-or-die.h" + +#define BLOCK_SIZE 0xffff +static const unsigned char zeros[BLOCK_SIZE]; + +/* + * Write data as an uncompressed zlib stream. + * For data larger than 64KB, writes multiple uncompressed blocks. + * If data is NULL, writes zeros. + * Updates the pack checksum context. + */ +static void write_uncompressed_zlib(FILE *f, struct git_hash_ctx *pack_ctx, + const void *data, size_t len, + const struct git_hash_algo *algo) +{ + unsigned char zlib_header[2] = { 0x78, 0x01 }; /* CMF, FLG */ + unsigned char block_header[5]; + const unsigned char *p = data; + size_t remaining = len; + uint32_t adler = 1L; /* adler32 initial value */ + unsigned char adler_buf[4]; + + /* Write zlib header */ + fwrite_or_die(f, zlib_header, sizeof(zlib_header)); + algo->update_fn(pack_ctx, zlib_header, 2); + + /* Write uncompressed blocks (max 64KB each) */ + do { + size_t block_len = remaining > BLOCK_SIZE ? BLOCK_SIZE : remaining; + int is_final = (block_len == remaining); + const unsigned char *block_data = data ? p : zeros; + + block_header[0] = is_final ? 0x01 : 0x00; + block_header[1] = block_len & 0xff; + block_header[2] = (block_len >> 8) & 0xff; + block_header[3] = block_header[1] ^ 0xff; + block_header[4] = block_header[2] ^ 0xff; + + fwrite_or_die(f, block_header, sizeof(block_header)); + algo->update_fn(pack_ctx, block_header, 5); + + if (block_len) { + fwrite_or_die(f, block_data, block_len); + algo->update_fn(pack_ctx, block_data, block_len); + adler = adler32(adler, block_data, block_len); + } + + if (data) + p += block_len; + remaining -= block_len; + } while (remaining > 0); + + /* Write adler32 checksum */ + put_be32(adler_buf, adler); + fwrite_or_die(f, adler_buf, sizeof(adler_buf)); + algo->update_fn(pack_ctx, adler_buf, 4); +} + +/* + * Write an uncompressed object to the pack file. + * If `data == NULL`, it is treated like a buffer to NUL bytes. + * Updates the pack checksum context. + */ +static void write_pack_object(FILE *f, struct git_hash_ctx *pack_ctx, + enum object_type type, + const void *data, size_t len, + struct object_id *oid, + const struct git_hash_algo *algo) +{ + unsigned char pack_header[MAX_PACK_OBJECT_HEADER]; + char object_header[32]; + int pack_header_len, object_header_len; + struct git_hash_ctx ctx; + + /* Write pack object header */ + pack_header_len = encode_in_pack_object_header(pack_header, + sizeof(pack_header), + type, len); + fwrite_or_die(f, pack_header, pack_header_len); + algo->update_fn(pack_ctx, pack_header, pack_header_len); + + /* Write the data as uncompressed zlib */ + write_uncompressed_zlib(f, pack_ctx, data, len, algo); + + algo->init_fn(&ctx); + object_header_len = format_object_header(object_header, + sizeof(object_header), + type, len); + algo->update_fn(&ctx, object_header, object_header_len); + if (data) + algo->update_fn(&ctx, data, len); + else { + for (size_t i = len / BLOCK_SIZE; i; i--) + algo->update_fn(&ctx, zeros, BLOCK_SIZE); + algo->update_fn(&ctx, zeros, len % BLOCK_SIZE); + } + algo->final_oid_fn(oid, &ctx); +} + +/* + * Generate a pack file with a single large (>4GB) reachable object. + * + * Creates: + * 1. A large blob (all NUL bytes) + * 2. A tree containing that blob as "file" + * 3. A commit using that tree + * 4. The empty tree + * 5. A child commit using the empty tree + * + * This is useful for testing that Git can handle objects larger than 4GB. + */ +static int generate_pack_with_large_object(const char *path, size_t blob_size, + const struct git_hash_algo *algo) +{ + FILE *f = xfopen(path, "wb"); + struct git_hash_ctx pack_ctx; + unsigned char pack_hash[GIT_MAX_RAWSZ]; + struct object_id blob_oid, tree_oid, commit_oid, empty_tree_oid, final_commit_oid; + struct strbuf buf = STRBUF_INIT; + const uint32_t object_count = 5; + struct pack_header pack_header = { + .hdr_signature = htonl(PACK_SIGNATURE), + .hdr_version = htonl(PACK_VERSION), + .hdr_entries = htonl(object_count), + }; + + algo->init_fn(&pack_ctx); + + /* Write pack header */ + fwrite_or_die(f, &pack_header, sizeof(pack_header)); + algo->update_fn(&pack_ctx, &pack_header, sizeof(pack_header)); + + /* 1. Write the large blob */ + write_pack_object(f, &pack_ctx, OBJ_BLOB, NULL, blob_size, &blob_oid, algo); + + /* 2. Write tree containing the blob as "file" */ + strbuf_addf(&buf, "100644 file%c", '\0'); + strbuf_add(&buf, blob_oid.hash, algo->rawsz); + write_pack_object(f, &pack_ctx, OBJ_TREE, buf.buf, buf.len, &tree_oid, algo); + + /* 3. Write commit using that tree */ + strbuf_reset(&buf); + strbuf_addf(&buf, + "tree %s\n" + "author A U Thor 1234567890 +0000\n" + "committer C O Mitter 1234567890 +0000\n" + "\n" + "Large blob commit\n", + oid_to_hex(&tree_oid)); + write_pack_object(f, &pack_ctx, OBJ_COMMIT, buf.buf, buf.len, &commit_oid, algo); + + /* 4. Write the empty tree */ + write_pack_object(f, &pack_ctx, OBJ_TREE, "", 0, &empty_tree_oid, algo); + + /* 5. Write final commit using empty tree, with previous commit as parent */ + strbuf_reset(&buf); + strbuf_addf(&buf, + "tree %s\n" + "parent %s\n" + "author A U Thor 1234567890 +0000\n" + "committer C O Mitter 1234567890 +0000\n" + "\n" + "Empty tree commit\n", + oid_to_hex(&empty_tree_oid), + oid_to_hex(&commit_oid)); + write_pack_object(f, &pack_ctx, OBJ_COMMIT, buf.buf, buf.len, &final_commit_oid, algo); + + /* Write pack trailer (checksum) */ + algo->final_fn(pack_hash, &pack_ctx); + fwrite_or_die(f, pack_hash, algo->rawsz); + if (fclose(f)) + die_errno(_("could not close '%s'"), path); + + strbuf_release(&buf); + + /* Print the final commit OID so caller can set up refs */ + printf("%s\n", oid_to_hex(&final_commit_oid)); + + return 0; +} + +static int cmd__synthesize__pack(int argc, const char **argv, + const char *prefix UNUSED, + struct repository *repo) +{ + int non_git; + int reachable_large = 0; + const struct git_hash_algo *algo; + size_t blob_size; + uintmax_t blob_size_u; + const char *path; + const char * const usage[] = { + "test-tool synthesize pack " + "--reachable-large ", + NULL + }; + struct option options[] = { + OPT_BOOL(0, "reachable-large", &reachable_large, + N_("write a pack with a single reachable large blob")), + OPT_END() + }; + + setup_git_directory_gently(&non_git); + repo = the_repository; + algo = repo->hash_algo; + + argc = parse_options(argc, argv, NULL, options, usage, + PARSE_OPT_KEEP_ARGV0); + if (argc != 3 || !reachable_large) + usage_with_options(usage, options); + + if (!git_parse_unsigned(argv[1], &blob_size_u, + maximum_unsigned_value_of_type(size_t))) + die(_("'%s' is not a valid blob size"), argv[1]); + blob_size = blob_size_u; + path = argv[2]; + + return !!generate_pack_with_large_object(path, blob_size, algo); +} + +int cmd__synthesize(int argc, const char **argv) +{ + const char *prefix = NULL; + char const * const synthesize_usage[] = { + "test-tool synthesize pack ", + NULL, + }; + parse_opt_subcommand_fn *fn = NULL; + struct option options[] = { + OPT_SUBCOMMAND("pack", &fn, cmd__synthesize__pack), + OPT_END() + }; + argc = parse_options(argc, argv, prefix, options, synthesize_usage, 0); + return !!fn(argc, argv, prefix, NULL); +} diff --git a/t/helper/test-tool.c b/t/helper/test-tool.c index a7abc618b3887e..b71a22b43bbc9e 100644 --- a/t/helper/test-tool.c +++ b/t/helper/test-tool.c @@ -82,6 +82,7 @@ static struct test_cmd cmds[] = { { "submodule-config", cmd__submodule_config }, { "submodule-nested-repo-config", cmd__submodule_nested_repo_config }, { "subprocess", cmd__subprocess }, + { "synthesize", cmd__synthesize }, { "trace2", cmd__trace2 }, { "truncate", cmd__truncate }, { "userdiff", cmd__userdiff }, diff --git a/t/helper/test-tool.h b/t/helper/test-tool.h index 7f150fa1eb9ad2..f2885b33d58aa8 100644 --- a/t/helper/test-tool.h +++ b/t/helper/test-tool.h @@ -75,6 +75,7 @@ int cmd__submodule(int argc, const char **argv); int cmd__submodule_config(int argc, const char **argv); int cmd__submodule_nested_repo_config(int argc, const char **argv); int cmd__subprocess(int argc, const char **argv); +int cmd__synthesize(int argc, const char **argv); int cmd__trace2(int argc, const char **argv); int cmd__truncate(int argc, const char **argv); int cmd__userdiff(int argc, const char **argv); From a3019888d8465e0f77926a91a20db170fef6989d Mon Sep 17 00:00:00 2001 From: Johannes Schindelin Date: Thu, 29 Jan 2026 14:07:14 +0100 Subject: [PATCH 6/6] t5608: add regression test for >4GB object clone The shift overflow bug in index-pack and unpack-objects caused incorrect object size calculation when the encoded size required more than 32 bits of shift. This would result in corrupted or failed unpacking of objects larger than 4GB. Add a test that creates a pack file containing a 4GB+ blob using the new 'test-tool synthesize pack --reachable-large' command, then clones the repository to verify the fix works correctly. Signed-off-by: Johannes Schindelin --- t/t5608-clone-2gb.sh | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/t/t5608-clone-2gb.sh b/t/t5608-clone-2gb.sh index 87a8cd9f98381a..af93302ddec1cf 100755 --- a/t/t5608-clone-2gb.sh +++ b/t/t5608-clone-2gb.sh @@ -49,4 +49,41 @@ test_expect_success 'clone - with worktree, file:// protocol' ' ' +test_expect_success SIZE_T_IS_64BIT 'set up repo with >4GB object' ' + large_blob_size=$((4*1024*1024*1024+1)) && + git init --bare 4gb-repo && + head_oid=$(test-tool synthesize pack \ + --reachable-large "$large_blob_size" \ + 4gb-repo/objects/pack/test.pack) && + git -C 4gb-repo index-pack objects/pack/test.pack && + git -C 4gb-repo update-ref refs/heads/main $head_oid && + git -C 4gb-repo symbolic-ref HEAD refs/heads/main +' + +test_expect_success SIZE_T_IS_64BIT 'clone >4GB object via unpack-objects' ' + # The synthesized pack has five objects, so a large unpack limit keeps + # fetch-pack on the unpack-objects path. + git -c fetch.unpackLimit=100 clone --bare \ + "file://$(pwd)/4gb-repo" 4gb-clone-unpack && + + # Verify the large blob survived the clone by comparing its OID + # between source and clone. We cannot use "cat-file -s" because + # object_info.sizep is still unsigned long, which truncates >4GB + # sizes on Windows. OID equality proves content integrity since + # the clone already verified checksums via index-pack/unpack-objects. + source_blob=$(git -C 4gb-repo rev-parse main^:file) && + clone_blob=$(git -C 4gb-clone-unpack rev-parse main^:file) && + test "$source_blob" = "$clone_blob" +' + +test_expect_success SIZE_T_IS_64BIT 'clone with >4GB object via index-pack' ' + # Force fetch-pack to hand the pack to index-pack instead. + git -c fetch.unpackLimit=1 clone --bare \ + "file://$(pwd)/4gb-repo" 4gb-clone-index && + + source_blob=$(git -C 4gb-repo rev-parse main^:file) && + clone_blob=$(git -C 4gb-clone-index rev-parse main^:file) && + test "$source_blob" = "$clone_blob" +' + test_done