diff --git a/Makefile b/Makefile index cedc234173e377..85405cb5b8d148 100644 --- a/Makefile +++ b/Makefile @@ -872,6 +872,7 @@ TEST_BUILTINS_OBJS += test-submodule-config.o TEST_BUILTINS_OBJS += test-submodule-nested-repo-config.o TEST_BUILTINS_OBJS += test-submodule.o TEST_BUILTINS_OBJS += test-subprocess.o +TEST_BUILTINS_OBJS += test-synthesize.o TEST_BUILTINS_OBJS += test-trace2.o TEST_BUILTINS_OBJS += test-truncate.o TEST_BUILTINS_OBJS += test-userdiff.o diff --git a/builtin/index-pack.c b/builtin/index-pack.c index ca7784dc2c4969..cc660582e97a4d 100644 --- a/builtin/index-pack.c +++ b/builtin/index-pack.c @@ -37,7 +37,7 @@ static const char index_pack_usage[] = struct object_entry { struct pack_idx_entry idx; - unsigned long size; + size_t size; unsigned char hdr_size; signed char type; signed char real_type; @@ -469,7 +469,7 @@ static int is_delta_type(enum object_type type) return (type == OBJ_REF_DELTA || type == OBJ_OFS_DELTA); } -static void *unpack_entry_data(off_t offset, unsigned long size, +static void *unpack_entry_data(off_t offset, size_t size, enum object_type type, struct object_id *oid) { static char fixed_buf[8192]; @@ -524,7 +524,8 @@ static void *unpack_raw_entry(struct object_entry *obj, struct object_id *oid) { unsigned char *p; - unsigned long size, c; + size_t size; + unsigned long c; off_t base_offset; unsigned shift; void *data; @@ -542,7 +543,7 @@ static void *unpack_raw_entry(struct object_entry *obj, p = fill(1); c = *p; use(1); - size += (c & 0x7f) << shift; + size += ((size_t)c & 0x7f) << shift; shift += 7; } obj->size = size; diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c index dd2480a73d2edf..aa4b1cb9b8a6c1 100644 --- a/builtin/pack-objects.c +++ b/builtin/pack-objects.c @@ -629,14 +629,21 @@ static off_t write_reuse_object(struct hashfile *f, struct object_entry *entry, struct packed_git *p = IN_PACK(entry); struct pack_window *w_curs = NULL; uint32_t pos; - off_t offset; + off_t offset, cur; enum object_type type = oe_type(entry); + enum object_type in_pack_type; off_t datalen; unsigned char header[MAX_PACK_OBJECT_HEADER], dheader[MAX_PACK_OBJECT_HEADER]; unsigned hdrlen; const unsigned hashsz = the_hash_algo->rawsz; - unsigned long entry_size = SIZE(entry); + size_t entry_size; + + cur = entry->in_pack_offset; + in_pack_type = unpack_object_header(p, &w_curs, &cur, &entry_size); + if (in_pack_type < 0) + die(_("write_reuse_object: unable to parse object header of %s"), + oid_to_hex(&entry->idx.oid)); if (DELTA(entry)) type = (allow_ofs_delta && DELTA(entry)->idx.offset) ? @@ -1087,7 +1094,7 @@ static void write_reused_pack_one(struct packed_git *reuse_packfile, { off_t offset, next, cur; enum object_type type; - unsigned long size; + size_t size; offset = pack_pos_to_offset(reuse_packfile, pos); next = pack_pos_to_offset(reuse_packfile, pos + 1); @@ -2243,7 +2250,7 @@ static void check_object(struct object_entry *entry, uint32_t object_index) off_t ofs; unsigned char *buf, c; enum object_type type; - unsigned long in_pack_size; + size_t in_pack_size; buf = use_pack(p, &w_curs, entry->in_pack_offset, &avail); @@ -2734,16 +2741,18 @@ unsigned long oe_get_size_slow(struct packing_data *pack, struct pack_window *w_curs; unsigned char *buf; enum object_type type; - unsigned long used, avail, size; + unsigned long used, avail; + size_t size; if (e->type_ != OBJ_OFS_DELTA && e->type_ != OBJ_REF_DELTA) { + unsigned long sz; packing_data_lock(&to_pack); if (odb_read_object_info(the_repository->objects, - &e->idx.oid, &size) < 0) + &e->idx.oid, &sz) < 0) die(_("unable to get size of %s"), oid_to_hex(&e->idx.oid)); packing_data_unlock(&to_pack); - return size; + return sz; } p = oe_in_pack(pack, e); diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c index e01cf6e360f6d1..59a36c2481a457 100644 --- a/builtin/unpack-objects.c +++ b/builtin/unpack-objects.c @@ -533,7 +533,8 @@ static void unpack_one(unsigned nr) { unsigned shift; unsigned char *pack; - unsigned long size, c; + size_t size; + unsigned long c; enum object_type type; obj_list[nr].offset = consumed_bytes; @@ -548,7 +549,7 @@ static void unpack_one(unsigned nr) pack = fill(1); c = *pack; use(1); - size += (c & 0x7f) << shift; + size += ((size_t)c & 0x7f) << shift; shift += 7; } diff --git a/compat/zlib-compat.h b/compat/zlib-compat.h index ac0827662298af..5078c5ef6ce0e8 100644 --- a/compat/zlib-compat.h +++ b/compat/zlib-compat.h @@ -7,6 +7,8 @@ # define z_stream_s zng_stream_s # define gz_header_s zng_gz_header_s +# define adler32(adler, buf, len) zng_adler32(adler, buf, len) + # define crc32(crc, buf, len) zng_crc32(crc, buf, len) # define inflate(strm, bits) zng_inflate(strm, bits) diff --git a/delta.h b/delta.h index 8a56ec07992c75..fad68cfc45f6f4 100644 --- a/delta.h +++ b/delta.h @@ -86,8 +86,11 @@ void *patch_delta(const void *src_buf, unsigned long src_size, * This must be called twice on the delta data buffer, first to get the * expected source buffer size, and again to get the target buffer size. */ -static inline unsigned long get_delta_hdr_size(const unsigned char **datap, - const unsigned char *top) +/* + * Size_t variant that doesn't truncate - use for >4GB objects on Windows. + */ +static inline size_t get_delta_hdr_size_sz(const unsigned char **datap, + const unsigned char *top) { const unsigned char *data = *datap; size_t cmd, size = 0; @@ -98,6 +101,13 @@ static inline unsigned long get_delta_hdr_size(const unsigned char **datap, i += 7; } while (cmd & 0x80 && data < top); *datap = data; + return size; +} + +static inline unsigned long get_delta_hdr_size(const unsigned char **datap, + const unsigned char *top) +{ + size_t size = get_delta_hdr_size_sz(datap, top); return cast_size_t_to_ulong(size); } diff --git a/git-zlib.c b/git-zlib.c index df9604910e3fdf..b91cb323aee916 100644 --- a/git-zlib.c +++ b/git-zlib.c @@ -30,6 +30,9 @@ static const char *zerr_to_string(int status) */ /* #define ZLIB_BUF_MAX ((uInt)-1) */ #define ZLIB_BUF_MAX ((uInt) 1024 * 1024 * 1024) /* 1GB */ + +/* uLong is 32-bit on Windows, even on 64-bit systems */ +#define ULONG_MAX_VALUE maximum_unsigned_value_of_type(uLong) static inline uInt zlib_buf_cap(unsigned long len) { return (ZLIB_BUF_MAX < len) ? ZLIB_BUF_MAX : len; @@ -39,31 +42,37 @@ static void zlib_pre_call(git_zstream *s) { s->z.next_in = s->next_in; s->z.next_out = s->next_out; - s->z.total_in = s->total_in; - s->z.total_out = s->total_out; + s->z.total_in = (uLong)(s->total_in & ULONG_MAX_VALUE); + s->z.total_out = (uLong)(s->total_out & ULONG_MAX_VALUE); s->z.avail_in = zlib_buf_cap(s->avail_in); s->z.avail_out = zlib_buf_cap(s->avail_out); } static void zlib_post_call(git_zstream *s, int status) { - unsigned long bytes_consumed; - unsigned long bytes_produced; + size_t bytes_consumed; + size_t bytes_produced; bytes_consumed = s->z.next_in - s->next_in; bytes_produced = s->z.next_out - s->next_out; - if (s->z.total_out != s->total_out + bytes_produced) + /* + * zlib's total_out/total_in are uLong which may wrap for >4GB. + * We track our own totals and verify only the low bits match. + */ + if ((s->z.total_out & ULONG_MAX_VALUE) != + ((s->total_out + bytes_produced) & ULONG_MAX_VALUE)) BUG("total_out mismatch"); /* * zlib does not update total_in when it returns Z_NEED_DICT, * causing a mismatch here. Skip the sanity check in that case. */ if (status != Z_NEED_DICT && - s->z.total_in != s->total_in + bytes_consumed) + (s->z.total_in & ULONG_MAX_VALUE) != + ((s->total_in + bytes_consumed) & ULONG_MAX_VALUE)) BUG("total_in mismatch"); - s->total_out = s->z.total_out; - s->total_in = s->z.total_in; + s->total_out += bytes_produced; + s->total_in += bytes_consumed; /* zlib-ng marks `next_in` as `const`, so we have to cast it away. */ s->next_in = (unsigned char *) s->z.next_in; s->next_out = s->z.next_out; diff --git a/git-zlib.h b/git-zlib.h index 0e66fefa8c9f05..44380e8ad38305 100644 --- a/git-zlib.h +++ b/git-zlib.h @@ -7,8 +7,8 @@ typedef struct git_zstream { struct z_stream_s z; unsigned long avail_in; unsigned long avail_out; - unsigned long total_in; - unsigned long total_out; + size_t total_in; + size_t total_out; unsigned char *next_in; unsigned char *next_out; } git_zstream; diff --git a/object-file.c b/object-file.c index 2acc9522df2daa..0be2981c7a1f43 100644 --- a/object-file.c +++ b/object-file.c @@ -1118,7 +1118,7 @@ int odb_source_loose_write_stream(struct odb_source *source, } while (ret == Z_OK || ret == Z_BUF_ERROR); if (stream.total_in != len + hdrlen) - die(_("write stream object %ld != %"PRIuMAX), stream.total_in, + die(_("write stream object %"PRIuMAX" != %"PRIuMAX), (uintmax_t)stream.total_in, (uintmax_t)len + hdrlen); /* @@ -2326,6 +2326,7 @@ int odb_source_loose_read_object_stream(struct odb_read_stream **out, struct object_info oi = OBJECT_INFO_INIT; struct odb_loose_read_stream *st; unsigned long mapsize; + unsigned long size_ul; void *mapped; mapped = odb_source_loose_map_object(source, oid, &mapsize); @@ -2349,11 +2350,18 @@ int odb_source_loose_read_object_stream(struct odb_read_stream **out, goto error; } - oi.sizep = &st->base.size; + /* + * object_info.sizep is unsigned long* (32-bit on Windows), but + * st->base.size is size_t (64-bit). Use temporary variable. + * Note: loose objects >4GB would still truncate here, but such + * large loose objects are uncommon (they'd normally be packed). + */ + oi.sizep = &size_ul; oi.typep = &st->base.type; if (parse_loose_header(st->hdr, &oi) < 0 || st->base.type < 0) goto error; + st->base.size = size_ul; st->mapped = mapped; st->mapsize = mapsize; diff --git a/odb/streaming.c b/odb/streaming.c index 5927a12954ba59..af2adf5ce786d6 100644 --- a/odb/streaming.c +++ b/odb/streaming.c @@ -157,15 +157,26 @@ static int open_istream_incore(struct odb_read_stream **out, .base.read = read_istream_incore, }; struct odb_incore_read_stream *st; + unsigned long size_ul; int ret; oi.typep = &stream.base.type; - oi.sizep = &stream.base.size; + /* + * object_info.sizep is unsigned long* (32-bit on Windows), but + * stream.base.size is size_t (64-bit). We use a temporary variable + * because the types are incompatible. Note: this path still truncates + * for >4GB objects, but large objects should use pack streaming + * (packfile_store_read_object_stream) which handles size_t properly. + * This incore fallback is only used for small objects or when pack + * streaming is unavailable. + */ + oi.sizep = &size_ul; oi.contentp = (void **)&stream.buf; ret = odb_read_object_info_extended(odb, oid, &oi, OBJECT_INFO_DIE_IF_CORRUPT); if (ret) return ret; + stream.base.size = size_ul; CALLOC_ARRAY(st, 1); *st = stream; diff --git a/odb/streaming.h b/odb/streaming.h index c7861f7e13c606..517e2ea2d3f5c3 100644 --- a/odb/streaming.h +++ b/odb/streaming.h @@ -21,7 +21,7 @@ struct odb_read_stream { odb_read_stream_close_fn close; odb_read_stream_read_fn read; enum object_type type; - unsigned long size; /* inflated size of full object */ + size_t size; /* inflated size of full object */ }; /* diff --git a/oss-fuzz/fuzz-pack-headers.c b/oss-fuzz/fuzz-pack-headers.c index 150c0f5fa2d7ec..ef61ab577c5098 100644 --- a/oss-fuzz/fuzz-pack-headers.c +++ b/oss-fuzz/fuzz-pack-headers.c @@ -6,7 +6,7 @@ int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size); int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { enum object_type type; - unsigned long len; + size_t len; unpack_object_header_buffer((const unsigned char *)data, (unsigned long)size, &type, &len); diff --git a/pack-bitmap.c b/pack-bitmap.c index f6ec18d83afe21..f9af8a96bdf4ee 100644 --- a/pack-bitmap.c +++ b/pack-bitmap.c @@ -2270,7 +2270,7 @@ static int try_partial_reuse(struct bitmap_index *bitmap_git, { off_t delta_obj_offset; enum object_type type; - unsigned long size; + size_t size; if (pack_pos >= pack->p->num_objects) return -1; /* not actually in the pack */ diff --git a/pack-check.c b/pack-check.c index 79992bb509f473..2792f34d2595bf 100644 --- a/pack-check.c +++ b/pack-check.c @@ -110,7 +110,7 @@ static int verify_packfile(struct repository *r, void *data; struct object_id oid; enum object_type type; - unsigned long size; + size_t size; off_t curpos; int data_valid; @@ -143,7 +143,9 @@ static int verify_packfile(struct repository *r, data = NULL; data_valid = 0; } else { - data = unpack_entry(r, p, entries[i].offset, &type, &size); + unsigned long sz; + data = unpack_entry(r, p, entries[i].offset, &type, &sz); + size = sz; data_valid = 1; } diff --git a/packfile.c b/packfile.c index b012d648adaf2e..4208f53046b630 100644 --- a/packfile.c +++ b/packfile.c @@ -1133,7 +1133,7 @@ int packfile_store_count_objects(struct packfile_store *store, } unsigned long unpack_object_header_buffer(const unsigned char *buf, - unsigned long len, enum object_type *type, unsigned long *sizep) + unsigned long len, enum object_type *type, size_t *sizep) { unsigned shift; size_t size, c; @@ -1144,7 +1144,11 @@ unsigned long unpack_object_header_buffer(const unsigned char *buf, size = c & 15; shift = 4; while (c & 0x80) { - if (len <= used || (bitsizeof(long) - 7) < shift) { + /* + * Each continuation byte adds 7 bits. Ensure shift won't + * overflow size_t (use size_t not long for 64-bit on Windows). + */ + if (len <= used || (bitsizeof(size_t) - 7) < shift) { error("bad object header"); size = used = 0; break; @@ -1153,13 +1157,16 @@ unsigned long unpack_object_header_buffer(const unsigned char *buf, size = st_add(size, st_left_shift(c & 0x7f, shift)); shift += 7; } - *sizep = cast_size_t_to_ulong(size); + *sizep = size; return used; } -unsigned long get_size_from_delta(struct packed_git *p, - struct pack_window **w_curs, - off_t curpos) +/* + * Size_t variant for >4GB delta results on Windows. + */ +static size_t get_size_from_delta_sz(struct packed_git *p, + struct pack_window **w_curs, + off_t curpos) { const unsigned char *data; unsigned char delta_head[20], *in; @@ -1206,16 +1213,24 @@ unsigned long get_size_from_delta(struct packed_git *p, data = delta_head; /* ignore base size */ - get_delta_hdr_size(&data, delta_head+sizeof(delta_head)); + get_delta_hdr_size_sz(&data, delta_head+sizeof(delta_head)); /* Read the result size */ - return get_delta_hdr_size(&data, delta_head+sizeof(delta_head)); + return get_delta_hdr_size_sz(&data, delta_head+sizeof(delta_head)); +} + +unsigned long get_size_from_delta(struct packed_git *p, + struct pack_window **w_curs, + off_t curpos) +{ + size_t size = get_size_from_delta_sz(p, w_curs, curpos); + return cast_size_t_to_ulong(size); } int unpack_object_header(struct packed_git *p, struct pack_window **w_curs, off_t *curpos, - unsigned long *sizep) + size_t *sizep) { unsigned char *base; unsigned long left; @@ -1367,7 +1382,7 @@ static enum object_type packed_to_object_type(struct repository *r, while (type == OBJ_OFS_DELTA || type == OBJ_REF_DELTA) { off_t base_offset; - unsigned long size; + size_t size; /* Push the object we're going to leave behind */ if (poi_stack_nr >= poi_stack_alloc && poi_stack == small_poi_stack) { poi_stack_alloc = alloc_nr(poi_stack_nr); @@ -1586,7 +1601,7 @@ static int packed_object_info_with_index_pos(struct packed_git *p, off_t obj_off uint32_t *maybe_index_pos, struct object_info *oi) { struct pack_window *w_curs = NULL; - unsigned long size; + size_t size; off_t curpos = obj_offset; enum object_type type = OBJ_NONE; uint32_t pack_pos; @@ -1614,14 +1629,18 @@ static int packed_object_info_with_index_pos(struct packed_git *p, off_t obj_off ret = -1; goto out; } - *oi->sizep = get_size_from_delta(p, &w_curs, tmp_pos); - if (*oi->sizep == 0) { + /* + * Use size_t variant to avoid die() on >4GB deltas. + * oi->sizep is unsigned long, so truncation may occur, + * but streaming code uses its own size_t tracking. + */ + size = get_size_from_delta_sz(p, &w_curs, tmp_pos); + if (size == 0) { ret = -1; goto out; } - } else { - *oi->sizep = size; } + *oi->sizep = (unsigned long)size; } if (oi->disk_sizep || (oi->mtimep && p->is_cruft)) { @@ -1778,7 +1797,7 @@ void *unpack_entry(struct repository *r, struct packed_git *p, off_t obj_offset, struct pack_window *w_curs = NULL; off_t curpos = obj_offset; void *data = NULL; - unsigned long size; + size_t size; enum object_type type; struct unpack_entry_stack_ent small_delta_stack[UNPACK_ENTRY_STACK_PREALLOC]; struct unpack_entry_stack_ent *delta_stack = small_delta_stack; @@ -1943,8 +1962,10 @@ void *unpack_entry(struct repository *r, struct packed_git *p, off_t obj_offset, (uintmax_t)curpos, p->pack_name); data = NULL; } else { + unsigned long sz; data = patch_delta(base, base_size, delta_data, - delta_size, &size); + delta_size, &sz); + size = sz; /* * We could not apply the delta; warn the user, but @@ -2929,7 +2950,7 @@ int packfile_read_object_stream(struct odb_read_stream **out, struct odb_packed_read_stream *stream; struct pack_window *window = NULL; enum object_type in_pack_type; - unsigned long size; + size_t size; in_pack_type = unpack_object_header(pack, &window, &offset, &size); unuse_pack(&window); diff --git a/packfile.h b/packfile.h index 9b647da7dda7c1..49d6bdecf6ea18 100644 --- a/packfile.h +++ b/packfile.h @@ -456,9 +456,9 @@ off_t find_pack_entry_one(const struct object_id *oid, struct packed_git *); int is_pack_valid(struct packed_git *); void *unpack_entry(struct repository *r, struct packed_git *, off_t, enum object_type *, unsigned long *); -unsigned long unpack_object_header_buffer(const unsigned char *buf, unsigned long len, enum object_type *type, unsigned long *sizep); +unsigned long unpack_object_header_buffer(const unsigned char *buf, unsigned long len, enum object_type *type, size_t *sizep); unsigned long get_size_from_delta(struct packed_git *, struct pack_window **, off_t); -int unpack_object_header(struct packed_git *, struct pack_window **, off_t *, unsigned long *); +int unpack_object_header(struct packed_git *, struct pack_window **, off_t *, size_t *); off_t get_delta_base(struct packed_git *p, struct pack_window **w_curs, off_t *curpos, enum object_type type, off_t delta_obj_offset); diff --git a/t/helper/meson.build b/t/helper/meson.build index 675e64c0101b61..3235f10ab8aae1 100644 --- a/t/helper/meson.build +++ b/t/helper/meson.build @@ -69,6 +69,7 @@ test_tool_sources = [ 'test-submodule-nested-repo-config.c', 'test-submodule.c', 'test-subprocess.c', + 'test-synthesize.c', 'test-tool.c', 'test-trace2.c', 'test-truncate.c', diff --git a/t/helper/test-synthesize.c b/t/helper/test-synthesize.c new file mode 100644 index 00000000000000..3ce7078078efff --- /dev/null +++ b/t/helper/test-synthesize.c @@ -0,0 +1,250 @@ +#define USE_THE_REPOSITORY_VARIABLE + +#include "test-tool.h" +#include "git-compat-util.h" +#include "git-zlib.h" +#include "hash.h" +#include "hex.h" +#include "object-file.h" +#include "object.h" +#include "pack.h" +#include "parse-options.h" +#include "parse.h" +#include "repository.h" +#include "setup.h" +#include "strbuf.h" +#include "write-or-die.h" + +#define BLOCK_SIZE 0xffff +static const unsigned char zeros[BLOCK_SIZE]; + +/* + * Write data as an uncompressed zlib stream. + * For data larger than 64KB, writes multiple uncompressed blocks. + * If data is NULL, writes zeros. + * Updates the pack checksum context. + */ +static void write_uncompressed_zlib(FILE *f, struct git_hash_ctx *pack_ctx, + const void *data, size_t len, + const struct git_hash_algo *algo) +{ + unsigned char zlib_header[2] = { 0x78, 0x01 }; /* CMF, FLG */ + unsigned char block_header[5]; + const unsigned char *p = data; + size_t remaining = len; + uint32_t adler = 1L; /* adler32 initial value */ + unsigned char adler_buf[4]; + + /* Write zlib header */ + fwrite_or_die(f, zlib_header, sizeof(zlib_header)); + algo->update_fn(pack_ctx, zlib_header, 2); + + /* Write uncompressed blocks (max 64KB each) */ + do { + size_t block_len = remaining > BLOCK_SIZE ? BLOCK_SIZE : remaining; + int is_final = (block_len == remaining); + const unsigned char *block_data = data ? p : zeros; + + block_header[0] = is_final ? 0x01 : 0x00; + block_header[1] = block_len & 0xff; + block_header[2] = (block_len >> 8) & 0xff; + block_header[3] = block_header[1] ^ 0xff; + block_header[4] = block_header[2] ^ 0xff; + + fwrite_or_die(f, block_header, sizeof(block_header)); + algo->update_fn(pack_ctx, block_header, 5); + + if (block_len) { + fwrite_or_die(f, block_data, block_len); + algo->update_fn(pack_ctx, block_data, block_len); + adler = adler32(adler, block_data, block_len); + } + + if (data) + p += block_len; + remaining -= block_len; + } while (remaining > 0); + + /* Write adler32 checksum */ + put_be32(adler_buf, adler); + fwrite_or_die(f, adler_buf, sizeof(adler_buf)); + algo->update_fn(pack_ctx, adler_buf, 4); +} + +/* + * Write an uncompressed object to the pack file. + * If `data == NULL`, it is treated like a buffer to NUL bytes. + * Updates the pack checksum context. + */ +static void write_pack_object(FILE *f, struct git_hash_ctx *pack_ctx, + enum object_type type, + const void *data, size_t len, + struct object_id *oid, + const struct git_hash_algo *algo) +{ + unsigned char pack_header[MAX_PACK_OBJECT_HEADER]; + char object_header[32]; + int pack_header_len, object_header_len; + struct git_hash_ctx ctx; + + /* Write pack object header */ + pack_header_len = encode_in_pack_object_header(pack_header, + sizeof(pack_header), + type, len); + fwrite_or_die(f, pack_header, pack_header_len); + algo->update_fn(pack_ctx, pack_header, pack_header_len); + + /* Write the data as uncompressed zlib */ + write_uncompressed_zlib(f, pack_ctx, data, len, algo); + + algo->init_fn(&ctx); + object_header_len = format_object_header(object_header, + sizeof(object_header), + type, len); + algo->update_fn(&ctx, object_header, object_header_len); + if (data) + algo->update_fn(&ctx, data, len); + else { + for (size_t i = len / BLOCK_SIZE; i; i--) + algo->update_fn(&ctx, zeros, BLOCK_SIZE); + algo->update_fn(&ctx, zeros, len % BLOCK_SIZE); + } + algo->final_oid_fn(oid, &ctx); +} + +/* + * Generate a pack file with a single large (>4GB) reachable object. + * + * Creates: + * 1. A large blob (all NUL bytes) + * 2. A tree containing that blob as "file" + * 3. A commit using that tree + * 4. The empty tree + * 5. A child commit using the empty tree + * + * This is useful for testing that Git can handle objects larger than 4GB. + */ +static int generate_pack_with_large_object(const char *path, size_t blob_size, + const struct git_hash_algo *algo) +{ + FILE *f = xfopen(path, "wb"); + struct git_hash_ctx pack_ctx; + unsigned char pack_hash[GIT_MAX_RAWSZ]; + struct object_id blob_oid, tree_oid, commit_oid, empty_tree_oid, final_commit_oid; + struct strbuf buf = STRBUF_INIT; + const uint32_t object_count = 5; + struct pack_header pack_header = { + .hdr_signature = htonl(PACK_SIGNATURE), + .hdr_version = htonl(PACK_VERSION), + .hdr_entries = htonl(object_count), + }; + + algo->init_fn(&pack_ctx); + + /* Write pack header */ + fwrite_or_die(f, &pack_header, sizeof(pack_header)); + algo->update_fn(&pack_ctx, &pack_header, sizeof(pack_header)); + + /* 1. Write the large blob */ + write_pack_object(f, &pack_ctx, OBJ_BLOB, NULL, blob_size, &blob_oid, algo); + + /* 2. Write tree containing the blob as "file" */ + strbuf_addf(&buf, "100644 file%c", '\0'); + strbuf_add(&buf, blob_oid.hash, algo->rawsz); + write_pack_object(f, &pack_ctx, OBJ_TREE, buf.buf, buf.len, &tree_oid, algo); + + /* 3. Write commit using that tree */ + strbuf_reset(&buf); + strbuf_addf(&buf, + "tree %s\n" + "author A U Thor 1234567890 +0000\n" + "committer C O Mitter 1234567890 +0000\n" + "\n" + "Large blob commit\n", + oid_to_hex(&tree_oid)); + write_pack_object(f, &pack_ctx, OBJ_COMMIT, buf.buf, buf.len, &commit_oid, algo); + + /* 4. Write the empty tree */ + write_pack_object(f, &pack_ctx, OBJ_TREE, "", 0, &empty_tree_oid, algo); + + /* 5. Write final commit using empty tree, with previous commit as parent */ + strbuf_reset(&buf); + strbuf_addf(&buf, + "tree %s\n" + "parent %s\n" + "author A U Thor 1234567890 +0000\n" + "committer C O Mitter 1234567890 +0000\n" + "\n" + "Empty tree commit\n", + oid_to_hex(&empty_tree_oid), + oid_to_hex(&commit_oid)); + write_pack_object(f, &pack_ctx, OBJ_COMMIT, buf.buf, buf.len, &final_commit_oid, algo); + + /* Write pack trailer (checksum) */ + algo->final_fn(pack_hash, &pack_ctx); + fwrite_or_die(f, pack_hash, algo->rawsz); + if (fclose(f)) + die_errno(_("could not close '%s'"), path); + + strbuf_release(&buf); + + /* Print the final commit OID so caller can set up refs */ + printf("%s\n", oid_to_hex(&final_commit_oid)); + + return 0; +} + +static int cmd__synthesize__pack(int argc, const char **argv, + const char *prefix UNUSED, + struct repository *repo) +{ + int non_git; + int reachable_large = 0; + const struct git_hash_algo *algo; + size_t blob_size; + uintmax_t blob_size_u; + const char *path; + const char * const usage[] = { + "test-tool synthesize pack " + "--reachable-large ", + NULL + }; + struct option options[] = { + OPT_BOOL(0, "reachable-large", &reachable_large, + N_("write a pack with a single reachable large blob")), + OPT_END() + }; + + setup_git_directory_gently(&non_git); + repo = the_repository; + algo = repo->hash_algo; + + argc = parse_options(argc, argv, NULL, options, usage, + PARSE_OPT_KEEP_ARGV0); + if (argc != 3 || !reachable_large) + usage_with_options(usage, options); + + if (!git_parse_unsigned(argv[1], &blob_size_u, + maximum_unsigned_value_of_type(size_t))) + die(_("'%s' is not a valid blob size"), argv[1]); + blob_size = blob_size_u; + path = argv[2]; + + return !!generate_pack_with_large_object(path, blob_size, algo); +} + +int cmd__synthesize(int argc, const char **argv) +{ + const char *prefix = NULL; + char const * const synthesize_usage[] = { + "test-tool synthesize pack ", + NULL, + }; + parse_opt_subcommand_fn *fn = NULL; + struct option options[] = { + OPT_SUBCOMMAND("pack", &fn, cmd__synthesize__pack), + OPT_END() + }; + argc = parse_options(argc, argv, prefix, options, synthesize_usage, 0); + return !!fn(argc, argv, prefix, NULL); +} diff --git a/t/helper/test-tool.c b/t/helper/test-tool.c index a7abc618b3887e..b71a22b43bbc9e 100644 --- a/t/helper/test-tool.c +++ b/t/helper/test-tool.c @@ -82,6 +82,7 @@ static struct test_cmd cmds[] = { { "submodule-config", cmd__submodule_config }, { "submodule-nested-repo-config", cmd__submodule_nested_repo_config }, { "subprocess", cmd__subprocess }, + { "synthesize", cmd__synthesize }, { "trace2", cmd__trace2 }, { "truncate", cmd__truncate }, { "userdiff", cmd__userdiff }, diff --git a/t/helper/test-tool.h b/t/helper/test-tool.h index 7f150fa1eb9ad2..f2885b33d58aa8 100644 --- a/t/helper/test-tool.h +++ b/t/helper/test-tool.h @@ -75,6 +75,7 @@ int cmd__submodule(int argc, const char **argv); int cmd__submodule_config(int argc, const char **argv); int cmd__submodule_nested_repo_config(int argc, const char **argv); int cmd__subprocess(int argc, const char **argv); +int cmd__synthesize(int argc, const char **argv); int cmd__trace2(int argc, const char **argv); int cmd__truncate(int argc, const char **argv); int cmd__userdiff(int argc, const char **argv); diff --git a/t/t5608-clone-2gb.sh b/t/t5608-clone-2gb.sh index 87a8cd9f98381a..af93302ddec1cf 100755 --- a/t/t5608-clone-2gb.sh +++ b/t/t5608-clone-2gb.sh @@ -49,4 +49,41 @@ test_expect_success 'clone - with worktree, file:// protocol' ' ' +test_expect_success SIZE_T_IS_64BIT 'set up repo with >4GB object' ' + large_blob_size=$((4*1024*1024*1024+1)) && + git init --bare 4gb-repo && + head_oid=$(test-tool synthesize pack \ + --reachable-large "$large_blob_size" \ + 4gb-repo/objects/pack/test.pack) && + git -C 4gb-repo index-pack objects/pack/test.pack && + git -C 4gb-repo update-ref refs/heads/main $head_oid && + git -C 4gb-repo symbolic-ref HEAD refs/heads/main +' + +test_expect_success SIZE_T_IS_64BIT 'clone >4GB object via unpack-objects' ' + # The synthesized pack has five objects, so a large unpack limit keeps + # fetch-pack on the unpack-objects path. + git -c fetch.unpackLimit=100 clone --bare \ + "file://$(pwd)/4gb-repo" 4gb-clone-unpack && + + # Verify the large blob survived the clone by comparing its OID + # between source and clone. We cannot use "cat-file -s" because + # object_info.sizep is still unsigned long, which truncates >4GB + # sizes on Windows. OID equality proves content integrity since + # the clone already verified checksums via index-pack/unpack-objects. + source_blob=$(git -C 4gb-repo rev-parse main^:file) && + clone_blob=$(git -C 4gb-clone-unpack rev-parse main^:file) && + test "$source_blob" = "$clone_blob" +' + +test_expect_success SIZE_T_IS_64BIT 'clone with >4GB object via index-pack' ' + # Force fetch-pack to hand the pack to index-pack instead. + git -c fetch.unpackLimit=1 clone --bare \ + "file://$(pwd)/4gb-repo" 4gb-clone-index && + + source_blob=$(git -C 4gb-repo rev-parse main^:file) && + clone_blob=$(git -C 4gb-clone-index rev-parse main^:file) && + test "$source_blob" = "$clone_blob" +' + test_done