diff options
author | Junio C Hamano <gitster@pobox.com> | 2016-10-10 14:03:46 -0700 |
---|---|---|
committer | Junio C Hamano <gitster@pobox.com> | 2016-10-10 14:03:47 -0700 |
commit | e6e24c94df9df6d39f2316113c14fe07d2ab03d7 (patch) | |
tree | 1f883be2ac3f3a84ce5af3cd1e9e32c45648e21e | |
parent | b8688adb12d086b161aa7c369126bdd56843a01b (diff) | |
parent | c9af708b1a5a92cbde2a0c9a75b13530f792ac84 (diff) | |
download | git-e6e24c94df9df6d39f2316113c14fe07d2ab03d7.tar.gz |
Merge branch 'jk/pack-objects-optim-mru'
"git pack-objects" in a repository with many packfiles used to
spend a lot of time looking for/at objects in them; the accesses to
the packfiles are now optimized by checking the most-recently-used
packfile first.
* jk/pack-objects-optim-mru:
pack-objects: use mru list when iterating over packs
pack-objects: break delta cycles before delta-search phase
sha1_file: make packed_object_info public
provide an initializer for "struct object_info"
-rw-r--r-- | builtin/cat-file.c | 5 | ||||
-rw-r--r-- | builtin/pack-objects.c | 92 | ||||
-rw-r--r-- | cache.h | 8 | ||||
-rw-r--r-- | pack-objects.h | 9 | ||||
-rw-r--r-- | sha1_file.c | 10 | ||||
-rw-r--r-- | streaming.c | 2 | ||||
-rwxr-xr-x | t/t5314-pack-cycle-detection.sh | 113 |
7 files changed, 227 insertions, 12 deletions
diff --git a/builtin/cat-file.c b/builtin/cat-file.c index cca97a86c0..30383e9eb4 100644 --- a/builtin/cat-file.c +++ b/builtin/cat-file.c @@ -53,7 +53,7 @@ static int cat_one_file(int opt, const char *exp_type, const char *obj_name, char *buf; unsigned long size; struct object_context obj_context; - struct object_info oi = {NULL}; + struct object_info oi = OBJECT_INFO_INIT; struct strbuf sb = STRBUF_INIT; unsigned flags = LOOKUP_REPLACE_OBJECT; const char *path = force_path; @@ -449,8 +449,7 @@ static int batch_objects(struct batch_options *opt) data.split_on_whitespace = 1; if (opt->all_objects) { - struct object_info empty; - memset(&empty, 0, sizeof(empty)); + struct object_info empty = OBJECT_INFO_INIT; if (!memcmp(&data.info, &empty, sizeof(empty))) data.skip_object_info = 1; } diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c index 8aeba6a6e1..1e7c2a98a5 100644 --- a/builtin/pack-objects.c +++ b/builtin/pack-objects.c @@ -23,6 +23,7 @@ #include "reachable.h" #include "sha1-array.h" #include "argv-array.h" +#include "mru.h" static const char *pack_usage[] = { N_("git pack-objects --stdout [<options>...] [< <ref-list> | < <object-list>]"), @@ -994,7 +995,7 @@ static int want_object_in_pack(const unsigned char *sha1, struct packed_git **found_pack, off_t *found_offset) { - struct packed_git *p; + struct mru_entry *entry; int want; if (!exclude && local && has_loose_object_nonlocal(sha1)) @@ -1011,7 +1012,8 @@ static int want_object_in_pack(const unsigned char *sha1, return want; } - for (p = packed_git; p; p = p->next) { + for (entry = packed_git_mru->head; entry; entry = entry->next) { + struct packed_git *p = entry->item; off_t offset; if (p == *found_pack) @@ -1027,6 +1029,8 @@ static int want_object_in_pack(const unsigned char *sha1, *found_pack = p; } want = want_found_object(exclude, p); + if (!exclude && want > 0) + mru_mark(packed_git_mru, entry); if (want != -1) return want; } @@ -1527,6 +1531,83 @@ static int pack_offset_sort(const void *_a, const void *_b) (a->in_pack_offset > b->in_pack_offset); } +/* + * Drop an on-disk delta we were planning to reuse. Naively, this would + * just involve blanking out the "delta" field, but we have to deal + * with some extra book-keeping: + * + * 1. Removing ourselves from the delta_sibling linked list. + * + * 2. Updating our size/type to the non-delta representation. These were + * either not recorded initially (size) or overwritten with the delta type + * (type) when check_object() decided to reuse the delta. + */ +static void drop_reused_delta(struct object_entry *entry) +{ + struct object_entry **p = &entry->delta->delta_child; + struct object_info oi = OBJECT_INFO_INIT; + + while (*p) { + if (*p == entry) + *p = (*p)->delta_sibling; + else + p = &(*p)->delta_sibling; + } + entry->delta = NULL; + + oi.sizep = &entry->size; + oi.typep = &entry->type; + if (packed_object_info(entry->in_pack, entry->in_pack_offset, &oi) < 0) { + /* + * We failed to get the info from this pack for some reason; + * fall back to sha1_object_info, which may find another copy. + * And if that fails, the error will be recorded in entry->type + * and dealt with in prepare_pack(). + */ + entry->type = sha1_object_info(entry->idx.sha1, &entry->size); + } +} + +/* + * Follow the chain of deltas from this entry onward, throwing away any links + * that cause us to hit a cycle (as determined by the DFS state flags in + * the entries). + */ +static void break_delta_chains(struct object_entry *entry) +{ + /* If it's not a delta, it can't be part of a cycle. */ + if (!entry->delta) { + entry->dfs_state = DFS_DONE; + return; + } + + switch (entry->dfs_state) { + case DFS_NONE: + /* + * This is the first time we've seen the object. We mark it as + * part of the active potential cycle and recurse. + */ + entry->dfs_state = DFS_ACTIVE; + break_delta_chains(entry->delta); + entry->dfs_state = DFS_DONE; + break; + + case DFS_DONE: + /* object already examined, and not part of a cycle */ + break; + + case DFS_ACTIVE: + /* + * We found a cycle that needs broken. It would be correct to + * break any link in the chain, but it's convenient to + * break this one. + */ + drop_reused_delta(entry); + entry->dfs_state = DFS_DONE; + break; + } +} + static void get_object_details(void) { uint32_t i; @@ -1544,6 +1625,13 @@ static void get_object_details(void) entry->no_try_delta = 1; } + /* + * This must happen in a second pass, since we rely on the delta + * information for the whole list being completed. + */ + for (i = 0; i < to_pack.nr_objects; i++) + break_delta_chains(&to_pack.objects[i]); + free(sorted_by_offset); } @@ -1602,7 +1602,15 @@ struct object_info { } packed; } u; }; + +/* + * Initializer for a "struct object_info" that wants no items. You may + * also memset() the memory to all-zeroes. + */ +#define OBJECT_INFO_INIT {NULL} + extern int sha1_object_info_extended(const unsigned char *, struct object_info *, unsigned flags); +extern int packed_object_info(struct packed_git *pack, off_t offset, struct object_info *); /* Dumb servers support */ extern int update_server_info(int); diff --git a/pack-objects.h b/pack-objects.h index d1b98b30ff..cc9b9a9b90 100644 --- a/pack-objects.h +++ b/pack-objects.h @@ -27,6 +27,15 @@ struct object_entry { unsigned no_try_delta:1; unsigned tagged:1; /* near the very tip of refs */ unsigned filled:1; /* assigned write-order */ + + /* + * State flags for depth-first search used for analyzing delta cycles. + */ + enum { + DFS_NONE = 0, + DFS_ACTIVE, + DFS_DONE + } dfs_state; }; struct packing_data { diff --git a/sha1_file.c b/sha1_file.c index 94daf31ec6..309e87d987 100644 --- a/sha1_file.c +++ b/sha1_file.c @@ -1826,11 +1826,9 @@ static int parse_sha1_header_extended(const char *hdr, struct object_info *oi, int parse_sha1_header(const char *hdr, unsigned long *sizep) { - struct object_info oi; + struct object_info oi = OBJECT_INFO_INIT; oi.sizep = sizep; - oi.typename = NULL; - oi.typep = NULL; return parse_sha1_header_extended(hdr, &oi, LOOKUP_REPLACE_OBJECT); } @@ -2068,8 +2066,8 @@ unwind: goto out; } -static int packed_object_info(struct packed_git *p, off_t obj_offset, - struct object_info *oi) +int packed_object_info(struct packed_git *p, off_t obj_offset, + struct object_info *oi) { struct pack_window *w_curs = NULL; unsigned long size; @@ -2840,7 +2838,7 @@ int sha1_object_info_extended(const unsigned char *sha1, struct object_info *oi, int sha1_object_info(const unsigned char *sha1, unsigned long *sizep) { enum object_type type; - struct object_info oi = {NULL}; + struct object_info oi = OBJECT_INFO_INIT; oi.typep = &type; oi.sizep = sizep; diff --git a/streaming.c b/streaming.c index 3f017a1c05..9afa66b8be 100644 --- a/streaming.c +++ b/streaming.c @@ -135,7 +135,7 @@ struct git_istream *open_istream(const unsigned char *sha1, struct stream_filter *filter) { struct git_istream *st; - struct object_info oi = {NULL}; + struct object_info oi = OBJECT_INFO_INIT; const unsigned char *real = lookup_replace_object(sha1); enum input_source src = istream_source(real, type, &oi); diff --git a/t/t5314-pack-cycle-detection.sh b/t/t5314-pack-cycle-detection.sh new file mode 100755 index 0000000000..f7dbdfb412 --- /dev/null +++ b/t/t5314-pack-cycle-detection.sh @@ -0,0 +1,113 @@ +#!/bin/sh + +test_description='test handling of inter-pack delta cycles during repack + +The goal here is to create a situation where we have two blobs, A and B, with A +as a delta against B in one pack, and vice versa in the other. Then if we can +persuade a full repack to find A from one pack and B from the other, that will +give us a cycle when we attempt to reuse those deltas. + +The trick is in the "persuade" step, as it depends on the internals of how +pack-objects picks which pack to reuse the deltas from. But we can assume +that it does so in one of two general strategies: + + 1. Using a static ordering of packs. In this case, no inter-pack cycles can + happen. Any objects with a delta relationship must be present in the same + pack (i.e., no "--thin" packs on disk), so we will find all related objects + from that pack. So assuming there are no cycles within a single pack (and + we avoid generating them via pack-objects or importing them via + index-pack), then our result will have no cycles. + + So this case should pass the tests no matter how we arrange things. + + 2. Picking the next pack to examine based on locality (i.e., where we found + something else recently). + + In this case, we want to make sure that we find the delta versions of A and + B and not their base versions. We can do this by putting two blobs in each + pack. The first is a "dummy" blob that can only be found in the pack in + question. And then the second is the actual delta we want to find. + + The two blobs must be present in the same tree, not present in other trees, + and the dummy pathname must sort before the delta path. + +The setup below focuses on case 2. We have two commits HEAD and HEAD^, each +which has two files: "dummy" and "file". Then we can make two packs which +contain: + + [pack one] + HEAD:dummy + HEAD:file (as delta against HEAD^:file) + HEAD^:file (as base) + + [pack two] + HEAD^:dummy + HEAD^:file (as delta against HEAD:file) + HEAD:file (as base) + +Then no matter which order we start looking at the packs in, we know that we +will always find a delta for "file", because its lookup will always come +immediately after the lookup for "dummy". +' +. ./test-lib.sh + + + +# Create a pack containing the the tree $1 and blob $1:file, with +# the latter stored as a delta against $2:file. +# +# We convince pack-objects to make the delta in the direction of our choosing +# by marking $2 as a preferred-base edge. That results in $1:file as a thin +# delta, and index-pack completes it by adding $2:file as a base. +# +# Note that the two variants of "file" must be similar enough to convince git +# to create the delta. +make_pack () { + { + printf '%s\n' "-$(git rev-parse $2)" + printf '%s dummy\n' "$(git rev-parse $1:dummy)" + printf '%s file\n' "$(git rev-parse $1:file)" + } | + git pack-objects --stdout | + git index-pack --stdin --fix-thin +} + +test_expect_success 'setup' ' + test-genrandom base 4096 >base && + for i in one two + do + # we want shared content here to encourage deltas... + cp base file && + echo $i >>file && + + # ...whereas dummy should be short, because we do not want + # deltas that would create duplicates when we --fix-thin + echo $i >dummy && + + git add file dummy && + test_tick && + git commit -m $i || + return 1 + done && + + make_pack HEAD^ HEAD && + make_pack HEAD HEAD^ +' + +test_expect_success 'repack' ' + # We first want to check that we do not have any internal errors, + # and also that we do not hit the last-ditch cycle-breaking code + # in write_object(), which will issue a warning to stderr. + >expect && + git repack -ad 2>stderr && + test_cmp expect stderr && + + # And then double-check that the resulting pack is usable (i.e., + # we did not fail to notice any cycles). We know we are accessing + # the objects via the new pack here, because "repack -d" will have + # removed the others. + git cat-file blob HEAD:file >/dev/null && + git cat-file blob HEAD^:file >/dev/null +' + +test_done |