From 4cf2143e02993440d6661ddccbd896bee7756c02 Mon Sep 17 00:00:00 2001
From: Jeff King <peff@peff.net>
Date: Thu, 11 Aug 2016 05:26:36 -0400
Subject: pack-objects: break delta cycles before delta-search phase

We do not allow cycles in the delta graph of a pack (i.e., A
is a delta of B which is a delta of A) for the obvious
reason that you cannot actually access any of the objects in
such a case.

There's a last-ditch attempt to notice cycles during the
write phase, during which we issue a warning to the user and
write one of the objects out in full. However, this is
"last-ditch" for two reasons:

  1. By this time, it's too late to find another delta for
     the object, so the resulting pack is larger than it
     otherwise could be.

  2. The warning is there because this is something that
     _shouldn't_ ever happen. If it does, then either:

       a. a pack we are reusing deltas from had its own
          cycle

       b. we are reusing deltas from multiple packs, and
          we found a cycle among them (i.e., A is a delta of
          B in one pack, but B is a delta of A in another,
          and we choose to use both deltas).

       c. there is a bug in the delta-search code

     So this code serves as a final check that none of these
     things has happened, warns the user, and prevents us
     from writing a bogus pack.

Right now, (2b) should never happen because of the static
ordering of packs in want_object_in_pack(). If two objects
have a delta relationship, then they must be in the same
pack, and therefore we will find them from that same pack.

However, a future patch would like to change that static
ordering, which will make (2b) a common occurrence. In
preparation, we should be able to handle those kinds of
cycles better. This patch does by introducing a
cycle-breaking step during the get_object_details() phase,
when we are deciding which deltas can be reused. That gives
us the chance to feed the objects into the delta search as
if the cycle did not exist.

We'll leave the detection and warning in the write_object()
phase in place, as it still serves as a check for case (2c).

This does mean we will stop warning for (2a). That case is
caused by bogus input packs, and we ideally would warn the
user about it.  However, since those cycles show up after
picking reusable deltas, they look the same as (2b) to us;
our new code will break the cycles early and the last-ditch
check will never see them.

We could do analysis on any cycles that we find to
distinguish the two cases (i.e., it is a bogus pack if and
only if every delta in the cycle is in the same pack), but
we don't need to. If there is a cycle inside a pack, we'll
run into problems not only reusing the delta, but accessing
the object data at all. So when we try to dig up the actual
size of the object, we'll hit that same cycle and kick in
our usual complain-and-try-another-source code.

Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 builtin/pack-objects.c | 84 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 84 insertions(+)

(limited to 'builtin/pack-objects.c')

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index c4c2a3c79d..ed77355c6e 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -1494,6 +1494,83 @@ static int pack_offset_sort(const void *_a, const void *_b)
 			(a->in_pack_offset > b->in_pack_offset);
 }
 
+/*
+ * Drop an on-disk delta we were planning to reuse. Naively, this would
+ * just involve blanking out the "delta" field, but we have to deal
+ * with some extra book-keeping:
+ *
+ *   1. Removing ourselves from the delta_sibling linked list.
+ *
+ *   2. Updating our size/type to the non-delta representation. These were
+ *      either not recorded initially (size) or overwritten with the delta type
+ *      (type) when check_object() decided to reuse the delta.
+ */
+static void drop_reused_delta(struct object_entry *entry)
+{
+	struct object_entry **p = &entry->delta->delta_child;
+	struct object_info oi = OBJECT_INFO_INIT;
+
+	while (*p) {
+		if (*p == entry)
+			*p = (*p)->delta_sibling;
+		else
+			p = &(*p)->delta_sibling;
+	}
+	entry->delta = NULL;
+
+	oi.sizep = &entry->size;
+	oi.typep = &entry->type;
+	if (packed_object_info(entry->in_pack, entry->in_pack_offset, &oi) < 0) {
+		/*
+		 * We failed to get the info from this pack for some reason;
+		 * fall back to sha1_object_info, which may find another copy.
+		 * And if that fails, the error will be recorded in entry->type
+		 * and dealt with in prepare_pack().
+		 */
+		entry->type = sha1_object_info(entry->idx.sha1, &entry->size);
+	}
+}
+
+/*
+ * Follow the chain of deltas from this entry onward, throwing away any links
+ * that cause us to hit a cycle (as determined by the DFS state flags in
+ * the entries).
+ */
+static void break_delta_chains(struct object_entry *entry)
+{
+	/* If it's not a delta, it can't be part of a cycle. */
+	if (!entry->delta) {
+		entry->dfs_state = DFS_DONE;
+		return;
+	}
+
+	switch (entry->dfs_state) {
+	case DFS_NONE:
+		/*
+		 * This is the first time we've seen the object. We mark it as
+		 * part of the active potential cycle and recurse.
+		 */
+		entry->dfs_state = DFS_ACTIVE;
+		break_delta_chains(entry->delta);
+		entry->dfs_state = DFS_DONE;
+		break;
+
+	case DFS_DONE:
+		/* object already examined, and not part of a cycle */
+		break;
+
+	case DFS_ACTIVE:
+		/*
+		 * We found a cycle that needs broken. It would be correct to
+		 * break any link in the chain, but it's convenient to
+		 * break this one.
+		 */
+		drop_reused_delta(entry);
+		entry->dfs_state = DFS_DONE;
+		break;
+	}
+}
+
 static void get_object_details(void)
 {
 	uint32_t i;
@@ -1511,6 +1588,13 @@ static void get_object_details(void)
 			entry->no_try_delta = 1;
 	}
 
+	/*
+	 * This must happen in a second pass, since we rely on the delta
+	 * information for the whole list being completed.
+	 */
+	for (i = 0; i < to_pack.nr_objects; i++)
+		break_delta_chains(&to_pack.objects[i]);
+
 	free(sorted_by_offset);
 }
 
-- 
cgit v1.2.1


From c9af708b1a5a92cbde2a0c9a75b13530f792ac84 Mon Sep 17 00:00:00 2001
From: Jeff King <peff@peff.net>
Date: Thu, 11 Aug 2016 05:26:57 -0400
Subject: pack-objects: use mru list when iterating over packs

In the original implementation of want_object_in_pack(), we
always looked for the object in every pack, so the order did
not matter for performance.

As of the last few patches, however, we can now often break
out of the loop early after finding the first instance, and
avoid looking in the other packs at all. In this case, pack
order can make a big difference, because we'd like to find
the objects by looking at as few packs as possible.

This patch switches us to the same packed_git_mru list that
is now used by normal object lookups.

Here are timings for p5303 on linux.git:

Test                      HEAD^                HEAD
------------------------------------------------------------------------
5303.3: rev-list (1)      31.31(31.07+0.23)    31.28(31.00+0.27) -0.1%
5303.4: repack (1)        40.35(38.84+2.60)    40.53(39.31+2.32) +0.4%
5303.6: rev-list (50)     31.37(31.15+0.21)    31.41(31.16+0.24) +0.1%
5303.7: repack (50)       58.25(68.54+2.03)    47.28(57.66+1.89) -18.8%
5303.9: rev-list (1000)   31.91(31.57+0.33)    31.93(31.64+0.28) +0.1%
5303.10: repack (1000)    304.80(376.00+3.92)  87.21(159.54+2.84) -71.4%

The rev-list numbers are unchanged, which makes sense (they
are not exercising this code at all). The 50- and 1000-pack
repack cases show considerable improvement.

The single-pack repack case doesn't, of course; there's
nothing to improve. In fact, it gives us a baseline for how
fast we could possibly go. You can see that though rev-list
can approach the single-pack case even with 1000 packs,
repack doesn't. The reason is simple: the loop we are
optimizing is only part of what the repack is doing. After
the "counting" phase, we do delta compression, which is much
more expensive when there are multiple packs, because we
have fewer deltas we can reuse (you can also see that these
numbers come from a multicore machine; the CPU times are
much higher than the wall-clock times due to the delta
phase).

So the good news is that in cases with many packs, we used
to be dominated by the "counting" phase, and now we are
dominated by the delta compression (which is faster, and
which we have already parallelized).

Here are similar numbers for git.git:

Test                      HEAD^               HEAD
---------------------------------------------------------------------
5303.3: rev-list (1)      1.55(1.51+0.02)     1.54(1.53+0.00) -0.6%
5303.4: repack (1)        1.82(1.80+0.08)     1.82(1.78+0.09) +0.0%
5303.6: rev-list (50)     1.58(1.57+0.00)     1.58(1.56+0.01) +0.0%
5303.7: repack (50)       2.50(3.12+0.07)     2.31(2.95+0.06) -7.6%
5303.9: rev-list (1000)   2.22(2.20+0.02)     2.23(2.19+0.03) +0.5%
5303.10: repack (1000)    10.47(16.78+0.22)   7.50(13.76+0.22) -28.4%

Not as impressive in terms of percentage, but still
measurable wins.  If you look at the wall-clock time
improvements in the 1000-pack case, you can see that linux
improved by roughly 10x as many seconds as git. That's
because it has roughly 10x as many objects, and we'd expect
this improvement to scale linearly with the number of
objects (since the number of packs is kept constant). It's
just that the "counting" phase is a smaller percentage of
the total time spent for a git.git repack, and hence the
percentage win is smaller.

The implementation itself is a straightforward use of the
MRU code. We only bother marking a pack as used when we know
that we are able to break early out of the loop, for two
reasons:

  1. If we can't break out early, it does no good; we have
     to visit each pack anyway, so we might as well avoid
     even the minor overhead of managing the cache order.

  2. The mru_mark() function reorders the list, which would
     screw up our traversal. So it is only safe to mark when
     we are about to break out of the loop. We could record
     the found pack and mark it after the loop finishes, of
     course, but that's more complicated and it doesn't buy
     us anything due to (1).

Note that this reordering does have a potential impact on
the final pack, as we store only a single "found" pack for
each object, even if it is present in multiple packs. In
principle, any copy is acceptable, as they all refer to the
same content. But in practice, they may differ in whether
they are stored as deltas, against which base, etc. This may
have an impact on delta reuse, and even the delta search
(since we skip pairs that were already in the same pack).

It's not clear whether this change of order would hurt or
even help average cases, though. The most likely reason to
have duplicate objects is from the completion of thin packs
(e.g., you have some objects in a "base" pack, then receive
several pushes; the packs you receive may be thin on the
wire, with deltas that refer to bases outside the pack, but
we complete them with duplicate base objects when indexing
them).

In such a case the current code would always find the thin
duplicates (because we currently walk the packs in reverse
chronological order). Whereas with this patch, some of those
duplicates would be found in the base pack instead.

In my tests repacking a real-world case of linux.git with
3600 thin-pack pushes (on top of a large "base" pack), the
resulting pack was about 0.04% larger with this patch. On
the other hand, because we were more likely to hit the base
pack, there were more opportunities for delta reuse, and we
had 50,000 fewer objects to examine in the delta search.

Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 builtin/pack-objects.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'builtin/pack-objects.c')

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index ed77355c6e..977d25f495 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -23,6 +23,7 @@
 #include "reachable.h"
 #include "sha1-array.h"
 #include "argv-array.h"
+#include "mru.h"
 
 static const char *pack_usage[] = {
 	N_("git pack-objects --stdout [<options>...] [< <ref-list> | < <object-list>]"),
@@ -957,7 +958,7 @@ static int want_object_in_pack(const unsigned char *sha1,
 			       struct packed_git **found_pack,
 			       off_t *found_offset)
 {
-	struct packed_git *p;
+	struct mru_entry *entry;
 
 	if (!exclude && local && has_loose_object_nonlocal(sha1))
 		return 0;
@@ -965,7 +966,8 @@ static int want_object_in_pack(const unsigned char *sha1,
 	*found_pack = NULL;
 	*found_offset = 0;
 
-	for (p = packed_git; p; p = p->next) {
+	for (entry = packed_git_mru->head; entry; entry = entry->next) {
+		struct packed_git *p = entry->item;
 		off_t offset = find_pack_entry_one(sha1, p);
 		if (offset) {
 			if (!*found_pack) {
@@ -992,8 +994,10 @@ static int want_object_in_pack(const unsigned char *sha1,
 			 * out of the loop to return 1 now.
 			 */
 			if (!ignore_packed_keep &&
-			    (!local || !have_non_local_packs))
+			    (!local || !have_non_local_packs)) {
+				mru_mark(packed_git_mru, entry);
 				break;
+			}
 
 			if (local && !p->pack_local)
 				return 0;
-- 
cgit v1.2.1