From cee1e7af1d979c26bb27a73f84cf53a8c860abdb Mon Sep 17 00:00:00 2001 From: Michael Tesch Date: Wed, 12 Apr 2017 14:38:30 -0400 Subject: merge: perform exact rename detection in linear time The current exact rename detection has order n^2 complexity. We can do better by using a map to first aggregate deletes and using that to match deletes to adds. This results in a substantial performance improvement for merges with a large quantity of adds and deletes. --- src/merge.c | 183 ++++++++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 152 insertions(+), 31 deletions(-) diff --git a/src/merge.c b/src/merge.c index 6e00b5adb..5d69d9b15 100644 --- a/src/merge.c +++ b/src/merge.c @@ -32,6 +32,8 @@ #include "commit.h" #include "oidarray.h" #include "merge_driver.h" +#include "oidmap.h" +#include "array.h" #include "git2/types.h" #include "git2/repository.h" @@ -1005,27 +1007,6 @@ struct merge_diff_similarity { size_t other_idx; }; -static int index_entry_similarity_exact( - git_repository *repo, - git_index_entry *a, - size_t a_idx, - git_index_entry *b, - size_t b_idx, - void **cache, - const git_merge_options *opts) -{ - GIT_UNUSED(repo); - GIT_UNUSED(a_idx); - GIT_UNUSED(b_idx); - GIT_UNUSED(cache); - GIT_UNUSED(opts); - - if (git_oid__cmp(&a->id, &b->id) == 0) - return 100; - - return 0; -} - static int index_entry_similarity_calc( void **out, git_repository *repo, @@ -1102,12 +1083,154 @@ static int index_entry_similarity_inexact( return score; } -static int merge_diff_mark_similarity( +/* Tracks deletes by oid for merge_diff_mark_similarity_exact(). This is a +* non-shrinking queue where next_pos is the next position to dequeue. +*/ +typedef struct { + git_array_t(size_t) arr; + size_t next_pos; + size_t first_entry; +} deletes_by_oid_queue; + +static void deletes_by_oid_free(git_oidmap *map) { + deletes_by_oid_queue *queue; + + if (!map) + return; + + git_oidmap_foreach_value(map, queue, { + git_array_clear(queue->arr); + }); + git_oidmap_free(map); +} + +static int deletes_by_oid_enqueue(git_oidmap *map, git_pool* pool, const git_oid *id, size_t idx) { + khint_t pos; + deletes_by_oid_queue *queue; + size_t *array_entry; + int error; + + pos = git_oidmap_lookup_index(map, id); + if (!git_oidmap_valid_index(map, pos)) { + queue = git_pool_malloc(pool, sizeof(deletes_by_oid_queue)); + GITERR_CHECK_ALLOC(queue); + + git_array_init(queue->arr); + queue->next_pos = 0; + queue->first_entry = idx; + + git_oidmap_insert(map, id, queue, &error); + if (error < 0) + return -1; + } else { + queue = git_oidmap_value_at(map, pos); + array_entry = git_array_alloc(queue->arr); + GITERR_CHECK_ALLOC(array_entry); + *array_entry = idx; + } + + return 0; +} + +static int deletes_by_oid_dequeue(size_t *idx, git_oidmap *map, const git_oid *id) { + khint_t pos; + deletes_by_oid_queue *queue; + size_t *array_entry; + + pos = git_oidmap_lookup_index(map, id); + + if (!git_oidmap_valid_index(map, pos)) + return GIT_ENOTFOUND; + + queue = git_oidmap_value_at(map, pos); + + if (queue->next_pos == 0) { + *idx = queue->first_entry; + } else { + array_entry = git_array_get(queue->arr, queue->next_pos - 1); + if (array_entry == NULL) + return GIT_ENOTFOUND; + + *idx = *array_entry; + } + + queue->next_pos++; + return 0; +} + +static int merge_diff_mark_similarity_exact( + git_merge_diff_list *diff_list, + struct merge_diff_similarity *similarity_ours, + struct merge_diff_similarity *similarity_theirs) +{ + size_t i, j; + git_merge_diff *conflict_src, *conflict_tgt; + git_oidmap *ours_deletes_by_oid, *theirs_deletes_by_oid; + int error = 0; + + if (!(ours_deletes_by_oid = git_oidmap_alloc()) || + !(theirs_deletes_by_oid = git_oidmap_alloc())) { + error = -1; + goto done; + } + + /* Build a map of object ids to conflicts */ + git_vector_foreach(&diff_list->conflicts, i, conflict_src) { + /* Items can be the source of a rename iff they have an item in the + * ancestor slot and lack an item in the ours or theirs slot. */ + if (!GIT_MERGE_INDEX_ENTRY_EXISTS(conflict_src->ancestor_entry)) + continue; + + if (!GIT_MERGE_INDEX_ENTRY_EXISTS(conflict_src->our_entry)) { + error = deletes_by_oid_enqueue(ours_deletes_by_oid, &diff_list->pool, &conflict_src->ancestor_entry.id, i); + if (error < 0) + goto done; + } + + if (!GIT_MERGE_INDEX_ENTRY_EXISTS(conflict_src->their_entry)) { + error = deletes_by_oid_enqueue(theirs_deletes_by_oid, &diff_list->pool, &conflict_src->ancestor_entry.id, i); + if (error < 0) + goto done; + } + } + + git_vector_foreach(&diff_list->conflicts, j, conflict_tgt) { + if (GIT_MERGE_INDEX_ENTRY_EXISTS(conflict_tgt->ancestor_entry)) + continue; + + if (GIT_MERGE_INDEX_ENTRY_EXISTS(conflict_tgt->our_entry)) { + if (deletes_by_oid_dequeue(&i, ours_deletes_by_oid, &conflict_tgt->our_entry.id) == 0) { + similarity_ours[i].similarity = 100; + similarity_ours[i].other_idx = j; + + similarity_ours[j].similarity = 100; + similarity_ours[j].other_idx = i; + } + } + + if (GIT_MERGE_INDEX_ENTRY_EXISTS(conflict_tgt->their_entry)) { + if (deletes_by_oid_dequeue(&i, theirs_deletes_by_oid, &conflict_tgt->their_entry.id) == 0) { + similarity_theirs[i].similarity = 100; + similarity_theirs[i].other_idx = j; + + similarity_theirs[j].similarity = 100; + similarity_theirs[j].other_idx = i; + } + } + } + +done: + deletes_by_oid_free(ours_deletes_by_oid); + deletes_by_oid_free(theirs_deletes_by_oid); + + return error; +} + +static int merge_diff_mark_similarity_inexact( git_repository *repo, git_merge_diff_list *diff_list, struct merge_diff_similarity *similarity_ours, struct merge_diff_similarity *similarity_theirs, - int (*similarity_fn)(git_repository *, git_index_entry *, size_t, git_index_entry *, size_t, void **, const git_merge_options *), void **cache, const git_merge_options *opts) { @@ -1132,7 +1255,7 @@ static int merge_diff_mark_similarity( if (GIT_MERGE_INDEX_ENTRY_EXISTS(conflict_tgt->our_entry) && !GIT_MERGE_INDEX_ENTRY_EXISTS(conflict_src->our_entry)) { - similarity = similarity_fn(repo, &conflict_src->ancestor_entry, i, &conflict_tgt->our_entry, our_idx, cache, opts); + similarity = index_entry_similarity_inexact(repo, &conflict_src->ancestor_entry, i, &conflict_tgt->our_entry, our_idx, cache, opts); if (similarity == GIT_EBUFS) continue; @@ -1158,7 +1281,7 @@ static int merge_diff_mark_similarity( if (GIT_MERGE_INDEX_ENTRY_EXISTS(conflict_tgt->their_entry) && !GIT_MERGE_INDEX_ENTRY_EXISTS(conflict_src->their_entry)) { - similarity = similarity_fn(repo, &conflict_src->ancestor_entry, i, &conflict_tgt->their_entry, their_idx, cache, opts); + similarity = index_entry_similarity_inexact(repo, &conflict_src->ancestor_entry, i, &conflict_tgt->their_entry, their_idx, cache, opts); if (similarity > similarity_theirs[i].similarity && similarity > similarity_theirs[j].similarity) { @@ -1396,11 +1519,10 @@ int git_merge_diff_list__find_renames( /* Calculate similarity between items that were deleted from the ancestor * and added in the other branch. */ - if ((error = merge_diff_mark_similarity(repo, diff_list, similarity_ours, - similarity_theirs, index_entry_similarity_exact, NULL, opts)) < 0) + if ((error = merge_diff_mark_similarity_exact(diff_list, similarity_ours, similarity_theirs)) < 0) goto done; - if (diff_list->conflicts.length <= opts->target_limit) { + if (opts->rename_threshold < 100 && diff_list->conflicts.length <= opts->target_limit) { GITERR_CHECK_ALLOC_MULTIPLY(&cache_size, diff_list->conflicts.length, 3); cache = git__calloc(cache_size, sizeof(void *)); GITERR_CHECK_ALLOC(cache); @@ -1410,9 +1532,8 @@ int git_merge_diff_list__find_renames( if (src_count > opts->target_limit || tgt_count > opts->target_limit) { /* TODO: report! */ } else { - if ((error = merge_diff_mark_similarity( - repo, diff_list, similarity_ours, similarity_theirs, - index_entry_similarity_inexact, cache, opts)) < 0) + if ((error = merge_diff_mark_similarity_inexact( + repo, diff_list, similarity_ours, similarity_theirs, cache, opts)) < 0) goto done; } } -- cgit v1.2.1