diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2007-10-25 11:23:26 -0700 |
---|---|---|
committer | Junio C Hamano <gitster@pobox.com> | 2007-10-26 23:18:06 -0700 |
commit | 9027f53cb5051bf83a0254e7f8aeb5d1a206de0b (patch) | |
tree | 3ee3d48a47f89d7c41c5a9e4430f0db6507f4d9e /diffcore-rename.c | |
parent | 644797119d7a3b7a043a51a9cccd8758f8451f91 (diff) | |
download | git-9027f53cb5051bf83a0254e7f8aeb5d1a206de0b.tar.gz |
Do linear-time/space rename logic for exact renames
This implements a smarter rename detector for exact renames, which
rather than doing a pairwise comparison (time O(m*n)) will just hash the
files into a hash-table (size O(n+m)), and only do pairwise comparisons
to renames that have the same hash (time O(n+m) except for unrealistic
hash collissions, which we just cull aggressively).
Admittedly the exact rename case is not nearly as interesting as the
generic case, but it's an important case none-the-less. A similar general
approach should work for the generic case too, but even then you do need
to handle the exact renames/copies separately (to avoid the inevitable
added cost factor that comes from the _size_ of the file), so this is
worth doing.
In the expectation that we will indeed do the same hashing trick for the
general rename case, this code uses a generic hash-table implementation
that can be used for other things too. In fact, we might be able to
consolidate some of our existing hash tables with the new generic code
in hash.[ch].
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
Diffstat (limited to 'diffcore-rename.c')
-rw-r--r-- | diffcore-rename.c | 211 |
1 files changed, 148 insertions, 63 deletions
diff --git a/diffcore-rename.c b/diffcore-rename.c index edb2424d13..e7e370b2cc 100644 --- a/diffcore-rename.c +++ b/diffcore-rename.c @@ -4,6 +4,7 @@ #include "cache.h" #include "diff.h" #include "diffcore.h" +#include "hash.h" /* Table of rename/copy destinations */ @@ -93,29 +94,6 @@ static struct diff_rename_src *register_rename_src(struct diff_filespec *one, return &(rename_src[first]); } -static int is_exact_match(struct diff_filespec *src, - struct diff_filespec *dst, - int contents_too) -{ - if (src->sha1_valid && dst->sha1_valid && - !hashcmp(src->sha1, dst->sha1)) - return 1; - if (!contents_too) - return 0; - if (diff_populate_filespec(src, 1) || diff_populate_filespec(dst, 1)) - return 0; - if (src->size != dst->size) - return 0; - if (src->sha1_valid && dst->sha1_valid) - return !hashcmp(src->sha1, dst->sha1); - if (diff_populate_filespec(src, 0) || diff_populate_filespec(dst, 0)) - return 0; - if (src->size == dst->size && - !memcmp(src->data, dst->data, src->size)) - return 1; - return 0; -} - static int basename_same(struct diff_filespec *src, struct diff_filespec *dst) { int src_len = strlen(src->path), dst_len = strlen(dst->path); @@ -242,56 +220,163 @@ static int score_compare(const void *a_, const void *b_) return b->score - a->score; } +struct file_similarity { + int src_dst, index; + struct diff_filespec *filespec; + struct file_similarity *next; +}; + +static int find_identical_files(struct file_similarity *src, + struct file_similarity *dst) +{ + int renames = 0; + + /* + * Walk over all the destinations ... + */ + do { + struct diff_filespec *one = dst->filespec; + struct file_similarity *p, *best; + int i = 100; + + /* + * .. to find the best source match + */ + best = NULL; + for (p = src; p; p = p->next) { + struct diff_filespec *two = p->filespec; + + /* False hash collission? */ + if (hashcmp(one->sha1, two->sha1)) + continue; + /* Non-regular files? If so, the modes must match! */ + if (!S_ISREG(one->mode) || !S_ISREG(two->mode)) { + if (one->mode != two->mode) + continue; + } + best = p; + if (basename_same(one, two)) + break; + + /* Too many identical alternatives? Pick one */ + if (!--i) + break; + } + if (best) { + record_rename_pair(dst->index, best->index, MAX_SCORE); + renames++; + } + } while ((dst = dst->next) != NULL); + return renames; +} + +/* + * Note: the rest of the rename logic depends on this + * phase also populating all the filespecs for any + * entry that isn't matched up with an exact rename. + */ +static void free_similarity_list(struct file_similarity *p) +{ + while (p) { + struct file_similarity *entry = p; + p = p->next; + + /* Stupid special case, see note above! */ + diff_populate_filespec(entry->filespec, 0); + free(entry); + } +} + +static int find_same_files(void *ptr) +{ + int ret; + struct file_similarity *p = ptr; + struct file_similarity *src = NULL, *dst = NULL; + + /* Split the hash list up into sources and destinations */ + do { + struct file_similarity *entry = p; + p = p->next; + if (entry->src_dst < 0) { + entry->next = src; + src = entry; + } else { + entry->next = dst; + dst = entry; + } + } while (p); + + /* + * If we have both sources *and* destinations, see if + * we can match them up + */ + ret = (src && dst) ? find_identical_files(src, dst) : 0; + + /* Free the hashes and return the number of renames found */ + free_similarity_list(src); + free_similarity_list(dst); + return ret; +} + +static unsigned int hash_filespec(struct diff_filespec *filespec) +{ + unsigned int hash; + if (!filespec->sha1_valid) { + if (diff_populate_filespec(filespec, 0)) + return 0; + hash_sha1_file(filespec->data, filespec->size, "blob", filespec->sha1); + } + memcpy(&hash, filespec->sha1, sizeof(hash)); + return hash; +} + +static void insert_file_table(struct hash_table *table, int src_dst, int index, struct diff_filespec *filespec) +{ + void **pos; + unsigned int hash; + struct file_similarity *entry = xmalloc(sizeof(*entry)); + + entry->src_dst = src_dst; + entry->index = index; + entry->filespec = filespec; + entry->next = NULL; + + hash = hash_filespec(filespec); + pos = insert_hash(hash, entry, table); + + /* We already had an entry there? */ + if (pos) { + entry->next = *pos; + *pos = entry; + } +} + /* * Find exact renames first. * * The first round matches up the up-to-date entries, * and then during the second round we try to match * cache-dirty entries as well. - * - * Note: the rest of the rename logic depends on this - * phase also populating all the filespecs for any - * entry that isn't matched up with an exact rename, - * see "is_exact_match()". */ static int find_exact_renames(void) { - int rename_count = 0; - int contents_too; - - for (contents_too = 0; contents_too < 2; contents_too++) { - int i; - - for (i = 0; i < rename_dst_nr; i++) { - struct diff_filespec *two = rename_dst[i].two; - int j; - - if (rename_dst[i].pair) - continue; /* dealt with an earlier round */ - for (j = 0; j < rename_src_nr; j++) { - int k; - struct diff_filespec *one = rename_src[j].one; - if (!is_exact_match(one, two, contents_too)) - continue; + int i; + struct hash_table file_table; - /* see if there is a basename match, too */ - for (k = j; k < rename_src_nr; k++) { - one = rename_src[k].one; - if (basename_same(one, two) && - is_exact_match(one, two, - contents_too)) { - j = k; - break; - } - } - - record_rename_pair(i, j, (int)MAX_SCORE); - rename_count++; - break; /* we are done with this entry */ - } - } - } - return rename_count; + init_hash(&file_table); + for (i = 0; i < rename_src_nr; i++) + insert_file_table(&file_table, -1, i, rename_src[i].one); + + for (i = 0; i < rename_dst_nr; i++) + insert_file_table(&file_table, 1, i, rename_dst[i].two); + + /* Find the renames */ + i = for_each_hash(&file_table, find_same_files); + + /* .. and free the hash data structure */ + free_hash(&file_table); + + return i; } void diffcore_rename(struct diff_options *options) |