diff options
author | Junio C Hamano <gitster@pobox.com> | 2014-02-27 14:01:09 -0800 |
---|---|---|
committer | Junio C Hamano <gitster@pobox.com> | 2014-02-27 14:01:09 -0800 |
commit | d637d1b9a8fb765a8542e69bd2e04b3e229f663b (patch) | |
tree | eea008a78eacbc6afbfd793377a70a9642624221 | |
parent | 810273bc33b1f50191f90deef74277ee84174efd (diff) | |
parent | 7b359ea6b3333a87fd3fa8b84913f2b75ed244ad (diff) | |
download | git-d637d1b9a8fb765a8542e69bd2e04b3e229f663b.tar.gz |
Merge branch 'kb/fast-hashmap'
Improvements to our hash table to get it to meet the needs of the
msysgit fscache project, with some nice performance improvements.
* kb/fast-hashmap:
name-hash: retire unused index_name_exists()
hashmap.h: use 'unsigned int' for hash-codes everywhere
test-hashmap.c: drop unnecessary #includes
.gitignore: test-hashmap is a generated file
read-cache.c: fix memory leaks caused by removed cache entries
builtin/update-index.c: cleanup update_one
fix 'git update-index --verbose --again' output
remove old hash.[ch] implementation
name-hash.c: remove cache entries instead of marking them CE_UNHASHED
name-hash.c: use new hash map implementation for cache entries
name-hash.c: remove unreferenced directory entries
name-hash.c: use new hash map implementation for directories
diffcore-rename.c: use new hash map implementation
diffcore-rename.c: simplify finding exact renames
diffcore-rename.c: move code around to prepare for the next patch
buitin/describe.c: use new hash map implementation
add a hashtable implementation that supports O(1) removal
submodule: don't access the .gitmodules cache entry after removing it
-rw-r--r-- | .gitignore | 1 | ||||
-rw-r--r-- | Documentation/technical/api-hash.txt | 52 | ||||
-rw-r--r-- | Documentation/technical/api-hashmap.txt | 235 | ||||
-rw-r--r-- | Makefile | 5 | ||||
-rw-r--r-- | builtin/describe.c | 53 | ||||
-rw-r--r-- | builtin/rm.c | 2 | ||||
-rw-r--r-- | builtin/update-index.c | 39 | ||||
-rw-r--r-- | cache.h | 20 | ||||
-rw-r--r-- | diffcore-rename.c | 185 | ||||
-rw-r--r-- | hash.c | 110 | ||||
-rw-r--r-- | hash.h | 50 | ||||
-rw-r--r-- | hashmap.c | 228 | ||||
-rw-r--r-- | hashmap.h | 71 | ||||
-rw-r--r-- | name-hash.c | 163 | ||||
-rw-r--r-- | read-cache.c | 10 | ||||
-rw-r--r-- | resolve-undo.c | 7 | ||||
-rw-r--r-- | submodule.c | 25 | ||||
-rwxr-xr-x | t/t0011-hashmap.sh | 240 | ||||
-rw-r--r-- | test-hashmap.c | 255 | ||||
-rw-r--r-- | unpack-trees.c | 3 |
20 files changed, 1216 insertions, 538 deletions
diff --git a/.gitignore b/.gitignore index b5f9defed3..dc600f9b36 100644 --- a/.gitignore +++ b/.gitignore @@ -182,6 +182,7 @@ /test-dump-cache-tree /test-scrap-cache-tree /test-genrandom +/test-hashmap /test-index-version /test-line-buffer /test-match-trees diff --git a/Documentation/technical/api-hash.txt b/Documentation/technical/api-hash.txt deleted file mode 100644 index e5061e0677..0000000000 --- a/Documentation/technical/api-hash.txt +++ /dev/null @@ -1,52 +0,0 @@ -hash API -======== - -The hash API is a collection of simple hash table functions. Users are expected -to implement their own hashing. - -Data Structures ---------------- - -`struct hash_table`:: - - The hash table structure. The `array` member points to the hash table - entries. The `size` member counts the total number of valid and invalid - entries in the table. The `nr` member keeps track of the number of - valid entries. - -`struct hash_table_entry`:: - - An opaque structure representing an entry in the hash table. The `hash` - member is the entry's hash key and the `ptr` member is the entry's - value. - -Functions ---------- - -`init_hash`:: - - Initialize the hash table. - -`free_hash`:: - - Release memory associated with the hash table. - -`insert_hash`:: - - Insert a pointer into the hash table. If an entry with that hash - already exists, a pointer to the existing entry's value is returned. - Otherwise NULL is returned. This allows callers to implement - chaining, etc. - -`lookup_hash`:: - - Lookup an entry in the hash table. If an entry with that hash exists - the entry's value is returned. Otherwise NULL is returned. - -`for_each_hash`:: - - Call a function for each entry in the hash table. The function is - expected to take the entry's value as its only argument and return an - int. If the function returns a negative int the loop is aborted - immediately. Otherwise, the return value is accumulated and the sum - returned upon completion of the loop. diff --git a/Documentation/technical/api-hashmap.txt b/Documentation/technical/api-hashmap.txt new file mode 100644 index 0000000000..42ca2347ed --- /dev/null +++ b/Documentation/technical/api-hashmap.txt @@ -0,0 +1,235 @@ +hashmap API +=========== + +The hashmap API is a generic implementation of hash-based key-value mappings. + +Data Structures +--------------- + +`struct hashmap`:: + + The hash table structure. ++ +The `size` member keeps track of the total number of entries. The `cmpfn` +member is a function used to compare two entries for equality. The `table` and +`tablesize` members store the hash table and its size, respectively. + +`struct hashmap_entry`:: + + An opaque structure representing an entry in the hash table, which must + be used as first member of user data structures. Ideally it should be + followed by an int-sized member to prevent unused memory on 64-bit + systems due to alignment. ++ +The `hash` member is the entry's hash code and the `next` member points to the +next entry in case of collisions (i.e. if multiple entries map to the same +bucket). + +`struct hashmap_iter`:: + + An iterator structure, to be used with hashmap_iter_* functions. + +Types +----- + +`int (*hashmap_cmp_fn)(const void *entry, const void *entry_or_key, const void *keydata)`:: + + User-supplied function to test two hashmap entries for equality. Shall + return 0 if the entries are equal. ++ +This function is always called with non-NULL `entry` / `entry_or_key` +parameters that have the same hash code. When looking up an entry, the `key` +and `keydata` parameters to hashmap_get and hashmap_remove are always passed +as second and third argument, respectively. Otherwise, `keydata` is NULL. + +Functions +--------- + +`unsigned int strhash(const char *buf)`:: +`unsigned int strihash(const char *buf)`:: +`unsigned int memhash(const void *buf, size_t len)`:: +`unsigned int memihash(const void *buf, size_t len)`:: + + Ready-to-use hash functions for strings, using the FNV-1 algorithm (see + http://www.isthe.com/chongo/tech/comp/fnv). ++ +`strhash` and `strihash` take 0-terminated strings, while `memhash` and +`memihash` operate on arbitrary-length memory. ++ +`strihash` and `memihash` are case insensitive versions. + +`void hashmap_init(struct hashmap *map, hashmap_cmp_fn equals_function, size_t initial_size)`:: + + Initializes a hashmap structure. ++ +`map` is the hashmap to initialize. ++ +The `equals_function` can be specified to compare two entries for equality. +If NULL, entries are considered equal if their hash codes are equal. ++ +If the total number of entries is known in advance, the `initial_size` +parameter may be used to preallocate a sufficiently large table and thus +prevent expensive resizing. If 0, the table is dynamically resized. + +`void hashmap_free(struct hashmap *map, int free_entries)`:: + + Frees a hashmap structure and allocated memory. ++ +`map` is the hashmap to free. ++ +If `free_entries` is true, each hashmap_entry in the map is freed as well +(using stdlib's free()). + +`void hashmap_entry_init(void *entry, unsigned int hash)`:: + + Initializes a hashmap_entry structure. ++ +`entry` points to the entry to initialize. ++ +`hash` is the hash code of the entry. + +`void *hashmap_get(const struct hashmap *map, const void *key, const void *keydata)`:: + + Returns the hashmap entry for the specified key, or NULL if not found. ++ +`map` is the hashmap structure. ++ +`key` is a hashmap_entry structure (or user data structure that starts with +hashmap_entry) that has at least been initialized with the proper hash code +(via `hashmap_entry_init`). ++ +If an entry with matching hash code is found, `key` and `keydata` are passed +to `hashmap_cmp_fn` to decide whether the entry matches the key. + +`void *hashmap_get_next(const struct hashmap *map, const void *entry)`:: + + Returns the next equal hashmap entry, or NULL if not found. This can be + used to iterate over duplicate entries (see `hashmap_add`). ++ +`map` is the hashmap structure. ++ +`entry` is the hashmap_entry to start the search from, obtained via a previous +call to `hashmap_get` or `hashmap_get_next`. + +`void hashmap_add(struct hashmap *map, void *entry)`:: + + Adds a hashmap entry. This allows to add duplicate entries (i.e. + separate values with the same key according to hashmap_cmp_fn). ++ +`map` is the hashmap structure. ++ +`entry` is the entry to add. + +`void *hashmap_put(struct hashmap *map, void *entry)`:: + + Adds or replaces a hashmap entry. If the hashmap contains duplicate + entries equal to the specified entry, only one of them will be replaced. ++ +`map` is the hashmap structure. ++ +`entry` is the entry to add or replace. ++ +Returns the replaced entry, or NULL if not found (i.e. the entry was added). + +`void *hashmap_remove(struct hashmap *map, const void *key, const void *keydata)`:: + + Removes a hashmap entry matching the specified key. If the hashmap + contains duplicate entries equal to the specified key, only one of + them will be removed. ++ +`map` is the hashmap structure. ++ +`key` is a hashmap_entry structure (or user data structure that starts with +hashmap_entry) that has at least been initialized with the proper hash code +(via `hashmap_entry_init`). ++ +If an entry with matching hash code is found, `key` and `keydata` are +passed to `hashmap_cmp_fn` to decide whether the entry matches the key. ++ +Returns the removed entry, or NULL if not found. + +`void hashmap_iter_init(struct hashmap *map, struct hashmap_iter *iter)`:: +`void *hashmap_iter_next(struct hashmap_iter *iter)`:: +`void *hashmap_iter_first(struct hashmap *map, struct hashmap_iter *iter)`:: + + Used to iterate over all entries of a hashmap. ++ +`hashmap_iter_init` initializes a `hashmap_iter` structure. ++ +`hashmap_iter_next` returns the next hashmap_entry, or NULL if there are no +more entries. ++ +`hashmap_iter_first` is a combination of both (i.e. initializes the iterator +and returns the first entry, if any). + +Usage example +------------- + +Here's a simple usage example that maps long keys to double values. +[source,c] +------------ +struct hashmap map; + +struct long2double { + struct hashmap_entry ent; /* must be the first member! */ + long key; + double value; +}; + +static int long2double_cmp(const struct long2double *e1, const struct long2double *e2, const void *unused) +{ + return !(e1->key == e2->key); +} + +void long2double_init(void) +{ + hashmap_init(&map, (hashmap_cmp_fn) long2double_cmp, 0); +} + +void long2double_free(void) +{ + hashmap_free(&map, 1); +} + +static struct long2double *find_entry(long key) +{ + struct long2double k; + hashmap_entry_init(&k, memhash(&key, sizeof(long))); + k.key = key; + return hashmap_get(&map, &k, NULL); +} + +double get_value(long key) +{ + struct long2double *e = find_entry(key); + return e ? e->value : 0; +} + +void set_value(long key, double value) +{ + struct long2double *e = find_entry(key); + if (!e) { + e = malloc(sizeof(struct long2double)); + hashmap_entry_init(e, memhash(&key, sizeof(long))); + e->key = key; + hashmap_add(&map, e); + } + e->value = value; +} +------------ + +Using variable-sized keys +------------------------- + +The `hashmap_entry_get` and `hashmap_entry_remove` functions expect an ordinary +`hashmap_entry` structure as key to find the correct entry. If the key data is +variable-sized (e.g. a FLEX_ARRAY string) or quite large, it is undesirable +to create a full-fledged entry structure on the heap and copy all the key data +into the structure. + +In this case, the `keydata` parameter can be used to pass +variable-sized key data directly to the comparison function, and the `key` +parameter can be a stripped-down, fixed size entry structure allocated on the +stack. + +See test-hashmap.c for an example using arbitrary-length strings as keys. @@ -555,6 +555,7 @@ TEST_PROGRAMS_NEED_X += test-date TEST_PROGRAMS_NEED_X += test-delta TEST_PROGRAMS_NEED_X += test-dump-cache-tree TEST_PROGRAMS_NEED_X += test-genrandom +TEST_PROGRAMS_NEED_X += test-hashmap TEST_PROGRAMS_NEED_X += test-index-version TEST_PROGRAMS_NEED_X += test-line-buffer TEST_PROGRAMS_NEED_X += test-match-trees @@ -671,7 +672,7 @@ LIB_H += git-compat-util.h LIB_H += gpg-interface.h LIB_H += graph.h LIB_H += grep.h -LIB_H += hash.h +LIB_H += hashmap.h LIB_H += help.h LIB_H += http.h LIB_H += kwset.h @@ -802,7 +803,7 @@ LIB_OBJS += gettext.o LIB_OBJS += gpg-interface.o LIB_OBJS += graph.o LIB_OBJS += grep.o -LIB_OBJS += hash.o +LIB_OBJS += hashmap.o LIB_OBJS += help.o LIB_OBJS += hex.o LIB_OBJS += ident.o diff --git a/builtin/describe.c b/builtin/describe.c index 7db43dae1b..dadd999c41 100644 --- a/builtin/describe.c +++ b/builtin/describe.c @@ -6,7 +6,7 @@ #include "exec_cmd.h" #include "parse-options.h" #include "diff.h" -#include "hash.h" +#include "hashmap.h" #include "argv-array.h" #define SEEN (1u << 0) @@ -25,7 +25,7 @@ static int longformat; static int first_parent; static int abbrev = -1; /* unspecified */ static int max_candidates = 10; -static struct hash_table names; +static struct hashmap names; static int have_util; static const char *pattern; static int always; @@ -37,7 +37,7 @@ static const char *diff_index_args[] = { }; struct commit_name { - struct commit_name *next; + struct hashmap_entry entry; unsigned char peeled[20]; struct tag *tag; unsigned prio:2; /* annotated tag = 2, tag = 1, head = 0 */ @@ -50,6 +50,12 @@ static const char *prio_names[] = { "head", "lightweight", "annotated", }; +static int commit_name_cmp(const struct commit_name *cn1, + const struct commit_name *cn2, const void *peeled) +{ + return hashcmp(cn1->peeled, peeled ? peeled : cn2->peeled); +} + static inline unsigned int hash_sha1(const unsigned char *sha1) { unsigned int hash; @@ -59,21 +65,9 @@ static inline unsigned int hash_sha1(const unsigned char *sha1) static inline struct commit_name *find_commit_name(const unsigned char *peeled) { - struct commit_name *n = lookup_hash(hash_sha1(peeled), &names); - while (n && !!hashcmp(peeled, n->peeled)) - n = n->next; - return n; -} - -static int set_util(void *chain, void *data) -{ - struct commit_name *n; - for (n = chain; n; n = n->next) { - struct commit *c = lookup_commit_reference_gently(n->peeled, 1); - if (c) - c->util = n; - } - return 0; + struct commit_name key; + hashmap_entry_init(&key, hash_sha1(peeled)); + return hashmap_get(&names, &key, peeled); } static int replace_name(struct commit_name *e, @@ -118,16 +112,10 @@ static void add_to_known_names(const char *path, struct tag *tag = NULL; if (replace_name(e, prio, sha1, &tag)) { if (!e) { - void **pos; e = xmalloc(sizeof(struct commit_name)); hashcpy(e->peeled, peeled); - pos = insert_hash(hash_sha1(peeled), e, &names); - if (pos) { - e->next = *pos; - *pos = e; - } else { - e->next = NULL; - } + hashmap_entry_init(e, hash_sha1(peeled)); + hashmap_add(&names, e); e->path = NULL; } e->tag = tag; @@ -292,7 +280,14 @@ static void describe(const char *arg, int last_one) fprintf(stderr, _("searching to describe %s\n"), arg); if (!have_util) { - for_each_hash(&names, set_util, NULL); + struct hashmap_iter iter; + struct commit *c; + struct commit_name *n = hashmap_iter_first(&names, &iter); + for (; n; n = hashmap_iter_next(&iter)) { + c = lookup_commit_reference_gently(n->peeled, 1); + if (c) + c->util = n; + } have_util = 1; } @@ -463,9 +458,9 @@ int cmd_describe(int argc, const char **argv, const char *prefix) return cmd_name_rev(args.argc, args.argv, prefix); } - init_hash(&names); + hashmap_init(&names, (hashmap_cmp_fn) commit_name_cmp, 0); for_each_rawref(get_name, NULL); - if (!names.nr && !always) + if (!names.size && !always) die(_("No names found, cannot describe anything.")); if (argc == 0) { diff --git a/builtin/rm.c b/builtin/rm.c index 3a0e0eaab7..171f37c1cc 100644 --- a/builtin/rm.c +++ b/builtin/rm.c @@ -311,7 +311,7 @@ int cmd_rm(int argc, const char **argv, const char *prefix) if (!match_pathspec_depth(&pathspec, ce->name, ce_namelen(ce), 0, seen)) continue; ALLOC_GROW(list.entry, list.nr + 1, list.alloc); - list.entry[list.nr].name = ce->name; + list.entry[list.nr].name = xstrdup(ce->name); list.entry[list.nr].is_submodule = S_ISGITLINK(ce->ce_mode); if (list.entry[list.nr++].is_submodule && !is_staging_gitmodules_ok()) diff --git a/builtin/update-index.c b/builtin/update-index.c index e3a10d706d..00313f373a 100644 --- a/builtin/update-index.c +++ b/builtin/update-index.c @@ -274,36 +274,32 @@ static void chmod_path(int flip, const char *path) die("git update-index: cannot chmod %cx '%s'", flip, path); } -static void update_one(const char *path, const char *prefix, int prefix_length) +static void update_one(const char *path) { - const char *p = prefix_path(prefix, prefix_length, path); - if (!verify_path(p)) { + if (!verify_path(path)) { fprintf(stderr, "Ignoring path %s\n", path); - goto free_return; + return; } if (mark_valid_only) { - if (mark_ce_flags(p, CE_VALID, mark_valid_only == MARK_FLAG)) + if (mark_ce_flags(path, CE_VALID, mark_valid_only == MARK_FLAG)) die("Unable to mark file %s", path); - goto free_return; + return; } if (mark_skip_worktree_only) { - if (mark_ce_flags(p, CE_SKIP_WORKTREE, mark_skip_worktree_only == MARK_FLAG)) + if (mark_ce_flags(path, CE_SKIP_WORKTREE, mark_skip_worktree_only == MARK_FLAG)) die("Unable to mark file %s", path); - goto free_return; + return; } if (force_remove) { - if (remove_file_from_cache(p)) + if (remove_file_from_cache(path)) die("git update-index: unable to remove %s", path); report("remove '%s'", path); - goto free_return; + return; } - if (process_path(p)) + if (process_path(path)) die("Unable to process path %s", path); report("add '%s'", path); - free_return: - if (p < path || p > path + strlen(path)) - free((char *)p); } static void read_index_info(int line_termination) @@ -563,6 +559,7 @@ static int do_reupdate(int ac, const char **av, const struct cache_entry *ce = active_cache[pos]; struct cache_entry *old = NULL; int save_nr; + char *path; if (ce_stage(ce) || !ce_path_match(ce, &pathspec)) continue; @@ -579,7 +576,9 @@ static int do_reupdate(int ac, const char **av, * or worse yet 'allow_replace', active_nr may decrease. */ save_nr = active_nr; - update_one(ce->name + prefix_length, prefix, prefix_length); + path = xstrdup(ce->name); + update_one(path); + free(path); if (save_nr != active_nr) goto redo; } @@ -836,11 +835,10 @@ int cmd_update_index(int argc, const char **argv, const char *prefix) setup_work_tree(); p = prefix_path(prefix, prefix_length, path); - update_one(p, NULL, 0); + update_one(p); if (set_executable_bit) chmod_path(set_executable_bit, p); - if (p < path || p > path + strlen(path)) - free((char *)p); + free((char *)p); ctx.argc--; ctx.argv++; break; @@ -879,11 +877,10 @@ int cmd_update_index(int argc, const char **argv, const char *prefix) strbuf_swap(&buf, &nbuf); } p = prefix_path(prefix, prefix_length, buf.buf); - update_one(p, NULL, 0); + update_one(p); if (set_executable_bit) chmod_path(set_executable_bit, p); - if (p < buf.buf || p > buf.buf + buf.len) - free((char *)p); + free((char *)p); } strbuf_release(&nbuf); strbuf_release(&buf); @@ -3,7 +3,7 @@ #include "git-compat-util.h" #include "strbuf.h" -#include "hash.h" +#include "hashmap.h" #include "advice.h" #include "gettext.h" #include "convert.h" @@ -130,12 +130,12 @@ struct stat_data { }; struct cache_entry { + struct hashmap_entry ent; struct stat_data ce_stat_data; unsigned int ce_mode; unsigned int ce_flags; unsigned int ce_namelen; unsigned char sha1[20]; - struct cache_entry *next; char name[FLEX_ARRAY]; /* more */ }; @@ -159,7 +159,6 @@ struct cache_entry { #define CE_ADDED (1 << 19) #define CE_HASHED (1 << 20) -#define CE_UNHASHED (1 << 21) #define CE_WT_REMOVE (1 << 22) /* remove in work directory */ #define CE_CONFLICTED (1 << 23) @@ -195,17 +194,18 @@ struct pathspec; * Copy the sha1 and stat state of a cache entry from one to * another. But we never change the name, or the hash state! */ -#define CE_STATE_MASK (CE_HASHED | CE_UNHASHED) static inline void copy_cache_entry(struct cache_entry *dst, const struct cache_entry *src) { - unsigned int state = dst->ce_flags & CE_STATE_MASK; + unsigned int state = dst->ce_flags & CE_HASHED; /* Don't copy hash chain and name */ - memcpy(dst, src, offsetof(struct cache_entry, next)); + memcpy(&dst->ce_stat_data, &src->ce_stat_data, + offsetof(struct cache_entry, name) - + offsetof(struct cache_entry, ce_stat_data)); /* Restore the hash state */ - dst->ce_flags = (dst->ce_flags & ~CE_STATE_MASK) | state; + dst->ce_flags = (dst->ce_flags & ~CE_HASHED) | state; } static inline unsigned create_ce_flags(unsigned stage) @@ -277,8 +277,8 @@ struct index_state { struct cache_time timestamp; unsigned name_hash_initialized : 1, initialized : 1; - struct hash_table name_hash; - struct hash_table dir_hash; + struct hashmap name_hash; + struct hashmap dir_hash; }; extern struct index_state the_index; @@ -316,7 +316,6 @@ extern void free_name_hash(struct index_state *istate); #define ce_modified(ce, st, options) ie_modified(&the_index, (ce), (st), (options)) #define cache_dir_exists(name, namelen) index_dir_exists(&the_index, (name), (namelen)) #define cache_file_exists(name, namelen, igncase) index_file_exists(&the_index, (name), (namelen), (igncase)) -#define cache_name_exists(name, namelen, igncase) index_name_exists(&the_index, (name), (namelen), (igncase)) #define cache_name_is_other(name, namelen) index_name_is_other(&the_index, (name), (namelen)) #define resolve_undo_clear() resolve_undo_clear_index(&the_index) #define unmerge_cache_entry_at(at) unmerge_index_entry_at(&the_index, at) @@ -467,7 +466,6 @@ extern int unmerged_index(const struct index_state *); extern int verify_path(const char *path); extern struct cache_entry *index_dir_exists(struct index_state *istate, const char *name, int namelen); extern struct cache_entry *index_file_exists(struct index_state *istate, const char *name, int namelen, int igncase); -extern struct cache_entry *index_name_exists(struct index_state *istate, const char *name, int namelen, int igncase); extern int index_name_pos(const struct index_state *, const char *name, int namelen); #define ADD_CACHE_OK_TO_ADD 1 /* Ok to add */ #define ADD_CACHE_OK_TO_REPLACE 2 /* Ok to replace file/directory */ diff --git a/diffcore-rename.c b/diffcore-rename.c index 6c7a72fbe7..9b4f068eb3 100644 --- a/diffcore-rename.c +++ b/diffcore-rename.c @@ -4,7 +4,7 @@ #include "cache.h" #include "diff.h" #include "diffcore.h" -#include "hash.h" +#include "hashmap.h" #include "progress.h" /* Table of rename/copy destinations */ @@ -243,137 +243,82 @@ static int score_compare(const void *a_, const void *b_) } struct file_similarity { - int src_dst, index; + struct hashmap_entry entry; + int index; struct diff_filespec *filespec; - struct file_similarity *next; }; -static int find_identical_files(struct file_similarity *src, - struct file_similarity *dst, +static unsigned int hash_filespec(struct diff_filespec *filespec) +{ + unsigned int hash; + if (!filespec->sha1_valid) { + if (diff_populate_filespec(filespec, 0)) + return 0; + hash_sha1_file(filespec->data, filespec->size, "blob", filespec->sha1); + } + memcpy(&hash, filespec->sha1, sizeof(hash)); + return hash; +} + +static int find_identical_files(struct hashmap *srcs, + int dst_index, struct diff_options *options) { int renames = 0; + struct diff_filespec *target = rename_dst[dst_index].two; + struct file_similarity *p, *best, dst; + int i = 100, best_score = -1; + /* - * Walk over all the destinations ... + * Find the best source match for specified destination. */ - do { - struct diff_filespec *target = dst->filespec; - struct file_similarity *p, *best; - int i = 100, best_score = -1; - - /* - * .. to find the best source match - */ - best = NULL; - for (p = src; p; p = p->next) { - int score; - struct diff_filespec *source = p->filespec; - - /* False hash collision? */ - if (hashcmp(source->sha1, target->sha1)) - continue; - /* Non-regular files? If so, the modes must match! */ - if (!S_ISREG(source->mode) || !S_ISREG(target->mode)) { - if (source->mode != target->mode) - continue; - } - /* Give higher scores to sources that haven't been used already */ - score = !source->rename_used; - if (source->rename_used && options->detect_rename != DIFF_DETECT_COPY) + best = NULL; + hashmap_entry_init(&dst, hash_filespec(target)); + for (p = hashmap_get(srcs, &dst, NULL); p; p = hashmap_get_next(srcs, p)) { + int score; + struct diff_filespec *source = p->filespec; + + /* False hash collision? */ + if (hashcmp(source->sha1, target->sha1)) + continue; + /* Non-regular files? If so, the modes must match! */ + if (!S_ISREG(source->mode) || !S_ISREG(target->mode)) { + if (source->mode != target->mode) continue; - score += basename_same(source, target); - if (score > best_score) { - best = p; - best_score = score; - if (score == 2) - break; - } - - /* Too many identical alternatives? Pick one */ - if (!--i) - break; } - if (best) { - record_rename_pair(dst->index, best->index, MAX_SCORE); - renames++; + /* Give higher scores to sources that haven't been used already */ + score = !source->rename_used; + if (source->rename_used && options->detect_rename != DIFF_DETECT_COPY) + continue; + score += basename_same(source, target); + if (score > best_score) { + best = p; + best_score = score; + if (score == 2) + break; } - } while ((dst = dst->next) != NULL); - return renames; -} -static void free_similarity_list(struct file_similarity *p) -{ - while (p) { - struct file_similarity *entry = p; - p = p->next; - free(entry); + /* Too many identical alternatives? Pick one */ + if (!--i) + break; } -} - -static int find_same_files(void *ptr, void *data) -{ - int ret; - struct file_similarity *p = ptr; - struct file_similarity *src = NULL, *dst = NULL; - struct diff_options *options = data; - - /* Split the hash list up into sources and destinations */ - do { - struct file_similarity *entry = p; - p = p->next; - if (entry->src_dst < 0) { - entry->next = src; - src = entry; - } else { - entry->next = dst; - dst = entry; - } - } while (p); - - /* - * If we have both sources *and* destinations, see if - * we can match them up - */ - ret = (src && dst) ? find_identical_files(src, dst, options) : 0; - - /* Free the hashes and return the number of renames found */ - free_similarity_list(src); - free_similarity_list(dst); - return ret; -} - -static unsigned int hash_filespec(struct diff_filespec *filespec) -{ - unsigned int hash; - if (!filespec->sha1_valid) { - if (diff_populate_filespec(filespec, 0)) - return 0; - hash_sha1_file(filespec->data, filespec->size, "blob", filespec->sha1); + if (best) { + record_rename_pair(dst_index, best->index, MAX_SCORE); + renames++; } - memcpy(&hash, filespec->sha1, sizeof(hash)); - return hash; + return renames; } -static void insert_file_table(struct hash_table *table, int src_dst, int index, struct diff_filespec *filespec) +static void insert_file_table(struct hashmap *table, int index, struct diff_filespec *filespec) { - void **pos; - unsigned int hash; struct file_similarity *entry = xmalloc(sizeof(*entry)); - entry->src_dst = src_dst; entry->index = index; entry->filespec = filespec; - entry->next = NULL; - - hash = hash_filespec(filespec); - pos = insert_hash(hash, entry, table); - /* We already had an entry there? */ - if (pos) { - entry->next = *pos; - *pos = entry; - } + hashmap_entry_init(entry, hash_filespec(filespec)); + hashmap_add(table, entry); } /* @@ -385,24 +330,22 @@ static void insert_file_table(struct hash_table *table, int src_dst, int index, */ static int find_exact_renames(struct diff_options *options) { - int i; - struct hash_table file_table; + int i, renames = 0; + struct hashmap file_table; - init_hash(&file_table); - preallocate_hash(&file_table, rename_src_nr + rename_dst_nr); + /* Add all sources to the hash table */ + hashmap_init(&file_table, NULL, rename_src_nr); for (i = 0; i < rename_src_nr; i++) - insert_file_table(&file_table, -1, i, rename_src[i].p->one); + insert_file_table(&file_table, i, rename_src[i].p->one); + /* Walk the destinations and find best source match */ for (i = 0; i < rename_dst_nr; i++) - insert_file_table(&file_table, 1, i, rename_dst[i].two); - - /* Find the renames */ - i = for_each_hash(&file_table, find_same_files, options); + renames += find_identical_files(&file_table, i, options); - /* .. and free the hash data structure */ - free_hash(&file_table); + /* Free the hash data structure and entries */ + hashmap_free(&file_table, 1); - return i; + return renames; } #define NUM_CANDIDATE_PER_DST 4 diff --git a/hash.c b/hash.c deleted file mode 100644 index 749ecfe484..0000000000 --- a/hash.c +++ /dev/null @@ -1,110 +0,0 @@ -/* - * Some generic hashing helpers. - */ -#include "cache.h" -#include "hash.h" - -/* - * Look up a hash entry in the hash table. Return the pointer to - * the existing entry, or the empty slot if none existed. The caller - * can then look at the (*ptr) to see whether it existed or not. - */ -static struct hash_table_entry *lookup_hash_entry(unsigned int hash, const struct hash_table *table) -{ - unsigned int size = table->size, nr = hash % size; - struct hash_table_entry *array = table->array; - - while (array[nr].ptr) { - if (array[nr].hash == hash) - break; - nr++; - if (nr >= size) - nr = 0; - } - return array + nr; -} - - -/* - * Insert a new hash entry pointer into the table. - * - * If that hash entry already existed, return the pointer to - * the existing entry (and the caller can create a list of the - * pointers or do anything else). If it didn't exist, return - * NULL (and the caller knows the pointer has been inserted). - */ -static void **insert_hash_entry(unsigned int hash, void *ptr, struct hash_table *table) -{ - struct hash_table_entry *entry = lookup_hash_entry(hash, table); - - if (!entry->ptr) { - entry->ptr = ptr; - entry->hash = hash; - table->nr++; - return NULL; - } - return &entry->ptr; -} - -static void grow_hash_table(struct hash_table *table) -{ - unsigned int i; - unsigned int old_size = table->size, new_size; - struct hash_table_entry *old_array = table->array, *new_array; - - new_size = alloc_nr(old_size); - new_array = xcalloc(sizeof(struct hash_table_entry), new_size); - table->size = new_size; - table->array = new_array; - table->nr = 0; - for (i = 0; i < old_size; i++) { - unsigned int hash = old_array[i].hash; - void *ptr = old_array[i].ptr; - if (ptr) - insert_hash_entry(hash, ptr, table); - } - free(old_array); -} - -void *lookup_hash(unsigned int hash, const struct hash_table *table) -{ - if (!table->array) - return NULL; - return lookup_hash_entry(hash, table)->ptr; -} - -void **insert_hash(unsigned int hash, void *ptr, struct hash_table *table) -{ - unsigned int nr = table->nr; - if (nr >= table->size/2) - grow_hash_table(table); - return insert_hash_entry(hash, ptr, table); -} - -int for_each_hash(const struct hash_table *table, int (*fn)(void *, void *), void *data) -{ - int sum = 0; - unsigned int i; - unsigned int size = table->size; - struct hash_table_entry *array = table->array; - - for (i = 0; i < size; i++) { - void *ptr = array->ptr; - array++; - if (ptr) { - int val = fn(ptr, data); - if (val < 0) - return val; - sum += val; - } - } - return sum; -} - -void free_hash(struct hash_table *table) -{ - free(table->array); - table->array = NULL; - table->size = 0; - table->nr = 0; -} diff --git a/hash.h b/hash.h deleted file mode 100644 index 1d43ac0ba0..0000000000 --- a/hash.h +++ /dev/null @@ -1,50 +0,0 @@ -#ifndef HASH_H -#define HASH_H - -/* - * These are some simple generic hash table helper functions. - * Not necessarily suitable for all users, but good for things - * where you want to just keep track of a list of things, and - * have a good hash to use on them. - * - * It keeps the hash table at roughly 50-75% free, so the memory - * cost of the hash table itself is roughly - * - * 3 * 2*sizeof(void *) * nr_of_objects - * - * bytes. - * - * FIXME: on 64-bit architectures, we waste memory. It would be - * good to have just 32-bit pointers, requiring a special allocator - * for hashed entries or something. - */ -struct hash_table_entry { - unsigned int hash; - void *ptr; -}; - -struct hash_table { - unsigned int size, nr; - struct hash_table_entry *array; -}; - -extern void *lookup_hash(unsigned int hash, const struct hash_table *table); -extern void **insert_hash(unsigned int hash, void *ptr, struct hash_table *table); -extern int for_each_hash(const struct hash_table *table, int (*fn)(void *, void *), void *data); -extern void free_hash(struct hash_table *table); - -static inline void init_hash(struct hash_table *table) -{ - table->size = 0; - table->nr = 0; - table->array = NULL; -} - -static inline void preallocate_hash(struct hash_table *table, unsigned int elts) -{ - assert(table->size == 0 && table->nr == 0 && table->array == NULL); - table->size = elts * 2; - table->array = xcalloc(sizeof(struct hash_table_entry), table->size); -} - -#endif diff --git a/hashmap.c b/hashmap.c new file mode 100644 index 0000000000..d1b8056d8d --- /dev/null +++ b/hashmap.c @@ -0,0 +1,228 @@ +/* + * Generic implementation of hash-based key value mappings. + */ +#include "cache.h" +#include "hashmap.h" + +#define FNV32_BASE ((unsigned int) 0x811c9dc5) +#define FNV32_PRIME ((unsigned int) 0x01000193) + +unsigned int strhash(const char *str) +{ + unsigned int c, hash = FNV32_BASE; + while ((c = (unsigned char) *str++)) + hash = (hash * FNV32_PRIME) ^ c; + return hash; +} + +unsigned int strihash(const char *str) +{ + unsigned int c, hash = FNV32_BASE; + while ((c = (unsigned char) *str++)) { + if (c >= 'a' && c <= 'z') + c -= 'a' - 'A'; + hash = (hash * FNV32_PRIME) ^ c; + } + return hash; +} + +unsigned int memhash(const void *buf, size_t len) +{ + unsigned int hash = FNV32_BASE; + unsigned char *ucbuf = (unsigned char *) buf; + while (len--) { + unsigned int c = *ucbuf++; + hash = (hash * FNV32_PRIME) ^ c; + } + return hash; +} + +unsigned int memihash(const void *buf, size_t len) +{ + unsigned int hash = FNV32_BASE; + unsigned char *ucbuf = (unsigned char *) buf; + while (len--) { + unsigned int c = *ucbuf++; + if (c >= 'a' && c <= 'z') + c -= 'a' - 'A'; + hash = (hash * FNV32_PRIME) ^ c; + } + return hash; +} + +#define HASHMAP_INITIAL_SIZE 64 +/* grow / shrink by 2^2 */ +#define HASHMAP_RESIZE_BITS 2 +/* load factor in percent */ +#define HASHMAP_LOAD_FACTOR 80 + +static void alloc_table(struct hashmap *map, unsigned int size) +{ + map->tablesize = size; + map->table = xcalloc(size, sizeof(struct hashmap_entry *)); + + /* calculate resize thresholds for new size */ + map->grow_at = (unsigned int) ((uint64_t) size * HASHMAP_LOAD_FACTOR / 100); + if (size <= HASHMAP_INITIAL_SIZE) + map->shrink_at = 0; + else + /* + * The shrink-threshold must be slightly smaller than + * (grow-threshold / resize-factor) to prevent erratic resizing, + * thus we divide by (resize-factor + 1). + */ + map->shrink_at = map->grow_at / ((1 << HASHMAP_RESIZE_BITS) + 1); +} + +static inline int entry_equals(const struct hashmap *map, + const struct hashmap_entry *e1, const struct hashmap_entry *e2, + const void *keydata) +{ + return (e1 == e2) || (e1->hash == e2->hash && !map->cmpfn(e1, e2, keydata)); +} + +static inline unsigned int bucket(const struct hashmap *map, + const struct hashmap_entry *key) +{ + return key->hash & (map->tablesize - 1); +} + +static void rehash(struct hashmap *map, unsigned int newsize) +{ + unsigned int i, oldsize = map->tablesize; + struct hashmap_entry **oldtable = map->table; + + alloc_table(map, newsize); + for (i = 0; i < oldsize; i++) { + struct hashmap_entry *e = oldtable[i]; + while (e) { + struct hashmap_entry *next = e->next; + unsigned int b = bucket(map, e); + e->next = map->table[b]; + map->table[b] = e; + e = next; + } + } + free(oldtable); +} + +static inline struct hashmap_entry **find_entry_ptr(const struct hashmap *map, + const struct hashmap_entry *key, const void *keydata) +{ + struct hashmap_entry **e = &map->table[bucket(map, key)]; + while (*e && !entry_equals(map, *e, key, keydata)) + e = &(*e)->next; + return e; +} + +static int always_equal(const void *unused1, const void *unused2, const void *unused3) +{ + return 0; +} + +void hashmap_init(struct hashmap *map, hashmap_cmp_fn equals_function, + size_t initial_size) +{ + unsigned int size = HASHMAP_INITIAL_SIZE; + map->size = 0; + map->cmpfn = equals_function ? equals_function : always_equal; + + /* calculate initial table size and allocate the table */ + initial_size = (unsigned int) ((uint64_t) initial_size * 100 + / HASHMAP_LOAD_FACTOR); + while (initial_size > size) + size <<= HASHMAP_RESIZE_BITS; + alloc_table(map, size); +} + +void hashmap_free(struct hashmap *map, int free_entries) +{ + if (!map || !map->table) + return; + if (free_entries) { + struct hashmap_iter iter; + struct hashmap_entry *e; + hashmap_iter_init(map, &iter); + while ((e = hashmap_iter_next(&iter))) + free(e); + } + free(map->table); + memset(map, 0, sizeof(*map)); +} + +void *hashmap_get(const struct hashmap *map, const void *key, const void *keydata) +{ + return *find_entry_ptr(map, key, keydata); +} + +void *hashmap_get_next(const struct hashmap *map, const void *entry) +{ + struct hashmap_entry *e = ((struct hashmap_entry *) entry)->next; + for (; e; e = e->next) + if (entry_equals(map, entry, e, NULL)) + return e; + return NULL; +} + +void hashmap_add(struct hashmap *map, void *entry) +{ + unsigned int b = bucket(map, entry); + + /* add entry */ + ((struct hashmap_entry *) entry)->next = map->table[b]; + map->table[b] = entry; + + /* fix size and rehash if appropriate */ + map->size++; + if (map->size > map->grow_at) + rehash(map, map->tablesize << HASHMAP_RESIZE_BITS); +} + +void *hashmap_remove(struct hashmap *map, const void *key, const void *keydata) +{ + struct hashmap_entry *old; + struct hashmap_entry **e = find_entry_ptr(map, key, keydata); + if (!*e) + return NULL; + + /* remove existing entry */ + old = *e; + *e = old->next; + old->next = NULL; + + /* fix size and rehash if appropriate */ + map->size--; + if (map->size < map->shrink_at) + rehash(map, map->tablesize >> HASHMAP_RESIZE_BITS); + return old; +} + +void *hashmap_put(struct hashmap *map, void *entry) +{ + struct hashmap_entry *old = hashmap_remove(map, entry, NULL); + hashmap_add(map, entry); + return old; +} + +void hashmap_iter_init(struct hashmap *map, struct hashmap_iter *iter) +{ + iter->map = map; + iter->tablepos = 0; + iter->next = NULL; +} + +void *hashmap_iter_next(struct hashmap_iter *iter) +{ + struct hashmap_entry *current = iter->next; + for (;;) { + if (current) { + iter->next = current->next; + return current; + } + + if (iter->tablepos >= iter->map->tablesize) + return NULL; + + current = iter->map->table[iter->tablepos++]; + } +} diff --git a/hashmap.h b/hashmap.h new file mode 100644 index 0000000000..a816ad47b1 --- /dev/null +++ b/hashmap.h @@ -0,0 +1,71 @@ +#ifndef HASHMAP_H +#define HASHMAP_H + +/* + * Generic implementation of hash-based key-value mappings. + * See Documentation/technical/api-hashmap.txt. + */ + +/* FNV-1 functions */ + +extern unsigned int strhash(const char *buf); +extern unsigned int strihash(const char *buf); +extern unsigned int memhash(const void *buf, size_t len); +extern unsigned int memihash(const void *buf, size_t len); + +/* data structures */ + +struct hashmap_entry { + struct hashmap_entry *next; + unsigned int hash; +}; + +typedef int (*hashmap_cmp_fn)(const void *entry, const void *entry_or_key, + const void *keydata); + +struct hashmap { + struct hashmap_entry **table; + hashmap_cmp_fn cmpfn; + unsigned int size, tablesize, grow_at, shrink_at; +}; + +struct hashmap_iter { + struct hashmap *map; + struct hashmap_entry *next; + unsigned int tablepos; +}; + +/* hashmap functions */ + +extern void hashmap_init(struct hashmap *map, hashmap_cmp_fn equals_function, + size_t initial_size); +extern void hashmap_free(struct hashmap *map, int free_entries); + +/* hashmap_entry functions */ + +static inline void hashmap_entry_init(void *entry, unsigned int hash) +{ + struct hashmap_entry *e = entry; + e->hash = hash; + e->next = NULL; +} +extern void *hashmap_get(const struct hashmap *map, const void *key, + const void *keydata); +extern void *hashmap_get_next(const struct hashmap *map, const void *entry); +extern void hashmap_add(struct hashmap *map, void *entry); +extern void *hashmap_put(struct hashmap *map, void *entry); +extern void *hashmap_remove(struct hashmap *map, const void *key, + const void *keydata); + +/* hashmap_iter functions */ + +extern void hashmap_iter_init(struct hashmap *map, struct hashmap_iter *iter); +extern void *hashmap_iter_next(struct hashmap_iter *iter); +static inline void *hashmap_iter_first(struct hashmap *map, + struct hashmap_iter *iter) +{ + hashmap_iter_init(map, iter); + return hashmap_iter_next(iter); +} + +#endif diff --git a/name-hash.c b/name-hash.c index e5b6e1ad23..97444d0201 100644 --- a/name-hash.c +++ b/name-hash.c @@ -8,49 +8,28 @@ #define NO_THE_INDEX_COMPATIBILITY_MACROS #include "cache.h" -/* - * This removes bit 5 if bit 6 is set. - * - * That will make US-ASCII characters hash to their upper-case - * equivalent. We could easily do this one whole word at a time, - * but that's for future worries. - */ -static inline unsigned char icase_hash(unsigned char c) -{ - return c & ~((c & 0x40) >> 1); -} - -static unsigned int hash_name(const char *name, int namelen) -{ - unsigned int hash = 0x123; - - while (namelen--) { - unsigned char c = *name++; - c = icase_hash(c); - hash = hash*101 + c; - } - return hash; -} - struct dir_entry { - struct dir_entry *next; + struct hashmap_entry ent; struct dir_entry *parent; struct cache_entry *ce; int nr; unsigned int namelen; }; +static int dir_entry_cmp(const struct dir_entry *e1, + const struct dir_entry *e2, const char *name) +{ + return e1->namelen != e2->namelen || strncasecmp(e1->ce->name, + name ? name : e2->ce->name, e1->namelen); +} + static struct dir_entry *find_dir_entry(struct index_state *istate, const char *name, unsigned int namelen) { - unsigned int hash = hash_name(name, namelen); - struct dir_entry *dir; - - for (dir = lookup_hash(hash, &istate->dir_hash); dir; dir = dir->next) - if (dir->namelen == namelen && - !strncasecmp(dir->ce->name, name, namelen)) - return dir; - return NULL; + struct dir_entry key; + hashmap_entry_init(&key, memihash(name, namelen)); + key.namelen = namelen; + return hashmap_get(&istate->dir_hash, &key, name); } static struct dir_entry *hash_dir_entry(struct index_state *istate, @@ -84,18 +63,11 @@ static struct dir_entry *hash_dir_entry(struct index_state *istate, dir = find_dir_entry(istate, ce->name, namelen); if (!dir) { /* not found, create it and add to hash table */ - void **pdir; - unsigned int hash = hash_name(ce->name, namelen); - dir = xcalloc(1, sizeof(struct dir_entry)); + hashmap_entry_init(dir, memihash(ce->name, namelen)); dir->namelen = namelen; dir->ce = ce; - - pdir = insert_hash(hash, dir, &istate->dir_hash); - if (pdir) { - dir->next = *pdir; - *pdir = dir; - } + hashmap_add(&istate->dir_hash, dir); /* recursively add missing parent directories */ dir->parent = hash_dir_entry(istate, ce, namelen); @@ -114,45 +86,50 @@ static void add_dir_entry(struct index_state *istate, struct cache_entry *ce) static void remove_dir_entry(struct index_state *istate, struct cache_entry *ce) { /* - * Release reference to the directory entry (and parents if 0). - * - * Note: we do not remove / free the entry because there's no - * hash.[ch]::remove_hash and dir->next may point to other entries - * that are still valid, so we must not free the memory. + * Release reference to the directory entry. If 0, remove and continue + * with parent directory. */ struct dir_entry *dir = hash_dir_entry(istate, ce, ce_namelen(ce)); - while (dir && dir->nr && !(--dir->nr)) - dir = dir->parent; + while (dir && !(--dir->nr)) { + struct dir_entry *parent = dir->parent; + hashmap_remove(&istate->dir_hash, dir, NULL); + free(dir); + dir = parent; + } } static void hash_index_entry(struct index_state *istate, struct cache_entry *ce) { - void **pos; - unsigned int hash; - if (ce->ce_flags & CE_HASHED) return; ce->ce_flags |= CE_HASHED; - ce->next = NULL; - hash = hash_name(ce->name, ce_namelen(ce)); - pos = insert_hash(hash, ce, &istate->name_hash); - if (pos) { - ce->next = *pos; - *pos = ce; - } + hashmap_entry_init(ce, memihash(ce->name, ce_namelen(ce))); + hashmap_add(&istate->name_hash, ce); - if (ignore_case && !(ce->ce_flags & CE_UNHASHED)) + if (ignore_case) add_dir_entry(istate, ce); } +static int cache_entry_cmp(const struct cache_entry *ce1, + const struct cache_entry *ce2, const void *remove) +{ + /* + * For remove_name_hash, find the exact entry (pointer equality); for + * index_file_exists, find all entries with matching hash code and + * decide whether the entry matches in same_name. + */ + return remove ? !(ce1 == ce2) : 0; +} + static void lazy_init_name_hash(struct index_state *istate) { int nr; if (istate->name_hash_initialized) return; - if (istate->cache_nr) - preallocate_hash(&istate->name_hash, istate->cache_nr); + hashmap_init(&istate->name_hash, (hashmap_cmp_fn) cache_entry_cmp, + istate->cache_nr); + hashmap_init(&istate->dir_hash, (hashmap_cmp_fn) dir_entry_cmp, 0); for (nr = 0; nr < istate->cache_nr; nr++) hash_index_entry(istate, istate->cache[nr]); istate->name_hash_initialized = 1; @@ -160,31 +137,19 @@ static void lazy_init_name_hash(struct index_state *istate) void add_name_hash(struct index_state *istate, struct cache_entry *ce) { - /* if already hashed, add reference to directory entries */ - if (ignore_case && (ce->ce_flags & CE_STATE_MASK) == CE_STATE_MASK) - add_dir_entry(istate, ce); - - ce->ce_flags &= ~CE_UNHASHED; if (istate->name_hash_initialized) hash_index_entry(istate, ce); } -/* - * We don't actually *remove* it, we can just mark it invalid so that - * we won't find it in lookups. - * - * Not only would we have to search the lists (simple enough), but - * we'd also have to rehash other hash buckets in case this makes the - * hash bucket empty (common). So it's much better to just mark - * it. - */ void remove_name_hash(struct index_state *istate, struct cache_entry *ce) { - /* if already hashed, release reference to directory entries */ - if (ignore_case && (ce->ce_flags & CE_STATE_MASK) == CE_HASHED) - remove_dir_entry(istate, ce); + if (!istate->name_hash_initialized || !(ce->ce_flags & CE_HASHED)) + return; + ce->ce_flags &= ~CE_HASHED; + hashmap_remove(&istate->name_hash, ce, ce); - ce->ce_flags |= CE_UNHASHED; + if (ignore_case) + remove_dir_entry(istate, ce); } static int slow_same_name(const char *name1, int len1, const char *name2, int len2) @@ -247,49 +212,27 @@ struct cache_entry *index_dir_exists(struct index_state *istate, const char *nam struct cache_entry *index_file_exists(struct index_state *istate, const char *name, int namelen, int icase) { - unsigned int hash = hash_name(name, namelen); struct cache_entry *ce; + struct hashmap_entry key; lazy_init_name_hash(istate); - ce = lookup_hash(hash, &istate->name_hash); + hashmap_entry_init(&key, memihash(name, namelen)); + ce = hashmap_get(&istate->name_hash, &key, NULL); while (ce) { - if (!(ce->ce_flags & CE_UNHASHED)) { - if (same_name(ce, name, namelen, icase)) - return ce; - } - ce = ce->next; + if (same_name(ce, name, namelen, icase)) + return ce; + ce = hashmap_get_next(&istate->name_hash, ce); } return NULL; } -struct cache_entry *index_name_exists(struct index_state *istate, const char *name, int namelen, int icase) -{ - if (namelen > 0 && name[namelen - 1] == '/') - return index_dir_exists(istate, name, namelen - 1); - return index_file_exists(istate, name, namelen, icase); -} - -static int free_dir_entry(void *entry, void *unused) -{ - struct dir_entry *dir = entry; - while (dir) { - struct dir_entry *next = dir->next; - free(dir); - dir = next; - } - return 0; -} - void free_name_hash(struct index_state *istate) { if (!istate->name_hash_initialized) return; istate->name_hash_initialized = 0; - if (ignore_case) - /* free directory entries */ - for_each_hash(&istate->dir_hash, free_dir_entry, NULL); - free_hash(&istate->name_hash); - free_hash(&istate->dir_hash); + hashmap_free(&istate->name_hash, 0); + hashmap_free(&istate->dir_hash, 1); } diff --git a/read-cache.c b/read-cache.c index 33dd676ccb..3f735f3c8e 100644 --- a/read-cache.c +++ b/read-cache.c @@ -47,6 +47,7 @@ static void replace_index_entry(struct index_state *istate, int nr, struct cache struct cache_entry *old = istate->cache[nr]; remove_name_hash(istate, old); + free(old); set_index_entry(istate, nr, ce); istate->cache_changed = 1; } @@ -58,7 +59,7 @@ void rename_index_entry_at(struct index_state *istate, int nr, const char *new_n new = xmalloc(cache_entry_size(namelen)); copy_cache_entry(new, old); - new->ce_flags &= ~CE_STATE_MASK; + new->ce_flags &= ~CE_HASHED; new->ce_namelen = namelen; memcpy(new->name, new_name, namelen + 1); @@ -478,6 +479,7 @@ int remove_index_entry_at(struct index_state *istate, int pos) record_resolve_undo(istate, ce); remove_name_hash(istate, ce); + free(ce); istate->cache_changed = 1; istate->cache_nr--; if (pos >= istate->cache_nr) @@ -499,8 +501,10 @@ void remove_marked_cache_entries(struct index_state *istate) unsigned int i, j; for (i = j = 0; i < istate->cache_nr; i++) { - if (ce_array[i]->ce_flags & CE_REMOVE) + if (ce_array[i]->ce_flags & CE_REMOVE) { remove_name_hash(istate, ce_array[i]); + free(ce_array[i]); + } else ce_array[j++] = ce_array[i]; } @@ -1894,7 +1898,7 @@ int read_index_unmerged(struct index_state *istate) new_ce->ce_mode = ce->ce_mode; if (add_index_entry(istate, new_ce, 0)) return error("%s: cannot drop to stage #0", - ce->name); + new_ce->name); i = index_name_pos(istate, new_ce->name, len); } return unmerged; diff --git a/resolve-undo.c b/resolve-undo.c index c09b00664e..49ebaaf8d8 100644 --- a/resolve-undo.c +++ b/resolve-undo.c @@ -119,6 +119,7 @@ int unmerge_index_entry_at(struct index_state *istate, int pos) struct string_list_item *item; struct resolve_undo_info *ru; int i, err = 0, matched; + char *name; if (!istate->resolve_undo) return pos; @@ -138,20 +139,22 @@ int unmerge_index_entry_at(struct index_state *istate, int pos) if (!ru) return pos; matched = ce->ce_flags & CE_MATCHED; + name = xstrdup(ce->name); remove_index_entry_at(istate, pos); for (i = 0; i < 3; i++) { struct cache_entry *nce; if (!ru->mode[i]) continue; nce = make_cache_entry(ru->mode[i], ru->sha1[i], - ce->name, i + 1, 0); + name, i + 1, 0); if (matched) nce->ce_flags |= CE_MATCHED; if (add_index_entry(istate, nce, ADD_CACHE_OK_TO_ADD)) { err = 1; - error("cannot unmerge '%s'", ce->name); + error("cannot unmerge '%s'", name); } } + free(name); if (err) return pos; free(ru); diff --git a/submodule.c b/submodule.c index 613857e400..b80ecacf60 100644 --- a/submodule.c +++ b/submodule.c @@ -116,30 +116,7 @@ int remove_path_from_gitmodules(const char *path) void stage_updated_gitmodules(void) { - struct strbuf buf = STRBUF_INIT; - struct stat st; - int pos; - struct cache_entry *ce; - int namelen = strlen(".gitmodules"); - - pos = cache_name_pos(".gitmodules", namelen); - if (pos < 0) { - warning(_("could not find .gitmodules in index")); - return; - } - ce = active_cache[pos]; - ce->ce_flags = namelen; - if (strbuf_read_file(&buf, ".gitmodules", 0) < 0) - die(_("reading updated .gitmodules failed")); - if (lstat(".gitmodules", &st) < 0) - die_errno(_("unable to stat updated .gitmodules")); - fill_stat_cache_info(ce, &st); - ce->ce_mode = ce_mode_from_stat(ce, st.st_mode); - if (remove_cache_entry_at(pos) < 0) - die(_("unable to remove .gitmodules from index")); - if (write_sha1_file(buf.buf, buf.len, blob_type, ce->sha1)) - die(_("adding updated .gitmodules failed")); - if (add_cache_entry(ce, ADD_CACHE_OK_TO_ADD|ADD_CACHE_OK_TO_REPLACE)) + if (add_file_to_cache(".gitmodules", 0)) die(_("staging updated .gitmodules failed")); } diff --git a/t/t0011-hashmap.sh b/t/t0011-hashmap.sh new file mode 100755 index 0000000000..391e2b6492 --- /dev/null +++ b/t/t0011-hashmap.sh @@ -0,0 +1,240 @@ +#!/bin/sh + +test_description='test hashmap and string hash functions' +. ./test-lib.sh + +test_hashmap() { + echo "$1" | test-hashmap $3 > actual && + echo "$2" > expect && + test_cmp expect actual +} + +test_expect_success 'hash functions' ' + +test_hashmap "hash key1" "2215982743 2215982743 116372151 116372151" && +test_hashmap "hash key2" "2215982740 2215982740 116372148 116372148" && +test_hashmap "hash fooBarFrotz" "1383912807 1383912807 3189766727 3189766727" && +test_hashmap "hash foobarfrotz" "2862305959 2862305959 3189766727 3189766727" + +' + +test_expect_success 'put' ' + +test_hashmap "put key1 value1 +put key2 value2 +put fooBarFrotz value3 +put foobarfrotz value4 +size" "NULL +NULL +NULL +NULL +64 4" + +' + +test_expect_success 'put (case insensitive)' ' + +test_hashmap "put key1 value1 +put key2 value2 +put fooBarFrotz value3 +size" "NULL +NULL +NULL +64 3" ignorecase + +' + +test_expect_success 'replace' ' + +test_hashmap "put key1 value1 +put key1 value2 +put fooBarFrotz value3 +put fooBarFrotz value4 +size" "NULL +value1 +NULL +value3 +64 2" + +' + +test_expect_success 'replace (case insensitive)' ' + +test_hashmap "put key1 value1 +put Key1 value2 +put fooBarFrotz value3 +put foobarfrotz value4 +size" "NULL +value1 +NULL +value3 +64 2" ignorecase + +' + +test_expect_success 'get' ' + +test_hashmap "put key1 value1 +put key2 value2 +put fooBarFrotz value3 +put foobarfrotz value4 +get key1 +get key2 +get fooBarFrotz +get notInMap" "NULL +NULL +NULL +NULL +value1 +value2 +value3 +NULL" + +' + +test_expect_success 'get (case insensitive)' ' + +test_hashmap "put key1 value1 +put key2 value2 +put fooBarFrotz value3 +get Key1 +get keY2 +get foobarfrotz +get notInMap" "NULL +NULL +NULL +value1 +value2 +value3 +NULL" ignorecase + +' + +test_expect_success 'add' ' + +test_hashmap "add key1 value1 +add key1 value2 +add fooBarFrotz value3 +add fooBarFrotz value4 +get key1 +get fooBarFrotz +get notInMap" "value2 +value1 +value4 +value3 +NULL" + +' + +test_expect_success 'add (case insensitive)' ' + +test_hashmap "add key1 value1 +add Key1 value2 +add fooBarFrotz value3 +add foobarfrotz value4 +get key1 +get Foobarfrotz +get notInMap" "value2 +value1 +value4 +value3 +NULL" ignorecase + +' + +test_expect_success 'remove' ' + +test_hashmap "put key1 value1 +put key2 value2 +put fooBarFrotz value3 +remove key1 +remove key2 +remove notInMap +size" "NULL +NULL +NULL +value1 +value2 +NULL +64 1" + +' + +test_expect_success 'remove (case insensitive)' ' + +test_hashmap "put key1 value1 +put key2 value2 +put fooBarFrotz value3 +remove Key1 +remove keY2 +remove notInMap +size" "NULL +NULL +NULL +value1 +value2 +NULL +64 1" ignorecase + +' + +test_expect_success 'iterate' ' + +test_hashmap "put key1 value1 +put key2 value2 +put fooBarFrotz value3 +iterate" "NULL +NULL +NULL +key2 value2 +key1 value1 +fooBarFrotz value3" + +' + +test_expect_success 'iterate (case insensitive)' ' + +test_hashmap "put key1 value1 +put key2 value2 +put fooBarFrotz value3 +iterate" "NULL +NULL +NULL +fooBarFrotz value3 +key2 value2 +key1 value1" ignorecase + +' + +test_expect_success 'grow / shrink' ' + + rm -f in && + rm -f expect && + for n in $(test_seq 51) + do + echo put key$n value$n >> in && + echo NULL >> expect + done && + echo size >> in && + echo 64 51 >> expect && + echo put key52 value52 >> in && + echo NULL >> expect + echo size >> in && + echo 256 52 >> expect && + for n in $(test_seq 12) + do + echo remove key$n >> in && + echo value$n >> expect + done && + echo size >> in && + echo 256 40 >> expect && + echo remove key40 >> in && + echo value40 >> expect && + echo size >> in && + echo 64 39 >> expect && + cat in | test-hashmap > out && + test_cmp expect out + +' + +test_done diff --git a/test-hashmap.c b/test-hashmap.c new file mode 100644 index 0000000000..f5183fb9e8 --- /dev/null +++ b/test-hashmap.c @@ -0,0 +1,255 @@ +#include "git-compat-util.h" +#include "hashmap.h" + +struct test_entry +{ + struct hashmap_entry ent; + /* key and value as two \0-terminated strings */ + char key[FLEX_ARRAY]; +}; + +static const char *get_value(const struct test_entry *e) +{ + return e->key + strlen(e->key) + 1; +} + +static int test_entry_cmp(const struct test_entry *e1, + const struct test_entry *e2, const char* key) +{ + return strcmp(e1->key, key ? key : e2->key); +} + +static int test_entry_cmp_icase(const struct test_entry *e1, + const struct test_entry *e2, const char* key) +{ + return strcasecmp(e1->key, key ? key : e2->key); +} + +static struct test_entry *alloc_test_entry(int hash, char *key, int klen, + char *value, int vlen) +{ + struct test_entry *entry = malloc(sizeof(struct test_entry) + klen + + vlen + 2); + hashmap_entry_init(entry, hash); + memcpy(entry->key, key, klen + 1); + memcpy(entry->key + klen + 1, value, vlen + 1); + return entry; +} + +#define HASH_METHOD_FNV 0 +#define HASH_METHOD_I 1 +#define HASH_METHOD_IDIV10 2 +#define HASH_METHOD_0 3 +#define HASH_METHOD_X2 4 +#define TEST_SPARSE 8 +#define TEST_ADD 16 +#define TEST_SIZE 100000 + +static unsigned int hash(unsigned int method, unsigned int i, const char *key) +{ + unsigned int hash; + switch (method & 3) + { + case HASH_METHOD_FNV: + hash = strhash(key); + break; + case HASH_METHOD_I: + hash = i; + break; + case HASH_METHOD_IDIV10: + hash = i / 10; + break; + case HASH_METHOD_0: + hash = 0; + break; + } + + if (method & HASH_METHOD_X2) + hash = 2 * hash; + return hash; +} + +/* + * Test performance of hashmap.[ch] + * Usage: time echo "perfhashmap method rounds" | test-hashmap + */ +static void perf_hashmap(unsigned int method, unsigned int rounds) +{ + struct hashmap map; + char buf[16]; + struct test_entry **entries; + unsigned int *hashes; + unsigned int i, j; + + entries = malloc(TEST_SIZE * sizeof(struct test_entry *)); + hashes = malloc(TEST_SIZE * sizeof(int)); + for (i = 0; i < TEST_SIZE; i++) { + snprintf(buf, sizeof(buf), "%i", i); + entries[i] = alloc_test_entry(0, buf, strlen(buf), "", 0); + hashes[i] = hash(method, i, entries[i]->key); + } + + if (method & TEST_ADD) { + /* test adding to the map */ + for (j = 0; j < rounds; j++) { + hashmap_init(&map, (hashmap_cmp_fn) test_entry_cmp, 0); + + /* add entries */ + for (i = 0; i < TEST_SIZE; i++) { + hashmap_entry_init(entries[i], hashes[i]); + hashmap_add(&map, entries[i]); + } + + hashmap_free(&map, 0); + } + } else { + /* test map lookups */ + hashmap_init(&map, (hashmap_cmp_fn) test_entry_cmp, 0); + + /* fill the map (sparsely if specified) */ + j = (method & TEST_SPARSE) ? TEST_SIZE / 10 : TEST_SIZE; + for (i = 0; i < j; i++) { + hashmap_entry_init(entries[i], hashes[i]); + hashmap_add(&map, entries[i]); + } + + for (j = 0; j < rounds; j++) { + for (i = 0; i < TEST_SIZE; i++) { + struct hashmap_entry key; + hashmap_entry_init(&key, hashes[i]); + hashmap_get(&map, &key, entries[i]->key); + } + } + + hashmap_free(&map, 0); + } +} + +#define DELIM " \t\r\n" + +/* + * Read stdin line by line and print result of commands to stdout: + * + * hash key -> strhash(key) memhash(key) strihash(key) memihash(key) + * put key value -> NULL / old value + * get key -> NULL / value + * remove key -> NULL / old value + * iterate -> key1 value1\nkey2 value2\n... + * size -> tablesize numentries + * + * perfhashmap method rounds -> test hashmap.[ch] performance + */ +int main(int argc, char *argv[]) +{ + char line[1024]; + struct hashmap map; + int icase; + + /* init hash map */ + icase = argc > 1 && !strcmp("ignorecase", argv[1]); + hashmap_init(&map, (hashmap_cmp_fn) (icase ? test_entry_cmp_icase + : test_entry_cmp), 0); + + /* process commands from stdin */ + while (fgets(line, sizeof(line), stdin)) { + char *cmd, *p1 = NULL, *p2 = NULL; + int l1 = 0, l2 = 0, hash = 0; + struct test_entry *entry; + + /* break line into command and up to two parameters */ + cmd = strtok(line, DELIM); + /* ignore empty lines */ + if (!cmd || *cmd == '#') + continue; + + p1 = strtok(NULL, DELIM); + if (p1) { + l1 = strlen(p1); + hash = icase ? strihash(p1) : strhash(p1); + p2 = strtok(NULL, DELIM); + if (p2) + l2 = strlen(p2); + } + + if (!strcmp("hash", cmd) && l1) { + + /* print results of different hash functions */ + printf("%u %u %u %u\n", strhash(p1), memhash(p1, l1), + strihash(p1), memihash(p1, l1)); + + } else if (!strcmp("add", cmd) && l1 && l2) { + + /* create entry with key = p1, value = p2 */ + entry = alloc_test_entry(hash, p1, l1, p2, l2); + + /* add to hashmap */ + hashmap_add(&map, entry); + + } else if (!strcmp("put", cmd) && l1 && l2) { + + /* create entry with key = p1, value = p2 */ + entry = alloc_test_entry(hash, p1, l1, p2, l2); + + /* add / replace entry */ + entry = hashmap_put(&map, entry); + + /* print and free replaced entry, if any */ + puts(entry ? get_value(entry) : "NULL"); + free(entry); + + } else if (!strcmp("get", cmd) && l1) { + + /* setup static key */ + struct hashmap_entry key; + hashmap_entry_init(&key, hash); + + /* lookup entry in hashmap */ + entry = hashmap_get(&map, &key, p1); + + /* print result */ + if (!entry) + puts("NULL"); + while (entry) { + puts(get_value(entry)); + entry = hashmap_get_next(&map, entry); + } + + } else if (!strcmp("remove", cmd) && l1) { + + /* setup static key */ + struct hashmap_entry key; + hashmap_entry_init(&key, hash); + + /* remove entry from hashmap */ + entry = hashmap_remove(&map, &key, p1); + + /* print result and free entry*/ + puts(entry ? get_value(entry) : "NULL"); + free(entry); + + } else if (!strcmp("iterate", cmd)) { + + struct hashmap_iter iter; + hashmap_iter_init(&map, &iter); + while ((entry = hashmap_iter_next(&iter))) + printf("%s %s\n", entry->key, get_value(entry)); + + } else if (!strcmp("size", cmd)) { + + /* print table sizes */ + printf("%u %u\n", map.tablesize, map.size); + + } else if (!strcmp("perfhashmap", cmd) && l1 && l2) { + + perf_hashmap(atoi(p1), atoi(p2)); + + } else { + + printf("Unknown command %s\n", cmd); + + } + } + + hashmap_free(&map, 1); + return 0; +} diff --git a/unpack-trees.c b/unpack-trees.c index 164354dad7..0692ebe16e 100644 --- a/unpack-trees.c +++ b/unpack-trees.c @@ -105,12 +105,11 @@ void setup_unpack_trees_porcelain(struct unpack_trees_options *opts, static void do_add_entry(struct unpack_trees_options *o, struct cache_entry *ce, unsigned int set, unsigned int clear) { - clear |= CE_HASHED | CE_UNHASHED; + clear |= CE_HASHED; if (set & CE_REMOVE) set |= CE_WT_REMOVE; - ce->next = NULL; ce->ce_flags = (ce->ce_flags & ~clear) | set; add_index_entry(&o->result, ce, ADD_CACHE_OK_TO_ADD | ADD_CACHE_OK_TO_REPLACE); |