diff options
Diffstat (limited to 'src/libgit2/hashsig.c')
-rw-r--r-- | src/libgit2/hashsig.c | 375 |
1 files changed, 375 insertions, 0 deletions
diff --git a/src/libgit2/hashsig.c b/src/libgit2/hashsig.c new file mode 100644 index 000000000..6b4fb8352 --- /dev/null +++ b/src/libgit2/hashsig.c @@ -0,0 +1,375 @@ +/* + * Copyright (C) the libgit2 contributors. All rights reserved. + * + * This file is part of libgit2, distributed under the GNU GPL v2 with + * a Linking Exception. For full terms see the included COPYING file. + */ + +#include "common.h" + +#include "git2/sys/hashsig.h" +#include "futils.h" +#include "util.h" + +typedef uint32_t hashsig_t; +typedef uint64_t hashsig_state; + +#define HASHSIG_SCALE 100 + +#define HASHSIG_MAX_RUN 80 +#define HASHSIG_HASH_START INT64_C(0x012345678ABCDEF0) +#define HASHSIG_HASH_SHIFT 5 + +#define HASHSIG_HASH_MIX(S,CH) \ + (S) = ((S) << HASHSIG_HASH_SHIFT) - (S) + (hashsig_state)(CH) + +#define HASHSIG_HEAP_SIZE ((1 << 7) - 1) +#define HASHSIG_HEAP_MIN_SIZE 4 + +typedef int (*hashsig_cmp)(const void *a, const void *b, void *); + +typedef struct { + int size, asize; + hashsig_cmp cmp; + hashsig_t values[HASHSIG_HEAP_SIZE]; +} hashsig_heap; + +struct git_hashsig { + hashsig_heap mins; + hashsig_heap maxs; + size_t lines; + git_hashsig_option_t opt; +}; + +#define HEAP_LCHILD_OF(I) (((I)<<1)+1) +#define HEAP_RCHILD_OF(I) (((I)<<1)+2) +#define HEAP_PARENT_OF(I) (((I)-1)>>1) + +static void hashsig_heap_init(hashsig_heap *h, hashsig_cmp cmp) +{ + h->size = 0; + h->asize = HASHSIG_HEAP_SIZE; + h->cmp = cmp; +} + +static int hashsig_cmp_max(const void *a, const void *b, void *payload) +{ + hashsig_t av = *(const hashsig_t *)a, bv = *(const hashsig_t *)b; + GIT_UNUSED(payload); + return (av < bv) ? -1 : (av > bv) ? 1 : 0; +} + +static int hashsig_cmp_min(const void *a, const void *b, void *payload) +{ + hashsig_t av = *(const hashsig_t *)a, bv = *(const hashsig_t *)b; + GIT_UNUSED(payload); + return (av > bv) ? -1 : (av < bv) ? 1 : 0; +} + +static void hashsig_heap_up(hashsig_heap *h, int el) +{ + int parent_el = HEAP_PARENT_OF(el); + + while (el > 0 && h->cmp(&h->values[parent_el], &h->values[el], NULL) > 0) { + hashsig_t t = h->values[el]; + h->values[el] = h->values[parent_el]; + h->values[parent_el] = t; + + el = parent_el; + parent_el = HEAP_PARENT_OF(el); + } +} + +static void hashsig_heap_down(hashsig_heap *h, int el) +{ + hashsig_t v, lv, rv; + + /* 'el < h->size / 2' tests if el is bottom row of heap */ + + while (el < h->size / 2) { + int lel = HEAP_LCHILD_OF(el), rel = HEAP_RCHILD_OF(el), swapel; + + v = h->values[el]; + lv = h->values[lel]; + rv = h->values[rel]; + + if (h->cmp(&v, &lv, NULL) < 0 && h->cmp(&v, &rv, NULL) < 0) + break; + + swapel = (h->cmp(&lv, &rv, NULL) < 0) ? lel : rel; + + h->values[el] = h->values[swapel]; + h->values[swapel] = v; + + el = swapel; + } +} + +static void hashsig_heap_sort(hashsig_heap *h) +{ + /* only need to do this at the end for signature comparison */ + git__qsort_r(h->values, h->size, sizeof(hashsig_t), h->cmp, NULL); +} + +static void hashsig_heap_insert(hashsig_heap *h, hashsig_t val) +{ + /* if heap is not full, insert new element */ + if (h->size < h->asize) { + h->values[h->size++] = val; + hashsig_heap_up(h, h->size - 1); + } + + /* if heap is full, pop top if new element should replace it */ + else if (h->cmp(&val, &h->values[0], NULL) > 0) { + h->size--; + h->values[0] = h->values[h->size]; + hashsig_heap_down(h, 0); + } + +} + +typedef struct { + int use_ignores; + uint8_t ignore_ch[256]; +} hashsig_in_progress; + +static int hashsig_in_progress_init( + hashsig_in_progress *prog, git_hashsig *sig) +{ + int i; + + /* no more than one can be set */ + GIT_ASSERT(!(sig->opt & GIT_HASHSIG_IGNORE_WHITESPACE) || + !(sig->opt & GIT_HASHSIG_SMART_WHITESPACE)); + + if (sig->opt & GIT_HASHSIG_IGNORE_WHITESPACE) { + for (i = 0; i < 256; ++i) + prog->ignore_ch[i] = git__isspace_nonlf(i); + prog->use_ignores = 1; + } else if (sig->opt & GIT_HASHSIG_SMART_WHITESPACE) { + for (i = 0; i < 256; ++i) + prog->ignore_ch[i] = git__isspace(i); + prog->use_ignores = 1; + } else { + memset(prog, 0, sizeof(*prog)); + } + + return 0; +} + +static int hashsig_add_hashes( + git_hashsig *sig, + const uint8_t *data, + size_t size, + hashsig_in_progress *prog) +{ + const uint8_t *scan = data, *end = data + size; + hashsig_state state = HASHSIG_HASH_START; + int use_ignores = prog->use_ignores, len; + uint8_t ch; + + while (scan < end) { + state = HASHSIG_HASH_START; + + for (len = 0; scan < end && len < HASHSIG_MAX_RUN; ) { + ch = *scan; + + if (use_ignores) + for (; scan < end && git__isspace_nonlf(ch); ch = *scan) + ++scan; + else if (sig->opt & + (GIT_HASHSIG_IGNORE_WHITESPACE | GIT_HASHSIG_SMART_WHITESPACE)) + for (; scan < end && ch == '\r'; ch = *scan) + ++scan; + + /* peek at next character to decide what to do next */ + if (sig->opt & GIT_HASHSIG_SMART_WHITESPACE) + use_ignores = (ch == '\n'); + + if (scan >= end) + break; + ++scan; + + /* check run terminator */ + if (ch == '\n' || ch == '\0') { + sig->lines++; + break; + } + + ++len; + HASHSIG_HASH_MIX(state, ch); + } + + if (len > 0) { + hashsig_heap_insert(&sig->mins, (hashsig_t)state); + hashsig_heap_insert(&sig->maxs, (hashsig_t)state); + + while (scan < end && (*scan == '\n' || !*scan)) + ++scan; + } + } + + prog->use_ignores = use_ignores; + + return 0; +} + +static int hashsig_finalize_hashes(git_hashsig *sig) +{ + if (sig->mins.size < HASHSIG_HEAP_MIN_SIZE && + !(sig->opt & GIT_HASHSIG_ALLOW_SMALL_FILES)) { + git_error_set(GIT_ERROR_INVALID, + "file too small for similarity signature calculation"); + return GIT_EBUFS; + } + + hashsig_heap_sort(&sig->mins); + hashsig_heap_sort(&sig->maxs); + + return 0; +} + +static git_hashsig *hashsig_alloc(git_hashsig_option_t opts) +{ + git_hashsig *sig = git__calloc(1, sizeof(git_hashsig)); + if (!sig) + return NULL; + + hashsig_heap_init(&sig->mins, hashsig_cmp_min); + hashsig_heap_init(&sig->maxs, hashsig_cmp_max); + sig->opt = opts; + + return sig; +} + +int git_hashsig_create( + git_hashsig **out, + const char *buf, + size_t buflen, + git_hashsig_option_t opts) +{ + int error; + hashsig_in_progress prog; + git_hashsig *sig = hashsig_alloc(opts); + GIT_ERROR_CHECK_ALLOC(sig); + + if ((error = hashsig_in_progress_init(&prog, sig)) < 0) + return error; + + error = hashsig_add_hashes(sig, (const uint8_t *)buf, buflen, &prog); + + if (!error) + error = hashsig_finalize_hashes(sig); + + if (!error) + *out = sig; + else + git_hashsig_free(sig); + + return error; +} + +int git_hashsig_create_fromfile( + git_hashsig **out, + const char *path, + git_hashsig_option_t opts) +{ + uint8_t buf[0x1000]; + ssize_t buflen = 0; + int error = 0, fd; + hashsig_in_progress prog; + git_hashsig *sig = hashsig_alloc(opts); + GIT_ERROR_CHECK_ALLOC(sig); + + if ((fd = git_futils_open_ro(path)) < 0) { + git__free(sig); + return fd; + } + + if ((error = hashsig_in_progress_init(&prog, sig)) < 0) { + p_close(fd); + return error; + } + + while (!error) { + if ((buflen = p_read(fd, buf, sizeof(buf))) <= 0) { + if ((error = (int)buflen) < 0) + git_error_set(GIT_ERROR_OS, + "read error on '%s' calculating similarity hashes", path); + break; + } + + error = hashsig_add_hashes(sig, buf, buflen, &prog); + } + + p_close(fd); + + if (!error) + error = hashsig_finalize_hashes(sig); + + if (!error) + *out = sig; + else + git_hashsig_free(sig); + + return error; +} + +void git_hashsig_free(git_hashsig *sig) +{ + git__free(sig); +} + +static int hashsig_heap_compare(const hashsig_heap *a, const hashsig_heap *b) +{ + int matches = 0, i, j, cmp; + + GIT_ASSERT_WITH_RETVAL(a->cmp == b->cmp, 0); + + /* hash heaps are sorted - just look for overlap vs total */ + + for (i = 0, j = 0; i < a->size && j < b->size; ) { + cmp = a->cmp(&a->values[i], &b->values[j], NULL); + + if (cmp < 0) + ++i; + else if (cmp > 0) + ++j; + else { + ++i; ++j; ++matches; + } + } + + return HASHSIG_SCALE * (matches * 2) / (a->size + b->size); +} + +int git_hashsig_compare(const git_hashsig *a, const git_hashsig *b) +{ + /* if we have no elements in either file then each file is either + * empty or blank. if we're ignoring whitespace then the files are + * similar, otherwise they're dissimilar. + */ + if (a->mins.size == 0 && b->mins.size == 0) { + if ((!a->lines && !b->lines) || + (a->opt & GIT_HASHSIG_IGNORE_WHITESPACE)) + return HASHSIG_SCALE; + else + return 0; + } + + /* if we have fewer than the maximum number of elements, then just use + * one array since the two arrays will be the same + */ + if (a->mins.size < HASHSIG_HEAP_SIZE) { + return hashsig_heap_compare(&a->mins, &b->mins); + } else { + int mins, maxs; + + if ((mins = hashsig_heap_compare(&a->mins, &b->mins)) < 0) + return mins; + if ((maxs = hashsig_heap_compare(&a->maxs, &b->maxs)) < 0) + return maxs; + + return (mins + maxs) / 2; + } +} |