summaryrefslogtreecommitdiff
path: root/src/libgit2/hashsig.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/libgit2/hashsig.c')
-rw-r--r--src/libgit2/hashsig.c375
1 files changed, 375 insertions, 0 deletions
diff --git a/src/libgit2/hashsig.c b/src/libgit2/hashsig.c
new file mode 100644
index 000000000..6b4fb8352
--- /dev/null
+++ b/src/libgit2/hashsig.c
@@ -0,0 +1,375 @@
+/*
+ * Copyright (C) the libgit2 contributors. All rights reserved.
+ *
+ * This file is part of libgit2, distributed under the GNU GPL v2 with
+ * a Linking Exception. For full terms see the included COPYING file.
+ */
+
+#include "common.h"
+
+#include "git2/sys/hashsig.h"
+#include "futils.h"
+#include "util.h"
+
+typedef uint32_t hashsig_t;
+typedef uint64_t hashsig_state;
+
+#define HASHSIG_SCALE 100
+
+#define HASHSIG_MAX_RUN 80
+#define HASHSIG_HASH_START INT64_C(0x012345678ABCDEF0)
+#define HASHSIG_HASH_SHIFT 5
+
+#define HASHSIG_HASH_MIX(S,CH) \
+ (S) = ((S) << HASHSIG_HASH_SHIFT) - (S) + (hashsig_state)(CH)
+
+#define HASHSIG_HEAP_SIZE ((1 << 7) - 1)
+#define HASHSIG_HEAP_MIN_SIZE 4
+
+typedef int (*hashsig_cmp)(const void *a, const void *b, void *);
+
+typedef struct {
+ int size, asize;
+ hashsig_cmp cmp;
+ hashsig_t values[HASHSIG_HEAP_SIZE];
+} hashsig_heap;
+
+struct git_hashsig {
+ hashsig_heap mins;
+ hashsig_heap maxs;
+ size_t lines;
+ git_hashsig_option_t opt;
+};
+
+#define HEAP_LCHILD_OF(I) (((I)<<1)+1)
+#define HEAP_RCHILD_OF(I) (((I)<<1)+2)
+#define HEAP_PARENT_OF(I) (((I)-1)>>1)
+
+static void hashsig_heap_init(hashsig_heap *h, hashsig_cmp cmp)
+{
+ h->size = 0;
+ h->asize = HASHSIG_HEAP_SIZE;
+ h->cmp = cmp;
+}
+
+static int hashsig_cmp_max(const void *a, const void *b, void *payload)
+{
+ hashsig_t av = *(const hashsig_t *)a, bv = *(const hashsig_t *)b;
+ GIT_UNUSED(payload);
+ return (av < bv) ? -1 : (av > bv) ? 1 : 0;
+}
+
+static int hashsig_cmp_min(const void *a, const void *b, void *payload)
+{
+ hashsig_t av = *(const hashsig_t *)a, bv = *(const hashsig_t *)b;
+ GIT_UNUSED(payload);
+ return (av > bv) ? -1 : (av < bv) ? 1 : 0;
+}
+
+static void hashsig_heap_up(hashsig_heap *h, int el)
+{
+ int parent_el = HEAP_PARENT_OF(el);
+
+ while (el > 0 && h->cmp(&h->values[parent_el], &h->values[el], NULL) > 0) {
+ hashsig_t t = h->values[el];
+ h->values[el] = h->values[parent_el];
+ h->values[parent_el] = t;
+
+ el = parent_el;
+ parent_el = HEAP_PARENT_OF(el);
+ }
+}
+
+static void hashsig_heap_down(hashsig_heap *h, int el)
+{
+ hashsig_t v, lv, rv;
+
+ /* 'el < h->size / 2' tests if el is bottom row of heap */
+
+ while (el < h->size / 2) {
+ int lel = HEAP_LCHILD_OF(el), rel = HEAP_RCHILD_OF(el), swapel;
+
+ v = h->values[el];
+ lv = h->values[lel];
+ rv = h->values[rel];
+
+ if (h->cmp(&v, &lv, NULL) < 0 && h->cmp(&v, &rv, NULL) < 0)
+ break;
+
+ swapel = (h->cmp(&lv, &rv, NULL) < 0) ? lel : rel;
+
+ h->values[el] = h->values[swapel];
+ h->values[swapel] = v;
+
+ el = swapel;
+ }
+}
+
+static void hashsig_heap_sort(hashsig_heap *h)
+{
+ /* only need to do this at the end for signature comparison */
+ git__qsort_r(h->values, h->size, sizeof(hashsig_t), h->cmp, NULL);
+}
+
+static void hashsig_heap_insert(hashsig_heap *h, hashsig_t val)
+{
+ /* if heap is not full, insert new element */
+ if (h->size < h->asize) {
+ h->values[h->size++] = val;
+ hashsig_heap_up(h, h->size - 1);
+ }
+
+ /* if heap is full, pop top if new element should replace it */
+ else if (h->cmp(&val, &h->values[0], NULL) > 0) {
+ h->size--;
+ h->values[0] = h->values[h->size];
+ hashsig_heap_down(h, 0);
+ }
+
+}
+
+typedef struct {
+ int use_ignores;
+ uint8_t ignore_ch[256];
+} hashsig_in_progress;
+
+static int hashsig_in_progress_init(
+ hashsig_in_progress *prog, git_hashsig *sig)
+{
+ int i;
+
+ /* no more than one can be set */
+ GIT_ASSERT(!(sig->opt & GIT_HASHSIG_IGNORE_WHITESPACE) ||
+ !(sig->opt & GIT_HASHSIG_SMART_WHITESPACE));
+
+ if (sig->opt & GIT_HASHSIG_IGNORE_WHITESPACE) {
+ for (i = 0; i < 256; ++i)
+ prog->ignore_ch[i] = git__isspace_nonlf(i);
+ prog->use_ignores = 1;
+ } else if (sig->opt & GIT_HASHSIG_SMART_WHITESPACE) {
+ for (i = 0; i < 256; ++i)
+ prog->ignore_ch[i] = git__isspace(i);
+ prog->use_ignores = 1;
+ } else {
+ memset(prog, 0, sizeof(*prog));
+ }
+
+ return 0;
+}
+
+static int hashsig_add_hashes(
+ git_hashsig *sig,
+ const uint8_t *data,
+ size_t size,
+ hashsig_in_progress *prog)
+{
+ const uint8_t *scan = data, *end = data + size;
+ hashsig_state state = HASHSIG_HASH_START;
+ int use_ignores = prog->use_ignores, len;
+ uint8_t ch;
+
+ while (scan < end) {
+ state = HASHSIG_HASH_START;
+
+ for (len = 0; scan < end && len < HASHSIG_MAX_RUN; ) {
+ ch = *scan;
+
+ if (use_ignores)
+ for (; scan < end && git__isspace_nonlf(ch); ch = *scan)
+ ++scan;
+ else if (sig->opt &
+ (GIT_HASHSIG_IGNORE_WHITESPACE | GIT_HASHSIG_SMART_WHITESPACE))
+ for (; scan < end && ch == '\r'; ch = *scan)
+ ++scan;
+
+ /* peek at next character to decide what to do next */
+ if (sig->opt & GIT_HASHSIG_SMART_WHITESPACE)
+ use_ignores = (ch == '\n');
+
+ if (scan >= end)
+ break;
+ ++scan;
+
+ /* check run terminator */
+ if (ch == '\n' || ch == '\0') {
+ sig->lines++;
+ break;
+ }
+
+ ++len;
+ HASHSIG_HASH_MIX(state, ch);
+ }
+
+ if (len > 0) {
+ hashsig_heap_insert(&sig->mins, (hashsig_t)state);
+ hashsig_heap_insert(&sig->maxs, (hashsig_t)state);
+
+ while (scan < end && (*scan == '\n' || !*scan))
+ ++scan;
+ }
+ }
+
+ prog->use_ignores = use_ignores;
+
+ return 0;
+}
+
+static int hashsig_finalize_hashes(git_hashsig *sig)
+{
+ if (sig->mins.size < HASHSIG_HEAP_MIN_SIZE &&
+ !(sig->opt & GIT_HASHSIG_ALLOW_SMALL_FILES)) {
+ git_error_set(GIT_ERROR_INVALID,
+ "file too small for similarity signature calculation");
+ return GIT_EBUFS;
+ }
+
+ hashsig_heap_sort(&sig->mins);
+ hashsig_heap_sort(&sig->maxs);
+
+ return 0;
+}
+
+static git_hashsig *hashsig_alloc(git_hashsig_option_t opts)
+{
+ git_hashsig *sig = git__calloc(1, sizeof(git_hashsig));
+ if (!sig)
+ return NULL;
+
+ hashsig_heap_init(&sig->mins, hashsig_cmp_min);
+ hashsig_heap_init(&sig->maxs, hashsig_cmp_max);
+ sig->opt = opts;
+
+ return sig;
+}
+
+int git_hashsig_create(
+ git_hashsig **out,
+ const char *buf,
+ size_t buflen,
+ git_hashsig_option_t opts)
+{
+ int error;
+ hashsig_in_progress prog;
+ git_hashsig *sig = hashsig_alloc(opts);
+ GIT_ERROR_CHECK_ALLOC(sig);
+
+ if ((error = hashsig_in_progress_init(&prog, sig)) < 0)
+ return error;
+
+ error = hashsig_add_hashes(sig, (const uint8_t *)buf, buflen, &prog);
+
+ if (!error)
+ error = hashsig_finalize_hashes(sig);
+
+ if (!error)
+ *out = sig;
+ else
+ git_hashsig_free(sig);
+
+ return error;
+}
+
+int git_hashsig_create_fromfile(
+ git_hashsig **out,
+ const char *path,
+ git_hashsig_option_t opts)
+{
+ uint8_t buf[0x1000];
+ ssize_t buflen = 0;
+ int error = 0, fd;
+ hashsig_in_progress prog;
+ git_hashsig *sig = hashsig_alloc(opts);
+ GIT_ERROR_CHECK_ALLOC(sig);
+
+ if ((fd = git_futils_open_ro(path)) < 0) {
+ git__free(sig);
+ return fd;
+ }
+
+ if ((error = hashsig_in_progress_init(&prog, sig)) < 0) {
+ p_close(fd);
+ return error;
+ }
+
+ while (!error) {
+ if ((buflen = p_read(fd, buf, sizeof(buf))) <= 0) {
+ if ((error = (int)buflen) < 0)
+ git_error_set(GIT_ERROR_OS,
+ "read error on '%s' calculating similarity hashes", path);
+ break;
+ }
+
+ error = hashsig_add_hashes(sig, buf, buflen, &prog);
+ }
+
+ p_close(fd);
+
+ if (!error)
+ error = hashsig_finalize_hashes(sig);
+
+ if (!error)
+ *out = sig;
+ else
+ git_hashsig_free(sig);
+
+ return error;
+}
+
+void git_hashsig_free(git_hashsig *sig)
+{
+ git__free(sig);
+}
+
+static int hashsig_heap_compare(const hashsig_heap *a, const hashsig_heap *b)
+{
+ int matches = 0, i, j, cmp;
+
+ GIT_ASSERT_WITH_RETVAL(a->cmp == b->cmp, 0);
+
+ /* hash heaps are sorted - just look for overlap vs total */
+
+ for (i = 0, j = 0; i < a->size && j < b->size; ) {
+ cmp = a->cmp(&a->values[i], &b->values[j], NULL);
+
+ if (cmp < 0)
+ ++i;
+ else if (cmp > 0)
+ ++j;
+ else {
+ ++i; ++j; ++matches;
+ }
+ }
+
+ return HASHSIG_SCALE * (matches * 2) / (a->size + b->size);
+}
+
+int git_hashsig_compare(const git_hashsig *a, const git_hashsig *b)
+{
+ /* if we have no elements in either file then each file is either
+ * empty or blank. if we're ignoring whitespace then the files are
+ * similar, otherwise they're dissimilar.
+ */
+ if (a->mins.size == 0 && b->mins.size == 0) {
+ if ((!a->lines && !b->lines) ||
+ (a->opt & GIT_HASHSIG_IGNORE_WHITESPACE))
+ return HASHSIG_SCALE;
+ else
+ return 0;
+ }
+
+ /* if we have fewer than the maximum number of elements, then just use
+ * one array since the two arrays will be the same
+ */
+ if (a->mins.size < HASHSIG_HEAP_SIZE) {
+ return hashsig_heap_compare(&a->mins, &b->mins);
+ } else {
+ int mins, maxs;
+
+ if ((mins = hashsig_heap_compare(&a->mins, &b->mins)) < 0)
+ return mins;
+ if ((maxs = hashsig_heap_compare(&a->maxs, &b->maxs)) < 0)
+ return maxs;
+
+ return (mins + maxs) / 2;
+ }
+}