diff options
author | Russell Belfer <rb@github.com> | 2013-02-14 17:25:10 -0800 |
---|---|---|
committer | Russell Belfer <rb@github.com> | 2013-02-20 15:09:40 -0800 |
commit | 5e5848eb15cc0dd8476d1c6882a9f770e6556586 (patch) | |
tree | 953fd30d6360b67c2174b6c03fd2984561c84cf6 | |
parent | 99ba8f2322eaa2df51ace9782b8eadc8c5a6e8b8 (diff) | |
download | libgit2-5e5848eb15cc0dd8476d1c6882a9f770e6556586.tar.gz |
Change similarity metric to sampled hashes
This moves the similarity metric code out of buf_text and into a
new file. Also, this implements a different approach to similarity
measurement based on a Rabin-Karp rolling hash where we only keep
the top 100 and bottom 100 hashes. In theory, that should be
sufficient samples to given a fairly accurate measurement while
limiting the amount of data we keep for file signatures no matter
how large the file is.
-rw-r--r-- | src/buf_text.c | 360 | ||||
-rw-r--r-- | src/buf_text.h | 48 | ||||
-rw-r--r-- | src/diff_tform.c | 6 | ||||
-rw-r--r-- | src/hashsig.c | 364 | ||||
-rw-r--r-- | src/hashsig.h | 70 | ||||
-rw-r--r-- | tests-clar/core/buffer.c | 114 |
6 files changed, 498 insertions, 464 deletions
diff --git a/src/buf_text.c b/src/buf_text.c index 49ec16aaf..3a8f442b4 100644 --- a/src/buf_text.c +++ b/src/buf_text.c @@ -5,7 +5,6 @@ * a Linking Exception. For full terms see the included COPYING file. */ #include "buf_text.h" -#include "fileops.h" int git_buf_text_puts_escaped( git_buf *buf, @@ -213,362 +212,3 @@ bool git_buf_text_gather_stats( return (stats->nul > 0 || ((stats->printable >> 7) < stats->nonprintable)); } - -#define SIMILARITY_MAXRUN 256 -#define SIMILARITY_HASH_START 5381 -#define SIMILARITY_HASH_UPDATE(S,N) (((S) << 5) + (S) + (uint32_t)(N)) - -enum { - SIMILARITY_FORMAT_UNKNOWN = 0, - SIMILARITY_FORMAT_TEXT = 1, - SIMILARITY_FORMAT_BINARY = 2 -}; - -struct git_buf_text_hashsig { - uint32_t *hashes; - size_t size; - size_t asize; - unsigned int format : 2; - unsigned int pairs : 1; -}; - -static int similarity_record_hash(git_buf_text_hashsig *sig, uint32_t hash) -{ - if (sig->size >= sig->asize) { - size_t new_asize = sig->asize + 512; - uint32_t *new_hashes = - git__realloc(sig->hashes, new_asize * sizeof(uint32_t)); - GITERR_CHECK_ALLOC(new_hashes); - - sig->hashes = new_hashes; - sig->asize = new_asize; - } - - sig->hashes[sig->size++] = hash; - return 0; -} - -static int similarity_add_hashes_text( - git_buf_text_hashsig *sig, - uint32_t *hash_start, - size_t *hashlen_start, - const char *ptr, - size_t len) -{ - int error; - const char *scan = ptr, *scan_end = ptr + len; - uint32_t hash = *hash_start; - size_t hashlen = *hashlen_start; - - while (scan < scan_end) { - char ch = *scan++; - - if (ch == '\r' || ch == '\n' || hashlen >= SIMILARITY_MAXRUN) { - if ((error = similarity_record_hash(sig, hash)) < 0) - break; - - hash = SIMILARITY_HASH_START; - hashlen = 0; - - /* skip all whitespace immediately after line ending */ - while (scan < scan_end && git__isspace(*scan)) - scan++; - } else { - hash = SIMILARITY_HASH_UPDATE(hash, ch); - hashlen++; - } - } - - *hash_start = hash; - *hashlen_start = hashlen; - - return error; -} - -static int similarity_add_hashes_binary( - git_buf_text_hashsig *sig, - uint32_t *hash_start, - size_t *hashlen_start, - const char *ptr, - size_t len) -{ - int error; - const char *scan = ptr, *scan_end = ptr + len; - uint32_t hash = *hash_start; - size_t hashlen = *hashlen_start; - - while (scan < scan_end) { - char ch = *scan++; - - if (!ch || hashlen >= SIMILARITY_MAXRUN) { - if ((error = similarity_record_hash(sig, hash)) < 0) - break; - - hash = SIMILARITY_HASH_START; - hashlen = 0; - - /* skip run of terminators */ - while (scan < scan_end && !*scan) - scan++; - } else { - hash = SIMILARITY_HASH_UPDATE(hash, ch); - hashlen++; - } - } - - *hash_start = hash; - *hashlen_start = hashlen; - - return error; -} - -static int similarity_add_hashes( - git_buf_text_hashsig *sig, - uint32_t *hash_start, - size_t *hashlen_start, - const char *ptr, - size_t len) -{ - int error = 0; - uint32_t hash = hash_start ? *hash_start : SIMILARITY_HASH_START; - size_t hashlen = hashlen_start ? *hashlen_start : 0; - - if (sig->format == SIMILARITY_FORMAT_TEXT) - error = similarity_add_hashes_text(sig, &hash, &hashlen, ptr, len); - else - error = similarity_add_hashes_binary(sig, &hash, &hashlen, ptr, len); - - if (hash_start) - *hash_start = hash; - if (hashlen_start) - *hashlen_start = hashlen; - - /* if we're not saving intermediate state, add final hash as needed */ - if (!error && !hash_start && hashlen > 0) - error = similarity_record_hash(sig, hash); - - return error; -} - -/* - * Decide if \0 or \n terminated runs are a better choice for hashes - */ -static void similarity_guess_format( - git_buf_text_hashsig *sig, const char *ptr, size_t len) -{ - size_t lines = 0, line_length = 0, max_line_length = 0; - size_t runs = 0, run_length = 0, max_run_length = 0; - - /* don't process more than 4k of data for this */ - if (len > 4096) - len = 4096; - - /* gather some stats */ - while (len--) { - char ch = *ptr++; - - if (ch == '\0') { - runs++; - if (max_run_length < run_length) - max_run_length = run_length; - run_length = 0; - } else if (ch == '\n') { - lines++; - if (max_line_length < line_length) - max_line_length = line_length; - line_length = 0; - } else { - run_length++; - line_length++; - } - } - - /* the following heuristic could probably be improved */ - if (lines > runs) - sig->format = SIMILARITY_FORMAT_TEXT; - else if (runs > 0) - sig->format = SIMILARITY_FORMAT_BINARY; - else - sig->format = SIMILARITY_FORMAT_UNKNOWN; -} - -static int similarity_compare_score(const void *a, const void *b) -{ - uint32_t av = *(uint32_t *)a, bv = *(uint32_t *)b; - return (av < bv) ? -1 : (av > bv) ? 1 : 0; -} - -static int similarity_finalize_hashes( - git_buf_text_hashsig *sig, bool generate_pairs) -{ - if (!sig->size) - return 0; - - /* create pairwise hashes if requested */ - - if (generate_pairs) { - size_t i, needed_size = sig->size * 2 - 1; - - if (needed_size > sig->asize) { - uint32_t *new_hashes = - git__realloc(sig->hashes, needed_size * sizeof(uint32_t)); - GITERR_CHECK_ALLOC(new_hashes); - - sig->hashes = new_hashes; - sig->asize = needed_size; - } - - for (i = 1; i < sig->size; ++i) - sig->hashes[sig->size + i - 1] = - SIMILARITY_HASH_UPDATE(sig->hashes[i - 1], sig->hashes[i]); - - sig->pairs = 1; - } - - /* sort all hashes */ - - qsort(sig->hashes, sig->size, sizeof(uint32_t), similarity_compare_score); - - if (generate_pairs) - qsort(&sig->hashes[sig->size], sig->size - 1, sizeof(uint32_t), - similarity_compare_score); - - return 0; -} - -int git_buf_text_hashsig_create( - git_buf_text_hashsig **out, - const git_buf *buf, - bool generate_pairs) -{ - int error; - git_buf_text_hashsig *sig = git__calloc(1, sizeof(git_buf_text_hashsig)); - GITERR_CHECK_ALLOC(sig); - - similarity_guess_format(sig, buf->ptr, buf->size); - - error = similarity_add_hashes(sig, NULL, NULL, buf->ptr, buf->size); - - if (!error) - error = similarity_finalize_hashes(sig, generate_pairs); - - if (!error) - *out = sig; - else - git_buf_text_hashsig_free(sig); - - return error; -} - -int git_buf_text_hashsig_create_fromfile( - git_buf_text_hashsig **out, - const char *path, - bool generate_pairs) -{ - char buf[4096]; - ssize_t buflen = 0; - uint32_t hash = SIMILARITY_HASH_START; - size_t hashlen = 0; - int error = 0, fd; - git_buf_text_hashsig *sig = git__calloc(1, sizeof(git_buf_text_hashsig)); - GITERR_CHECK_ALLOC(sig); - - if ((fd = git_futils_open_ro(path)) < 0) { - git__free(sig); - return fd; - } - - while (!error && (buflen = p_read(fd, buf, sizeof(buf))) > 0) { - if (sig->format == SIMILARITY_FORMAT_UNKNOWN) - similarity_guess_format(sig, buf, buflen); - - error = similarity_add_hashes(sig, &hash, &hashlen, buf, buflen); - } - - if (buflen < 0) { - giterr_set(GITERR_OS, - "Read error on '%s' while calculating similarity hashes", path); - error = (int)buflen; - } - - p_close(fd); - - if (!error && hashlen > 0) - error = similarity_record_hash(sig, hash); - - if (!error) - error = similarity_finalize_hashes(sig, generate_pairs); - - if (!error) - *out = sig; - else - git_buf_text_hashsig_free(sig); - - return error; -} - -void git_buf_text_hashsig_free(git_buf_text_hashsig *sig) -{ - if (!sig) - return; - - if (sig->hashes) { - git__free(sig->hashes); - sig->hashes = NULL; - } - - git__free(sig); -} - -int git_buf_text_hashsig_compare( - const git_buf_text_hashsig *a, - const git_buf_text_hashsig *b, - int scale) -{ - size_t matches = 0, pairs = 0, total = 0, i, j; - - if (a->format != b->format || !a->size || !b->size) - return 0; - - if (scale <= 0) - scale = 100; - - /* hash lists are sorted - just look for overlap vs total */ - - for (i = 0, j = 0; i < a->size && j < b->size; ) { - uint32_t av = a->hashes[i]; - uint32_t bv = b->hashes[j]; - - if (av < bv) - ++i; - else if (av > bv) - ++j; - else { - ++i; ++j; - ++matches; - } - } - - total = (a->size + b->size); - - if (a->pairs && b->pairs) { - for (i = 0, j = 0; i < a->size - 1 && j < b->size - 1; ) { - uint32_t av = a->hashes[i + a->size]; - uint32_t bv = b->hashes[j + b->size]; - - if (av < bv) - ++i; - else if (av > bv) - ++j; - else { - ++i; ++j; - ++pairs; - } - } - - total += (a->size + b->size - 2); - } - - return (int)(scale * 2 * (matches + pairs) / total); -} - diff --git a/src/buf_text.h b/src/buf_text.h index c48c010d3..458ee33c9 100644 --- a/src/buf_text.h +++ b/src/buf_text.h @@ -105,52 +105,4 @@ extern int git_buf_text_detect_bom( extern bool git_buf_text_gather_stats( git_buf_text_stats *stats, const git_buf *buf, bool skip_bom); -/** - * Similarity signature of line hashes for a buffer - */ -typedef struct git_buf_text_hashsig git_buf_text_hashsig; - -/** - * Build a similarity signature for a buffer - * - * This can either generate a simple array of hashed lines/runs in the - * file, or it can also keep hashes of pairs of runs in sequence. Adding - * the pairwise runs means the final score will be sensitive to line - * ordering changes as well as individual line contents. - * - * @param out The array of hashed runs representing the file content - * @param buf The contents of the file to hash - * @param generate_pairwise_hashes Should pairwise runs be hashed - */ -extern int git_buf_text_hashsig_create( - git_buf_text_hashsig **out, - const git_buf *buf, - bool generate_pairwise_hashes); - -/** - * Build a similarity signature from a file - * - * This walks through the file, only loading a maximum of 4K of file data at - * a time. Otherwise, it acts just like `git_buf_text_hashsig_create`. - */ -extern int git_buf_text_hashsig_create_fromfile( - git_buf_text_hashsig **out, - const char *path, - bool generate_pairwise_hashes); - -/** - * Release memory for a content similarity signature - */ -extern void git_buf_text_hashsig_free(git_buf_text_hashsig *sig); - -/** - * Measure similarity between two files - * - * @return <0 for error, [0 to scale] as similarity score - */ -extern int git_buf_text_hashsig_compare( - const git_buf_text_hashsig *a, - const git_buf_text_hashsig *b, - int scale); - #endif diff --git a/src/diff_tform.c b/src/diff_tform.c index dbcdc277f..3939230b7 100644 --- a/src/diff_tform.c +++ b/src/diff_tform.c @@ -7,7 +7,7 @@ #include "common.h" #include "diff.h" #include "git2/config.h" -#include "buf_text.h" +#include "hashsig.h" static git_diff_delta *diff_delta__dup( const git_diff_delta *d, git_pool *pool) @@ -300,7 +300,7 @@ on_error: typedef struct { /* array of delta index * 2 + (old_file/new_file) -> file hashes */ - git_buf_text_hashsig *sigs; + git_hashsig *sigs; } diff_similarity_cache; static unsigned int calc_similarity( @@ -308,6 +308,8 @@ static unsigned int calc_similarity( { diff_similarity_cache *cache = ref; + GIT_UNUSED(cache); + if (git_oid_cmp(&old_file->oid, &new_file->oid) == 0) return 100; diff --git a/src/hashsig.c b/src/hashsig.c new file mode 100644 index 000000000..a9cd8fa53 --- /dev/null +++ b/src/hashsig.c @@ -0,0 +1,364 @@ +/* + * Copyright (C) the libgit2 contributors. All rights reserved. + * + * This file is part of libgit2, distributed under the GNU GPL v2 with + * a Linking Exception. For full terms see the included COPYING file. + */ +#include "hashsig.h" +#include "fileops.h" + +typedef uint32_t hashsig_t; +typedef uint64_t hashsig_state; + +#define HASHSIG_SCALE 100 + +#define HASHSIG_HASH_WINDOW 32 +#define HASHSIG_HASH_START 0 +#define HASHSIG_HASH_SHIFT 5 +#define HASHSIG_HASH_MASK 0x7FFFFFFF + +#define HASHSIG_HEAP_SIZE ((1 << 7) - 1) + +typedef int (*hashsig_cmp)(const void *a, const void *b); + +typedef struct { + int size, asize; + hashsig_cmp cmp; + hashsig_t values[HASHSIG_HEAP_SIZE]; +} hashsig_heap; + +typedef struct { + hashsig_state state, shift_n; + char window[HASHSIG_HASH_WINDOW]; + int win_len, win_pos, saw_lf; +} hashsig_in_progress; + +#define HASHSIG_IN_PROGRESS_INIT { HASHSIG_HASH_START, 1, {0}, 0, 0, 1 } + +struct git_hashsig { + hashsig_heap mins; + hashsig_heap maxs; + git_hashsig_option_t opt; + int considered; +}; + +#define HEAP_LCHILD_OF(I) (((I)*2)+1) +#define HEAP_RCHILD_OF(I) (((I)*2)+2) +#define HEAP_PARENT_OF(I) (((I)-1)>>1) + +static void hashsig_heap_init(hashsig_heap *h, hashsig_cmp cmp) +{ + h->size = 0; + h->asize = HASHSIG_HEAP_SIZE; + h->cmp = cmp; +} + +static int hashsig_cmp_max(const void *a, const void *b) +{ + hashsig_t av = *(const hashsig_t *)a, bv = *(const hashsig_t *)b; + return (av < bv) ? -1 : (av > bv) ? 1 : 0; +} + +static int hashsig_cmp_min(const void *a, const void *b) +{ + hashsig_t av = *(const hashsig_t *)a, bv = *(const hashsig_t *)b; + return (av > bv) ? -1 : (av < bv) ? 1 : 0; +} + +static void hashsig_heap_up(hashsig_heap *h, int el) +{ + int parent_el = HEAP_PARENT_OF(el); + + while (el > 0 && h->cmp(&h->values[parent_el], &h->values[el]) > 0) { + hashsig_t t = h->values[el]; + h->values[el] = h->values[parent_el]; + h->values[parent_el] = t; + + el = parent_el; + parent_el = HEAP_PARENT_OF(el); + } +} + +static void hashsig_heap_down(hashsig_heap *h, int el) +{ + hashsig_t v, lv, rv; + + /* 'el < h->size / 2' tests if el is bottom row of heap */ + + while (el < h->size / 2) { + int lel = HEAP_LCHILD_OF(el), rel = HEAP_RCHILD_OF(el), swapel; + + v = h->values[el]; + lv = h->values[lel]; + rv = h->values[rel]; + + if (h->cmp(&v, &lv) < 0 && h->cmp(&v, &rv) < 0) + break; + + swapel = (h->cmp(&lv, &rv) < 0) ? lel : rel; + + h->values[el] = h->values[swapel]; + h->values[swapel] = v; + + el = swapel; + } +} + +static void hashsig_heap_sort(hashsig_heap *h) +{ + /* only need to do this at the end for signature comparison */ + qsort(h->values, h->size, sizeof(hashsig_t), h->cmp); +} + +static void hashsig_heap_insert(hashsig_heap *h, hashsig_t val) +{ + /* if heap is full, pop top if new element should replace it */ + if (h->size == h->asize && h->cmp(&val, &h->values[0]) > 0) { + h->size--; + h->values[0] = h->values[h->size]; + hashsig_heap_down(h, 0); + } + + /* if heap is not full, insert new element */ + if (h->size < h->asize) { + h->values[h->size++] = val; + hashsig_heap_up(h, h->size - 1); + } +} + +GIT_INLINE(bool) hashsig_include_char( + char ch, git_hashsig_option_t opt, int *saw_lf) +{ + if ((opt & GIT_HASHSIG_IGNORE_WHITESPACE) && git__isspace(ch)) + return false; + + if (opt & GIT_HASHSIG_SMART_WHITESPACE) { + if (ch == '\r' || (*saw_lf && git__isspace(ch))) + return false; + + *saw_lf = (ch == '\n'); + } + + return true; +} + +static void hashsig_initial_window( + git_hashsig *sig, + const char **data, + size_t size, + hashsig_in_progress *prog) +{ + hashsig_state state, shift_n; + int win_len; + const char *scan, *end; + + /* init until we have processed at least HASHSIG_HASH_WINDOW data */ + + if (prog->win_len >= HASHSIG_HASH_WINDOW) + return; + + state = prog->state; + win_len = prog->win_len; + shift_n = prog->shift_n; + + scan = *data; + end = scan + size; + + while (scan < end && win_len < HASHSIG_HASH_WINDOW) { + char ch = *scan++; + + if (!hashsig_include_char(ch, sig->opt, &prog->saw_lf)) + continue; + + state = (state * HASHSIG_HASH_SHIFT + ch) & HASHSIG_HASH_MASK; + + if (!win_len) + shift_n = 1; + else + shift_n = (shift_n * HASHSIG_HASH_SHIFT) & HASHSIG_HASH_MASK; + + prog->window[win_len++] = ch; + } + + /* insert initial hash if we just finished */ + + if (win_len == HASHSIG_HASH_WINDOW) { + hashsig_heap_insert(&sig->mins, state); + hashsig_heap_insert(&sig->maxs, state); + sig->considered = 1; + } + + prog->state = state; + prog->win_len = win_len; + prog->shift_n = shift_n; + + *data = scan; +} + +static int hashsig_add_hashes( + git_hashsig *sig, + const char *data, + size_t size, + hashsig_in_progress *prog) +{ + const char *scan = data, *end = data + size; + hashsig_state state, shift_n, rmv; + + if (prog->win_len < HASHSIG_HASH_WINDOW) + hashsig_initial_window(sig, &scan, size, prog); + + state = prog->state; + shift_n = prog->shift_n; + + /* advance window, adding new chars and removing old */ + + for (; scan < end; ++scan) { + char ch = *scan; + + if (!hashsig_include_char(ch, sig->opt, &prog->saw_lf)) + continue; + + rmv = shift_n * prog->window[prog->win_pos]; + + state = (state - rmv) & HASHSIG_HASH_MASK; + state = (state * HASHSIG_HASH_SHIFT) & HASHSIG_HASH_MASK; + state = (state + ch) & HASHSIG_HASH_MASK; + + hashsig_heap_insert(&sig->mins, state); + hashsig_heap_insert(&sig->maxs, state); + sig->considered++; + + prog->window[prog->win_pos] = ch; + prog->win_pos = (prog->win_pos + 1) % HASHSIG_HASH_WINDOW; + } + + prog->state = state; + + return 0; +} + +static int hashsig_finalize_hashes(git_hashsig *sig) +{ + if (sig->mins.size < HASHSIG_HEAP_SIZE) { + giterr_set(GITERR_INVALID, + "File too small for similarity signature calculation"); + return GIT_EBUFS; + } + + hashsig_heap_sort(&sig->mins); + hashsig_heap_sort(&sig->maxs); + + return 0; +} + +static git_hashsig *hashsig_alloc(git_hashsig_option_t opts) +{ + git_hashsig *sig = git__calloc(1, sizeof(git_hashsig)); + if (!sig) + return NULL; + + hashsig_heap_init(&sig->mins, hashsig_cmp_min); + hashsig_heap_init(&sig->maxs, hashsig_cmp_max); + sig->opt = opts; + + return sig; +} + +int git_hashsig_create( + git_hashsig **out, + const git_buf *buf, + git_hashsig_option_t opts) +{ + int error; + hashsig_in_progress prog = HASHSIG_IN_PROGRESS_INIT; + git_hashsig *sig = hashsig_alloc(opts); + GITERR_CHECK_ALLOC(sig); + + error = hashsig_add_hashes(sig, buf->ptr, buf->size, &prog); + + if (!error) + error = hashsig_finalize_hashes(sig); + + if (!error) + *out = sig; + else + git_hashsig_free(sig); + + return error; +} + +int git_hashsig_create_fromfile( + git_hashsig **out, + const char *path, + git_hashsig_option_t opts) +{ + char buf[4096]; + ssize_t buflen = 0; + int error = 0, fd; + hashsig_in_progress prog = HASHSIG_IN_PROGRESS_INIT; + git_hashsig *sig = hashsig_alloc(opts); + GITERR_CHECK_ALLOC(sig); + + if ((fd = git_futils_open_ro(path)) < 0) { + git__free(sig); + return fd; + } + + while (!error) { + if ((buflen = p_read(fd, buf, sizeof(buf))) <= 0) { + if ((error = buflen) < 0) + giterr_set(GITERR_OS, + "Read error on '%s' calculating similarity hashes", path); + break; + } + + error = hashsig_add_hashes(sig, buf, buflen, &prog); + } + + p_close(fd); + + if (!error) + error = hashsig_finalize_hashes(sig); + + if (!error) + *out = sig; + else + git_hashsig_free(sig); + + return error; +} + +void git_hashsig_free(git_hashsig *sig) +{ + git__free(sig); +} + +static int hashsig_heap_compare(const hashsig_heap *a, const hashsig_heap *b) +{ + int matches = 0, i, j, cmp; + + assert(a->cmp == b->cmp); + + /* hash heaps are sorted - just look for overlap vs total */ + + for (i = 0, j = 0; i < a->size && j < b->size; ) { + cmp = a->cmp(&a->values[i], &b->values[j]); + + if (cmp < 0) + ++i; + else if (cmp > 0) + ++j; + else { + ++i; ++j; ++matches; + } + } + + return HASHSIG_SCALE * (matches * 2) / (a->size + b->size); +} + +int git_hashsig_compare(const git_hashsig *a, const git_hashsig *b) +{ + return (hashsig_heap_compare(&a->mins, &b->mins) + + hashsig_heap_compare(&a->maxs, &b->maxs)) / 2; +} + diff --git a/src/hashsig.h b/src/hashsig.h new file mode 100644 index 000000000..70b47f5f3 --- /dev/null +++ b/src/hashsig.h @@ -0,0 +1,70 @@ +/* + * Copyright (C) the libgit2 contributors. All rights reserved. + * + * This file is part of libgit2, distributed under the GNU GPL v2 with + * a Linking Exception. For full terms see the included COPYING file. + */ +#ifndef INCLUDE_hashsig_h__ +#define INCLUDE_hashsig_h__ + +#include "buffer.h" + +/** + * Similarity signature of line hashes for a buffer + */ +typedef struct git_hashsig git_hashsig; + +typedef enum { + GIT_HASHSIG_NORMAL = 0, /* use all data */ + GIT_HASHSIG_IGNORE_WHITESPACE = 1, /* ignore whitespace */ + GIT_HASHSIG_SMART_WHITESPACE = 2, /* ignore \r and all space after \n */ +} git_hashsig_option_t; + +/** + * Build a similarity signature for a buffer + * + * If you have passed a whitespace-ignoring buffer, then the whitespace + * will be removed from the buffer while it is being processed, modifying + * the buffer in place. Sorry about that! + * + * This will return an error if the buffer doesn't contain enough data to + * compute a valid signature. + * + * @param out The array of hashed runs representing the file content + * @param buf The contents of the file to hash + * @param generate_pairwise_hashes Should pairwise runs be hashed + */ +extern int git_hashsig_create( + git_hashsig **out, + const git_buf *buf, + git_hashsig_option_t opts); + +/** + * Build a similarity signature from a file + * + * This walks through the file, only loading a maximum of 4K of file data at + * a time. Otherwise, it acts just like `git_hashsig_create`. + * + * This will return an error if the file doesn't contain enough data to + * compute a valid signature. + */ +extern int git_hashsig_create_fromfile( + git_hashsig **out, + const char *path, + git_hashsig_option_t opts); + +/** + * Release memory for a content similarity signature + */ +extern void git_hashsig_free(git_hashsig *sig); + +/** + * Measure similarity between two files + * + * @return <0 for error, [0 to 100] as similarity score + */ +extern int git_hashsig_compare( + const git_hashsig *a, + const git_hashsig *b); + +#endif diff --git a/tests-clar/core/buffer.c b/tests-clar/core/buffer.c index 63753bb67..6cad05c00 100644 --- a/tests-clar/core/buffer.c +++ b/tests-clar/core/buffer.c @@ -1,6 +1,7 @@ #include "clar_libgit2.h" #include "buffer.h" #include "buf_text.h" +#include "hashsig.h" #include "fileops.h" #define TESTSTR "Have you seen that? Have you seeeen that??" @@ -732,89 +733,94 @@ void test_core_buffer__classify_with_utf8(void) cl_assert(git_buf_text_contains_nul(&b)); } +#define SIMILARITY_TEST_DATA_1 \ + "test data\nright here\ninline\ntada\nneeds more data\nlots of data\n" \ + "is this enough?\nthere has to be enough data to fill the hash array!\n" \ + "Apparently 191 bytes is the minimum amount of data needed.\nHere goes!\n" \ + "Let's make sure we've got plenty to go with here.\n smile \n" + void test_core_buffer__similarity_metric(void) { - git_buf_text_hashsig *a, *b; + git_hashsig *a, *b; git_buf buf = GIT_BUF_INIT; int sim; /* in the first case, we compare data to itself and expect 100% match */ - cl_git_pass(git_buf_sets(&buf, "test data\nright here\ninline\ntada")); - cl_git_pass(git_buf_text_hashsig_create(&a, &buf, true)); - cl_git_pass(git_buf_text_hashsig_create(&b, &buf, true)); + cl_git_pass(git_buf_sets(&buf, SIMILARITY_TEST_DATA_1)); + cl_git_pass(git_hashsig_create(&a, &buf, GIT_HASHSIG_NORMAL)); + cl_git_pass(git_hashsig_create(&b, &buf, GIT_HASHSIG_NORMAL)); - cl_assert_equal_i(100, git_buf_text_hashsig_compare(a, b, 100)); + cl_assert_equal_i(100, git_hashsig_compare(a, b)); - git_buf_text_hashsig_free(a); - git_buf_text_hashsig_free(b); + git_hashsig_free(a); + git_hashsig_free(b); - /* in the second case, half of a is matched and all of b is matched, so - * we'll expect a score of around 66% to be the similarity score - */ + /* if we change just a single byte, how much does that change magnify? */ - cl_git_pass( - git_buf_sets(&buf, "a\nb\nc\nd\ne\nf\ng\nh\ni\nj\nk\nl\nm\nn\no\np\n")); - cl_git_pass(git_buf_text_hashsig_create(&a, &buf, true)); + cl_git_pass(git_buf_sets(&buf, SIMILARITY_TEST_DATA_1)); + cl_git_pass(git_hashsig_create(&a, &buf, GIT_HASHSIG_NORMAL)); + cl_git_pass(git_buf_sets(&buf, + "Test data\nright here\ninline\ntada\nneeds more data\nlots of data\n" + "is this enough?\nthere has to be enough data to fill the hash array!\n" + "Apparently 191 bytes is the minimum amount of data needed.\nHere goes!\n" + "Let's make sure we've got plenty to go with here.\n smile \n")); + cl_git_pass(git_hashsig_create(&b, &buf, GIT_HASHSIG_NORMAL)); - cl_git_pass(git_buf_sets(&buf, "a\nb\nc\nd\ne\nf\ng\nh")); - cl_git_pass(git_buf_text_hashsig_create(&b, &buf, true)); + sim = git_hashsig_compare(a, b); - sim = git_buf_text_hashsig_compare(a, b, 100); - cl_assert(sim > 60 && sim < 70); + cl_assert(95 < sim && sim < 100); /* expect >95% similarity */ - git_buf_text_hashsig_free(a); - git_buf_text_hashsig_free(b); + git_hashsig_free(a); + git_hashsig_free(b); - /* in the reversed case, 100% of line hashes match, but no pairwise hashes - * match, so we'll expect about a 50% match for a reversed file - */ + /* let's try comparing data to a superset of itself */ - cl_git_pass( - git_buf_sets(&buf, "a\nb\nc\nd\ne\nf\ng\nh\ni\nj\nk\nl\nm\nn\no\np\n")); - cl_git_pass(git_buf_text_hashsig_create(&a, &buf, true)); - cl_git_pass( - git_buf_sets(&buf, "p\no\nn\nm\nl\nk\nj\ni\nh\ng\nf\ne\nd\nc\nb\na\n")); - cl_git_pass(git_buf_text_hashsig_create(&b, &buf, true)); + cl_git_pass(git_buf_sets(&buf, SIMILARITY_TEST_DATA_1)); + cl_git_pass(git_hashsig_create(&a, &buf, GIT_HASHSIG_NORMAL)); + cl_git_pass(git_buf_sets(&buf, SIMILARITY_TEST_DATA_1 + "and if I add some more, it should still be pretty similar, yes?\n")); + cl_git_pass(git_hashsig_create(&b, &buf, GIT_HASHSIG_NORMAL)); - sim = git_buf_text_hashsig_compare(a, b, 100); - cl_assert(sim > 45 && sim < 55); + sim = git_hashsig_compare(a, b); - git_buf_text_hashsig_free(a); - git_buf_text_hashsig_free(b); + cl_assert(70 < sim && sim < 80); /* expect in the 70-80% similarity range */ - /* if we don't use pairwise signatures, then a reversed file should - * match 100% - */ + git_hashsig_free(a); + git_hashsig_free(b); + + /* what if we keep about half the original data and add half new */ + + cl_git_pass(git_buf_sets(&buf, SIMILARITY_TEST_DATA_1)); + cl_git_pass(git_hashsig_create(&a, &buf, GIT_HASHSIG_NORMAL)); + cl_git_pass(git_buf_sets(&buf, + "test data\nright here\ninline\ntada\nneeds more data\nlots of data\n" + "is this enough?\nthere has to be enough data to fill the hash array!\n" + "okay, that's half the original\nwhat else can we add?\nmore data\n" + "one more line will complete this\nshort\nlines\ndon't\nmatter\n")); + cl_git_pass(git_hashsig_create(&b, &buf, GIT_HASHSIG_NORMAL)); - cl_git_pass( - git_buf_sets(&buf, "a\nb\nc\nd\ne\nf\ng\nh\ni\nj\nk\nl\nm\nn\no\np\n")); - cl_git_pass(git_buf_text_hashsig_create(&a, &buf, false)); - cl_git_pass( - git_buf_sets(&buf, "p\no\nn\nm\nl\nk\nj\ni\nh\ng\nf\ne\nd\nc\nb\na\n")); - cl_git_pass(git_buf_text_hashsig_create(&b, &buf, false)); + sim = git_hashsig_compare(a, b); - sim = git_buf_text_hashsig_compare(a, b, 100); - cl_assert_equal_i(100, sim); + cl_assert(40 < sim && sim < 60); /* expect in the 40-60% similarity range */ - git_buf_text_hashsig_free(a); - git_buf_text_hashsig_free(b); + git_hashsig_free(a); + git_hashsig_free(b); /* lastly, let's check that we can hash file content as well */ - cl_git_pass( - git_buf_sets(&buf, "a\nb\nc\nd\ne\nf\ng\nh\ni\nj\nk\nl\nm\nn\no\np\n")); - cl_git_pass(git_buf_text_hashsig_create(&a, &buf, true)); + cl_git_pass(git_buf_sets(&buf, SIMILARITY_TEST_DATA_1)); + cl_git_pass(git_hashsig_create(&a, &buf, GIT_HASHSIG_NORMAL)); cl_git_pass(git_futils_mkdir("scratch", NULL, 0755, GIT_MKDIR_PATH)); - cl_git_mkfile("scratch/testdata", - "a\nb\nc\nd\ne\nf\ng\nh\ni\nj\nk\nl\nm\nn\no\np\n"); - cl_git_pass(git_buf_text_hashsig_create_fromfile(&b, "scratch/testdata", true)); + cl_git_mkfile("scratch/testdata", SIMILARITY_TEST_DATA_1); + cl_git_pass(git_hashsig_create_fromfile( + &b, "scratch/testdata", GIT_HASHSIG_NORMAL)); - cl_assert_equal_i(100, git_buf_text_hashsig_compare(a, b, 100)); + cl_assert_equal_i(100, git_hashsig_compare(a, b)); - git_buf_text_hashsig_free(a); - git_buf_text_hashsig_free(b); + git_hashsig_free(a); + git_hashsig_free(b); git_buf_free(&buf); git_futils_rmdir_r("scratch", NULL, GIT_RMDIR_REMOVE_FILES); |