Change similarity metric to sampled hashes

This moves the similarity metric code out of buf_text and into a new file. Also, this implements a different approach to similarity measurement based on a Rabin-Karp rolling hash where we only keep the top 100 and bottom 100 hashes. In theory, that should be sufficient samples to given a fairly accurate measurement while limiting the amount of data we keep for file signatures no matter how large the file is.
author: Russell Belfer <rb@github.com> 2013-02-14 17:25:10 -0800
committer: Russell Belfer <rb@github.com> 2013-02-20 15:09:40 -0800
commit: 5e5848eb15cc0dd8476d1c6882a9f770e6556586 (patch)
tree: 953fd30d6360b67c2174b6c03fd2984561c84cf6
parent: 99ba8f2322eaa2df51ace9782b8eadc8c5a6e8b8 (diff)
download: libgit2-5e5848eb15cc0dd8476d1c6882a9f770e6556586.tar.gz
6 files changed, 498 insertions, 464 deletions
diff --git a/src/buf_text.c b/src/buf_text.c
index 49ec16aaf..3a8f442b4 100644
--- a/src/buf_text.c
+++ b/src/buf_text.c
@@ -5,7 +5,6 @@
  * a Linking Exception. For full terms see the included COPYING file.
  */
 #include "buf_text.h"
-#include "fileops.h"
 
 int git_buf_text_puts_escaped(
 	git_buf *buf,
@@ -213,362 +212,3 @@ bool git_buf_text_gather_stats(
 	return (stats->nul > 0 ||
 		((stats->printable >> 7) < stats->nonprintable));
 }
-
-#define SIMILARITY_MAXRUN 256
-#define SIMILARITY_HASH_START  5381
-#define SIMILARITY_HASH_UPDATE(S,N) (((S) << 5) + (S) + (uint32_t)(N))
-
-enum {
-	SIMILARITY_FORMAT_UNKNOWN = 0,
-	SIMILARITY_FORMAT_TEXT = 1,
-	SIMILARITY_FORMAT_BINARY = 2
-};
-
-struct git_buf_text_hashsig {
-	uint32_t *hashes;
-	size_t size;
-	size_t asize;
-	unsigned int format : 2;
-	unsigned int pairs : 1;
-};
-
-static int similarity_record_hash(git_buf_text_hashsig *sig, uint32_t hash)
-{
-	if (sig->size >= sig->asize) {
-		size_t new_asize = sig->asize + 512;
-		uint32_t *new_hashes =
-			git__realloc(sig->hashes, new_asize * sizeof(uint32_t));
-		GITERR_CHECK_ALLOC(new_hashes);
-
-		sig->hashes = new_hashes;
-		sig->asize  = new_asize;
-	}
-
-	sig->hashes[sig->size++] = hash;
-	return 0;
-}
-
-static int similarity_add_hashes_text(
-	git_buf_text_hashsig *sig,
-	uint32_t *hash_start,
-	size_t *hashlen_start,
-	const char *ptr,
-	size_t len)
-{
-	int error;
-	const char *scan = ptr, *scan_end = ptr + len;
-	uint32_t hash = *hash_start;
-	size_t hashlen = *hashlen_start;
-
-	while (scan < scan_end) {
-		char ch = *scan++;
-
-		if (ch == '\r' || ch == '\n' || hashlen >= SIMILARITY_MAXRUN) {
-			if ((error = similarity_record_hash(sig, hash)) < 0)
-				break;
-
-			hash = SIMILARITY_HASH_START;
-			hashlen = 0;
-
-			/* skip all whitespace immediately after line ending */
-			while (scan < scan_end && git__isspace(*scan))
-				scan++;
-		} else {
-			hash = SIMILARITY_HASH_UPDATE(hash, ch);
-			hashlen++;
-		}
-	}
-
-	*hash_start = hash;
-	*hashlen_start = hashlen;
-
-	return error;
-}
-
-static int similarity_add_hashes_binary(
-	git_buf_text_hashsig *sig,
-	uint32_t *hash_start,
-	size_t *hashlen_start,
-	const char *ptr,
-	size_t len)
-{
-	int error;
-	const char *scan = ptr, *scan_end = ptr + len;
-	uint32_t hash = *hash_start;
-	size_t hashlen = *hashlen_start;
-
-	while (scan < scan_end) {
-		char ch = *scan++;
-
-		if (!ch || hashlen >= SIMILARITY_MAXRUN) {
-			if ((error = similarity_record_hash(sig, hash)) < 0)
-				break;
-
-			hash = SIMILARITY_HASH_START;
-			hashlen = 0;
-
-			/* skip run of terminators */
-			while (scan < scan_end && !*scan)
-				scan++;
-		} else {
-			hash = SIMILARITY_HASH_UPDATE(hash, ch);
-			hashlen++;
-		}
-	}
-
-	*hash_start = hash;
-	*hashlen_start = hashlen;
-
-	return error;
-}
-
-static int similarity_add_hashes(
-	git_buf_text_hashsig *sig,
-	uint32_t *hash_start,
-	size_t *hashlen_start,
-	const char *ptr,
-	size_t len)
-{
-	int error = 0;
-	uint32_t hash = hash_start ? *hash_start : SIMILARITY_HASH_START;
-	size_t hashlen = hashlen_start ? *hashlen_start : 0;
-
-	if (sig->format == SIMILARITY_FORMAT_TEXT)
-		error = similarity_add_hashes_text(sig, &hash, &hashlen, ptr, len);
-	else
-		error = similarity_add_hashes_binary(sig, &hash, &hashlen, ptr, len);
-
-	if (hash_start)
-		*hash_start = hash;
-	if (hashlen_start)
-		*hashlen_start = hashlen;
-
-	/* if we're not saving intermediate state, add final hash as needed */
-	if (!error && !hash_start && hashlen > 0)
-		error = similarity_record_hash(sig, hash);
-
-	return error;
-}
-
-/*
- * Decide if \0 or \n terminated runs are a better choice for hashes
- */
-static void similarity_guess_format(
-	git_buf_text_hashsig *sig, const char *ptr, size_t len)
-{
-	size_t lines = 0, line_length = 0, max_line_length = 0;
-	size_t runs = 0, run_length = 0, max_run_length = 0;
-
-	/* don't process more than 4k of data for this */
-	if (len > 4096)
-		len = 4096;
-
-	/* gather some stats */
-	while (len--) {
-		char ch = *ptr++;
-
-		if (ch == '\0') {
-			runs++;
-			if (max_run_length < run_length)
-				max_run_length = run_length;
-			run_length = 0;
-		} else if (ch == '\n') {
-			lines++;
-			if (max_line_length < line_length)
-				max_line_length = line_length;
-			line_length = 0;
-		} else {
-			run_length++;
-			line_length++;
-		}
-	}
-
-	/* the following heuristic could probably be improved */
-	if (lines > runs)
-		sig->format = SIMILARITY_FORMAT_TEXT;
-	else if (runs > 0)
-		sig->format = SIMILARITY_FORMAT_BINARY;
-	else
-		sig->format = SIMILARITY_FORMAT_UNKNOWN;
-}
-
-static int similarity_compare_score(const void *a, const void *b)
-{
-	uint32_t av = *(uint32_t *)a, bv = *(uint32_t *)b;
-	return (av < bv) ? -1 : (av > bv) ? 1 : 0;
-}
-
-static int similarity_finalize_hashes(
-	git_buf_text_hashsig *sig, bool generate_pairs)
-{
-	if (!sig->size)
-		return 0;
-
-	/* create pairwise hashes if requested */
-
-	if (generate_pairs) {
-		size_t i, needed_size = sig->size * 2 - 1;
-
-		if (needed_size > sig->asize) {
-			uint32_t *new_hashes =
-				git__realloc(sig->hashes, needed_size * sizeof(uint32_t));
-			GITERR_CHECK_ALLOC(new_hashes);
-
-			sig->hashes = new_hashes;
-			sig->asize  = needed_size;
-		}
-
-		for (i = 1; i < sig->size; ++i)
-			sig->hashes[sig->size + i - 1] =
-				SIMILARITY_HASH_UPDATE(sig->hashes[i - 1], sig->hashes[i]);
-
-		sig->pairs = 1;
-	}
-
-	/* sort all hashes */
-
-	qsort(sig->hashes, sig->size, sizeof(uint32_t), similarity_compare_score);
-
-	if (generate_pairs)
-		qsort(&sig->hashes[sig->size], sig->size - 1, sizeof(uint32_t),
-			similarity_compare_score);
-
-	return 0;
-}
-
-int git_buf_text_hashsig_create(
-	git_buf_text_hashsig **out,
-	const git_buf *buf,
-	bool generate_pairs)
-{
-	int error;
-	git_buf_text_hashsig *sig = git__calloc(1, sizeof(git_buf_text_hashsig));
-	GITERR_CHECK_ALLOC(sig);
-
-	similarity_guess_format(sig, buf->ptr, buf->size);
-
-	error = similarity_add_hashes(sig, NULL, NULL, buf->ptr, buf->size);
-
-	if (!error)
-		error = similarity_finalize_hashes(sig, generate_pairs);
-
-	if (!error)
-		*out = sig;
-	else
-		git_buf_text_hashsig_free(sig);
-
-	return error;
-}
-
-int git_buf_text_hashsig_create_fromfile(
-	git_buf_text_hashsig **out,
-	const char *path,
-	bool generate_pairs)
-{
-	char buf[4096];
-	ssize_t buflen = 0;
-	uint32_t hash = SIMILARITY_HASH_START;
-	size_t hashlen = 0;
-	int error = 0, fd;
-	git_buf_text_hashsig *sig = git__calloc(1, sizeof(git_buf_text_hashsig));
-	GITERR_CHECK_ALLOC(sig);
-
-	if ((fd = git_futils_open_ro(path)) < 0) {
-		git__free(sig);
-		return fd;
-	}
-
-	while (!error && (buflen = p_read(fd, buf, sizeof(buf))) > 0) {
-		if (sig->format == SIMILARITY_FORMAT_UNKNOWN)
-			similarity_guess_format(sig, buf, buflen);
-
-		error = similarity_add_hashes(sig, &hash, &hashlen, buf, buflen);
-	}
-
-	if (buflen < 0) {
-		giterr_set(GITERR_OS,
-			"Read error on '%s' while calculating similarity hashes", path);
-		error = (int)buflen;
-	}
-
-	p_close(fd);
-
-	if (!error && hashlen > 0)
-		error = similarity_record_hash(sig, hash);
-
-	if (!error)
-		error = similarity_finalize_hashes(sig, generate_pairs);
-
-	if (!error)
-		*out = sig;
-	else
-		git_buf_text_hashsig_free(sig);
-
-	return error;
-}
-
-void git_buf_text_hashsig_free(git_buf_text_hashsig *sig)
-{
-	if (!sig)
-		return;
-
-	if (sig->hashes) {
-		git__free(sig->hashes);
-		sig->hashes = NULL;
-	}
-
-	git__free(sig);
-}
-
-int git_buf_text_hashsig_compare(
-	const git_buf_text_hashsig *a,
-	const git_buf_text_hashsig *b,
-	int scale)
-{
-	size_t matches = 0, pairs = 0, total = 0, i, j;
-
-	if (a->format != b->format || !a->size || !b->size)
-		return 0;
-
-	if (scale <= 0)
-		scale = 100;
-
-	/* hash lists are sorted - just look for overlap vs total */
-
-	for (i = 0, j = 0; i < a->size && j < b->size; ) {
-		uint32_t av = a->hashes[i];
-		uint32_t bv = b->hashes[j];
-
-		if (av < bv)
-			++i;
-		else if (av > bv)
-			++j;
-		else {
-			++i; ++j;
-			++matches;
-		}
-	}
-
-	total = (a->size + b->size);
-
-	if (a->pairs && b->pairs) {
-		for (i = 0, j = 0; i < a->size - 1 && j < b->size - 1; ) {
-			uint32_t av = a->hashes[i + a->size];
-			uint32_t bv = b->hashes[j + b->size];
-
-			if (av < bv)
-				++i;
-			else if (av > bv)
-				++j;
-			else {
-				++i; ++j;
-				++pairs;
-			}
-		}
-
-		total += (a->size + b->size - 2);
-	}
-
-	return (int)(scale * 2 * (matches + pairs) / total);
-}
-
diff --git a/src/buf_text.h b/src/buf_text.h
index c48c010d3..458ee33c9 100644
--- a/src/buf_text.h
+++ b/src/buf_text.h
@@ -105,52 +105,4 @@ extern int git_buf_text_detect_bom(
 extern bool git_buf_text_gather_stats(
 	git_buf_text_stats *stats, const git_buf *buf, bool skip_bom);
 
-/**
- * Similarity signature of line hashes for a buffer
- */
-typedef struct git_buf_text_hashsig git_buf_text_hashsig;
-
-/**
- * Build a similarity signature for a buffer
- *
- * This can either generate a simple array of hashed lines/runs in the
- * file, or it can also keep hashes of pairs of runs in sequence.  Adding
- * the pairwise runs means the final score will be sensitive to line
- * ordering changes as well as individual line contents.
- *
- * @param out The array of hashed runs representing the file content
- * @param buf The contents of the file to hash
- * @param generate_pairwise_hashes Should pairwise runs be hashed
- */
-extern int git_buf_text_hashsig_create(
-	git_buf_text_hashsig **out,
-	const git_buf *buf,
-	bool generate_pairwise_hashes);
-
-/**
- * Build a similarity signature from a file
- *
- * This walks through the file, only loading a maximum of 4K of file data at
- * a time.  Otherwise, it acts just like `git_buf_text_hashsig_create`.
- */
-extern int git_buf_text_hashsig_create_fromfile(
-	git_buf_text_hashsig **out,
-	const char *path,
-	bool generate_pairwise_hashes);
-
-/**
- * Release memory for a content similarity signature
- */
-extern void git_buf_text_hashsig_free(git_buf_text_hashsig *sig);
-
-/**
- * Measure similarity between two files
- *
- * @return <0 for error, [0 to scale] as similarity score
- */
-extern int git_buf_text_hashsig_compare(
-	const git_buf_text_hashsig *a,
-	const git_buf_text_hashsig *b,
-	int scale);
-
 #endif
diff --git a/src/diff_tform.c b/src/diff_tform.c
index dbcdc277f..3939230b7 100644
--- a/src/diff_tform.c
+++ b/src/diff_tform.c
@@ -7,7 +7,7 @@
 #include "common.h"
 #include "diff.h"
 #include "git2/config.h"
-#include "buf_text.h"
+#include "hashsig.h"
 
 static git_diff_delta *diff_delta__dup(
 	const git_diff_delta *d, git_pool *pool)
@@ -300,7 +300,7 @@ on_error:
 
 typedef struct {
 	/* array of delta index * 2 + (old_file/new_file) -> file hashes */
-	git_buf_text_hashsig *sigs;
+	git_hashsig *sigs;
 } diff_similarity_cache;
 
 static unsigned int calc_similarity(
@@ -308,6 +308,8 @@ static unsigned int calc_similarity(
 {
 	diff_similarity_cache *cache = ref;
 
+	GIT_UNUSED(cache);
+
 	if (git_oid_cmp(&old_file->oid, &new_file->oid) == 0)
 		return 100;
 
diff --git a/src/hashsig.c b/src/hashsig.c
new file mode 100644
index 000000000..a9cd8fa53
--- /dev/null
+++ b/src/hashsig.c
@@ -0,0 +1,364 @@
+/*
+ * Copyright (C) the libgit2 contributors. All rights reserved.
+ *
+ * This file is part of libgit2, distributed under the GNU GPL v2 with
+ * a Linking Exception. For full terms see the included COPYING file.
+ */
+#include "hashsig.h"
+#include "fileops.h"
+
+typedef uint32_t hashsig_t;
+typedef uint64_t hashsig_state;
+
+#define HASHSIG_SCALE 100
+
+#define HASHSIG_HASH_WINDOW 32
+#define HASHSIG_HASH_START	0
+#define HASHSIG_HASH_SHIFT  5
+#define HASHSIG_HASH_MASK   0x7FFFFFFF
+
+#define HASHSIG_HEAP_SIZE ((1 << 7) - 1)
+
+typedef int (*hashsig_cmp)(const void *a, const void *b);
+
+typedef struct {
+	int size, asize;
+	hashsig_cmp cmp;
+	hashsig_t values[HASHSIG_HEAP_SIZE];
+} hashsig_heap;
+
+typedef struct {
+	hashsig_state state, shift_n;
+	char window[HASHSIG_HASH_WINDOW];
+	int win_len, win_pos, saw_lf;
+} hashsig_in_progress;
+
+#define HASHSIG_IN_PROGRESS_INIT { HASHSIG_HASH_START, 1, {0}, 0, 0, 1 }
+
+struct git_hashsig {
+	hashsig_heap mins;
+	hashsig_heap maxs;
+	git_hashsig_option_t opt;
+	int considered;
+};
+
+#define HEAP_LCHILD_OF(I) (((I)*2)+1)
+#define HEAP_RCHILD_OF(I) (((I)*2)+2)
+#define HEAP_PARENT_OF(I) (((I)-1)>>1)
+
+static void hashsig_heap_init(hashsig_heap *h, hashsig_cmp cmp)
+{
+	h->size  = 0;
+	h->asize = HASHSIG_HEAP_SIZE;
+	h->cmp   = cmp;
+}
+
+static int hashsig_cmp_max(const void *a, const void *b)
+{
+	hashsig_t av = *(const hashsig_t *)a, bv = *(const hashsig_t *)b;
+	return (av < bv) ? -1 : (av > bv) ? 1 : 0;
+}
+
+static int hashsig_cmp_min(const void *a, const void *b)
+{
+	hashsig_t av = *(const hashsig_t *)a, bv = *(const hashsig_t *)b;
+	return (av > bv) ? -1 : (av < bv) ? 1 : 0;
+}
+
+static void hashsig_heap_up(hashsig_heap *h, int el)
+{
+	int parent_el = HEAP_PARENT_OF(el);
+
+	while (el > 0 && h->cmp(&h->values[parent_el], &h->values[el]) > 0) {
+		hashsig_t t = h->values[el];
+		h->values[el] = h->values[parent_el];
+		h->values[parent_el] = t;
+
+		el = parent_el;
+		parent_el = HEAP_PARENT_OF(el);
+	}
+}
+
+static void hashsig_heap_down(hashsig_heap *h, int el)
+{
+	hashsig_t v, lv, rv;
+
+	/* 'el < h->size / 2' tests if el is bottom row of heap */
+
+	while (el < h->size / 2) {
+		int lel = HEAP_LCHILD_OF(el), rel = HEAP_RCHILD_OF(el), swapel;
+
+		v  = h->values[el];
+		lv = h->values[lel];
+		rv = h->values[rel];
+
+		if (h->cmp(&v, &lv) < 0 && h->cmp(&v, &rv) < 0)
+			break;
+
+		swapel = (h->cmp(&lv, &rv) < 0) ? lel : rel;
+
+		h->values[el] = h->values[swapel];
+		h->values[swapel] = v;
+
+		el = swapel;
+	}
+}
+
+static void hashsig_heap_sort(hashsig_heap *h)
+{
+	/* only need to do this at the end for signature comparison */
+	qsort(h->values, h->size, sizeof(hashsig_t), h->cmp);
+}
+
+static void hashsig_heap_insert(hashsig_heap *h, hashsig_t val)
+{
+	/* if heap is full, pop top if new element should replace it */
+	if (h->size == h->asize && h->cmp(&val, &h->values[0]) > 0) {
+		h->size--;
+		h->values[0] = h->values[h->size];
+		hashsig_heap_down(h, 0);
+	}
+
+	/* if heap is not full, insert new element */
+	if (h->size < h->asize) {
+		h->values[h->size++] = val;
+		hashsig_heap_up(h, h->size - 1);
+	}
+}
+
+GIT_INLINE(bool) hashsig_include_char(
+	char ch, git_hashsig_option_t opt, int *saw_lf)
+{
+	if ((opt & GIT_HASHSIG_IGNORE_WHITESPACE) && git__isspace(ch))
+		return false;
+
+	if (opt & GIT_HASHSIG_SMART_WHITESPACE) {
+		if (ch == '\r' || (*saw_lf && git__isspace(ch)))
+			return false;
+
+		*saw_lf = (ch == '\n');
+	}
+
+	return true;
+}
+
+static void hashsig_initial_window(
+	git_hashsig *sig,
+	const char **data,
+	size_t size,
+	hashsig_in_progress *prog)
+{
+	hashsig_state state, shift_n;
+	int win_len;
+	const char *scan, *end;
+
+	/* init until we have processed at least HASHSIG_HASH_WINDOW data */
+
+	if (prog->win_len >= HASHSIG_HASH_WINDOW)
+		return;
+
+	state   = prog->state;
+	win_len = prog->win_len;
+	shift_n = prog->shift_n;
+
+	scan = *data;
+	end  = scan + size;
+
+	while (scan < end && win_len < HASHSIG_HASH_WINDOW) {
+		char ch = *scan++;
+
+		if (!hashsig_include_char(ch, sig->opt, &prog->saw_lf))
+			continue;
+
+		state = (state * HASHSIG_HASH_SHIFT + ch) & HASHSIG_HASH_MASK;
+
+		if (!win_len)
+			shift_n = 1;
+		else
+			shift_n = (shift_n * HASHSIG_HASH_SHIFT) & HASHSIG_HASH_MASK;
+
+		prog->window[win_len++] = ch;
+	}
+
+	/* insert initial hash if we just finished */
+
+	if (win_len == HASHSIG_HASH_WINDOW) {
+		hashsig_heap_insert(&sig->mins, state);
+		hashsig_heap_insert(&sig->maxs, state);
+		sig->considered = 1;
+	}
+
+	prog->state   = state;
+	prog->win_len = win_len;
+	prog->shift_n = shift_n;
+
+	*data = scan;
+}
+
+static int hashsig_add_hashes(
+	git_hashsig *sig,
+	const char *data,
+	size_t size,
+	hashsig_in_progress *prog)
+{
+	const char *scan = data, *end = data + size;
+	hashsig_state state, shift_n, rmv;
+
+	if (prog->win_len < HASHSIG_HASH_WINDOW)
+		hashsig_initial_window(sig, &scan, size, prog);
+
+	state   = prog->state;
+	shift_n = prog->shift_n;
+
+	/* advance window, adding new chars and removing old */
+
+	for (; scan < end; ++scan) {
+		char ch = *scan;
+
+		if (!hashsig_include_char(ch, sig->opt, &prog->saw_lf))
+			continue;
+
+		rmv = shift_n * prog->window[prog->win_pos];
+
+		state = (state - rmv) & HASHSIG_HASH_MASK;
+		state = (state * HASHSIG_HASH_SHIFT) & HASHSIG_HASH_MASK;
+		state = (state + ch) & HASHSIG_HASH_MASK;
+
+		hashsig_heap_insert(&sig->mins, state);
+		hashsig_heap_insert(&sig->maxs, state);
+		sig->considered++;
+
+		prog->window[prog->win_pos] = ch;
+		prog->win_pos = (prog->win_pos + 1) % HASHSIG_HASH_WINDOW;
+	}
+
+	prog->state = state;
+
+	return 0;
+}
+
+static int hashsig_finalize_hashes(git_hashsig *sig)
+{
+	if (sig->mins.size < HASHSIG_HEAP_SIZE) {
+		giterr_set(GITERR_INVALID,
+			"File too small for similarity signature calculation");
+		return GIT_EBUFS;
+	}
+
+	hashsig_heap_sort(&sig->mins);
+	hashsig_heap_sort(&sig->maxs);
+
+	return 0;
+}
+
+static git_hashsig *hashsig_alloc(git_hashsig_option_t opts)
+{
+	git_hashsig *sig = git__calloc(1, sizeof(git_hashsig));
+	if (!sig)
+		return NULL;
+
+	hashsig_heap_init(&sig->mins, hashsig_cmp_min);
+	hashsig_heap_init(&sig->maxs, hashsig_cmp_max);
+	sig->opt = opts;
+
+	return sig;
+}
+
+int git_hashsig_create(
+	git_hashsig **out,
+	const git_buf *buf,
+	git_hashsig_option_t opts)
+{
+	int error;
+	hashsig_in_progress prog = HASHSIG_IN_PROGRESS_INIT;
+	git_hashsig *sig = hashsig_alloc(opts);
+	GITERR_CHECK_ALLOC(sig);
+
+	error = hashsig_add_hashes(sig, buf->ptr, buf->size, &prog);
+
+	if (!error)
+		error = hashsig_finalize_hashes(sig);
+
+	if (!error)
+		*out = sig;
+	else
+		git_hashsig_free(sig);
+
+	return error;
+}
+
+int git_hashsig_create_fromfile(
+	git_hashsig **out,
+	const char *path,
+	git_hashsig_option_t opts)
+{
+	char buf[4096];
+	ssize_t buflen = 0;
+	int error = 0, fd;
+	hashsig_in_progress prog = HASHSIG_IN_PROGRESS_INIT;
+	git_hashsig *sig = hashsig_alloc(opts);
+	GITERR_CHECK_ALLOC(sig);
+
+	if ((fd = git_futils_open_ro(path)) < 0) {
+		git__free(sig);
+		return fd;
+	}
+
+	while (!error) {
+		if ((buflen = p_read(fd, buf, sizeof(buf))) <= 0) {
+			if ((error = buflen) < 0)
+				giterr_set(GITERR_OS,
+					"Read error on '%s' calculating similarity hashes", path);
+			break;
+		}
+
+		error = hashsig_add_hashes(sig, buf, buflen, &prog);
+	}
+
+	p_close(fd);
+
+	if (!error)
+		error = hashsig_finalize_hashes(sig);
+
+	if (!error)
+		*out = sig;
+	else
+		git_hashsig_free(sig);
+
+	return error;
+}
+
+void git_hashsig_free(git_hashsig *sig)
+{
+	git__free(sig);
+}
+
+static int hashsig_heap_compare(const hashsig_heap *a, const hashsig_heap *b)
+{
+	int matches = 0, i, j, cmp;
+
+	assert(a->cmp == b->cmp);
+
+	/* hash heaps are sorted - just look for overlap vs total */
+
+	for (i = 0, j = 0; i < a->size && j < b->size; ) {
+		cmp = a->cmp(&a->values[i], &b->values[j]);
+
+		if (cmp < 0)
+			++i;
+		else if (cmp > 0)
+			++j;
+		else {
+			++i; ++j; ++matches;
+		}
+	}
+
+	return HASHSIG_SCALE * (matches * 2) / (a->size + b->size);
+}
+
+int git_hashsig_compare(const git_hashsig *a, const git_hashsig *b)
+{
+	return (hashsig_heap_compare(&a->mins, &b->mins) +
+			hashsig_heap_compare(&a->maxs, &b->maxs)) / 2;
+}
+
diff --git a/src/hashsig.h b/src/hashsig.h
new file mode 100644
index 000000000..70b47f5f3
--- /dev/null
+++ b/src/hashsig.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (C) the libgit2 contributors. All rights reserved.
+ *
+ * This file is part of libgit2, distributed under the GNU GPL v2 with
+ * a Linking Exception. For full terms see the included COPYING file.
+ */
+#ifndef INCLUDE_hashsig_h__
+#define INCLUDE_hashsig_h__
+
+#include "buffer.h"
+
+/**
+ * Similarity signature of line hashes for a buffer
+ */
+typedef struct git_hashsig git_hashsig;
+
+typedef enum {
+	GIT_HASHSIG_NORMAL = 0, /* use all data */
+	GIT_HASHSIG_IGNORE_WHITESPACE = 1, /* ignore whitespace */
+	GIT_HASHSIG_SMART_WHITESPACE = 2, /* ignore \r and all space after \n */
+} git_hashsig_option_t;
+
+/**
+ * Build a similarity signature for a buffer
+ *
+ * If you have passed a whitespace-ignoring buffer, then the whitespace
+ * will be removed from the buffer while it is being processed, modifying
+ * the buffer in place.  Sorry about that!
+ *
+ * This will return an error if the buffer doesn't contain enough data to
+ * compute a valid signature.
+ *
+ * @param out The array of hashed runs representing the file content
+ * @param buf The contents of the file to hash
+ * @param generate_pairwise_hashes Should pairwise runs be hashed
+ */
+extern int git_hashsig_create(
+	git_hashsig **out,
+	const git_buf *buf,
+	git_hashsig_option_t opts);
+
+/**
+ * Build a similarity signature from a file
+ *
+ * This walks through the file, only loading a maximum of 4K of file data at
+ * a time.  Otherwise, it acts just like `git_hashsig_create`.
+ *
+ * This will return an error if the file doesn't contain enough data to
+ * compute a valid signature.
+ */
+extern int git_hashsig_create_fromfile(
+	git_hashsig **out,
+	const char *path,
+	git_hashsig_option_t opts);
+
+/**
+ * Release memory for a content similarity signature
+ */
+extern void git_hashsig_free(git_hashsig *sig);
+
+/**
+ * Measure similarity between two files
+ *
+ * @return <0 for error, [0 to 100] as similarity score
+ */
+extern int git_hashsig_compare(
+	const git_hashsig *a,
+	const git_hashsig *b);
+
+#endif
diff --git a/tests-clar/core/buffer.c b/tests-clar/core/buffer.c
index 63753bb67..6cad05c00 100644
--- a/tests-clar/core/buffer.c
+++ b/tests-clar/core/buffer.c
@@ -1,6 +1,7 @@
 #include "clar_libgit2.h"
 #include "buffer.h"
 #include "buf_text.h"
+#include "hashsig.h"
 #include "fileops.h"
 
 #define TESTSTR "Have you seen that? Have you seeeen that??"
@@ -732,89 +733,94 @@ void test_core_buffer__classify_with_utf8(void)
 	cl_assert(git_buf_text_contains_nul(&b));
 }
 
+#define SIMILARITY_TEST_DATA_1 \
+	"test data\nright here\ninline\ntada\nneeds more data\nlots of data\n" \
+	"is this enough?\nthere has to be enough data to fill the hash array!\n" \
+	"Apparently 191 bytes is the minimum amount of data needed.\nHere goes!\n" \
+	"Let's make sure we've got plenty to go with here.\n   smile   \n"
+
 void test_core_buffer__similarity_metric(void)
 {
-	git_buf_text_hashsig *a, *b;
+	git_hashsig *a, *b;
 	git_buf buf = GIT_BUF_INIT;
 	int sim;
 
 	/* in the first case, we compare data to itself and expect 100% match */
 
-	cl_git_pass(git_buf_sets(&buf, "test data\nright here\ninline\ntada"));
-	cl_git_pass(git_buf_text_hashsig_create(&a, &buf, true));
-	cl_git_pass(git_buf_text_hashsig_create(&b, &buf, true));
+	cl_git_pass(git_buf_sets(&buf, SIMILARITY_TEST_DATA_1));
+	cl_git_pass(git_hashsig_create(&a, &buf, GIT_HASHSIG_NORMAL));
+	cl_git_pass(git_hashsig_create(&b, &buf, GIT_HASHSIG_NORMAL));
 
-	cl_assert_equal_i(100, git_buf_text_hashsig_compare(a, b, 100));
+	cl_assert_equal_i(100, git_hashsig_compare(a, b));
 
-	git_buf_text_hashsig_free(a);
-	git_buf_text_hashsig_free(b);
+	git_hashsig_free(a);
+	git_hashsig_free(b);
 
-	/* in the second case, half of a is matched and all of b is matched, so
-	 * we'll expect a score of around 66% to be the similarity score
-	 */
+	/* if we change just a single byte, how much does that change magnify? */
 
-	cl_git_pass(
-		git_buf_sets(&buf, "a\nb\nc\nd\ne\nf\ng\nh\ni\nj\nk\nl\nm\nn\no\np\n"));
-	cl_git_pass(git_buf_text_hashsig_create(&a, &buf, true));
+	cl_git_pass(git_buf_sets(&buf, SIMILARITY_TEST_DATA_1));
+	cl_git_pass(git_hashsig_create(&a, &buf, GIT_HASHSIG_NORMAL));
+	cl_git_pass(git_buf_sets(&buf,
+		"Test data\nright here\ninline\ntada\nneeds more data\nlots of data\n"
+		"is this enough?\nthere has to be enough data to fill the hash array!\n"
+		"Apparently 191 bytes is the minimum amount of data needed.\nHere goes!\n"
+		 "Let's make sure we've got plenty to go with here.\n   smile   \n"));
+	cl_git_pass(git_hashsig_create(&b, &buf, GIT_HASHSIG_NORMAL));
 
-	cl_git_pass(git_buf_sets(&buf, "a\nb\nc\nd\ne\nf\ng\nh"));
-	cl_git_pass(git_buf_text_hashsig_create(&b, &buf, true));
+	sim = git_hashsig_compare(a, b);
 
-	sim = git_buf_text_hashsig_compare(a, b, 100);
-	cl_assert(sim > 60 && sim < 70);
+	cl_assert(95 < sim && sim < 100); /* expect >95% similarity */
 
-	git_buf_text_hashsig_free(a);
-	git_buf_text_hashsig_free(b);
+	git_hashsig_free(a);
+	git_hashsig_free(b);
 
-	/* in the reversed case, 100% of line hashes match, but no pairwise hashes
-	 * match, so we'll expect about a 50% match for a reversed file
-	 */
+	/* let's try comparing data to a superset of itself */
 
-	cl_git_pass(
-		git_buf_sets(&buf, "a\nb\nc\nd\ne\nf\ng\nh\ni\nj\nk\nl\nm\nn\no\np\n"));
-	cl_git_pass(git_buf_text_hashsig_create(&a, &buf, true));
-	cl_git_pass(
-		git_buf_sets(&buf, "p\no\nn\nm\nl\nk\nj\ni\nh\ng\nf\ne\nd\nc\nb\na\n"));
-	cl_git_pass(git_buf_text_hashsig_create(&b, &buf, true));
+	cl_git_pass(git_buf_sets(&buf, SIMILARITY_TEST_DATA_1));
+	cl_git_pass(git_hashsig_create(&a, &buf, GIT_HASHSIG_NORMAL));
+	cl_git_pass(git_buf_sets(&buf, SIMILARITY_TEST_DATA_1
+		"and if I add some more, it should still be pretty similar, yes?\n"));
+	cl_git_pass(git_hashsig_create(&b, &buf, GIT_HASHSIG_NORMAL));
 
-	sim = git_buf_text_hashsig_compare(a, b, 100);
-	cl_assert(sim > 45 && sim < 55);
+	sim = git_hashsig_compare(a, b);
 
-	git_buf_text_hashsig_free(a);
-	git_buf_text_hashsig_free(b);
+	cl_assert(70 < sim && sim < 80); /* expect in the 70-80% similarity range */
 
-	/* if we don't use pairwise signatures, then a reversed file should
-	 * match 100%
-	 */
+	git_hashsig_free(a);
+	git_hashsig_free(b);
+
+	/* what if we keep about half the original data and add half new */
+
+	cl_git_pass(git_buf_sets(&buf, SIMILARITY_TEST_DATA_1));
+	cl_git_pass(git_hashsig_create(&a, &buf, GIT_HASHSIG_NORMAL));
+	cl_git_pass(git_buf_sets(&buf,
+		"test data\nright here\ninline\ntada\nneeds more data\nlots of data\n"
+		"is this enough?\nthere has to be enough data to fill the hash array!\n"
+		"okay, that's half the original\nwhat else can we add?\nmore data\n"
+		 "one more line will complete this\nshort\nlines\ndon't\nmatter\n"));
+	cl_git_pass(git_hashsig_create(&b, &buf, GIT_HASHSIG_NORMAL));
 
-	cl_git_pass(
-		git_buf_sets(&buf, "a\nb\nc\nd\ne\nf\ng\nh\ni\nj\nk\nl\nm\nn\no\np\n"));
-	cl_git_pass(git_buf_text_hashsig_create(&a, &buf, false));
-	cl_git_pass(
-		git_buf_sets(&buf, "p\no\nn\nm\nl\nk\nj\ni\nh\ng\nf\ne\nd\nc\nb\na\n"));
-	cl_git_pass(git_buf_text_hashsig_create(&b, &buf, false));
+	sim = git_hashsig_compare(a, b);
 
-	sim = git_buf_text_hashsig_compare(a, b, 100);
-	cl_assert_equal_i(100, sim);
+	cl_assert(40 < sim && sim < 60); /* expect in the 40-60% similarity range */
 
-	git_buf_text_hashsig_free(a);
-	git_buf_text_hashsig_free(b);
+	git_hashsig_free(a);
+	git_hashsig_free(b);
 
 	/* lastly, let's check that we can hash file content as well */
 
-	cl_git_pass(
-		git_buf_sets(&buf, "a\nb\nc\nd\ne\nf\ng\nh\ni\nj\nk\nl\nm\nn\no\np\n"));
-	cl_git_pass(git_buf_text_hashsig_create(&a, &buf, true));
+	cl_git_pass(git_buf_sets(&buf, SIMILARITY_TEST_DATA_1));
+	cl_git_pass(git_hashsig_create(&a, &buf, GIT_HASHSIG_NORMAL));
 
 	cl_git_pass(git_futils_mkdir("scratch", NULL, 0755, GIT_MKDIR_PATH));
-	cl_git_mkfile("scratch/testdata",
-		"a\nb\nc\nd\ne\nf\ng\nh\ni\nj\nk\nl\nm\nn\no\np\n");
-	cl_git_pass(git_buf_text_hashsig_create_fromfile(&b, "scratch/testdata", true));
+	cl_git_mkfile("scratch/testdata", SIMILARITY_TEST_DATA_1);
+	cl_git_pass(git_hashsig_create_fromfile(
+		&b, "scratch/testdata", GIT_HASHSIG_NORMAL));
 
-	cl_assert_equal_i(100, git_buf_text_hashsig_compare(a, b, 100));
+	cl_assert_equal_i(100, git_hashsig_compare(a, b));
 
-	git_buf_text_hashsig_free(a);
-	git_buf_text_hashsig_free(b);
+	git_hashsig_free(a);
+	git_hashsig_free(b);
 
 	git_buf_free(&buf);
 	git_futils_rmdir_r("scratch", NULL, GIT_RMDIR_REMOVE_FILES);
author	Russell Belfer <rb@github.com>	2013-02-14 17:25:10 -0800
committer	Russell Belfer <rb@github.com>	2013-02-20 15:09:40 -0800
commit	5e5848eb15cc0dd8476d1c6882a9f770e6556586 (patch)
tree	953fd30d6360b67c2174b6c03fd2984561c84cf6
parent	99ba8f2322eaa2df51ace9782b8eadc8c5a6e8b8 (diff)
download	libgit2-5e5848eb15cc0dd8476d1c6882a9f770e6556586.tar.gz