diff options
| author | Russell Belfer <rb@github.com> | 2013-02-14 17:25:10 -0800 |
|---|---|---|
| committer | Russell Belfer <rb@github.com> | 2013-02-20 15:09:40 -0800 |
| commit | 5e5848eb15cc0dd8476d1c6882a9f770e6556586 (patch) | |
| tree | 953fd30d6360b67c2174b6c03fd2984561c84cf6 /src/hashsig.h | |
| parent | 99ba8f2322eaa2df51ace9782b8eadc8c5a6e8b8 (diff) | |
| download | libgit2-5e5848eb15cc0dd8476d1c6882a9f770e6556586.tar.gz | |
Change similarity metric to sampled hashes
This moves the similarity metric code out of buf_text and into a
new file. Also, this implements a different approach to similarity
measurement based on a Rabin-Karp rolling hash where we only keep
the top 100 and bottom 100 hashes. In theory, that should be
sufficient samples to given a fairly accurate measurement while
limiting the amount of data we keep for file signatures no matter
how large the file is.
Diffstat (limited to 'src/hashsig.h')
| -rw-r--r-- | src/hashsig.h | 70 |
1 files changed, 70 insertions, 0 deletions
diff --git a/src/hashsig.h b/src/hashsig.h new file mode 100644 index 000000000..70b47f5f3 --- /dev/null +++ b/src/hashsig.h @@ -0,0 +1,70 @@ +/* + * Copyright (C) the libgit2 contributors. All rights reserved. + * + * This file is part of libgit2, distributed under the GNU GPL v2 with + * a Linking Exception. For full terms see the included COPYING file. + */ +#ifndef INCLUDE_hashsig_h__ +#define INCLUDE_hashsig_h__ + +#include "buffer.h" + +/** + * Similarity signature of line hashes for a buffer + */ +typedef struct git_hashsig git_hashsig; + +typedef enum { + GIT_HASHSIG_NORMAL = 0, /* use all data */ + GIT_HASHSIG_IGNORE_WHITESPACE = 1, /* ignore whitespace */ + GIT_HASHSIG_SMART_WHITESPACE = 2, /* ignore \r and all space after \n */ +} git_hashsig_option_t; + +/** + * Build a similarity signature for a buffer + * + * If you have passed a whitespace-ignoring buffer, then the whitespace + * will be removed from the buffer while it is being processed, modifying + * the buffer in place. Sorry about that! + * + * This will return an error if the buffer doesn't contain enough data to + * compute a valid signature. + * + * @param out The array of hashed runs representing the file content + * @param buf The contents of the file to hash + * @param generate_pairwise_hashes Should pairwise runs be hashed + */ +extern int git_hashsig_create( + git_hashsig **out, + const git_buf *buf, + git_hashsig_option_t opts); + +/** + * Build a similarity signature from a file + * + * This walks through the file, only loading a maximum of 4K of file data at + * a time. Otherwise, it acts just like `git_hashsig_create`. + * + * This will return an error if the file doesn't contain enough data to + * compute a valid signature. + */ +extern int git_hashsig_create_fromfile( + git_hashsig **out, + const char *path, + git_hashsig_option_t opts); + +/** + * Release memory for a content similarity signature + */ +extern void git_hashsig_free(git_hashsig *sig); + +/** + * Measure similarity between two files + * + * @return <0 for error, [0 to 100] as similarity score + */ +extern int git_hashsig_compare( + const git_hashsig *a, + const git_hashsig *b); + +#endif |
