diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/array.h | 11 | ||||
-rw-r--r-- | src/blob.c | 89 | ||||
-rw-r--r-- | src/blob.h | 9 | ||||
-rw-r--r-- | src/diff.h | 29 | ||||
-rw-r--r-- | src/diff_file.c | 17 | ||||
-rw-r--r-- | src/diff_patch.c | 30 | ||||
-rw-r--r-- | src/diff_tform.c | 352 | ||||
-rw-r--r-- | src/hashsig.c | 214 | ||||
-rw-r--r-- | src/index.c | 34 | ||||
-rw-r--r-- | src/util.h | 5 |
10 files changed, 458 insertions, 332 deletions
diff --git a/src/array.h b/src/array.h index 248010425..c25a1b29e 100644 --- a/src/array.h +++ b/src/array.h @@ -39,25 +39,26 @@ #define GITERR_CHECK_ARRAY(a) GITERR_CHECK_ALLOC((a).ptr) -typedef git_array_t(void) git_array_generic_t; +typedef git_array_t(char) git_array_generic_t; /* use a generic array for growth so this can return the new item */ -GIT_INLINE(void *) git_array_grow(git_array_generic_t *a, size_t item_size) +GIT_INLINE(void *) git_array_grow(void *_a, size_t item_size) { + git_array_generic_t *a = _a; uint32_t new_size = (a->size < 8) ? 8 : a->asize * 3 / 2; - void *new_array = git__realloc(a->ptr, new_size * item_size); + char *new_array = git__realloc(a->ptr, new_size * item_size); if (!new_array) { git_array_clear(*a); return NULL; } else { a->ptr = new_array; a->asize = new_size; a->size++; - return (((char *)a->ptr) + (a->size - 1) * item_size); + return a->ptr + (a->size - 1) * item_size; } } #define git_array_alloc(a) \ ((a).size >= (a).asize) ? \ - git_array_grow((git_array_generic_t *)&(a), sizeof(*(a).ptr)) : \ + git_array_grow(&(a), sizeof(*(a).ptr)) : \ (a).ptr ? &(a).ptr[(a).size++] : NULL #define git_array_last(a) ((a).size ? &(a).ptr[(a).size - 1] : NULL) diff --git a/src/blob.c b/src/blob.c index 2e4d5f479..5bb51f7cf 100644 --- a/src/blob.c +++ b/src/blob.c @@ -105,6 +105,7 @@ static int write_file_stream( static int write_file_filtered( git_oid *oid, + git_off_t *size, git_odb *odb, const char *full_path, git_vector *filters) @@ -123,8 +124,11 @@ static int write_file_filtered( git_buf_free(&source); /* Write the file to disk if it was properly filtered */ - if (!error) + if (!error) { + *size = dest.size; + error = git_odb_write(oid, odb, dest.ptr, dest.size, GIT_OBJ_BLOB); + } git_buf_free(&dest); return error; @@ -152,21 +156,46 @@ static int write_symlink( return error; } -static int blob_create_internal(git_oid *oid, git_repository *repo, const char *content_path, const char *hint_path, bool try_load_filters) +int git_blob__create_from_paths( + git_oid *oid, + struct stat *out_st, + git_repository *repo, + const char *content_path, + const char *hint_path, + mode_t hint_mode, + bool try_load_filters) { int error; struct stat st; git_odb *odb = NULL; git_off_t size; + mode_t mode; + git_buf path = GIT_BUF_INIT; assert(hint_path || !try_load_filters); - if ((error = git_path_lstat(content_path, &st)) < 0 || (error = git_repository_odb__weakptr(&odb, repo)) < 0) - return error; + if (!content_path) { + if (git_repository__ensure_not_bare(repo, "create blob from file") < 0) + return GIT_EBAREREPO; + + if (git_buf_joinpath( + &path, git_repository_workdir(repo), hint_path) < 0) + return -1; + + content_path = path.ptr; + } + + if ((error = git_path_lstat(content_path, &st)) < 0 || + (error = git_repository_odb(&odb, repo)) < 0) + goto done; + + if (out_st) + memcpy(out_st, &st, sizeof(st)); size = st.st_size; + mode = hint_mode ? hint_mode : st.st_mode; - if (S_ISLNK(st.st_mode)) { + if (S_ISLNK(mode)) { error = write_symlink(oid, odb, content_path, (size_t)size); } else { git_vector write_filters = GIT_VECTOR_INIT; @@ -187,7 +216,8 @@ static int blob_create_internal(git_oid *oid, git_repository *repo, const char * error = write_file_stream(oid, odb, content_path, size); } else { /* We need to apply one or more filters */ - error = write_file_filtered(oid, odb, content_path, &write_filters); + error = write_file_filtered( + oid, &size, odb, content_path, &write_filters); } git_filters_free(&write_filters); @@ -207,34 +237,21 @@ static int blob_create_internal(git_oid *oid, git_repository *repo, const char * */ } +done: + git_odb_free(odb); + git_buf_free(&path); + return error; } -int git_blob_create_fromworkdir(git_oid *oid, git_repository *repo, const char *path) +int git_blob_create_fromworkdir( + git_oid *oid, git_repository *repo, const char *path) { - git_buf full_path = GIT_BUF_INIT; - const char *workdir; - int error; - - if ((error = git_repository__ensure_not_bare(repo, "create blob from file")) < 0) - return error; - - workdir = git_repository_workdir(repo); - - if (git_buf_joinpath(&full_path, workdir, path) < 0) { - git_buf_free(&full_path); - return -1; - } - - error = blob_create_internal( - oid, repo, git_buf_cstr(&full_path), - git_buf_cstr(&full_path) + strlen(workdir), true); - - git_buf_free(&full_path); - return error; + return git_blob__create_from_paths(oid, NULL, repo, NULL, path, 0, true); } -int git_blob_create_fromdisk(git_oid *oid, git_repository *repo, const char *path) +int git_blob_create_fromdisk( + git_oid *oid, git_repository *repo, const char *path) { int error; git_buf full_path = GIT_BUF_INIT; @@ -251,8 +268,8 @@ int git_blob_create_fromdisk(git_oid *oid, git_repository *repo, const char *pat if (workdir && !git__prefixcmp(hintpath, workdir)) hintpath += strlen(workdir); - error = blob_create_internal( - oid, repo, git_buf_cstr(&full_path), hintpath, true); + error = git_blob__create_from_paths( + oid, NULL, repo, git_buf_cstr(&full_path), hintpath, 0, true); git_buf_free(&full_path); return error; @@ -272,12 +289,9 @@ int git_blob_create_fromchunks( git_filebuf file = GIT_FILEBUF_INIT; git_buf path = GIT_BUF_INIT; - if (git_buf_join_n( - &path, '/', 3, - git_repository_path(repo), - GIT_OBJECTS_DIR, - "streamed") < 0) - goto cleanup; + if (git_buf_joinpath( + &path, git_repository_path(repo), GIT_OBJECTS_DIR "streamed") < 0) + goto cleanup; content = git__malloc(BUFFER_SIZE); GITERR_CHECK_ALLOC(content); @@ -303,7 +317,8 @@ int git_blob_create_fromchunks( if (git_filebuf_flush(&file) < 0) goto cleanup; - error = blob_create_internal(oid, repo, file.path_lock, hintpath, hintpath != NULL); + error = git_blob__create_from_paths( + oid, NULL, repo, file.path_lock, hintpath, 0, hintpath != NULL); cleanup: git_buf_free(&path); diff --git a/src/blob.h b/src/blob.h index 22e37cc3a..4cd9f1e0c 100644 --- a/src/blob.h +++ b/src/blob.h @@ -21,4 +21,13 @@ void git_blob__free(void *blob); int git_blob__parse(void *blob, git_odb_object *obj); int git_blob__getbuf(git_buf *buffer, git_blob *blob); +extern int git_blob__create_from_paths( + git_oid *out_oid, + struct stat *out_st, + git_repository *repo, + const char *full_path, + const char *hint_path, + mode_t hint_mode, + bool apply_filters); + #endif diff --git a/src/diff.h b/src/diff.h index d09a130bc..d1bec00c6 100644 --- a/src/diff.h +++ b/src/diff.h @@ -16,6 +16,7 @@ #include "iterator.h" #include "repository.h" #include "pool.h" +#include "odb.h" #define DIFF_OLD_PREFIX_DEFAULT "a/" #define DIFF_NEW_PREFIX_DEFAULT "b/" @@ -108,5 +109,33 @@ extern void git_diff_find_similar__hashsig_free(void *sig, void *payload); extern int git_diff_find_similar__calc_similarity( int *score, void *siga, void *sigb, void *payload); +/* + * Sometimes a git_diff_file will have a zero size; this attempts to + * fill in the size without loading the blob if possible. If that is + * not possible, then it will return the git_odb_object that had to be + * loaded and the caller can use it or dispose of it as needed. + */ +GIT_INLINE(int) git_diff_file__resolve_zero_size( + git_diff_file *file, git_odb_object **odb_obj, git_repository *repo) +{ + int error; + git_odb *odb; + size_t len; + git_otype type; + + if ((error = git_repository_odb(&odb, repo)) < 0) + return error; + + error = git_odb__read_header_or_object( + odb_obj, &len, &type, odb, &file->oid); + + git_odb_free(odb); + + if (!error) + file->size = (git_off_t)len; + + return error; +} + #endif diff --git a/src/diff_file.c b/src/diff_file.c index 9d06daafa..bcfef13cd 100644 --- a/src/diff_file.c +++ b/src/diff_file.c @@ -241,19 +241,9 @@ static int diff_file_content_load_blob(git_diff_file_content *fc) /* if we don't know size, try to peek at object header first */ if (!fc->file->size) { - git_odb *odb; - size_t len; - git_otype type; - - if (!(error = git_repository_odb__weakptr(&odb, fc->repo))) { - error = git_odb__read_header_or_object( - &odb_obj, &len, &type, odb, &fc->file->oid); - git_odb_free(odb); - } - if (error) + if ((error = git_diff_file__resolve_zero_size( + fc->file, &odb_obj, fc->repo)) < 0) return error; - - fc->file->size = len; } if (diff_file_content_binary_by_size(fc)) @@ -417,6 +407,9 @@ int git_diff_file_content__load(git_diff_file_content *fc) void git_diff_file_content__unload(git_diff_file_content *fc) { + if ((fc->flags & GIT_DIFF_FLAG__LOADED) == 0) + return; + if (fc->flags & GIT_DIFF_FLAG__FREE_DATA) { git__free(fc->map.data); fc->map.data = ""; diff --git a/src/diff_patch.c b/src/diff_patch.c index 1b4adac03..69bb08198 100644 --- a/src/diff_patch.c +++ b/src/diff_patch.c @@ -230,6 +230,10 @@ static int diff_patch_generate(git_diff_patch *patch, git_diff_output *output) if ((patch->flags & GIT_DIFF_PATCH_DIFFED) != 0) return 0; + /* if we are not looking at the hunks and lines, don't do the diff */ + if (!output->hunk_cb && !output->data_cb) + return 0; + if ((patch->flags & GIT_DIFF_PATCH_LOADED) == 0 && (error = diff_patch_load(patch, output)) < 0) return error; @@ -284,8 +288,8 @@ int git_diff_foreach( if (diff_required(diff, "git_diff_foreach") < 0) return -1; - diff_output_init((git_diff_output *)&xo, - &diff->opts, file_cb, hunk_cb, data_cb, payload); + diff_output_init( + &xo.output, &diff->opts, file_cb, hunk_cb, data_cb, payload); git_xdiff_init(&xo, &diff->opts); git_vector_foreach(&diff->deltas, idx, patch.delta) { @@ -296,10 +300,10 @@ int git_diff_foreach( if (!(error = diff_patch_init_from_diff(&patch, diff, idx))) { - error = diff_patch_file_callback(&patch, (git_diff_output *)&xo); + error = diff_patch_file_callback(&patch, &xo.output); if (!error) - error = diff_patch_generate(&patch, (git_diff_output *)&xo); + error = diff_patch_generate(&patch, &xo.output); git_diff_patch_free(&patch); } @@ -437,7 +441,7 @@ int git_diff_blobs( memset(&xo, 0, sizeof(xo)); diff_output_init( - (git_diff_output *)&xo, opts, file_cb, hunk_cb, data_cb, payload); + &xo.output, opts, file_cb, hunk_cb, data_cb, payload); git_xdiff_init(&xo, opts); if (!old_path && new_path) @@ -448,7 +452,7 @@ int git_diff_blobs( error = diff_patch_from_blobs( &pd, &xo, old_blob, old_path, new_blob, new_path, opts); - git_diff_patch_free((git_diff_patch *)&pd); + git_diff_patch_free(&pd.patch); return error; } @@ -473,7 +477,7 @@ int git_diff_patch_from_blobs( memset(&xo, 0, sizeof(xo)); - diff_output_to_patch((git_diff_output *)&xo, &pd->patch); + diff_output_to_patch(&xo.output, &pd->patch); git_xdiff_init(&xo, opts); error = diff_patch_from_blobs( @@ -549,7 +553,7 @@ int git_diff_blob_to_buffer( memset(&xo, 0, sizeof(xo)); diff_output_init( - (git_diff_output *)&xo, opts, file_cb, hunk_cb, data_cb, payload); + &xo.output, opts, file_cb, hunk_cb, data_cb, payload); git_xdiff_init(&xo, opts); if (!old_path && buf_path) @@ -560,7 +564,7 @@ int git_diff_blob_to_buffer( error = diff_patch_from_blob_and_buffer( &pd, &xo, old_blob, old_path, buf, buflen, buf_path, opts); - git_diff_patch_free((git_diff_patch *)&pd); + git_diff_patch_free(&pd.patch); return error; } @@ -586,7 +590,7 @@ int git_diff_patch_from_blob_and_buffer( memset(&xo, 0, sizeof(xo)); - diff_output_to_patch((git_diff_output *)&xo, &pd->patch); + diff_output_to_patch(&xo.output, &pd->patch); git_xdiff_init(&xo, opts); error = diff_patch_from_blob_and_buffer( @@ -638,13 +642,13 @@ int git_diff_get_patch( if ((error = diff_patch_alloc_from_diff(&patch, diff, idx)) < 0) return error; - diff_output_to_patch((git_diff_output *)&xo, patch); + diff_output_to_patch(&xo.output, patch); git_xdiff_init(&xo, &diff->opts); - error = diff_patch_file_callback(patch, (git_diff_output *)&xo); + error = diff_patch_file_callback(patch, &xo.output); if (!error) - error = diff_patch_generate(patch, (git_diff_output *)&xo); + error = diff_patch_generate(patch, &xo.output); if (!error) { /* if cumulative diff size is < 0.5 total size, flatten the patch */ diff --git a/src/diff_tform.c b/src/diff_tform.c index ac5356a8c..ba35d3c14 100644 --- a/src/diff_tform.c +++ b/src/diff_tform.c @@ -408,57 +408,99 @@ GIT_INLINE(git_diff_file *) similarity_get_file(git_diff_list *diff, size_t idx) return (idx & 1) ? &delta->new_file : &delta->old_file; } -static int similarity_calc( - git_diff_list *diff, +typedef struct { + size_t idx; + git_iterator_type_t src; + git_repository *repo; + git_diff_file *file; + git_buf data; + git_odb_object *odb_obj; + git_blob *blob; +} similarity_info; + +static int similarity_init( + similarity_info *info, git_diff_list *diff, size_t file_idx) +{ + info->idx = file_idx; + info->src = (file_idx & 1) ? diff->new_src : diff->old_src; + info->repo = diff->repo; + info->file = similarity_get_file(diff, file_idx); + info->odb_obj = NULL; + info->blob = NULL; + git_buf_init(&info->data, 0); + + if (info->file->size > 0) + return 0; + + return git_diff_file__resolve_zero_size( + info->file, &info->odb_obj, info->repo); +} + +static int similarity_sig( + similarity_info *info, const git_diff_find_options *opts, - size_t file_idx, void **cache) { int error = 0; - git_diff_file *file = similarity_get_file(diff, file_idx); - git_iterator_type_t src = (file_idx & 1) ? diff->new_src : diff->old_src; - - if (src == GIT_ITERATOR_TYPE_WORKDIR) { /* compute hashsig from file */ - git_buf path = GIT_BUF_INIT; - - /* TODO: apply wd-to-odb filters to file data if necessary */ + git_diff_file *file = info->file; + if (info->src == GIT_ITERATOR_TYPE_WORKDIR) { if ((error = git_buf_joinpath( - &path, git_repository_workdir(diff->repo), file->path)) < 0) + &info->data, git_repository_workdir(info->repo), file->path)) < 0) return error; /* if path is not a regular file, just skip this item */ - if (git_path_isfile(path.ptr)) - error = opts->metric->file_signature( - &cache[file_idx], file, path.ptr, opts->metric->payload); + if (!git_path_isfile(info->data.ptr)) + return 0; - git_buf_free(&path); - } else { /* compute hashsig from blob buffer */ - git_blob *blob = NULL; - git_off_t blobsize; + /* TODO: apply wd-to-odb filters to file data if necessary */ - /* TODO: add max size threshold a la diff? */ + error = opts->metric->file_signature( + &cache[info->idx], info->file, + info->data.ptr, opts->metric->payload); + } else { + /* if we didn't initially know the size, we might have an odb_obj + * around from earlier, so convert that, otherwise load the blob now + */ + if (info->odb_obj != NULL) + error = git_object__from_odb_object( + (git_object **)&info->blob, info->repo, + info->odb_obj, GIT_OBJ_BLOB); + else + error = git_blob_lookup(&info->blob, info->repo, &file->oid); - if (git_blob_lookup(&blob, diff->repo, &file->oid) < 0) { + if (error < 0) { /* if lookup fails, just skip this item in similarity calc */ giterr_clear(); - return 0; - } + } else { + size_t sz; - blobsize = git_blob_rawsize(blob); - if (!git__is_sizet(blobsize)) /* ? what to do ? */ - blobsize = (size_t)-1; + /* index size may not be actual blob size if filtered */ + if (file->size != git_blob_rawsize(info->blob)) + file->size = git_blob_rawsize(info->blob); - error = opts->metric->buffer_signature( - &cache[file_idx], file, git_blob_rawcontent(blob), - (size_t)blobsize, opts->metric->payload); + sz = (size_t)(git__is_sizet(file->size) ? file->size : -1); - git_blob_free(blob); + error = opts->metric->buffer_signature( + &cache[info->idx], info->file, + git_blob_rawcontent(info->blob), sz, opts->metric->payload); + } } return error; } +static void similarity_unload(similarity_info *info) +{ + if (info->odb_obj) + git_odb_object_free(info->odb_obj); + + if (info->blob) + git_blob_free(info->blob); + else + git_buf_free(&info->data); +} + #define FLAG_SET(opts,flag_name) (((opts)->flags & flag_name) != 0) /* - score < 0 means files cannot be compared @@ -476,6 +518,8 @@ static int similarity_measure( git_diff_file *a_file = similarity_get_file(diff, a_idx); git_diff_file *b_file = similarity_get_file(diff, b_idx); bool exact_match = FLAG_SET(opts, GIT_DIFF_FIND_EXACT_MATCH_ONLY); + int error = 0; + similarity_info a_info, b_info; *score = -1; @@ -510,19 +554,44 @@ static int similarity_measure( return 0; } + memset(&a_info, 0, sizeof(a_info)); + memset(&b_info, 0, sizeof(b_info)); + + /* set up similarity data (will try to update missing file sizes) */ + if (!cache[a_idx] && (error = similarity_init(&a_info, diff, a_idx)) < 0) + return error; + if (!cache[b_idx] && (error = similarity_init(&b_info, diff, b_idx)) < 0) + goto cleanup; + + /* check if file sizes are nowhere near each other */ + if (a_file->size > 127 && + b_file->size > 127 && + (a_file->size > (b_file->size << 3) || + b_file->size > (a_file->size << 3))) + goto cleanup; + /* update signature cache if needed */ - if (!cache[a_idx] && similarity_calc(diff, opts, a_idx, cache) < 0) - return -1; - if (!cache[b_idx] && similarity_calc(diff, opts, b_idx, cache) < 0) - return -1; + if (!cache[a_idx]) { + if ((error = similarity_sig(&a_info, opts, cache)) < 0) + goto cleanup; + } + if (!cache[b_idx]) { + if ((error = similarity_sig(&b_info, opts, cache)) < 0) + goto cleanup; + } - /* some metrics may not wish to process this file (too big / too small) */ - if (!cache[a_idx] || !cache[b_idx]) - return 0; + /* calculate similarity provided that the metric choose to process + * both the a and b files (some may not if file is too big, etc). + */ + if (cache[a_idx] && cache[b_idx]) + error = opts->metric->similarity( + score, cache[a_idx], cache[b_idx], opts->metric->payload); + +cleanup: + similarity_unload(&a_info); + similarity_unload(&b_info); - /* compare signatures */ - return opts->metric->similarity( - score, cache[a_idx], cache[b_idx], opts->metric->payload); + return error; } static int calc_self_similarity( @@ -693,25 +762,30 @@ int git_diff_find_similar( git_diff_list *diff, git_diff_find_options *given_opts) { - size_t i, j, sigcache_size; + size_t s, t; int error = 0, similarity; - git_diff_delta *from, *to; + git_diff_delta *src, *tgt; git_diff_find_options opts; - size_t num_srcs = 0, num_tgts = 0, tried_srcs = 0, tried_tgts = 0; + size_t num_deltas, num_srcs = 0, num_tgts = 0; + size_t tried_srcs = 0, tried_tgts = 0; size_t num_rewrites = 0, num_updates = 0, num_bumped = 0; void **sigcache; /* cache of similarity metric file signatures */ - diff_find_match *match_srcs = NULL, *match_tgts = NULL, *best_match; + diff_find_match *tgt2src = NULL; + diff_find_match *src2tgt = NULL; + diff_find_match *tgt2src_copy = NULL; + diff_find_match *best_match; git_diff_file swap; if ((error = normalize_find_opts(diff, &opts, given_opts)) < 0) return error; + num_deltas = diff->deltas.length; + /* TODO: maybe abort if deltas.length > rename_limit ??? */ - if (!git__is_uint32(diff->deltas.length)) + if (!git__is_uint32(num_deltas)) return 0; - sigcache_size = diff->deltas.length * 2; /* keep size b/c diff may change */ - sigcache = git__calloc(sigcache_size, sizeof(void *)); + sigcache = git__calloc(num_deltas * 2, sizeof(void *)); GITERR_CHECK_ALLOC(sigcache); /* Label rename sources and targets @@ -719,11 +793,11 @@ int git_diff_find_similar( * This will also set self-similarity scores for MODIFIED files and * mark them for splitting if break-rewrites is enabled */ - git_vector_foreach(&diff->deltas, i, to) { - if (is_rename_source(diff, &opts, i, sigcache)) + git_vector_foreach(&diff->deltas, t, tgt) { + if (is_rename_source(diff, &opts, t, sigcache)) ++num_srcs; - if (is_rename_target(diff, &opts, i, sigcache)) + if (is_rename_target(diff, &opts, t, sigcache)) ++num_tgts; } @@ -731,10 +805,15 @@ int git_diff_find_similar( if (!num_srcs || !num_tgts) goto cleanup; - match_tgts = git__calloc(diff->deltas.length, sizeof(diff_find_match)); - GITERR_CHECK_ALLOC(match_tgts); - match_srcs = git__calloc(diff->deltas.length, sizeof(diff_find_match)); - GITERR_CHECK_ALLOC(match_srcs); + src2tgt = git__calloc(num_deltas, sizeof(diff_find_match)); + GITERR_CHECK_ALLOC(src2tgt); + tgt2src = git__calloc(num_deltas, sizeof(diff_find_match)); + GITERR_CHECK_ALLOC(tgt2src); + + if (FLAG_SET(&opts, GIT_DIFF_FIND_COPIES)) { + tgt2src_copy = git__calloc(num_deltas, sizeof(diff_find_match)); + GITERR_CHECK_ALLOC(tgt2src_copy); + } /* * Find best-fit matches for rename / copy candidates @@ -743,47 +822,61 @@ int git_diff_find_similar( find_best_matches: tried_tgts = num_bumped = 0; - git_vector_foreach(&diff->deltas, i, to) { + git_vector_foreach(&diff->deltas, t, tgt) { /* skip things that are not rename targets */ - if ((to->flags & GIT_DIFF_FLAG__IS_RENAME_TARGET) == 0) + if ((tgt->flags & GIT_DIFF_FLAG__IS_RENAME_TARGET) == 0) continue; tried_srcs = 0; - git_vector_foreach(&diff->deltas, j, from) { + git_vector_foreach(&diff->deltas, s, src) { /* skip things that are not rename sources */ - if ((from->flags & GIT_DIFF_FLAG__IS_RENAME_SOURCE) == 0) + if ((src->flags & GIT_DIFF_FLAG__IS_RENAME_SOURCE) == 0) continue; /* calculate similarity for this pair and find best match */ - if (i == j) + if (s == t) similarity = -1; /* don't measure self-similarity here */ else if ((error = similarity_measure( - &similarity, diff, &opts, sigcache, 2 * j, 2 * i + 1)) < 0) + &similarity, diff, &opts, sigcache, 2 * s, 2 * t + 1)) < 0) goto cleanup; - /* if this pairing is better for the src and the tgt, keep it */ - if (similarity > 0 && - match_tgts[i].similarity < (uint32_t)similarity && - match_srcs[j].similarity < (uint32_t)similarity) + if (similarity < 0) + continue; + + /* is this a better rename? */ + if (tgt2src[t].similarity < (uint32_t)similarity && + src2tgt[s].similarity < (uint32_t)similarity) { - if (match_tgts[i].similarity > 0) { - match_tgts[match_srcs[j].idx].similarity = 0; - match_srcs[match_tgts[i].idx].similarity = 0; - ++num_bumped; + /* eject old mapping */ + if (src2tgt[s].similarity > 0) { + tgt2src[src2tgt[s].idx].similarity = 0; + num_bumped++; + } + if (tgt2src[t].similarity > 0) { + src2tgt[tgt2src[t].idx].similarity = 0; + num_bumped++; } - match_tgts[i].similarity = (uint32_t)similarity; - match_tgts[i].idx = (uint32_t)j; + /* write new mapping */ + tgt2src[t].idx = s; + tgt2src[t].similarity = (uint32_t)similarity; + src2tgt[s].idx = t; + src2tgt[s].similarity = (uint32_t)similarity; + } - match_srcs[j].similarity = (uint32_t)similarity; - match_srcs[j].idx = (uint32_t)i; + /* keep best absolute match for copies */ + if (tgt2src_copy != NULL && + tgt2src_copy[t].similarity < (uint32_t)similarity) + { + tgt2src_copy[t].idx = s; + tgt2src_copy[t].similarity = (uint32_t)similarity; } if (++tried_srcs >= num_srcs) break; - /* cap on maximum targets we'll examine (per "to" file) */ + /* cap on maximum targets we'll examine (per "tgt" file) */ if (tried_srcs > opts.rename_limit) break; } @@ -801,18 +894,21 @@ find_best_matches: tried_tgts = 0; - git_vector_foreach(&diff->deltas, i, to) { + git_vector_foreach(&diff->deltas, t, tgt) { /* skip things that are not rename targets */ - if ((to->flags & GIT_DIFF_FLAG__IS_RENAME_TARGET) == 0) + if ((tgt->flags & GIT_DIFF_FLAG__IS_RENAME_TARGET) == 0) continue; /* check if this delta was the target of a similarity */ - best_match = &match_tgts[i]; - if (!best_match->similarity) + if (tgt2src[t].similarity) + best_match = &tgt2src[t]; + else if (tgt2src_copy && tgt2src_copy[t].similarity) + best_match = &tgt2src_copy[t]; + else continue; - j = best_match->idx; - from = GIT_VECTOR_GET(&diff->deltas, j); + s = best_match->idx; + src = GIT_VECTOR_GET(&diff->deltas, s); /* possible scenarios: * 1. from DELETE to ADD/UNTRACK/IGNORE = RENAME @@ -822,101 +918,112 @@ find_best_matches: * 5. from OTHER to ADD/UNTRACK/IGNORE = OTHER + COPY */ - if (from->status == GIT_DELTA_DELETED) { + if (src->status == GIT_DELTA_DELETED) { - if (delta_is_new_only(to)) { + if (delta_is_new_only(tgt)) { if (best_match->similarity < opts.rename_threshold) continue; - delta_make_rename(to, from, best_match->similarity); + delta_make_rename(tgt, src, best_match->similarity); - from->flags |= GIT_DIFF_FLAG__TO_DELETE; + src->flags |= GIT_DIFF_FLAG__TO_DELETE; num_rewrites++; } else { - assert(delta_is_split(to)); + assert(delta_is_split(tgt)); if (best_match->similarity < opts.rename_from_rewrite_threshold) continue; - memcpy(&swap, &to->old_file, sizeof(swap)); + memcpy(&swap, &tgt->old_file, sizeof(swap)); - delta_make_rename(to, from, best_match->similarity); + delta_make_rename(tgt, src, best_match->similarity); num_rewrites--; - from->status = GIT_DELTA_DELETED; - memcpy(&from->old_file, &swap, sizeof(from->old_file)); - memset(&from->new_file, 0, sizeof(from->new_file)); - from->new_file.path = from->old_file.path; - from->new_file.flags |= GIT_DIFF_FLAG_VALID_OID; + src->status = GIT_DELTA_DELETED; + memcpy(&src->old_file, &swap, sizeof(src->old_file)); + memset(&src->new_file, 0, sizeof(src->new_file)); + src->new_file.path = src->old_file.path; + src->new_file.flags |= GIT_DIFF_FLAG_VALID_OID; num_updates++; + + if (src2tgt[t].similarity > 0 && src2tgt[t].idx > t) { + /* what used to be at src t is now at src s */ + tgt2src[src2tgt[t].idx].idx = (uint32_t)s; + } } } - else if (delta_is_split(from)) { + else if (delta_is_split(src)) { - if (delta_is_new_only(to)) { + if (delta_is_new_only(tgt)) { if (best_match->similarity < opts.rename_threshold) continue; - delta_make_rename(to, from, best_match->similarity); + delta_make_rename(tgt, src, best_match->similarity); - from->status = (diff->new_src == GIT_ITERATOR_TYPE_WORKDIR) ? + src->status = (diff->new_src == GIT_ITERATOR_TYPE_WORKDIR) ? GIT_DELTA_UNTRACKED : GIT_DELTA_ADDED; - memset(&from->old_file, 0, sizeof(from->old_file)); - from->old_file.path = from->new_file.path; - from->old_file.flags |= GIT_DIFF_FLAG_VALID_OID; + memset(&src->old_file, 0, sizeof(src->old_file)); + src->old_file.path = src->new_file.path; + src->old_file.flags |= GIT_DIFF_FLAG_VALID_OID; - from->flags &= ~GIT_DIFF_FLAG__TO_SPLIT; + src->flags &= ~GIT_DIFF_FLAG__TO_SPLIT; num_rewrites--; num_updates++; } else { - assert(delta_is_split(from)); + assert(delta_is_split(src)); if (best_match->similarity < opts.rename_from_rewrite_threshold) continue; - memcpy(&swap, &to->old_file, sizeof(swap)); + memcpy(&swap, &tgt->old_file, sizeof(swap)); - delta_make_rename(to, from, best_match->similarity); + delta_make_rename(tgt, src, best_match->similarity); num_rewrites--; num_updates++; - memcpy(&from->old_file, &swap, sizeof(from->old_file)); + memcpy(&src->old_file, &swap, sizeof(src->old_file)); /* if we've just swapped the new element into the correct * place, clear the SPLIT flag */ - if (match_tgts[j].idx == i && - match_tgts[j].similarity > + if (tgt2src[s].idx == t && + tgt2src[s].similarity > opts.rename_from_rewrite_threshold) { - - from->status = GIT_DELTA_RENAMED; - from->similarity = match_tgts[j].similarity; - match_tgts[j].similarity = 0; - from->flags &= ~GIT_DIFF_FLAG__TO_SPLIT; + src->status = GIT_DELTA_RENAMED; + src->similarity = tgt2src[s].similarity; + tgt2src[s].similarity = 0; + src->flags &= ~GIT_DIFF_FLAG__TO_SPLIT; num_rewrites--; } /* otherwise, if we just overwrote a source, update mapping */ - else if (j > i && match_srcs[i].similarity > 0) { - match_tgts[match_srcs[i].idx].idx = (uint32_t)j; + else if (src2tgt[t].similarity > 0 && src2tgt[t].idx > t) { + /* what used to be at src t is now at src s */ + tgt2src[src2tgt[t].idx].idx = (uint32_t)s; } num_updates++; } } - else if (delta_is_new_only(to)) { - if (!FLAG_SET(&opts, GIT_DIFF_FIND_COPIES) || - best_match->similarity < opts.copy_threshold) + else if (delta_is_new_only(tgt)) { + if (!FLAG_SET(&opts, GIT_DIFF_FIND_COPIES)) continue; - to->status = GIT_DELTA_COPIED; - to->similarity = best_match->similarity; - memcpy(&to->old_file, &from->old_file, sizeof(to->old_file)); + if (tgt2src_copy[t].similarity < opts.copy_threshold) + continue; + + /* always use best possible source for copy */ + best_match = &tgt2src_copy[t]; + src = GIT_VECTOR_GET(&diff->deltas, best_match->idx); + + tgt->status = GIT_DELTA_COPIED; + tgt->similarity = best_match->similarity; + memcpy(&tgt->old_file, &src->old_file, sizeof(tgt->old_file)); num_updates++; } @@ -932,12 +1039,13 @@ find_best_matches: FLAG_SET(&opts, GIT_DIFF_BREAK_REWRITES)); cleanup: - git__free(match_srcs); - git__free(match_tgts); + git__free(tgt2src); + git__free(src2tgt); + git__free(tgt2src_copy); - for (i = 0; i < sigcache_size; ++i) { - if (sigcache[i] != NULL) - opts.metric->free_signature(sigcache[i], opts.metric->payload); + for (t = 0; t < num_deltas * 2; ++t) { + if (sigcache[t] != NULL) + opts.metric->free_signature(sigcache[t], opts.metric->payload); } git__free(sigcache); diff --git a/src/hashsig.c b/src/hashsig.c index ab8d8b3f0..109f966ba 100644 --- a/src/hashsig.c +++ b/src/hashsig.c @@ -13,12 +13,15 @@ typedef uint64_t hashsig_state; #define HASHSIG_SCALE 100 -#define HASHSIG_HASH_WINDOW 32 -#define HASHSIG_HASH_START 0 +#define HASHSIG_MAX_RUN 80 +#define HASHSIG_HASH_START 0x012345678ABCDEF0LL #define HASHSIG_HASH_SHIFT 5 -#define HASHSIG_HASH_MASK 0x7FFFFFFF + +#define HASHSIG_HASH_MIX(S,CH) \ + (S) = ((S) << HASHSIG_HASH_SHIFT) - (S) + (hashsig_state)(CH) #define HASHSIG_HEAP_SIZE ((1 << 7) - 1) +#define HASHSIG_HEAP_MIN_SIZE 4 typedef int (*hashsig_cmp)(const void *a, const void *b, void *); @@ -28,14 +31,6 @@ typedef struct { hashsig_t values[HASHSIG_HEAP_SIZE]; } hashsig_heap; -typedef struct { - hashsig_state state, shift_n; - char window[HASHSIG_HASH_WINDOW]; - int win_len, win_pos, saw_lf; -} hashsig_in_progress; - -#define HASHSIG_IN_PROGRESS_INIT { HASHSIG_HASH_START, 1, {0}, 0, 0, 1 } - struct git_hashsig { hashsig_heap mins; hashsig_heap maxs; @@ -43,8 +38,8 @@ struct git_hashsig { int considered; }; -#define HEAP_LCHILD_OF(I) (((I)*2)+1) -#define HEAP_RCHILD_OF(I) (((I)*2)+2) +#define HEAP_LCHILD_OF(I) (((I)<<1)+1) +#define HEAP_RCHILD_OF(I) (((I)<<1)+2) #define HEAP_PARENT_OF(I) (((I)-1)>>1) static void hashsig_heap_init(hashsig_heap *h, hashsig_cmp cmp) @@ -115,134 +110,109 @@ static void hashsig_heap_sort(hashsig_heap *h) static void hashsig_heap_insert(hashsig_heap *h, hashsig_t val) { - /* if heap is full, pop top if new element should replace it */ - if (h->size == h->asize && h->cmp(&val, &h->values[0], NULL) > 0) { - h->size--; - h->values[0] = h->values[h->size]; - hashsig_heap_down(h, 0); - } - /* if heap is not full, insert new element */ if (h->size < h->asize) { h->values[h->size++] = val; hashsig_heap_up(h, h->size - 1); } -} - -GIT_INLINE(bool) hashsig_include_char( - char ch, git_hashsig_option_t opt, int *saw_lf) -{ - if ((opt & GIT_HASHSIG_IGNORE_WHITESPACE) && git__isspace(ch)) - return false; - - if (opt & GIT_HASHSIG_SMART_WHITESPACE) { - if (ch == '\r' || (*saw_lf && git__isspace(ch))) - return false; - *saw_lf = (ch == '\n'); + /* if heap is full, pop top if new element should replace it */ + else if (h->cmp(&val, &h->values[0], NULL) > 0) { + h->size--; + h->values[0] = h->values[h->size]; + hashsig_heap_down(h, 0); } - return true; } -static void hashsig_initial_window( - git_hashsig *sig, - const char **data, - size_t size, - hashsig_in_progress *prog) -{ - hashsig_state state, shift_n; - int win_len; - const char *scan, *end; - - /* init until we have processed at least HASHSIG_HASH_WINDOW data */ - - if (prog->win_len >= HASHSIG_HASH_WINDOW) - return; - - state = prog->state; - win_len = prog->win_len; - shift_n = prog->shift_n; - - scan = *data; - end = scan + size; - - while (scan < end && win_len < HASHSIG_HASH_WINDOW) { - char ch = *scan++; - - if (!hashsig_include_char(ch, sig->opt, &prog->saw_lf)) - continue; - - state = (state * HASHSIG_HASH_SHIFT + ch) & HASHSIG_HASH_MASK; - - if (!win_len) - shift_n = 1; - else - shift_n = (shift_n * HASHSIG_HASH_SHIFT) & HASHSIG_HASH_MASK; - - prog->window[win_len++] = ch; - } - - /* insert initial hash if we just finished */ +typedef struct { + int use_ignores; + uint8_t ignore_ch[256]; +} hashsig_in_progress; - if (win_len == HASHSIG_HASH_WINDOW) { - hashsig_heap_insert(&sig->mins, (hashsig_t)state); - hashsig_heap_insert(&sig->maxs, (hashsig_t)state); - sig->considered = 1; +static void hashsig_in_progress_init( + hashsig_in_progress *prog, git_hashsig *sig) +{ + int i; + + switch (sig->opt) { + case GIT_HASHSIG_IGNORE_WHITESPACE: + for (i = 0; i < 256; ++i) + prog->ignore_ch[i] = git__isspace_nonlf(i); + prog->use_ignores = 1; + break; + case GIT_HASHSIG_SMART_WHITESPACE: + for (i = 0; i < 256; ++i) + prog->ignore_ch[i] = git__isspace(i); + prog->use_ignores = 1; + break; + default: + memset(prog, 0, sizeof(*prog)); + break; } - - prog->state = state; - prog->win_len = win_len; - prog->shift_n = shift_n; - - *data = scan; } +#define HASHSIG_IN_PROGRESS_INIT { 1 } + static int hashsig_add_hashes( git_hashsig *sig, - const char *data, + const uint8_t *data, size_t size, hashsig_in_progress *prog) { - const char *scan = data, *end = data + size; - hashsig_state state, shift_n, rmv; - - if (prog->win_len < HASHSIG_HASH_WINDOW) - hashsig_initial_window(sig, &scan, size, prog); - - state = prog->state; - shift_n = prog->shift_n; - - /* advance window, adding new chars and removing old */ - - for (; scan < end; ++scan) { - char ch = *scan; - - if (!hashsig_include_char(ch, sig->opt, &prog->saw_lf)) - continue; - - rmv = shift_n * prog->window[prog->win_pos]; + const uint8_t *scan = data, *end = data + size; + hashsig_state state = HASHSIG_HASH_START; + int use_ignores = prog->use_ignores, len; + uint8_t ch; + + while (scan < end) { + state = HASHSIG_HASH_START; + + for (len = 0; scan < end && len < HASHSIG_MAX_RUN; ) { + ch = *scan; + + if (use_ignores) + for (; scan < end && git__isspace_nonlf(ch); ch = *scan) + ++scan; + else if (sig->opt != GIT_HASHSIG_NORMAL) + for (; scan < end && ch == '\r'; ch = *scan) + ++scan; + + /* peek at next character to decide what to do next */ + if (sig->opt == GIT_HASHSIG_SMART_WHITESPACE) + use_ignores = (ch == '\n'); + + if (scan >= end) + break; + ++scan; + + /* check run terminator */ + if (ch == '\n' || ch == '\0') + break; + + ++len; + HASHSIG_HASH_MIX(state, ch); + } - state = (state - rmv) & HASHSIG_HASH_MASK; - state = (state * HASHSIG_HASH_SHIFT) & HASHSIG_HASH_MASK; - state = (state + ch) & HASHSIG_HASH_MASK; + if (len > 0) { + hashsig_heap_insert(&sig->mins, (hashsig_t)state); + hashsig_heap_insert(&sig->maxs, (hashsig_t)state); - hashsig_heap_insert(&sig->mins, (hashsig_t)state); - hashsig_heap_insert(&sig->maxs, (hashsig_t)state); - sig->considered++; + sig->considered++; - prog->window[prog->win_pos] = ch; - prog->win_pos = (prog->win_pos + 1) % HASHSIG_HASH_WINDOW; + while (scan < end && (*scan == '\n' || !*scan)) + ++scan; + } } - prog->state = state; + prog->use_ignores = use_ignores; return 0; } static int hashsig_finalize_hashes(git_hashsig *sig) { - if (sig->mins.size < HASHSIG_HEAP_SIZE) { + if (sig->mins.size < HASHSIG_HEAP_MIN_SIZE) { giterr_set(GITERR_INVALID, "File too small for similarity signature calculation"); return GIT_EBUFS; @@ -274,11 +244,13 @@ int git_hashsig_create( git_hashsig_option_t opts) { int error; - hashsig_in_progress prog = HASHSIG_IN_PROGRESS_INIT; + hashsig_in_progress prog; git_hashsig *sig = hashsig_alloc(opts); GITERR_CHECK_ALLOC(sig); - error = hashsig_add_hashes(sig, buf, buflen, &prog); + hashsig_in_progress_init(&prog, sig); + + error = hashsig_add_hashes(sig, (const uint8_t *)buf, buflen, &prog); if (!error) error = hashsig_finalize_hashes(sig); @@ -296,10 +268,10 @@ int git_hashsig_create_fromfile( const char *path, git_hashsig_option_t opts) { - char buf[4096]; + uint8_t buf[0x1000]; ssize_t buflen = 0; int error = 0, fd; - hashsig_in_progress prog = HASHSIG_IN_PROGRESS_INIT; + hashsig_in_progress prog; git_hashsig *sig = hashsig_alloc(opts); GITERR_CHECK_ALLOC(sig); @@ -308,6 +280,8 @@ int git_hashsig_create_fromfile( return fd; } + hashsig_in_progress_init(&prog, sig); + while (!error) { if ((buflen = p_read(fd, buf, sizeof(buf))) <= 0) { if ((error = (int)buflen) < 0) @@ -362,6 +336,12 @@ static int hashsig_heap_compare(const hashsig_heap *a, const hashsig_heap *b) int git_hashsig_compare(const git_hashsig *a, const git_hashsig *b) { - return (hashsig_heap_compare(&a->mins, &b->mins) + - hashsig_heap_compare(&a->maxs, &b->maxs)) / 2; + /* if we have fewer than the maximum number of elements, then just use + * one array since the two arrays will be the same + */ + if (a->mins.size < HASHSIG_HEAP_SIZE) + return hashsig_heap_compare(&a->mins, &b->mins); + else + return (hashsig_heap_compare(&a->mins, &b->mins) + + hashsig_heap_compare(&a->maxs, &b->maxs)) / 2; } diff --git a/src/index.c b/src/index.c index 0610eb5b9..cbdd43bdc 100644 --- a/src/index.c +++ b/src/index.c @@ -16,6 +16,7 @@ #include "iterator.h" #include "pathspec.h" #include "ignore.h" +#include "blob.h" #include "git2/odb.h" #include "git2/oid.h" @@ -604,42 +605,23 @@ int git_index_entry__cmp_icase(const void *a, const void *b) return strcasecmp(entry_a->path, entry_b->path); } -static int index_entry_init(git_index_entry **entry_out, git_index *index, const char *rel_path) +static int index_entry_init( + git_index_entry **entry_out, git_index *index, const char *rel_path) { + int error = 0; git_index_entry *entry = NULL; struct stat st; git_oid oid; - const char *workdir; - git_buf full_path = GIT_BUF_INIT; - int error; if (INDEX_OWNER(index) == NULL) return create_index_error(-1, "Could not initialize index entry. " "Index is not backed up by an existing repository."); - workdir = git_repository_workdir(INDEX_OWNER(index)); - - if (!workdir) - return create_index_error(GIT_EBAREREPO, - "Could not initialize index entry. Repository is bare"); - - if ((error = git_buf_joinpath(&full_path, workdir, rel_path)) < 0) - return error; - - if ((error = git_path_lstat(full_path.ptr, &st)) < 0) { - git_buf_free(&full_path); - return error; - } - - git_buf_free(&full_path); /* done with full path */ - - /* There is no need to validate the rel_path here, since it will be - * immediately validated by the call to git_blob_create_fromfile. - */ - - /* write the blob to disk and get the oid */ - if ((error = git_blob_create_fromworkdir(&oid, INDEX_OWNER(index), rel_path)) < 0) + /* write the blob to disk and get the oid and stat info */ + error = git_blob__create_from_paths( + &oid, &st, INDEX_OWNER(index), NULL, rel_path, 0, true); + if (error < 0) return error; entry = git__calloc(1, sizeof(git_index_entry)); diff --git a/src/util.h b/src/util.h index a97c9bf39..ed9624770 100644 --- a/src/util.h +++ b/src/util.h @@ -294,6 +294,11 @@ GIT_INLINE(bool) git__isspace(int c) return (c == ' ' || c == '\t' || c == '\n' || c == '\f' || c == '\r' || c == '\v' || c == 0x85 /* Unicode CR+LF */); } +GIT_INLINE(bool) git__isspace_nonlf(int c) +{ + return (c == ' ' || c == '\t' || c == '\f' || c == '\r' || c == '\v' || c == 0x85 /* Unicode CR+LF */); +} + GIT_INLINE(bool) git__iswildcard(int c) { return (c == '*' || c == '?' || c == '['); |