From 840383b2c2bd7179604f5c2595bf95e22a4e0c84 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Thu, 1 Apr 2010 20:09:26 -0400 Subject: textconv: refactor calls to run_textconv This patch adds a fill_textconv wrapper, which centralizes some minor logic like error checking and handling the case of no-textconv. In addition to dropping the number of lines, this will make it easier in future patches to handle multiple types of textconv. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- diff.c | 66 ++++++++++++++++++++++++++++++------------------------------------ 1 file changed, 30 insertions(+), 36 deletions(-) (limited to 'diff.c') diff --git a/diff.c b/diff.c index db2cd5d35f..9665d6d41f 100644 --- a/diff.c +++ b/diff.c @@ -43,7 +43,8 @@ static char diff_colors[][COLOR_MAXLEN] = { }; static void diff_filespec_load_driver(struct diff_filespec *one); -static char *run_textconv(const char *, struct diff_filespec *, size_t *); +static size_t fill_textconv(const char *cmd, + struct diff_filespec *df, char **outbuf); static int parse_diff_color_slot(const char *var, int ofs) { @@ -477,7 +478,7 @@ static void emit_rewrite_diff(const char *name_a, const char *reset = diff_get_color(color_diff, DIFF_RESET); static struct strbuf a_name = STRBUF_INIT, b_name = STRBUF_INIT; const char *a_prefix, *b_prefix; - const char *data_one, *data_two; + char *data_one, *data_two; size_t size_one, size_two; struct emit_callback ecbdata; @@ -499,26 +500,8 @@ static void emit_rewrite_diff(const char *name_a, quote_two_c_style(&a_name, a_prefix, name_a, 0); quote_two_c_style(&b_name, b_prefix, name_b, 0); - diff_populate_filespec(one, 0); - diff_populate_filespec(two, 0); - if (textconv_one) { - data_one = run_textconv(textconv_one, one, &size_one); - if (!data_one) - die("unable to read files to diff"); - } - else { - data_one = one->data; - size_one = one->size; - } - if (textconv_two) { - data_two = run_textconv(textconv_two, two, &size_two); - if (!data_two) - die("unable to read files to diff"); - } - else { - data_two = two->data; - size_two = two->size; - } + size_one = fill_textconv(textconv_one, one, &data_one); + size_two = fill_textconv(textconv_two, two, &data_two); memset(&ecbdata, 0, sizeof(ecbdata)); ecbdata.color_diff = color_diff; @@ -1717,20 +1700,8 @@ static void builtin_diff(const char *name_a, strbuf_reset(&header); } - if (textconv_one) { - size_t size; - mf1.ptr = run_textconv(textconv_one, one, &size); - if (!mf1.ptr) - die("unable to read files to diff"); - mf1.size = size; - } - if (textconv_two) { - size_t size; - mf2.ptr = run_textconv(textconv_two, two, &size); - if (!mf2.ptr) - die("unable to read files to diff"); - mf2.size = size; - } + mf1.size = fill_textconv(textconv_one, one, &mf1.ptr); + mf2.size = fill_textconv(textconv_two, two, &mf2.ptr); pe = diff_funcname_pattern(one); if (!pe) @@ -3916,3 +3887,26 @@ static char *run_textconv(const char *pgm, struct diff_filespec *spec, return strbuf_detach(&buf, outsize); } + +static size_t fill_textconv(const char *cmd, + struct diff_filespec *df, + char **outbuf) +{ + size_t size; + + if (!cmd) { + if (!DIFF_FILE_VALID(df)) { + *outbuf = ""; + return 0; + } + if (diff_populate_filespec(df, 0)) + die("unable to read files to diff"); + *outbuf = df->data; + return df->size; + } + + *outbuf = run_textconv(cmd, df, &size); + if (!*outbuf) + die("unable to read files to diff"); + return size; +} -- cgit v1.2.1 From d9bae1a178f0f8b198ea611e874975214ad6f990 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Thu, 1 Apr 2010 20:12:15 -0400 Subject: diff: cache textconv output Running a textconv filter can take a long time. It's particularly bad for a large file which needs to be spooled to disk, but even for small files, the fork+exec overhead can add up for something like "git log -p". This patch uses the notes-cache mechanism to keep a fast cache of textconv output. Caches are stored in refs/notes/textconv/$x, where $x is the userdiff driver defined in gitattributes. Caching is enabled only if diff.$x.cachetextconv is true. In my test repo, on a commit with 45 jpg and avi files changed and a textconv to show their exif tags: [before] $ time git show >/dev/null real 0m13.724s user 0m12.057s sys 0m1.624s [after, first run] $ git config diff.mfo.cachetextconv true $ time git show >/dev/null real 0m14.252s user 0m12.197s sys 0m1.800s [after, subsequent runs] $ time git show >/dev/null real 0m0.352s user 0m0.148s sys 0m0.200s So for a slight (3.8%) cost on the first run, we achieve an almost 40x speed up on subsequent runs. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- diff.c | 52 +++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 43 insertions(+), 9 deletions(-) (limited to 'diff.c') diff --git a/diff.c b/diff.c index 9665d6d41f..72d8503d89 100644 --- a/diff.c +++ b/diff.c @@ -43,7 +43,7 @@ static char diff_colors[][COLOR_MAXLEN] = { }; static void diff_filespec_load_driver(struct diff_filespec *one); -static size_t fill_textconv(const char *cmd, +static size_t fill_textconv(struct userdiff_driver *driver, struct diff_filespec *df, char **outbuf); static int parse_diff_color_slot(const char *var, int ofs) @@ -466,8 +466,8 @@ static void emit_rewrite_diff(const char *name_a, const char *name_b, struct diff_filespec *one, struct diff_filespec *two, - const char *textconv_one, - const char *textconv_two, + struct userdiff_driver *textconv_one, + struct userdiff_driver *textconv_two, struct diff_options *o) { int lc_a, lc_b; @@ -1569,14 +1569,26 @@ void diff_set_mnemonic_prefix(struct diff_options *options, const char *a, const options->b_prefix = b; } -static const char *get_textconv(struct diff_filespec *one) +static struct userdiff_driver *get_textconv(struct diff_filespec *one) { if (!DIFF_FILE_VALID(one)) return NULL; if (!S_ISREG(one->mode)) return NULL; diff_filespec_load_driver(one); - return one->driver->textconv; + if (!one->driver->textconv) + return NULL; + + if (one->driver->textconv_want_cache && !one->driver->textconv_cache) { + struct notes_cache *c = xmalloc(sizeof(*c)); + struct strbuf name = STRBUF_INIT; + + strbuf_addf(&name, "textconv/%s", one->driver->name); + notes_cache_init(c, name.buf, one->driver->textconv); + one->driver->textconv_cache = c; + } + + return one->driver; } static void builtin_diff(const char *name_a, @@ -1593,7 +1605,8 @@ static void builtin_diff(const char *name_a, const char *set = diff_get_color_opt(o, DIFF_METAINFO); const char *reset = diff_get_color_opt(o, DIFF_RESET); const char *a_prefix, *b_prefix; - const char *textconv_one = NULL, *textconv_two = NULL; + struct userdiff_driver *textconv_one = NULL; + struct userdiff_driver *textconv_two = NULL; struct strbuf header = STRBUF_INIT; if (DIFF_OPT_TST(o, SUBMODULE_LOG) && @@ -3888,13 +3901,13 @@ static char *run_textconv(const char *pgm, struct diff_filespec *spec, return strbuf_detach(&buf, outsize); } -static size_t fill_textconv(const char *cmd, +static size_t fill_textconv(struct userdiff_driver *driver, struct diff_filespec *df, char **outbuf) { size_t size; - if (!cmd) { + if (!driver || !driver->textconv) { if (!DIFF_FILE_VALID(df)) { *outbuf = ""; return 0; @@ -3905,8 +3918,29 @@ static size_t fill_textconv(const char *cmd, return df->size; } - *outbuf = run_textconv(cmd, df, &size); + if (driver->textconv_cache) { + *outbuf = notes_cache_get(driver->textconv_cache, df->sha1, + &size); + if (*outbuf) + return size; + } + + *outbuf = run_textconv(driver->textconv, df, &size); if (!*outbuf) die("unable to read files to diff"); + + if (driver->textconv_cache) { + /* ignore errors, as we might be in a readonly repository */ + notes_cache_put(driver->textconv_cache, df->sha1, *outbuf, + size); + /* + * we could save up changes and flush them all at the end, + * but we would need an extra call after all diffing is done. + * Since generating a cache entry is the slow path anyway, + * this extra overhead probably isn't a big deal. + */ + notes_cache_write(driver->textconv_cache); + } + return size; } -- cgit v1.2.1 From b3373982667dc983b8dacf33861d25b20bafb995 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Thu, 1 Apr 2010 20:14:24 -0400 Subject: diff: avoid useless filespec population builtin_diff calls fill_mmfile fairly early, which in turn calls diff_populate_filespec, which actually retrieves the file's blob contents into a buffer. Long ago, this was sensible as we would need to look at the blobs eventually. These days, however, we may not ever want those blobs if we end up using a textconv cache, and for large binary files (exactly the sort for which you might have a textconv cache), just retrieving the objects can be costly. This patch just pushes the fill_mmfile call a bit later, so we can avoid populating the filespec in some cases. There is one thing to note that looks like a bug but isn't. We push the fill_mmfile down into the first branch of a conditional. It seems like we would need it on the other branch, too, but we don't; fill_textconv does it for us (in fact, before this, we were just writing over the results of the fill_mmfile on that branch). Here's a timing sample on a commit with 45 changed jpgs and avis. The result is fully textconv cached, but we still wasted a lot of time just pulling the blobs from storage. The total size of the blobs (source and dest) is about 180M. [before] $ time git show >/dev/null real 0m0.352s user 0m0.148s sys 0m0.200s [after] $ time git show >/dev/null real 0m0.009s user 0m0.004s sys 0m0.004s And that's on a warm cache. On a cold cache, the "after" case is not much worse, but the "before" case has to do an extra 180M of I/O. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- diff.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'diff.c') diff --git a/diff.c b/diff.c index 72d8503d89..4cb6d9a9e8 100644 --- a/diff.c +++ b/diff.c @@ -1680,12 +1680,11 @@ static void builtin_diff(const char *name_a, } } - if (fill_mmfile(&mf1, one) < 0 || fill_mmfile(&mf2, two) < 0) - die("unable to read files to diff"); - if (!DIFF_OPT_TST(o, TEXT) && - ( (diff_filespec_is_binary(one) && !textconv_one) || - (diff_filespec_is_binary(two) && !textconv_two) )) { + ( (!textconv_one && diff_filespec_is_binary(one)) || + (!textconv_two && diff_filespec_is_binary(two)) )) { + if (fill_mmfile(&mf1, one) < 0 || fill_mmfile(&mf2, two) < 0) + die("unable to read files to diff"); /* Quite common confusing case */ if (mf1.size == mf2.size && !memcmp(mf1.ptr, mf2.ptr, mf1.size)) -- cgit v1.2.1