From a6db3fbb6e4ec11695e2a3af8bc2cb9119cb1941 Mon Sep 17 00:00:00 2001 From: Jeff Hostetler Date: Fri, 14 Apr 2017 19:12:28 +0000 Subject: read-cache: add strcmp_offset function Add strcmp_offset() function to also return the offset of the first change. Add unit test and helper to verify. Signed-off-by: Jeff Hostetler Signed-off-by: Junio C Hamano --- read-cache.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'read-cache.c') diff --git a/read-cache.c b/read-cache.c index e447751823..11823f5dbc 100644 --- a/read-cache.c +++ b/read-cache.c @@ -887,6 +887,26 @@ static int has_file_name(struct index_state *istate, return retval; } + +/* + * Like strcmp(), but also return the offset of the first change. + * If strings are equal, return the length. + */ +int strcmp_offset(const char *s1, const char *s2, size_t *first_change) +{ + size_t k; + + if (!first_change) + return strcmp(s1, s2); + + for (k = 0; s1[k] == s2[k]; k++) + if (s1[k] == '\0') + break; + + *first_change = k; + return (unsigned char)s1[k] - (unsigned char)s2[k]; +} + /* * Do we have another file with a pathname that is a proper * subset of the name we're trying to add? -- cgit v1.2.1 From e5494631ed94017da862d55eb6393a0d01d8b91d Mon Sep 17 00:00:00 2001 From: Jeff Hostetler Date: Wed, 19 Apr 2017 17:06:16 +0000 Subject: read-cache: speed up add_index_entry during checkout Teach add_index_entry_with_check() to see if the path of the new item is greater than the last path in the index array before attempting to search for it. During checkout, merge_working_tree() populates the new index in sorted order, so this change will save a binary lookups per file. This preserves the original behavior but simply checks the last element before starting the search. This helps performance on very large repositories. Signed-off-by: Jeff Hostetler Signed-off-by: Junio C Hamano --- read-cache.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'read-cache.c') diff --git a/read-cache.c b/read-cache.c index 11823f5dbc..836338b3ba 100644 --- a/read-cache.c +++ b/read-cache.c @@ -1021,7 +1021,16 @@ static int add_index_entry_with_check(struct index_state *istate, struct cache_e if (!(option & ADD_CACHE_KEEP_CACHE_TREE)) cache_tree_invalidate_path(istate, ce->name); - pos = index_name_stage_pos(istate, ce->name, ce_namelen(ce), ce_stage(ce)); + + /* + * If this entry's path sorts after the last entry in the index, + * we can avoid searching for it. + */ + if (istate->cache_nr > 0 && + strcmp(ce->name, istate->cache[istate->cache_nr - 1]->name) > 0) + pos = -istate->cache_nr - 1; + else + pos = index_name_stage_pos(istate, ce->name, ce_namelen(ce), ce_stage(ce)); /* existing match? Just replace it. */ if (pos >= 0) { -- cgit v1.2.1 From 06b6d81b7929ce6f5542ba869e7bb9d8e437e6f8 Mon Sep 17 00:00:00 2001 From: Jeff Hostetler Date: Wed, 19 Apr 2017 17:06:17 +0000 Subject: read-cache: speed up has_dir_name (part 1) Teach has_dir_name() to see if the path of the new item is greater than the last path in the index array before attempting to search for it. has_dir_name() is looking for file/directory collisions in the index and has to consider each sub-directory prefix in turn. This can cause multiple binary searches for each path. During operations like checkout, merge_working_tree() populates the new index in sorted order, so we expect to be able to append in many cases. This commit is part 1 of 2. This commit handles the top of has_dir_name() and the trivial optimization. Signed-off-by: Jeff Hostetler Signed-off-by: Junio C Hamano --- read-cache.c | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) (limited to 'read-cache.c') diff --git a/read-cache.c b/read-cache.c index 836338b3ba..5aa097afb7 100644 --- a/read-cache.c +++ b/read-cache.c @@ -910,6 +910,9 @@ int strcmp_offset(const char *s1, const char *s2, size_t *first_change) /* * Do we have another file with a pathname that is a proper * subset of the name we're trying to add? + * + * That is, is there another file in the index with a path + * that matches a sub-directory in the given entry? */ static int has_dir_name(struct index_state *istate, const struct cache_entry *ce, int pos, int ok_to_replace) @@ -918,6 +921,48 @@ static int has_dir_name(struct index_state *istate, int stage = ce_stage(ce); const char *name = ce->name; const char *slash = name + ce_namelen(ce); + size_t len_eq_last; + int cmp_last = 0; + + /* + * We are frequently called during an iteration on a sorted + * list of pathnames and while building a new index. Therefore, + * there is a high probability that this entry will eventually + * be appended to the index, rather than inserted in the middle. + * If we can confirm that, we can avoid binary searches on the + * components of the pathname. + * + * Compare the entry's full path with the last path in the index. + */ + if (istate->cache_nr > 0) { + cmp_last = strcmp_offset(name, + istate->cache[istate->cache_nr - 1]->name, + &len_eq_last); + if (cmp_last > 0) { + if (len_eq_last == 0) { + /* + * The entry sorts AFTER the last one in the + * index and their paths have no common prefix, + * so there cannot be a F/D conflict. + */ + return retval; + } else { + /* + * The entry sorts AFTER the last one in the + * index, but has a common prefix. Fall through + * to the loop below to disect the entry's path + * and see where the difference is. + */ + } + } else if (cmp_last == 0) { + /* + * The entry exactly matches the last one in the + * index, but because of multiple stage and CE_REMOVE + * items, we fall through and let the regular search + * code handle it. + */ + } + } for (;;) { int len; -- cgit v1.2.1 From b986df5c35a930858c6bba5f66c18ce6760d84e9 Mon Sep 17 00:00:00 2001 From: Jeff Hostetler Date: Wed, 19 Apr 2017 17:06:18 +0000 Subject: read-cache: speed up has_dir_name (part 2) Teach has_dir_name() to see if the path of the new item is greater than the last path in the index array before attempting to search for it. has_dir_name() is looking for file/directory collisions in the index and has to consider each sub-directory prefix in turn. This can cause multiple binary searches for each path. During operations like checkout, merge_working_tree() populates the new index in sorted order, so we expect to be able to append in many cases. This commit is part 2 of 2. This commit handles the additional possible short-cuts as we look at each sub-directory prefix. The net-net gains for add_index_entry_with_check() and both had_dir_name() commits are best seen for very large repos. Here are results for an INFLATED version of linux.git with 1M files. $ GIT_PERF_REPO=/mnt/test/linux_inflated.git/ ./run upstream/base HEAD ./p0006-read-tree-checkout.sh Test upstream/base HEAD 0006.2: read-tree br_base br_ballast (1043893) 3.79(3.63+0.15) 2.68(2.52+0.15) -29.3% 0006.3: switch between br_base br_ballast (1043893) 7.55(6.58+0.44) 6.03(4.60+0.43) -20.1% 0006.4: switch between br_ballast br_ballast_plus_1 (1043893) 10.84(9.26+0.59) 8.44(7.06+0.65) -22.1% 0006.5: switch between aliases (1043893) 10.93(9.39+0.58) 10.24(7.04+0.63) -6.3% Here are results for a synthetic repo with 4.2M files. $ GIT_PERF_REPO=~/work/gfw/t/perf/repos/gen-many-files-10.4.3.git/ ./run HEAD~3 HEAD ./p0006-read-tree-checkout.sh Test HEAD~3 HEAD 0006.2: read-tree br_base br_ballast (4194305) 29.96(19.26+10.50) 23.76(13.42+10.12) -20.7% 0006.3: switch between br_base br_ballast (4194305) 56.95(36.08+16.83) 45.54(25.94+15.68) -20.0% 0006.4: switch between br_ballast br_ballast_plus_1 (4194305) 90.94(51.50+31.52) 78.22(39.39+30.70) -14.0% 0006.5: switch between aliases (4194305) 93.72(51.63+34.09) 77.94(39.00+30.88) -16.8% Results for medium repos (like linux.git) are mixed and have more variance (probably do to disk IO unrelated to this test. $ GIT_PERF_REPO=/mnt/test/linux.git/ ./run HEAD~3 HEAD ./p0006-read-tree-checkout.sh Test HEAD~3 HEAD 0006.2: read-tree br_base br_ballast (57994) 0.25(0.21+0.03) 0.20(0.17+0.02) -20.0% 0006.3: switch between br_base br_ballast (57994) 10.67(6.06+2.92) 10.51(5.94+2.91) -1.5% 0006.4: switch between br_ballast br_ballast_plus_1 (57994) 0.59(0.47+0.16) 0.52(0.40+0.13) -11.9% 0006.5: switch between aliases (57994) 0.59(0.44+0.17) 0.51(0.38+0.14) -13.6% $ GIT_PERF_REPO=/mnt/test/linux.git/ ./run HEAD~3 HEAD ./p0006-read-tree-checkout.sh Test HEAD~3 HEAD 0006.2: read-tree br_base br_ballast (57994) 0.24(0.21+0.02) 0.21(0.18+0.02) -12.5% 0006.3: switch between br_base br_ballast (57994) 10.42(5.98+2.91) 10.66(5.86+3.09) +2.3% 0006.4: switch between br_ballast br_ballast_plus_1 (57994) 0.59(0.49+0.13) 0.53(0.37+0.16) -10.2% 0006.5: switch between aliases (57994) 0.59(0.43+0.17) 0.50(0.37+0.14) -15.3% Results for smaller repos (like git.git) are not significant. $ ./run HEAD~3 HEAD ./p0006-read-tree-checkout.sh Test HEAD~3 HEAD 0006.2: read-tree br_base br_ballast (3043) 0.01(0.00+0.00) 0.01(0.00+0.00) +0.0% 0006.3: switch between br_base br_ballast (3043) 0.31(0.17+0.11) 0.29(0.19+0.08) -6.5% 0006.4: switch between br_ballast br_ballast_plus_1 (3043) 0.03(0.02+0.00) 0.03(0.02+0.00) +0.0% 0006.5: switch between aliases (3043) 0.03(0.02+0.00) 0.03(0.02+0.00) +0.0% Signed-off-by: Jeff Hostetler Signed-off-by: Junio C Hamano --- read-cache.c | 63 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 62 insertions(+), 1 deletion(-) (limited to 'read-cache.c') diff --git a/read-cache.c b/read-cache.c index 5aa097afb7..5d47ba8bdd 100644 --- a/read-cache.c +++ b/read-cache.c @@ -965,7 +965,7 @@ static int has_dir_name(struct index_state *istate, } for (;;) { - int len; + size_t len; for (;;) { if (*--slash == '/') @@ -975,6 +975,67 @@ static int has_dir_name(struct index_state *istate, } len = slash - name; + if (cmp_last > 0) { + /* + * (len + 1) is a directory boundary (including + * the trailing slash). And since the loop is + * decrementing "slash", the first iteration is + * the longest directory prefix; subsequent + * iterations consider parent directories. + */ + + if (len + 1 <= len_eq_last) { + /* + * The directory prefix (including the trailing + * slash) also appears as a prefix in the last + * entry, so the remainder cannot collide (because + * strcmp said the whole path was greater). + * + * EQ: last: xxx/A + * this: xxx/B + * + * LT: last: xxx/file_A + * this: xxx/file_B + */ + return retval; + } + + if (len > len_eq_last) { + /* + * This part of the directory prefix (excluding + * the trailing slash) is longer than the known + * equal portions, so this sub-directory cannot + * collide with a file. + * + * GT: last: xxxA + * this: xxxB/file + */ + return retval; + } + + if (istate->cache_nr > 0 && + ce_namelen(istate->cache[istate->cache_nr - 1]) > len) { + /* + * The directory prefix lines up with part of + * a longer file or directory name, but sorts + * after it, so this sub-directory cannot + * collide with a file. + * + * last: xxx/yy-file (because '-' sorts before '/') + * this: xxx/yy/abc + */ + return retval; + } + + /* + * This is a possible collision. Fall through and + * let the regular search code handle it. + * + * last: xxx + * this: xxx/file + */ + } + pos = index_name_stage_pos(istate, name, len, stage); if (pos >= 0) { /* -- cgit v1.2.1