Remove bypassed checksums in --inplace to improve speed.

When checking a checksum that refers to a part of an --inplace file that has been overwritten w/o getting SUMFLG_SAME_OFFSET set, we remove the checksum from the list. This will speed up files that have a lot of identical checksum blocks (e.g. sequences of zeros) that we can't use due to them not getting marked as being the same. Patch provided by Michael Chapman.
author: Wayne Davison <wayned@samba.org> 2013-08-03 09:44:13 -0700
committer: Wayne Davison <wayned@samba.org> 2013-08-03 09:59:38 -0700
commit: de94193353864221280be9fbb6193d92eb133000 (patch)
tree: 76bee964a0f7424bc8eb2b259e5168850d63a674
parent: 05fce6582a9192c58b2107153ec00056fd120d14 (diff)
download: rsync-de94193353864221280be9fbb6193d92eb133000.tar.gz
2 files changed, 20 insertions, 9 deletions
diff --git a/NEWS b/NEWS
index 040ac2d7..eec631d3 100644
--- a/NEWS
+++ b/NEWS
@@ -154,6 +154,9 @@ Changes since 3.0.9:
       file for one way to package the resulting files.  (Suggestions for
       how to make this even easier to install & use are welcomed.)
 
+    - Improved the speed of some --inplace updates when there are lots of
+      identical checksum blocks that end up being unsuable.
+
     - Added the --outbuf=N|L|B option for chosing the output buffering.
 
     - Repating the --fuzzy option now causes the code to look for fuzzy matches
diff --git a/match.c b/match.c
index bafab9f3..a8bd1f30 100644
--- a/match.c
+++ b/match.c
@@ -178,7 +178,8 @@ static void hash_search(int f,struct sum_struct *s,
 
 	do {
 		int done_csum2 = 0;
-		int32 i;
+		uint32 hash_entry;
+		int32 i, *prev;
 
 		if (DEBUG_GTE(DELTASUM, 4)) {
 			rprintf(FINFO, "offset=%s sum=%04x%04x\n",
@@ -186,19 +187,32 @@ static void hash_search(int f,struct sum_struct *s,
 		}
 
 		if (tablesize == TRADITIONAL_TABLESIZE) {
-			if ((i = hash_table[SUM2HASH2(s1,s2)]) < 0)
+			hash_entry = SUM2HASH2(s1,s2);
+			if ((i = hash_table[hash_entry]) < 0)
 				goto null_hash;
 			sum = (s1 & 0xffff) | (s2 << 16);
 		} else {
 			sum = (s1 & 0xffff) | (s2 << 16);
-			if ((i = hash_table[BIG_SUM2HASH(sum)]) < 0)
+			hash_entry = BIG_SUM2HASH(sum);
+			if ((i = hash_table[hash_entry]) < 0)
 				goto null_hash;
 		}
+		prev = &hash_table[hash_entry];
 
 		hash_hits++;
 		do {
 			int32 l;
 
+			/* When updating in-place, the chunk's offset must be
+			 * either >= our offset or identical data at that offset.
+			 * Remove any bypassed entries that we can never use. */
+			if (updating_basis_file && s->sums[i].offset < offset
+			    && !(s->sums[i].flags & SUMFLG_SAME_OFFSET)) {
+				*prev = s->sums[i].chain;
+				continue;
+			}
+			prev = &s->sums[i].chain;
+
 			if (sum != s->sums[i].sum1)
 				continue;
 
@@ -207,12 +221,6 @@ static void hash_search(int f,struct sum_struct *s,
 			if (l != s->sums[i].len)
 				continue;
 
-			/* in-place: ensure chunk's offset is either >= our
-			 * offset or that the data didn't move. */
-			if (updating_basis_file && s->sums[i].offset < offset
-			    && !(s->sums[i].flags & SUMFLG_SAME_OFFSET))
-				continue;
-
 			if (DEBUG_GTE(DELTASUM, 3)) {
 				rprintf(FINFO,
 					"potential match at %s i=%ld sum=%08x\n",
author	Wayne Davison <wayned@samba.org>	2013-08-03 09:44:13 -0700
committer	Wayne Davison <wayned@samba.org>	2013-08-03 09:59:38 -0700
commit	de94193353864221280be9fbb6193d92eb133000 (patch)
tree	76bee964a0f7424bc8eb2b259e5168850d63a674
parent	05fce6582a9192c58b2107153ec00056fd120d14 (diff)
download	rsync-de94193353864221280be9fbb6193d92eb133000.tar.gz