improve diff-delta with sparse and/or repetitive data

It is useless to preserve multiple hash entries for consecutive blocks with the same hash. Keeping only the first one will allow for matching the longest string of identical bytes while subsequent blocks will only allow for shorter matches. The backward matching code will match the end of it as necessary. This improves both performances (no repeated string compare with long successions of identical bytes, or even small group of bytes), as well as compression (less likely to need random hash bucket entry culling), especially with sparse files. With well behaved data sets this patch doesn't change much. Signed-off-by: Nicolas Pitre <nico@cam.org> Signed-off-by: Junio C Hamano <junkio@cox.net>
author: Nicolas Pitre <nico@cam.org> 2006-05-02 23:31:00 -0400
committer: Junio C Hamano <junkio@cox.net> 2006-05-02 21:32:39 -0700
commit: 06a9f9203570d21f9ef5fe219cdde527dcdf0990 (patch)
tree: 7a06db8f14535b45ba0c34c2d252a2b833314554 /diff-delta.c
parent: 2d08e5dd730680f7f8645a6326ec653435e032df (diff)
download: git-06a9f9203570d21f9ef5fe219cdde527dcdf0990.tar.gz
1 files changed, 27 insertions, 13 deletions
diff --git a/diff-delta.c b/diff-delta.c
index 45df786b0a..c618875188 100644
--- a/diff-delta.c
+++ b/diff-delta.c
@@ -136,11 +136,12 @@ struct delta_index {
 
 struct delta_index * create_delta_index(const void *buf, unsigned long bufsize)
 {
-	unsigned int i, hsize, hmask, entries, *hash_count;
+	unsigned int i, hsize, hmask, entries, prev_val, *hash_count;
 	const unsigned char *data, *buffer = buf;
 	struct delta_index *index;
 	struct index_entry *entry, **hash;
 	void *mem;
+	unsigned long memsize;
 
 	if (!buf || !bufsize)
 		return NULL;
@@ -155,9 +156,10 @@ struct delta_index * create_delta_index(const void *buf, unsigned long bufsize)
 	hmask = hsize - 1;
 
 	/* allocate lookup index */
-	mem = malloc(sizeof(*index) +
-		     sizeof(*hash) * hsize +
-		     sizeof(*entry) * entries);
+	memsize = sizeof(*index) +
+		  sizeof(*hash) * hsize +
+		  sizeof(*entry) * entries;
+	mem = malloc(memsize);
 	if (!mem)
 		return NULL;
 	index = mem;
@@ -179,18 +181,26 @@ struct delta_index * create_delta_index(const void *buf, unsigned long bufsize)
 	}
 
 	/* then populate the index */
-	data = buffer + entries * RABIN_WINDOW - RABIN_WINDOW;
-	while (data >= buffer) {
+	prev_val = ~0;
+	for (data = buffer + entries * RABIN_WINDOW - RABIN_WINDOW;
+	     data >= buffer;
+	     data -= RABIN_WINDOW) {
 		unsigned int val = 0;
 		for (i = 1; i <= RABIN_WINDOW; i++)
 			val = ((val << 8) | data[i]) ^ T[val >> RABIN_SHIFT];
-		i = val & hmask;
-		entry->ptr = data + RABIN_WINDOW;
-		entry->val = val;
-		entry->next = hash[i];
-		hash[i] = entry++;
-		hash_count[i]++;
-		data -= RABIN_WINDOW;
+		if (val == prev_val) {
+			/* keep the lowest of consecutive identical blocks */
+			entry[-1].ptr = data + RABIN_WINDOW;
+		} else {
+			prev_val = val;
+			i = val & hmask;
+			entry->ptr = data + RABIN_WINDOW;
+			entry->val = val;
+			entry->next = hash[i];
+			hash[i] = entry++;
+			hash_count[i]++;
+			entries--;
+		}
 	}
 
 	/*
@@ -220,6 +230,10 @@ struct delta_index * create_delta_index(const void *buf, unsigned long bufsize)
 	}
 	free(hash_count);
 
+	/* If we didn't use all hash entries, free the unused memory. */
+	if (entries)
+		index = realloc(index, memsize - entries * sizeof(*entry));
+
 	return index;
 }
author	Nicolas Pitre <nico@cam.org>	2006-05-02 23:31:00 -0400
committer	Junio C Hamano <junkio@cox.net>	2006-05-02 21:32:39 -0700
commit	06a9f9203570d21f9ef5fe219cdde527dcdf0990 (patch)
tree	7a06db8f14535b45ba0c34c2d252a2b833314554 /diff-delta.c
parent	2d08e5dd730680f7f8645a6326ec653435e032df (diff)
download	git-06a9f9203570d21f9ef5fe219cdde527dcdf0990.tar.gz