summaryrefslogtreecommitdiff
path: root/ewah/ewah_io.c
diff options
context:
space:
mode:
authorVicent Marti <tanoku@gmail.com>2013-11-14 07:43:51 -0500
committerJunio C Hamano <gitster@pobox.com>2013-12-30 12:17:20 -0800
commite1273106f62927e3efdb1cfa107cb1a9f913274c (patch)
tree9a23af0dbec1791e1a0d8b3137e614744ee14f19 /ewah/ewah_io.c
parent7e3dae494370b5596a6ea76af1191829ce11bce2 (diff)
downloadgit-e1273106f62927e3efdb1cfa107cb1a9f913274c.tar.gz
ewah: compressed bitmap implementation
EWAH is a word-aligned compressed variant of a bitset (i.e. a data structure that acts as a 0-indexed boolean array for many entries). It uses a 64-bit run-length encoding (RLE) compression scheme, trading some compression for better processing speed. The goal of this word-aligned implementation is not to achieve the best compression, but rather to improve query processing time. As it stands right now, this EWAH implementation will always be more efficient storage-wise than its uncompressed alternative. EWAH arrays will be used as the on-disk format to store reachability bitmaps for all objects in a repository while keeping reasonable sizes, in the same way that JGit does. This EWAH implementation is a mostly straightforward port of the original `javaewah` library that JGit currently uses. The library is self-contained and has been embedded whole (4 files) inside the `ewah` folder to ease redistribution. The library is re-licensed under the GPLv2 with the permission of Daniel Lemire, the original author. The source code for the C version can be found on GitHub: https://github.com/vmg/libewok The original Java implementation can also be found on GitHub: https://github.com/lemire/javaewah [jc: stripped debug-only code per Peff's $gmane/239768] Signed-off-by: Vicent Marti <tanoku@gmail.com> Signed-off-by: Jeff King <peff@peff.net> Helped-by: Ramsay Jones <ramsay@ramsay1.demon.co.uk> Signed-off-by: Junio C Hamano <gitster@pobox.com>
Diffstat (limited to 'ewah/ewah_io.c')
-rw-r--r--ewah/ewah_io.c193
1 files changed, 193 insertions, 0 deletions
diff --git a/ewah/ewah_io.c b/ewah/ewah_io.c
new file mode 100644
index 0000000000..aed0da6866
--- /dev/null
+++ b/ewah/ewah_io.c
@@ -0,0 +1,193 @@
+/**
+ * Copyright 2013, GitHub, Inc
+ * Copyright 2009-2013, Daniel Lemire, Cliff Moon,
+ * David McIntosh, Robert Becho, Google Inc. and Veronika Zenz
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include "git-compat-util.h"
+#include "ewok.h"
+
+int ewah_serialize_native(struct ewah_bitmap *self, int fd)
+{
+ uint32_t write32;
+ size_t to_write = self->buffer_size * 8;
+
+ /* 32 bit -- bit size for the map */
+ write32 = (uint32_t)self->bit_size;
+ if (write(fd, &write32, 4) != 4)
+ return -1;
+
+ /** 32 bit -- number of compressed 64-bit words */
+ write32 = (uint32_t)self->buffer_size;
+ if (write(fd, &write32, 4) != 4)
+ return -1;
+
+ if (write(fd, self->buffer, to_write) != to_write)
+ return -1;
+
+ /** 32 bit -- position for the RLW */
+ write32 = self->rlw - self->buffer;
+ if (write(fd, &write32, 4) != 4)
+ return -1;
+
+ return (3 * 4) + to_write;
+}
+
+int ewah_serialize_to(struct ewah_bitmap *self,
+ int (*write_fun)(void *, const void *, size_t),
+ void *data)
+{
+ size_t i;
+ eword_t dump[2048];
+ const size_t words_per_dump = sizeof(dump) / sizeof(eword_t);
+ uint32_t bitsize, word_count, rlw_pos;
+
+ const eword_t *buffer;
+ size_t words_left;
+
+ /* 32 bit -- bit size for the map */
+ bitsize = htonl((uint32_t)self->bit_size);
+ if (write_fun(data, &bitsize, 4) != 4)
+ return -1;
+
+ /** 32 bit -- number of compressed 64-bit words */
+ word_count = htonl((uint32_t)self->buffer_size);
+ if (write_fun(data, &word_count, 4) != 4)
+ return -1;
+
+ /** 64 bit x N -- compressed words */
+ buffer = self->buffer;
+ words_left = self->buffer_size;
+
+ while (words_left >= words_per_dump) {
+ for (i = 0; i < words_per_dump; ++i, ++buffer)
+ dump[i] = htonll(*buffer);
+
+ if (write_fun(data, dump, sizeof(dump)) != sizeof(dump))
+ return -1;
+
+ words_left -= words_per_dump;
+ }
+
+ if (words_left) {
+ for (i = 0; i < words_left; ++i, ++buffer)
+ dump[i] = htonll(*buffer);
+
+ if (write_fun(data, dump, words_left * 8) != words_left * 8)
+ return -1;
+ }
+
+ /** 32 bit -- position for the RLW */
+ rlw_pos = (uint8_t*)self->rlw - (uint8_t *)self->buffer;
+ rlw_pos = htonl(rlw_pos / sizeof(eword_t));
+
+ if (write_fun(data, &rlw_pos, 4) != 4)
+ return -1;
+
+ return (3 * 4) + (self->buffer_size * 8);
+}
+
+static int write_helper(void *fd, const void *buf, size_t len)
+{
+ return write((intptr_t)fd, buf, len);
+}
+
+int ewah_serialize(struct ewah_bitmap *self, int fd)
+{
+ return ewah_serialize_to(self, write_helper, (void *)(intptr_t)fd);
+}
+
+int ewah_read_mmap(struct ewah_bitmap *self, void *map, size_t len)
+{
+ uint32_t *read32 = map;
+ eword_t *read64;
+ size_t i;
+
+ self->bit_size = ntohl(*read32++);
+ self->buffer_size = self->alloc_size = ntohl(*read32++);
+ self->buffer = ewah_realloc(self->buffer,
+ self->alloc_size * sizeof(eword_t));
+
+ if (!self->buffer)
+ return -1;
+
+ for (i = 0, read64 = (void *)read32; i < self->buffer_size; ++i)
+ self->buffer[i] = ntohll(*read64++);
+
+ read32 = (void *)read64;
+ self->rlw = self->buffer + ntohl(*read32++);
+
+ return (3 * 4) + (self->buffer_size * 8);
+}
+
+int ewah_deserialize(struct ewah_bitmap *self, int fd)
+{
+ size_t i;
+ eword_t dump[2048];
+ const size_t words_per_dump = sizeof(dump) / sizeof(eword_t);
+ uint32_t bitsize, word_count, rlw_pos;
+
+ eword_t *buffer = NULL;
+ size_t words_left;
+
+ ewah_clear(self);
+
+ /* 32 bit -- bit size for the map */
+ if (read(fd, &bitsize, 4) != 4)
+ return -1;
+
+ self->bit_size = (size_t)ntohl(bitsize);
+
+ /** 32 bit -- number of compressed 64-bit words */
+ if (read(fd, &word_count, 4) != 4)
+ return -1;
+
+ self->buffer_size = self->alloc_size = (size_t)ntohl(word_count);
+ self->buffer = ewah_realloc(self->buffer,
+ self->alloc_size * sizeof(eword_t));
+
+ if (!self->buffer)
+ return -1;
+
+ /** 64 bit x N -- compressed words */
+ buffer = self->buffer;
+ words_left = self->buffer_size;
+
+ while (words_left >= words_per_dump) {
+ if (read(fd, dump, sizeof(dump)) != sizeof(dump))
+ return -1;
+
+ for (i = 0; i < words_per_dump; ++i, ++buffer)
+ *buffer = ntohll(dump[i]);
+
+ words_left -= words_per_dump;
+ }
+
+ if (words_left) {
+ if (read(fd, dump, words_left * 8) != words_left * 8)
+ return -1;
+
+ for (i = 0; i < words_left; ++i, ++buffer)
+ *buffer = ntohll(dump[i]);
+ }
+
+ /** 32 bit -- position for the RLW */
+ if (read(fd, &rlw_pos, 4) != 4)
+ return -1;
+
+ self->rlw = self->buffer + ntohl(rlw_pos);
+ return 0;
+}