summaryrefslogtreecommitdiff
path: root/src/midx.c
diff options
context:
space:
mode:
authorlhchavez <lhchavez@lhchavez.com>2020-02-23 22:28:52 +0000
committerlhchavez <lhchavez@lhchavez.com>2020-10-05 05:08:38 -0700
commit005e77157d5eef9d9c0765ff201e6ec07e7f5d00 (patch)
treea65b1b054b394fca1237d5f8c001d21547ffcee1 /src/midx.c
parent6d1f19269f6bdce126689535e86819f704f25d1a (diff)
downloadlibgit2-005e77157d5eef9d9c0765ff201e6ec07e7f5d00.tar.gz
multipack: Introduce a parser for multi-pack-index files
This change is the first in a series to add support for git's multi-pack-index. This should speed up large repositories significantly. Part of: #5399
Diffstat (limited to 'src/midx.c')
-rw-r--r--src/midx.c418
1 files changed, 418 insertions, 0 deletions
diff --git a/src/midx.c b/src/midx.c
new file mode 100644
index 000000000..21cfff497
--- /dev/null
+++ b/src/midx.c
@@ -0,0 +1,418 @@
+/*
+ * Copyright (C) the libgit2 contributors. All rights reserved.
+ *
+ * This file is part of libgit2, distributed under the GNU GPL v2 with
+ * a Linking Exception. For full terms see the included COPYING file.
+ */
+
+#include "midx.h"
+
+#include "buffer.h"
+#include "futils.h"
+#include "hash.h"
+#include "odb.h"
+#include "pack.h"
+
+#define GIT_MIDX_FILE_MODE 0444
+
+#define MIDX_SIGNATURE 0x4d494458 /* "MIDX" */
+#define MIDX_VERSION 1
+#define MIDX_OBJECT_ID_VERSION 1
+struct git_midx_header {
+ uint32_t signature;
+ uint8_t version;
+ uint8_t object_id_version;
+ uint8_t chunks;
+ uint8_t base_midx_files;
+ uint32_t packfiles;
+};
+
+#define MIDX_PACKFILE_NAMES_ID 0x504e414d /* "PNAM" */
+#define MIDX_OID_FANOUT_ID 0x4f494446 /* "OIDF" */
+#define MIDX_OID_LOOKUP_ID 0x4f49444c /* "OIDL" */
+#define MIDX_OBJECT_OFFSETS_ID 0x4f4f4646 /* "OOFF" */
+#define MIDX_OBJECT_LARGE_OFFSETS_ID 0x4c4f4646 /* "LOFF" */
+
+struct git_midx_chunk {
+ off64_t offset;
+ size_t length;
+};
+
+static int midx_error(const char *message)
+{
+ git_error_set(GIT_ERROR_ODB, "invalid multi-pack-index file - %s", message);
+ return -1;
+}
+
+static int midx_parse_packfile_names(
+ git_midx_file *idx,
+ const unsigned char *data,
+ uint32_t packfiles,
+ struct git_midx_chunk *chunk)
+{
+ int error;
+ uint32_t i;
+ char *packfile_name = (char *)(data + chunk->offset);
+ size_t chunk_size = chunk->length, len;
+ if (chunk->offset == 0)
+ return midx_error("missing Packfile Names chunk");
+ if (chunk->length == 0)
+ return midx_error("empty Packfile Names chunk");
+ if ((error = git_vector_init(&idx->packfile_names, packfiles, git__strcmp_cb)) < 0)
+ return error;
+ for (i = 0; i < packfiles; ++i) {
+ len = p_strnlen(packfile_name, chunk_size);
+ if (len == 0)
+ return midx_error("empty packfile name");
+ if (len + 1 > chunk_size)
+ return midx_error("unterminated packfile name");
+ git_vector_insert(&idx->packfile_names, packfile_name);
+ if (i && strcmp(git_vector_get(&idx->packfile_names, i - 1), packfile_name) >= 0)
+ return midx_error("packfile names are not sorted");
+ if (strlen(packfile_name) <= strlen(".idx") || git__suffixcmp(packfile_name, ".idx") != 0)
+ return midx_error("non-.idx packfile name");
+ if (strchr(packfile_name, '/') != NULL || strchr(packfile_name, '\\') != NULL)
+ return midx_error("non-local packfile");
+ packfile_name += len + 1;
+ chunk_size -= len + 1;
+ }
+ return 0;
+}
+
+static int midx_parse_oid_fanout(
+ git_midx_file *idx,
+ const unsigned char *data,
+ struct git_midx_chunk *chunk_oid_fanout)
+{
+ uint32_t i, nr;
+ if (chunk_oid_fanout->offset == 0)
+ return midx_error("missing OID Fanout chunk");
+ if (chunk_oid_fanout->length == 0)
+ return midx_error("empty OID Fanout chunk");
+ if (chunk_oid_fanout->length != 256 * 4)
+ return midx_error("OID Fanout chunk has wrong length");
+
+ idx->oid_fanout = (const uint32_t *)(data + chunk_oid_fanout->offset);
+ nr = 0;
+ for (i = 0; i < 256; ++i) {
+ uint32_t n = ntohl(idx->oid_fanout[i]);
+ if (n < nr)
+ return midx_error("index is non-monotonic");
+ nr = n;
+ }
+ idx->num_objects = nr;
+ return 0;
+}
+
+static int midx_parse_oid_lookup(
+ git_midx_file *idx,
+ const unsigned char *data,
+ struct git_midx_chunk *chunk_oid_lookup)
+{
+ uint32_t i;
+ git_oid *oid, *prev_oid, zero_oid = {{0}};
+
+ if (chunk_oid_lookup->offset == 0)
+ return midx_error("missing OID Lookup chunk");
+ if (chunk_oid_lookup->length == 0)
+ return midx_error("empty OID Lookup chunk");
+ if (chunk_oid_lookup->length != idx->num_objects * 20)
+ return midx_error("OID Lookup chunk has wrong length");
+
+ idx->oid_lookup = oid = (git_oid *)(data + chunk_oid_lookup->offset);
+ prev_oid = &zero_oid;
+ for (i = 0; i < idx->num_objects; ++i, ++oid) {
+ if (git_oid_cmp(prev_oid, oid) >= 0)
+ return midx_error("OID Lookup index is non-monotonic");
+ prev_oid = oid;
+ }
+
+ return 0;
+}
+
+static int midx_parse_object_offsets(
+ git_midx_file *idx,
+ const unsigned char *data,
+ struct git_midx_chunk *chunk_object_offsets)
+{
+ if (chunk_object_offsets->offset == 0)
+ return midx_error("missing Object Offsets chunk");
+ if (chunk_object_offsets->length == 0)
+ return midx_error("empty Object Offsets chunk");
+ if (chunk_object_offsets->length != idx->num_objects * 8)
+ return midx_error("Object Offsets chunk has wrong length");
+
+ idx->object_offsets = data + chunk_object_offsets->offset;
+
+ return 0;
+}
+
+static int midx_parse_object_large_offsets(
+ git_midx_file *idx,
+ const unsigned char *data,
+ struct git_midx_chunk *chunk_object_large_offsets)
+{
+ if (chunk_object_large_offsets->length == 0)
+ return 0;
+ if (chunk_object_large_offsets->length % 8 != 0)
+ return midx_error("malformed Object Large Offsets chunk");
+
+ idx->object_large_offsets = data + chunk_object_large_offsets->offset;
+ idx->num_object_large_offsets = chunk_object_large_offsets->length / 8;
+
+ return 0;
+}
+
+int git_midx_parse(
+ git_midx_file *idx,
+ const unsigned char *data,
+ size_t size)
+{
+ struct git_midx_header *hdr;
+ const unsigned char *chunk_hdr;
+ struct git_midx_chunk *last_chunk;
+ uint32_t i;
+ off64_t last_chunk_offset, chunk_offset, trailer_offset;
+ git_oid idx_checksum = {{0}};
+ int error;
+ struct git_midx_chunk chunk_packfile_names = {0},
+ chunk_oid_fanout = {0},
+ chunk_oid_lookup = {0},
+ chunk_object_offsets = {0},
+ chunk_object_large_offsets = {0};
+
+ assert(idx);
+
+ if (size < sizeof(struct git_midx_header) + 20)
+ return midx_error("multi-pack index is too short");
+
+ hdr = ((struct git_midx_header *)data);
+
+ if (hdr->signature != htonl(MIDX_SIGNATURE) ||
+ hdr->version != MIDX_VERSION ||
+ hdr->object_id_version != MIDX_OBJECT_ID_VERSION) {
+ return midx_error("unsupported multi-pack index version");
+ }
+ if (hdr->chunks == 0)
+ return midx_error("no chunks in multi-pack index");
+
+ /*
+ * The very first chunk's offset should be after the header, all the chunk
+ * headers, and a special zero chunk.
+ */
+ last_chunk_offset =
+ sizeof(struct git_midx_header) +
+ (1 + hdr->chunks) * 12;
+ trailer_offset = size - 20;
+ if (trailer_offset < last_chunk_offset)
+ return midx_error("wrong index size");
+ git_oid_cpy(&idx->checksum, (git_oid *)(data + trailer_offset));
+
+ if (git_hash_buf(&idx_checksum, data, (size_t)trailer_offset) < 0)
+ return midx_error("could not calculate signature");
+ if (!git_oid_equal(&idx_checksum, &idx->checksum))
+ return midx_error("index signature mismatch");
+
+ chunk_hdr = data + sizeof(struct git_midx_header);
+ last_chunk = NULL;
+ for (i = 0; i < hdr->chunks; ++i, chunk_hdr += 12) {
+ chunk_offset = ((off64_t)ntohl(*((uint32_t *)(chunk_hdr + 4)))) << 32 |
+ ((off64_t)ntohl(*((uint32_t *)(chunk_hdr + 8))));
+ if (chunk_offset < last_chunk_offset)
+ return midx_error("chunks are non-monotonic");
+ if (chunk_offset >= trailer_offset)
+ return midx_error("chunks extend beyond the trailer");
+ if (last_chunk != NULL)
+ last_chunk->length = (size_t)(chunk_offset - last_chunk_offset);
+ last_chunk_offset = chunk_offset;
+
+ switch (ntohl(*((uint32_t *)(chunk_hdr + 0)))) {
+ case MIDX_PACKFILE_NAMES_ID:
+ chunk_packfile_names.offset = last_chunk_offset;
+ last_chunk = &chunk_packfile_names;
+ break;
+
+ case MIDX_OID_FANOUT_ID:
+ chunk_oid_fanout.offset = last_chunk_offset;
+ last_chunk = &chunk_oid_fanout;
+ break;
+
+ case MIDX_OID_LOOKUP_ID:
+ chunk_oid_lookup.offset = last_chunk_offset;
+ last_chunk = &chunk_oid_lookup;
+ break;
+
+ case MIDX_OBJECT_OFFSETS_ID:
+ chunk_object_offsets.offset = last_chunk_offset;
+ last_chunk = &chunk_object_offsets;
+ break;
+
+ case MIDX_OBJECT_LARGE_OFFSETS_ID:
+ chunk_object_large_offsets.offset = last_chunk_offset;
+ last_chunk = &chunk_object_large_offsets;
+ break;
+
+ default:
+ return midx_error("unrecognized chunk ID");
+ }
+ }
+ last_chunk->length = (size_t)(trailer_offset - last_chunk_offset);
+
+ error = midx_parse_packfile_names(
+ idx, data, ntohl(hdr->packfiles), &chunk_packfile_names);
+ if (error < 0)
+ return error;
+ error = midx_parse_oid_fanout(idx, data, &chunk_oid_fanout);
+ if (error < 0)
+ return error;
+ error = midx_parse_oid_lookup(idx, data, &chunk_oid_lookup);
+ if (error < 0)
+ return error;
+ error = midx_parse_object_offsets(idx, data, &chunk_object_offsets);
+ if (error < 0)
+ return error;
+ error = midx_parse_object_large_offsets(idx, data, &chunk_object_large_offsets);
+ if (error < 0)
+ return error;
+
+ return 0;
+}
+
+int git_midx_open(
+ git_midx_file **idx_out,
+ const char *path)
+{
+ git_midx_file *idx;
+ git_file fd = -1;
+ size_t idx_size;
+ struct stat st;
+ int error;
+
+ /* TODO: properly open the file without access time using O_NOATIME */
+ fd = git_futils_open_ro(path);
+ if (fd < 0)
+ return fd;
+
+ if (p_fstat(fd, &st) < 0) {
+ p_close(fd);
+ git_error_set(GIT_ERROR_ODB, "multi-pack-index file not found - '%s'", path);
+ return -1;
+ }
+
+ if (!S_ISREG(st.st_mode) || !git__is_sizet(st.st_size)) {
+ p_close(fd);
+ git_error_set(GIT_ERROR_ODB, "invalid pack index '%s'", path);
+ return -1;
+ }
+ idx_size = (size_t)st.st_size;
+
+ idx = git__calloc(1, sizeof(git_midx_file));
+ GIT_ERROR_CHECK_ALLOC(idx);
+
+ error = git_futils_mmap_ro(&idx->index_map, fd, 0, idx_size);
+ p_close(fd);
+ if (error < 0) {
+ git_midx_free(idx);
+ return error;
+ }
+
+ if ((error = git_midx_parse(idx, idx->index_map.data, idx_size)) < 0) {
+ git_midx_free(idx);
+ return error;
+ }
+
+ *idx_out = idx;
+ return 0;
+}
+
+int git_midx_entry_find(
+ git_midx_entry *e,
+ git_midx_file *idx,
+ const git_oid *short_oid,
+ size_t len)
+{
+ int pos, found = 0;
+ size_t pack_index;
+ uint32_t hi, lo;
+ const git_oid *current = NULL;
+ const unsigned char *object_offset;
+ off64_t offset;
+
+ assert(idx);
+
+ hi = ntohl(idx->oid_fanout[(int)short_oid->id[0]]);
+ lo = ((short_oid->id[0] == 0x0) ? 0 : ntohl(idx->oid_fanout[(int)short_oid->id[0] - 1]));
+
+ pos = git_pack__lookup_sha1(idx->oid_lookup, 20, lo, hi, short_oid->id);
+
+ if (pos >= 0) {
+ /* An object matching exactly the oid was found */
+ found = 1;
+ current = idx->oid_lookup + pos;
+ } else {
+ /* No object was found */
+ /* pos refers to the object with the "closest" oid to short_oid */
+ pos = -1 - pos;
+ if (pos < (int)idx->num_objects) {
+ current = idx->oid_lookup + pos;
+
+ if (!git_oid_ncmp(short_oid, current, len))
+ found = 1;
+ }
+ }
+
+ if (found && len != GIT_OID_HEXSZ && pos + 1 < (int)idx->num_objects) {
+ /* Check for ambiguousity */
+ const git_oid *next = current + 1;
+
+ if (!git_oid_ncmp(short_oid, next, len)) {
+ found = 2;
+ }
+ }
+
+ if (!found)
+ return git_odb__error_notfound("failed to find offset for multi-pack index entry", short_oid, len);
+ if (found > 1)
+ return git_odb__error_ambiguous("found multiple offsets for multi-pack index entry");
+
+ object_offset = idx->object_offsets + pos * 8;
+ offset = ntohl(*((uint32_t *)(object_offset + 4)));
+ if (offset & 0x80000000) {
+ uint32_t object_large_offsets_pos = offset & 0x7fffffff;
+ const unsigned char *object_large_offsets_index = idx->object_large_offsets;
+
+ /* Make sure we're not being sent out of bounds */
+ if (object_large_offsets_pos >= idx->num_object_large_offsets)
+ return git_odb__error_notfound("invalid index into the object large offsets table", short_oid, len);
+
+ object_large_offsets_index += 8 * object_large_offsets_pos;
+
+ offset = (((uint64_t)ntohl(*((uint32_t *)(object_large_offsets_index + 0)))) << 32) |
+ ntohl(*((uint32_t *)(object_large_offsets_index + 4)));
+ }
+ pack_index = ntohl(*((uint32_t *)(object_offset + 0)));
+ if (pack_index >= git_vector_length(&idx->packfile_names))
+ return midx_error("invalid index into the packfile names table");
+ e->pack_index = pack_index;
+ e->offset = offset;
+ git_oid_cpy(&e->sha1, current);
+ return 0;
+}
+
+void git_midx_close(git_midx_file *idx)
+{
+ assert(idx);
+
+ if (idx->index_map.data)
+ git_futils_mmap_free(&idx->index_map);
+ git_vector_free(&idx->packfile_names);
+}
+
+void git_midx_free(git_midx_file *idx)
+{
+ if (!idx)
+ return;
+
+ git_midx_close(idx);
+ git__free(idx);
+}