diff options
author | lhchavez <lhchavez@lhchavez.com> | 2020-02-23 22:28:52 +0000 |
---|---|---|
committer | lhchavez <lhchavez@lhchavez.com> | 2020-10-05 05:08:38 -0700 |
commit | 005e77157d5eef9d9c0765ff201e6ec07e7f5d00 (patch) | |
tree | a65b1b054b394fca1237d5f8c001d21547ffcee1 /src/midx.c | |
parent | 6d1f19269f6bdce126689535e86819f704f25d1a (diff) | |
download | libgit2-005e77157d5eef9d9c0765ff201e6ec07e7f5d00.tar.gz |
multipack: Introduce a parser for multi-pack-index files
This change is the first in a series to add support for git's
multi-pack-index. This should speed up large repositories significantly.
Part of: #5399
Diffstat (limited to 'src/midx.c')
-rw-r--r-- | src/midx.c | 418 |
1 files changed, 418 insertions, 0 deletions
diff --git a/src/midx.c b/src/midx.c new file mode 100644 index 000000000..21cfff497 --- /dev/null +++ b/src/midx.c @@ -0,0 +1,418 @@ +/* + * Copyright (C) the libgit2 contributors. All rights reserved. + * + * This file is part of libgit2, distributed under the GNU GPL v2 with + * a Linking Exception. For full terms see the included COPYING file. + */ + +#include "midx.h" + +#include "buffer.h" +#include "futils.h" +#include "hash.h" +#include "odb.h" +#include "pack.h" + +#define GIT_MIDX_FILE_MODE 0444 + +#define MIDX_SIGNATURE 0x4d494458 /* "MIDX" */ +#define MIDX_VERSION 1 +#define MIDX_OBJECT_ID_VERSION 1 +struct git_midx_header { + uint32_t signature; + uint8_t version; + uint8_t object_id_version; + uint8_t chunks; + uint8_t base_midx_files; + uint32_t packfiles; +}; + +#define MIDX_PACKFILE_NAMES_ID 0x504e414d /* "PNAM" */ +#define MIDX_OID_FANOUT_ID 0x4f494446 /* "OIDF" */ +#define MIDX_OID_LOOKUP_ID 0x4f49444c /* "OIDL" */ +#define MIDX_OBJECT_OFFSETS_ID 0x4f4f4646 /* "OOFF" */ +#define MIDX_OBJECT_LARGE_OFFSETS_ID 0x4c4f4646 /* "LOFF" */ + +struct git_midx_chunk { + off64_t offset; + size_t length; +}; + +static int midx_error(const char *message) +{ + git_error_set(GIT_ERROR_ODB, "invalid multi-pack-index file - %s", message); + return -1; +} + +static int midx_parse_packfile_names( + git_midx_file *idx, + const unsigned char *data, + uint32_t packfiles, + struct git_midx_chunk *chunk) +{ + int error; + uint32_t i; + char *packfile_name = (char *)(data + chunk->offset); + size_t chunk_size = chunk->length, len; + if (chunk->offset == 0) + return midx_error("missing Packfile Names chunk"); + if (chunk->length == 0) + return midx_error("empty Packfile Names chunk"); + if ((error = git_vector_init(&idx->packfile_names, packfiles, git__strcmp_cb)) < 0) + return error; + for (i = 0; i < packfiles; ++i) { + len = p_strnlen(packfile_name, chunk_size); + if (len == 0) + return midx_error("empty packfile name"); + if (len + 1 > chunk_size) + return midx_error("unterminated packfile name"); + git_vector_insert(&idx->packfile_names, packfile_name); + if (i && strcmp(git_vector_get(&idx->packfile_names, i - 1), packfile_name) >= 0) + return midx_error("packfile names are not sorted"); + if (strlen(packfile_name) <= strlen(".idx") || git__suffixcmp(packfile_name, ".idx") != 0) + return midx_error("non-.idx packfile name"); + if (strchr(packfile_name, '/') != NULL || strchr(packfile_name, '\\') != NULL) + return midx_error("non-local packfile"); + packfile_name += len + 1; + chunk_size -= len + 1; + } + return 0; +} + +static int midx_parse_oid_fanout( + git_midx_file *idx, + const unsigned char *data, + struct git_midx_chunk *chunk_oid_fanout) +{ + uint32_t i, nr; + if (chunk_oid_fanout->offset == 0) + return midx_error("missing OID Fanout chunk"); + if (chunk_oid_fanout->length == 0) + return midx_error("empty OID Fanout chunk"); + if (chunk_oid_fanout->length != 256 * 4) + return midx_error("OID Fanout chunk has wrong length"); + + idx->oid_fanout = (const uint32_t *)(data + chunk_oid_fanout->offset); + nr = 0; + for (i = 0; i < 256; ++i) { + uint32_t n = ntohl(idx->oid_fanout[i]); + if (n < nr) + return midx_error("index is non-monotonic"); + nr = n; + } + idx->num_objects = nr; + return 0; +} + +static int midx_parse_oid_lookup( + git_midx_file *idx, + const unsigned char *data, + struct git_midx_chunk *chunk_oid_lookup) +{ + uint32_t i; + git_oid *oid, *prev_oid, zero_oid = {{0}}; + + if (chunk_oid_lookup->offset == 0) + return midx_error("missing OID Lookup chunk"); + if (chunk_oid_lookup->length == 0) + return midx_error("empty OID Lookup chunk"); + if (chunk_oid_lookup->length != idx->num_objects * 20) + return midx_error("OID Lookup chunk has wrong length"); + + idx->oid_lookup = oid = (git_oid *)(data + chunk_oid_lookup->offset); + prev_oid = &zero_oid; + for (i = 0; i < idx->num_objects; ++i, ++oid) { + if (git_oid_cmp(prev_oid, oid) >= 0) + return midx_error("OID Lookup index is non-monotonic"); + prev_oid = oid; + } + + return 0; +} + +static int midx_parse_object_offsets( + git_midx_file *idx, + const unsigned char *data, + struct git_midx_chunk *chunk_object_offsets) +{ + if (chunk_object_offsets->offset == 0) + return midx_error("missing Object Offsets chunk"); + if (chunk_object_offsets->length == 0) + return midx_error("empty Object Offsets chunk"); + if (chunk_object_offsets->length != idx->num_objects * 8) + return midx_error("Object Offsets chunk has wrong length"); + + idx->object_offsets = data + chunk_object_offsets->offset; + + return 0; +} + +static int midx_parse_object_large_offsets( + git_midx_file *idx, + const unsigned char *data, + struct git_midx_chunk *chunk_object_large_offsets) +{ + if (chunk_object_large_offsets->length == 0) + return 0; + if (chunk_object_large_offsets->length % 8 != 0) + return midx_error("malformed Object Large Offsets chunk"); + + idx->object_large_offsets = data + chunk_object_large_offsets->offset; + idx->num_object_large_offsets = chunk_object_large_offsets->length / 8; + + return 0; +} + +int git_midx_parse( + git_midx_file *idx, + const unsigned char *data, + size_t size) +{ + struct git_midx_header *hdr; + const unsigned char *chunk_hdr; + struct git_midx_chunk *last_chunk; + uint32_t i; + off64_t last_chunk_offset, chunk_offset, trailer_offset; + git_oid idx_checksum = {{0}}; + int error; + struct git_midx_chunk chunk_packfile_names = {0}, + chunk_oid_fanout = {0}, + chunk_oid_lookup = {0}, + chunk_object_offsets = {0}, + chunk_object_large_offsets = {0}; + + assert(idx); + + if (size < sizeof(struct git_midx_header) + 20) + return midx_error("multi-pack index is too short"); + + hdr = ((struct git_midx_header *)data); + + if (hdr->signature != htonl(MIDX_SIGNATURE) || + hdr->version != MIDX_VERSION || + hdr->object_id_version != MIDX_OBJECT_ID_VERSION) { + return midx_error("unsupported multi-pack index version"); + } + if (hdr->chunks == 0) + return midx_error("no chunks in multi-pack index"); + + /* + * The very first chunk's offset should be after the header, all the chunk + * headers, and a special zero chunk. + */ + last_chunk_offset = + sizeof(struct git_midx_header) + + (1 + hdr->chunks) * 12; + trailer_offset = size - 20; + if (trailer_offset < last_chunk_offset) + return midx_error("wrong index size"); + git_oid_cpy(&idx->checksum, (git_oid *)(data + trailer_offset)); + + if (git_hash_buf(&idx_checksum, data, (size_t)trailer_offset) < 0) + return midx_error("could not calculate signature"); + if (!git_oid_equal(&idx_checksum, &idx->checksum)) + return midx_error("index signature mismatch"); + + chunk_hdr = data + sizeof(struct git_midx_header); + last_chunk = NULL; + for (i = 0; i < hdr->chunks; ++i, chunk_hdr += 12) { + chunk_offset = ((off64_t)ntohl(*((uint32_t *)(chunk_hdr + 4)))) << 32 | + ((off64_t)ntohl(*((uint32_t *)(chunk_hdr + 8)))); + if (chunk_offset < last_chunk_offset) + return midx_error("chunks are non-monotonic"); + if (chunk_offset >= trailer_offset) + return midx_error("chunks extend beyond the trailer"); + if (last_chunk != NULL) + last_chunk->length = (size_t)(chunk_offset - last_chunk_offset); + last_chunk_offset = chunk_offset; + + switch (ntohl(*((uint32_t *)(chunk_hdr + 0)))) { + case MIDX_PACKFILE_NAMES_ID: + chunk_packfile_names.offset = last_chunk_offset; + last_chunk = &chunk_packfile_names; + break; + + case MIDX_OID_FANOUT_ID: + chunk_oid_fanout.offset = last_chunk_offset; + last_chunk = &chunk_oid_fanout; + break; + + case MIDX_OID_LOOKUP_ID: + chunk_oid_lookup.offset = last_chunk_offset; + last_chunk = &chunk_oid_lookup; + break; + + case MIDX_OBJECT_OFFSETS_ID: + chunk_object_offsets.offset = last_chunk_offset; + last_chunk = &chunk_object_offsets; + break; + + case MIDX_OBJECT_LARGE_OFFSETS_ID: + chunk_object_large_offsets.offset = last_chunk_offset; + last_chunk = &chunk_object_large_offsets; + break; + + default: + return midx_error("unrecognized chunk ID"); + } + } + last_chunk->length = (size_t)(trailer_offset - last_chunk_offset); + + error = midx_parse_packfile_names( + idx, data, ntohl(hdr->packfiles), &chunk_packfile_names); + if (error < 0) + return error; + error = midx_parse_oid_fanout(idx, data, &chunk_oid_fanout); + if (error < 0) + return error; + error = midx_parse_oid_lookup(idx, data, &chunk_oid_lookup); + if (error < 0) + return error; + error = midx_parse_object_offsets(idx, data, &chunk_object_offsets); + if (error < 0) + return error; + error = midx_parse_object_large_offsets(idx, data, &chunk_object_large_offsets); + if (error < 0) + return error; + + return 0; +} + +int git_midx_open( + git_midx_file **idx_out, + const char *path) +{ + git_midx_file *idx; + git_file fd = -1; + size_t idx_size; + struct stat st; + int error; + + /* TODO: properly open the file without access time using O_NOATIME */ + fd = git_futils_open_ro(path); + if (fd < 0) + return fd; + + if (p_fstat(fd, &st) < 0) { + p_close(fd); + git_error_set(GIT_ERROR_ODB, "multi-pack-index file not found - '%s'", path); + return -1; + } + + if (!S_ISREG(st.st_mode) || !git__is_sizet(st.st_size)) { + p_close(fd); + git_error_set(GIT_ERROR_ODB, "invalid pack index '%s'", path); + return -1; + } + idx_size = (size_t)st.st_size; + + idx = git__calloc(1, sizeof(git_midx_file)); + GIT_ERROR_CHECK_ALLOC(idx); + + error = git_futils_mmap_ro(&idx->index_map, fd, 0, idx_size); + p_close(fd); + if (error < 0) { + git_midx_free(idx); + return error; + } + + if ((error = git_midx_parse(idx, idx->index_map.data, idx_size)) < 0) { + git_midx_free(idx); + return error; + } + + *idx_out = idx; + return 0; +} + +int git_midx_entry_find( + git_midx_entry *e, + git_midx_file *idx, + const git_oid *short_oid, + size_t len) +{ + int pos, found = 0; + size_t pack_index; + uint32_t hi, lo; + const git_oid *current = NULL; + const unsigned char *object_offset; + off64_t offset; + + assert(idx); + + hi = ntohl(idx->oid_fanout[(int)short_oid->id[0]]); + lo = ((short_oid->id[0] == 0x0) ? 0 : ntohl(idx->oid_fanout[(int)short_oid->id[0] - 1])); + + pos = git_pack__lookup_sha1(idx->oid_lookup, 20, lo, hi, short_oid->id); + + if (pos >= 0) { + /* An object matching exactly the oid was found */ + found = 1; + current = idx->oid_lookup + pos; + } else { + /* No object was found */ + /* pos refers to the object with the "closest" oid to short_oid */ + pos = -1 - pos; + if (pos < (int)idx->num_objects) { + current = idx->oid_lookup + pos; + + if (!git_oid_ncmp(short_oid, current, len)) + found = 1; + } + } + + if (found && len != GIT_OID_HEXSZ && pos + 1 < (int)idx->num_objects) { + /* Check for ambiguousity */ + const git_oid *next = current + 1; + + if (!git_oid_ncmp(short_oid, next, len)) { + found = 2; + } + } + + if (!found) + return git_odb__error_notfound("failed to find offset for multi-pack index entry", short_oid, len); + if (found > 1) + return git_odb__error_ambiguous("found multiple offsets for multi-pack index entry"); + + object_offset = idx->object_offsets + pos * 8; + offset = ntohl(*((uint32_t *)(object_offset + 4))); + if (offset & 0x80000000) { + uint32_t object_large_offsets_pos = offset & 0x7fffffff; + const unsigned char *object_large_offsets_index = idx->object_large_offsets; + + /* Make sure we're not being sent out of bounds */ + if (object_large_offsets_pos >= idx->num_object_large_offsets) + return git_odb__error_notfound("invalid index into the object large offsets table", short_oid, len); + + object_large_offsets_index += 8 * object_large_offsets_pos; + + offset = (((uint64_t)ntohl(*((uint32_t *)(object_large_offsets_index + 0)))) << 32) | + ntohl(*((uint32_t *)(object_large_offsets_index + 4))); + } + pack_index = ntohl(*((uint32_t *)(object_offset + 0))); + if (pack_index >= git_vector_length(&idx->packfile_names)) + return midx_error("invalid index into the packfile names table"); + e->pack_index = pack_index; + e->offset = offset; + git_oid_cpy(&e->sha1, current); + return 0; +} + +void git_midx_close(git_midx_file *idx) +{ + assert(idx); + + if (idx->index_map.data) + git_futils_mmap_free(&idx->index_map); + git_vector_free(&idx->packfile_names); +} + +void git_midx_free(git_midx_file *idx) +{ + if (!idx) + return; + + git_midx_close(idx); + git__free(idx); +} |