// Copyright (C) 2009-2016 Joel Rosdahl // // This program is free software; you can redistribute it and/or modify it // under the terms of the GNU General Public License as published by the Free // Software Foundation; either version 3 of the License, or (at your option) // any later version. // // This program is distributed in the hope that it will be useful, but WITHOUT // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or // FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for // more details. // // You should have received a copy of the GNU General Public License along with // this program; if not, write to the Free Software Foundation, Inc., 51 // Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #include "ccache.h" #include "hashtable_itr.h" #include "hashutil.h" #include "manifest.h" #include "murmurhashneutral2.h" #include // Sketchy specification of the manifest disk format: // // magic number (4 bytes) // file format version (1 byte unsigned int) // size of the hash fields (in bytes) (1 byte unsigned int) // reserved for future use (2 bytes) // ---------------------------------------------------------------------------- // number of include file paths (4 bytes unsigned int) // path to include file (NUL-terminated string, // ... at most 1024 bytes) // // ---------------------------------------------------------------------------- // number of include file hash entries (4 bytes unsigned int) // index of include file path (4 bytes unsigned int) // hash of include file ( bytes) // size of include file (4 bytes unsigned int) // mtime of include file (8 bytes signed int) // ctime of include file (8 bytes signed int) // ... // // // // // // ---------------------------------------------------------------------------- // number of object name entries (4 bytes unsigned int) // number of include file hash indexes (4 bytes unsigned int) // include file hash index (4 bytes unsigned int) // ... // // hash part of object name ( bytes) // size part of object name (4 bytes unsigned int) // ... // number of include file hash indexes // include file hash index // ... // // // static const uint32_t MAGIC = 0x63436d46U; static const uint32_t MAX_MANIFEST_ENTRIES = 100; static const uint32_t MAX_MANIFEST_FILE_INFO_ENTRIES = 10000; #define ccache_static_assert(e) \ do { enum { ccache_static_assert__ = 1/(e) }; } while (false) struct file_info { // Index to n_files. uint32_t index; // Hash of referenced file. uint8_t hash[16]; // Size of referenced file. uint32_t size; // mtime of referenced file. int64_t mtime; // ctime of referenced file. int64_t ctime; }; struct object { // Number of entries in file_info_indexes. uint32_t n_file_info_indexes; // Indexes to file_infos. uint32_t *file_info_indexes; // Hash of the object itself. struct file_hash hash; }; struct manifest { // Version of decoded file. uint8_t version; // Reserved for future use. uint16_t reserved; // Size of hash fields (in bytes). uint8_t hash_size; // Referenced include files. uint32_t n_files; char **files; // Information about referenced include files. uint32_t n_file_infos; struct file_info *file_infos; // Object names plus references to include file hashes. uint32_t n_objects; struct object *objects; }; struct file_stats { uint32_t size; int64_t mtime; int64_t ctime; }; static unsigned int hash_from_file_info(void *key) { ccache_static_assert(sizeof(struct file_info) == 40); // No padding. return murmurhashneutral2(key, sizeof(struct file_info), 0); } static int file_infos_equal(void *key1, void *key2) { struct file_info *fi1 = (struct file_info *)key1; struct file_info *fi2 = (struct file_info *)key2; return fi1->index == fi2->index && memcmp(fi1->hash, fi2->hash, 16) == 0 && fi1->size == fi2->size && fi1->mtime == fi2->mtime && fi1->ctime == fi2->ctime; } static void free_manifest(struct manifest *mf) { for (uint32_t i = 0; i < mf->n_files; i++) { free(mf->files[i]); } free(mf->files); free(mf->file_infos); for (uint32_t i = 0; i < mf->n_objects; i++) { free(mf->objects[i].file_info_indexes); } free(mf->objects); free(mf); } #define READ_BYTE(var) \ do { \ int ch_ = gzgetc(f); \ if (ch_ == EOF) { \ goto error; \ } \ (var) = ch_ & 0xFF; \ } while (false) #define READ_INT(size, var) \ do { \ uint64_t u_ = 0; \ for (size_t i_ = 0; i_ < (size); i_++) { \ int ch_ = gzgetc(f); \ if (ch_ == EOF) { \ goto error; \ } \ u_ <<= 8; \ u_ |= ch_ & 0xFF; \ } \ (var) = u_; \ } while (false) #define READ_STR(var) \ do { \ char buf_[1024]; \ size_t i_; \ for (i_ = 0; i_ < sizeof(buf_); i_++) { \ int ch_ = gzgetc(f); \ if (ch_ == EOF) { \ goto error; \ } \ buf_[i_] = ch_; \ if (ch_ == '\0') { \ break; \ } \ } \ if (i_ == sizeof(buf_)) { \ goto error; \ } \ (var) = x_strdup(buf_); \ } while (false) #define READ_BYTES(n, var) \ do { \ for (size_t i_ = 0; i_ < (n); i_++) { \ int ch_ = gzgetc(f); \ if (ch_ == EOF) { \ goto error; \ } \ (var)[i_] = ch_; \ } \ } while (false) static struct manifest * create_empty_manifest(void) { struct manifest *mf = x_malloc(sizeof(*mf)); mf->hash_size = 16; mf->n_files = 0; mf->files = NULL; mf->n_file_infos = 0; mf->file_infos = NULL; mf->n_objects = 0; mf->objects = NULL; return mf; } static struct manifest * read_manifest(gzFile f) { struct manifest *mf = create_empty_manifest(); uint32_t magic; READ_INT(4, magic); if (magic != MAGIC) { cc_log("Manifest file has bad magic number %u", magic); goto error; } READ_BYTE(mf->version); if (mf->version != MANIFEST_VERSION) { cc_log("Manifest file has unknown version %u", mf->version); goto error; } READ_BYTE(mf->hash_size); if (mf->hash_size != 16) { // Temporary measure until we support different hash algorithms. cc_log("Manifest file has unsupported hash size %u", mf->hash_size); goto error; } READ_INT(2, mf->reserved); READ_INT(4, mf->n_files); mf->files = x_calloc(mf->n_files, sizeof(*mf->files)); for (uint32_t i = 0; i < mf->n_files; i++) { READ_STR(mf->files[i]); } READ_INT(4, mf->n_file_infos); mf->file_infos = x_calloc(mf->n_file_infos, sizeof(*mf->file_infos)); for (uint32_t i = 0; i < mf->n_file_infos; i++) { READ_INT(4, mf->file_infos[i].index); READ_BYTES(mf->hash_size, mf->file_infos[i].hash); READ_INT(4, mf->file_infos[i].size); READ_INT(8, mf->file_infos[i].mtime); READ_INT(8, mf->file_infos[i].ctime); } READ_INT(4, mf->n_objects); mf->objects = x_calloc(mf->n_objects, sizeof(*mf->objects)); for (uint32_t i = 0; i < mf->n_objects; i++) { READ_INT(4, mf->objects[i].n_file_info_indexes); mf->objects[i].file_info_indexes = x_calloc(mf->objects[i].n_file_info_indexes, sizeof(*mf->objects[i].file_info_indexes)); for (uint32_t j = 0; j < mf->objects[i].n_file_info_indexes; j++) { READ_INT(4, mf->objects[i].file_info_indexes[j]); } READ_BYTES(mf->hash_size, mf->objects[i].hash.hash); READ_INT(4, mf->objects[i].hash.size); } return mf; error: cc_log("Corrupt manifest file"); free_manifest(mf); return NULL; } #define WRITE_INT(size, var) \ do { \ uint64_t u_ = (var); \ uint8_t ch_; \ size_t i_; \ for (i_ = 0; i_ < (size); i_++) { \ ch_ = (u_ >> (8 * ((size) - i_ - 1))); \ if (gzputc(f, ch_) == EOF) { \ goto error; \ } \ } \ } while (false) #define WRITE_STR(var) \ do { \ if (gzputs(f, var) == EOF || gzputc(f, '\0') == EOF) { \ goto error; \ } \ } while (false) #define WRITE_BYTES(n, var) \ do { \ size_t i_; \ for (i_ = 0; i_ < (n); i_++) { \ if (gzputc(f, (var)[i_]) == EOF) { \ goto error; \ } \ } \ } while (false) static int write_manifest(gzFile f, const struct manifest *mf) { WRITE_INT(4, MAGIC); WRITE_INT(1, MANIFEST_VERSION); WRITE_INT(1, 16); WRITE_INT(2, 0); WRITE_INT(4, mf->n_files); for (uint32_t i = 0; i < mf->n_files; i++) { WRITE_STR(mf->files[i]); } WRITE_INT(4, mf->n_file_infos); for (uint32_t i = 0; i < mf->n_file_infos; i++) { WRITE_INT(4, mf->file_infos[i].index); WRITE_BYTES(mf->hash_size, mf->file_infos[i].hash); WRITE_INT(4, mf->file_infos[i].size); WRITE_INT(8, mf->file_infos[i].mtime); WRITE_INT(8, mf->file_infos[i].ctime); } WRITE_INT(4, mf->n_objects); for (uint32_t i = 0; i < mf->n_objects; i++) { WRITE_INT(4, mf->objects[i].n_file_info_indexes); for (uint32_t j = 0; j < mf->objects[i].n_file_info_indexes; j++) { WRITE_INT(4, mf->objects[i].file_info_indexes[j]); } WRITE_BYTES(mf->hash_size, mf->objects[i].hash.hash); WRITE_INT(4, mf->objects[i].hash.size); } return 1; error: cc_log("Error writing to manifest file"); return 0; } static int verify_object(struct conf *conf, struct manifest *mf, struct object *obj, struct hashtable *stated_files, struct hashtable *hashed_files) { for (uint32_t i = 0; i < obj->n_file_info_indexes; i++) { struct file_info *fi = &mf->file_infos[obj->file_info_indexes[i]]; char *path = mf->files[fi->index]; struct file_stats *st = hashtable_search(stated_files, path); if (!st) { struct stat file_stat; if (x_stat(path, &file_stat) != 0) { return 0; } st = x_malloc(sizeof(*st)); st->size = file_stat.st_size; st->mtime = file_stat.st_mtime; st->ctime = file_stat.st_ctime; hashtable_insert(stated_files, x_strdup(path), st); } if (conf->sloppiness & SLOPPY_FILE_STAT_MATCHES) { // st->ctime is sometimes 0, so we can't check that both st->ctime and // st->mtime are greater than time_of_compilation. But it's sufficient to // check that either is. if (fi->size == st->size && fi->mtime == st->mtime && fi->ctime == st->ctime && MAX(st->mtime, st->ctime) >= time_of_compilation) { cc_log("size/mtime/ctime hit for %s", path); continue; } else { cc_log("size/mtime/ctime miss for %s", path); } } struct file_hash *actual = hashtable_search(hashed_files, path); if (!actual) { struct mdfour hash; hash_start(&hash); int result = hash_source_code_file(conf, &hash, path); if (result & HASH_SOURCE_CODE_ERROR) { cc_log("Failed hashing %s", path); return 0; } if (result & HASH_SOURCE_CODE_FOUND_TIME) { return 0; } actual = x_malloc(sizeof(*actual)); hash_result_as_bytes(&hash, actual->hash); actual->size = hash.totalN; hashtable_insert(hashed_files, x_strdup(path), actual); } if (memcmp(fi->hash, actual->hash, mf->hash_size) != 0 || fi->size != actual->size) { return 0; } } return 1; } static struct hashtable * create_string_index_map(char **strings, uint32_t len) { struct hashtable *h = create_hashtable(1000, hash_from_string, strings_equal); for (uint32_t i = 0; i < len; i++) { uint32_t *index = x_malloc(sizeof(*index)); *index = i; hashtable_insert(h, x_strdup(strings[i]), index); } return h; } static struct hashtable * create_file_info_index_map(struct file_info *infos, uint32_t len) { struct hashtable *h = create_hashtable(1000, hash_from_file_info, file_infos_equal); for (uint32_t i = 0; i < len; i++) { struct file_info *fi = x_malloc(sizeof(*fi)); *fi = infos[i]; uint32_t *index = x_malloc(sizeof(*index)); *index = i; hashtable_insert(h, fi, index); } return h; } static uint32_t get_include_file_index(struct manifest *mf, char *path, struct hashtable *mf_files) { uint32_t *index = hashtable_search(mf_files, path); if (index) { return *index; } uint32_t n = mf->n_files; mf->files = x_realloc(mf->files, (n + 1) * sizeof(*mf->files)); mf->n_files++; mf->files[n] = x_strdup(path); return n; } static uint32_t get_file_hash_index(struct manifest *mf, char *path, struct file_hash *file_hash, struct hashtable *mf_files, struct hashtable *mf_file_infos) { struct file_info fi; fi.index = get_include_file_index(mf, path, mf_files); memcpy(fi.hash, file_hash->hash, sizeof(fi.hash)); fi.size = file_hash->size; // file_stat.st_{m,c}time has a resolution of 1 second, so we can cache the // file's mtime and ctime only if they're at least one second older than // time_of_compilation. // // st->ctime may be 0, so we have to check time_of_compilation against // MAX(mtime, ctime). struct stat file_stat; if (stat(path, &file_stat) != -1 && time_of_compilation > MAX(file_stat.st_mtime, file_stat.st_ctime)) { fi.mtime = file_stat.st_mtime; fi.ctime = file_stat.st_ctime; } else { fi.mtime = -1; fi.ctime = -1; } uint32_t *fi_index = hashtable_search(mf_file_infos, &fi); if (fi_index) { return *fi_index; } uint32_t n = mf->n_file_infos; mf->file_infos = x_realloc(mf->file_infos, (n + 1) * sizeof(*mf->file_infos)); mf->n_file_infos++; mf->file_infos[n] = fi; return n; } static void add_file_info_indexes(uint32_t *indexes, uint32_t size, struct manifest *mf, struct hashtable *included_files) { if (size == 0) { return; } // path --> index struct hashtable *mf_files = create_string_index_map(mf->files, mf->n_files); // struct file_info --> index struct hashtable *mf_file_infos = create_file_info_index_map(mf->file_infos, mf->n_file_infos); struct hashtable_itr *iter = hashtable_iterator(included_files); uint32_t i = 0; do { char *path = hashtable_iterator_key(iter); struct file_hash *file_hash = hashtable_iterator_value(iter); indexes[i] = get_file_hash_index(mf, path, file_hash, mf_files, mf_file_infos); i++; } while (hashtable_iterator_advance(iter)); assert(i == size); hashtable_destroy(mf_file_infos, 1); hashtable_destroy(mf_files, 1); } static void add_object_entry(struct manifest *mf, struct file_hash *object_hash, struct hashtable *included_files) { uint32_t n_objs = mf->n_objects; mf->objects = x_realloc(mf->objects, (n_objs + 1) * sizeof(*mf->objects)); mf->n_objects++; struct object *obj = &mf->objects[n_objs]; uint32_t n_fii = hashtable_count(included_files); obj->n_file_info_indexes = n_fii; obj->file_info_indexes = x_malloc(n_fii * sizeof(*obj->file_info_indexes)); add_file_info_indexes(obj->file_info_indexes, n_fii, mf, included_files); memcpy(obj->hash.hash, object_hash->hash, mf->hash_size); obj->hash.size = object_hash->size; } // Try to get the object hash from a manifest file. Caller frees. Returns NULL // on failure. struct file_hash * manifest_get(struct conf *conf, const char *manifest_path) { gzFile f = NULL; struct manifest *mf = NULL; struct hashtable *hashed_files = NULL; // path --> struct file_hash struct hashtable *stated_files = NULL; // path --> struct file_stats struct file_hash *fh = NULL; int fd = open(manifest_path, O_RDONLY | O_BINARY); if (fd == -1) { // Cache miss. cc_log("No such manifest file"); goto out; } f = gzdopen(fd, "rb"); if (!f) { close(fd); cc_log("Failed to gzdopen manifest file"); goto out; } mf = read_manifest(f); if (!mf) { cc_log("Error reading manifest file"); goto out; } hashed_files = create_hashtable(1000, hash_from_string, strings_equal); stated_files = create_hashtable(1000, hash_from_string, strings_equal); // Check newest object first since it's a bit more likely to match. for (uint32_t i = mf->n_objects; i > 0; i--) { if (verify_object(conf, mf, &mf->objects[i - 1], stated_files, hashed_files)) { fh = x_malloc(sizeof(*fh)); *fh = mf->objects[i - 1].hash; goto out; } } out: if (hashed_files) { hashtable_destroy(hashed_files, 1); } if (stated_files) { hashtable_destroy(stated_files, 1); } if (f) { gzclose(f); } if (mf) { free_manifest(mf); } return fh; } // Put the object name into a manifest file given a set of included files. // Returns true on success, otherwise false. bool manifest_put(const char *manifest_path, struct file_hash *object_hash, struct hashtable *included_files) { int ret = 0; gzFile f2 = NULL; struct manifest *mf = NULL; char *tmp_file = NULL; // We don't bother to acquire a lock when writing the manifest to disk. A // race between two processes will only result in one lost entry, which is // not a big deal, and it's also very unlikely. int fd1 = open(manifest_path, O_RDONLY | O_BINARY); if (fd1 == -1) { // New file. mf = create_empty_manifest(); } else { gzFile f1 = gzdopen(fd1, "rb"); if (!f1) { cc_log("Failed to gzdopen manifest file"); close(fd1); goto out; } mf = read_manifest(f1); gzclose(f1); if (!mf) { cc_log("Failed to read manifest file; deleting it"); x_unlink(manifest_path); mf = create_empty_manifest(); } } if (mf->n_objects > MAX_MANIFEST_ENTRIES) { // Normally, there shouldn't be many object entries in the manifest since // new entries are added only if an include file has changed but not the // source file, and you typically change source files more often than // header files. However, it's certainly possible to imagine cases where // the manifest will grow large (for instance, a generated header file that // changes for every build), and this must be taken care of since // processing an ever growing manifest eventually will take too much time. // A good way of solving this would be to maintain the object entries in // LRU order and discarding the old ones. An easy way is to throw away all // entries when there are too many. Let's do that for now. cc_log("More than %u entries in manifest file; discarding", MAX_MANIFEST_ENTRIES); free_manifest(mf); mf = create_empty_manifest(); } else if (mf->n_file_infos > MAX_MANIFEST_FILE_INFO_ENTRIES) { // Rarely, file_info entries can grow large in pathological cases where // many included files change, but the main file does not. This also puts // an upper bound on the number of file_info entries. cc_log("More than %u file_info entries in manifest file; discarding", MAX_MANIFEST_FILE_INFO_ENTRIES); free_manifest(mf); mf = create_empty_manifest(); } tmp_file = format("%s.tmp", manifest_path); int fd2 = create_tmp_fd(&tmp_file); f2 = gzdopen(fd2, "wb"); if (!f2) { cc_log("Failed to gzdopen %s", tmp_file); goto out; } add_object_entry(mf, object_hash, included_files); if (write_manifest(f2, mf)) { gzclose(f2); f2 = NULL; if (x_rename(tmp_file, manifest_path) == 0) { ret = 1; } else { cc_log("Failed to rename %s to %s", tmp_file, manifest_path); goto out; } } else { cc_log("Failed to write manifest file"); goto out; } out: if (mf) { free_manifest(mf); } if (tmp_file) { free(tmp_file); } if (f2) { gzclose(f2); } return ret; } bool manifest_dump(const char *manifest_path, FILE *stream) { struct manifest *mf = NULL; gzFile f = NULL; bool ret = false; int fd = open(manifest_path, O_RDONLY | O_BINARY); if (fd == -1) { fprintf(stderr, "No such manifest file: %s\n", manifest_path); goto out; } f = gzdopen(fd, "rb"); if (!f) { fprintf(stderr, "Failed to dzopen manifest file\n"); close(fd); goto out; } mf = read_manifest(f); if (!mf) { fprintf(stderr, "Error reading manifest file\n"); goto out; } fprintf(stream, "Magic: %c%c%c%c\n", (MAGIC >> 24) & 0xFF, (MAGIC >> 16) & 0xFF, (MAGIC >> 8) & 0xFF, MAGIC & 0xFF); fprintf(stream, "Version: %u\n", mf->version); fprintf(stream, "Hash size: %u\n", (unsigned)mf->hash_size); fprintf(stream, "Reserved field: %u\n", (unsigned)mf->reserved); fprintf(stream, "File paths (%u):\n", (unsigned)mf->n_files); for (unsigned i = 0; i < mf->n_files; ++i) { fprintf(stream, " %u: %s\n", i, mf->files[i]); } fprintf(stream, "File infos (%u):\n", (unsigned)mf->n_file_infos); for (unsigned i = 0; i < mf->n_file_infos; ++i) { char *hash; fprintf(stream, " %u:\n", i); fprintf(stream, " Path index: %u\n", mf->file_infos[i].index); hash = format_hash_as_string(mf->file_infos[i].hash, -1); fprintf(stream, " Hash: %s\n", hash); free(hash); fprintf(stream, " Size: %u\n", mf->file_infos[i].size); fprintf(stream, " Mtime: %lld\n", (long long)mf->file_infos[i].mtime); fprintf(stream, " Ctime: %lld\n", (long long)mf->file_infos[i].ctime); } fprintf(stream, "Results (%u):\n", (unsigned)mf->n_objects); for (unsigned i = 0; i < mf->n_objects; ++i) { char *hash; fprintf(stream, " %u:\n", i); fprintf(stream, " File info indexes:"); for (unsigned j = 0; j < mf->objects[i].n_file_info_indexes; ++j) { fprintf(stream, " %u", mf->objects[i].file_info_indexes[j]); } fprintf(stream, "\n"); hash = format_hash_as_string(mf->objects[i].hash.hash, -1); fprintf(stream, " Hash: %s\n", hash); free(hash); fprintf(stream, " Size: %u\n", (unsigned)mf->objects[i].hash.size); } ret = true; out: if (mf) { free_manifest(mf); } if (f) { gzclose(f); } return ret; }