From 75b97fec171dbbf7ec73960cefb50c265cfb7af7 Mon Sep 17 00:00:00 2001 From: Jonathan Tan Date: Tue, 5 Dec 2017 16:58:43 +0000 Subject: extension.partialclone: introduce partial clone extension Introduce new repository extension option: `extensions.partialclone` See the update to Documentation/technical/repository-version.txt in this patch for more information. Signed-off-by: Jonathan Tan Signed-off-by: Junio C Hamano --- cache.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'cache.h') diff --git a/cache.h b/cache.h index 6440e2bf21..35e3f5ed5f 100644 --- a/cache.h +++ b/cache.h @@ -860,10 +860,12 @@ extern int grafts_replace_parents; #define GIT_REPO_VERSION 0 #define GIT_REPO_VERSION_READ 1 extern int repository_format_precious_objects; +extern char *repository_format_partial_clone; struct repository_format { int version; int precious_objects; + char *partial_clone; /* value of extensions.partialclone */ int is_bare; char *work_tree; struct string_list unknown_extensions; -- cgit v1.2.1 From 498f1f61f123fd66eccc05c1d19356b25b4225b2 Mon Sep 17 00:00:00 2001 From: Jonathan Tan Date: Tue, 5 Dec 2017 16:58:44 +0000 Subject: fsck: introduce partialclone extension Currently, Git does not support repos with very large numbers of objects or repos that wish to minimize manipulation of certain blobs (for example, because they are very large) very well, even if the user operates mostly on part of the repo, because Git is designed on the assumption that every referenced object is available somewhere in the repo storage. In such an arrangement, the full set of objects is usually available in remote storage, ready to be lazily downloaded. Teach fsck about the new state of affairs. In this commit, teach fsck that missing promisor objects referenced from the reflog are not an error case; in future commits, fsck will be taught about other cases. Signed-off-by: Jonathan Tan Signed-off-by: Junio C Hamano --- cache.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'cache.h') diff --git a/cache.h b/cache.h index 35e3f5ed5f..c76f2e91d6 100644 --- a/cache.h +++ b/cache.h @@ -1587,7 +1587,8 @@ extern struct packed_git { unsigned pack_local:1, pack_keep:1, freshened:1, - do_not_close:1; + do_not_close:1, + pack_promisor:1; unsigned char sha1[20]; struct revindex_entry *revindex; /* something like ".git/objects/pack/xxxxx.pack" */ -- cgit v1.2.1 From 8b4c0103a98239287176a537f7a04d9b07b49125 Mon Sep 17 00:00:00 2001 From: Jonathan Tan Date: Fri, 8 Dec 2017 15:27:14 +0000 Subject: sha1_file: support lazily fetching missing objects Teach sha1_file to fetch objects from the remote configured in extensions.partialclone whenever an object is requested but missing. The fetching of objects can be suppressed through a global variable. This is used by fsck and index-pack. However, by default, such fetching is not suppressed. This is meant as a temporary measure to ensure that all Git commands work in such a situation. Future patches will update some commands to either tolerate missing objects (without fetching them) or be more efficient in fetching them. In order to determine the code changes in sha1_file.c necessary, I investigated the following: (1) functions in sha1_file.c that take in a hash, without the user regarding how the object is stored (loose or packed) (2) functions in packfile.c (because I need to check callers that know about the loose/packed distinction and operate on both differently, and ensure that they can handle the concept of objects that are neither loose nor packed) (1) is handled by the modification to sha1_object_info_extended(). For (2), I looked at for_each_packed_object and others. For for_each_packed_object, the callers either already work or are fixed in this patch: - reachable - only to find recent objects - builtin/fsck - already knows about missing objects - builtin/cat-file - warning message added in this commit Callers of the other functions do not need to be changed: - parse_pack_index - http - indirectly from http_get_info_packs - find_pack_entry_one - this searches a single pack that is provided as an argument; the caller already knows (through other means) that the sought object is in a specific pack - find_sha1_pack - fast-import - appears to be an optimization to not store a file if it is already in a pack - http-walker - to search through a struct alt_base - http-push - to search through remote packs - has_sha1_pack - builtin/fsck - already knows about promisor objects - builtin/count-objects - informational purposes only (check if loose object is also packed) - builtin/prune-packed - check if object to be pruned is packed (if not, don't prune it) - revision - used to exclude packed objects if requested by user - diff - just for optimization Signed-off-by: Jonathan Tan Signed-off-by: Junio C Hamano --- cache.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'cache.h') diff --git a/cache.h b/cache.h index c76f2e91d6..6980072132 100644 --- a/cache.h +++ b/cache.h @@ -1727,6 +1727,14 @@ struct object_info { #define OBJECT_INFO_QUICK 8 extern int sha1_object_info_extended(const unsigned char *, struct object_info *, unsigned flags); +/* + * Set this to 0 to prevent sha1_object_info_extended() from fetching missing + * blobs. This has a difference only if extensions.partialClone is set. + * + * Its default value is 1. + */ +extern int fetch_if_missing; + /* Dumb servers support */ extern int update_server_info(int); -- cgit v1.2.1