diff options
author | Sergey Poznyakoff <gray@gnu.org> | 2022-01-06 14:49:30 +0200 |
---|---|---|
committer | Sergey Poznyakoff <gray@gnu.org> | 2022-01-06 15:36:10 +0200 |
commit | b8c3d13fd821e90a190cc5cfad3a9e17f18aa416 (patch) | |
tree | 1beb84078d59d5800ade4de7972671c2b823b8b7 | |
parent | 42276af5bd0a48512a23f83db021b6e832c3fb92 (diff) | |
download | gdbm-b8c3d13fd821e90a190cc5cfad3a9e17f18aa416.tar.gz |
Speed up flushing the bucket cache on disk
The implementation of _gdbm_cache_flush becomes prohibitively
inefficient during extensive updates of large databases. The
bug was reported at https://github.com/Perl/perl5/issues/19306.
To fix it, make sure that all changed cache entries are placed at
the head of the cache_mru list, forming a contiguous sequence.
This way a potentially long iteration over all cache entries can be
cut off at the first entry with ca_changed == FALSE.
This commit also gets rid of several superfluous fields in
struct gdbm_file_info:
- cache_entry
Not needed, because the most recently used cache entry
(cache_mru) is always the current one.
- bucket_changed
dbf->cache_mru->ca_changed reflects the status of the current
bucket.
- second_changed
Not needed because _gdbm_cache_flush, which flushes all changed
buckets, is now invoked unconditionally by _gdbm_end_update (and
also whenever dbf->cache_mru changes).
* src/gdbmdefs.h (struct gdbm_file_info): Remove cache_entry. The
current cache entry is cache_mru.
Remove bucket_changed, and second_changed.
All uses changed.
* src/proto.h (_gdbm_current_bucket_changed): New inline function.
* src/bucket.c (_gdbm_cache_flush): Assume all changed elements form
a contiguous sequence beginning with dbf->cache_mru.
(set_cache_entry): Remove. All callers changed.
(lru_link_elem,lru_unlink_elem): Update dbf->bucket as necessary.
(cache_lookup): If the obtained bucket is not changed and is going
to become current, flush all changed cache elements.
* src/update.c (_gdbm_end_update): Call _gdbm_cache_flush unconditionally.
* src/findkey.c: Use dbf->cache_mru instead of the removed dbf->cache_entry.
* src/gdbmseq.c: Likewise.
* tools/gdbmshell.c (_gdbm_print_bucket_cache): Likewise.
* src/falloc.c: Use _gdbm_current_bucket_changed to mark the current
bucket as changed.
* src/gdbmstore.c: Likewise.
* src/gdbmdelete.c: Likewise. Use _gdbm_current_bucket_changed.
* tests/gtcacheopt.c: Fix typo.
* tests/gtload.c: New option: -cachesize
-rw-r--r-- | src/bucket.c | 88 | ||||
-rw-r--r-- | src/falloc.c | 4 | ||||
-rw-r--r-- | src/findkey.c | 20 | ||||
-rw-r--r-- | src/gdbmdefs.h | 7 | ||||
-rw-r--r-- | src/gdbmdelete.c | 8 | ||||
-rw-r--r-- | src/gdbmopen.c | 2 | ||||
-rw-r--r-- | src/gdbmseq.c | 2 | ||||
-rw-r--r-- | src/gdbmstore.c | 3 | ||||
-rw-r--r-- | src/proto.h | 7 | ||||
-rw-r--r-- | src/recover.c | 3 | ||||
-rw-r--r-- | src/update.c | 16 | ||||
-rw-r--r-- | tests/gtcacheopt.c | 2 | ||||
-rw-r--r-- | tests/gtload.c | 15 | ||||
-rw-r--r-- | tools/gdbmshell.c | 2 |
14 files changed, 97 insertions, 82 deletions
diff --git a/src/bucket.c b/src/bucket.c index 6ce08ad..289ae11 100644 --- a/src/bucket.c +++ b/src/bucket.c @@ -41,14 +41,6 @@ _gdbm_new_bucket (GDBM_FILE dbf, hash_bucket *bucket, int bits) for (index = 0; index < dbf->header->bucket_elems; index++) bucket->h_table[index].hash_value = -1; } - -static void -set_cache_entry (GDBM_FILE dbf, cache_elem *elem) -{ - dbf->cache_entry = elem; - dbf->bucket = dbf->cache_entry->ca_bucket; -} - /* Bucket cache table functions */ @@ -90,7 +82,10 @@ cache_tab_lookup_slot (GDBM_FILE dbf, off_t adr) /* LRU list management */ -/* Link ELEM after REF in DBF cache. If REF is NULL, link at head */ +/* + * Link ELEM after REF in DBF cache. If REF is NULL, link at head and + * set DBF->bucket to point to the ca_bucket of ELEM. + */ static void lru_link_elem (GDBM_FILE dbf, cache_elem *elem, cache_elem *ref) { @@ -103,6 +98,7 @@ lru_link_elem (GDBM_FILE dbf, cache_elem *elem, cache_elem *ref) else dbf->cache_lru = elem; dbf->cache_mru = elem; + dbf->bucket = dbf->cache_mru->ca_bucket; } else { @@ -118,7 +114,10 @@ lru_link_elem (GDBM_FILE dbf, cache_elem *elem, cache_elem *ref) } } -/* Unlink ELEM from the list of cache elements in DBF. */ +/* + * Unlink ELEM from the list of cache elements in DBF. + * If cache_mru gets updated, update DBF->bucket accordingly. + */ static void lru_unlink_elem (GDBM_FILE dbf, cache_elem *elem) { @@ -127,7 +126,10 @@ lru_unlink_elem (GDBM_FILE dbf, cache_elem *elem) if ((x = elem->ca_prev)) x->ca_next = elem->ca_next; else - dbf->cache_mru = elem->ca_next; + { + dbf->cache_mru = elem->ca_next; + dbf->bucket = dbf->cache_mru->ca_bucket; + } if ((x = elem->ca_next)) x->ca_prev = elem->ca_prev; else @@ -334,19 +336,34 @@ cache_lookup (GDBM_FILE dbf, off_t adr, cache_elem *ref, cache_elem **ret_elem) dbf->cache_num++; } } + + /* + * If the obtained bucket is not changed and is going to become current, + * flush all changed cache elements. This ensures that changed cache + * elements form a contiguous sequence at the head of the cache list (see + * _gdbm_cache_flush). + */ + if (ref == NULL && !elem->ca_changed) + _gdbm_cache_flush (dbf); + lru_link_elem (dbf, elem, ref); if (rc != cache_failure) *ret_elem = elem; return rc; } -/* Find a bucket for DBF that is pointed to by the bucket directory from - location DIR_INDEX. The bucket cache is first checked to see if it - is already in memory. If not, a bucket may be tossed to read the new - bucket. On success, the requested bucket becomes the "current" bucket - and dbf->bucket points to the correct bucket. On error, the current - bucket remains unchanged. */ - +/* + * Find a bucket for DBF that is pointed to by the bucket directory from + * location DIR_INDEX. The bucket cache is first checked to see if it + * is already in memory. If not, the last recently used bucket may be + * tossed (if the cache is full) to read the new bucket. + * + * On success, the cached entry with the requested bucket is placed at + * the head of the cache list (cache_mru) and the requested bucket becomes + * "current". + * + * On error, the current bucket remains unchanged. + */ int _gdbm_get_bucket (GDBM_FILE dbf, int dir_index) { @@ -367,9 +384,6 @@ _gdbm_get_bucket (GDBM_FILE dbf, int dir_index) dbf->bucket_dir = dir_index; bucket_adr = dbf->dir[dir_index]; - if (dbf->cache_entry && dbf->cache_entry->ca_adr == bucket_adr) - return 0; - switch (cache_lookup (dbf, bucket_adr, NULL, &elem)) { case cache_found: @@ -427,7 +441,6 @@ _gdbm_get_bucket (GDBM_FILE dbf, int dir_index) case cache_failure: return -1; } - set_cache_entry (dbf, elem); return 0; } @@ -462,7 +475,14 @@ _gdbm_split_bucket (GDBM_FILE dbf, int next_insert) new_bits = dbf->bucket->bucket_bits + 1; - /* Allocate two new buckets */ + /* + * Allocate two new buckets. They will be populated with the entries + * from the current bucket (cache_mru->bucket), so make sure that + * cache_mru remains unchanged until both buckets are fully formed. + * Newly allocated buckets must be linked right after cache_mru, so + * that all changed buckets form a contiguous sequence at the beginning + * of the cache list (this is needed by _gdbm_cache_flush). + */ adr_0 = _gdbm_alloc (dbf, dbf->header->bucket_size); switch (cache_lookup (dbf, adr_0, dbf->cache_mru, &newcache[0])) { @@ -603,17 +623,15 @@ _gdbm_split_bucket (GDBM_FILE dbf, int next_insert) /* Set changed flags. */ newcache[0]->ca_changed = TRUE; newcache[1]->ca_changed = TRUE; - dbf->bucket_changed = TRUE; dbf->directory_changed = TRUE; - dbf->second_changed = TRUE; /* Update the cache! */ dbf->bucket_dir = _gdbm_bucket_dir (dbf, next_insert); /* Invalidate old cache entry. */ - old_bucket.av_adr = dbf->cache_entry->ca_adr; + old_bucket.av_adr = dbf->cache_mru->ca_adr; old_bucket.av_size = dbf->header->bucket_size; - cache_elem_free (dbf, dbf->cache_entry); + cache_elem_free (dbf, dbf->cache_mru); /* Set dbf->bucket to the proper bucket. */ if (dbf->dir[dbf->bucket_dir] != adr_0) @@ -630,7 +648,6 @@ _gdbm_split_bucket (GDBM_FILE dbf, int next_insert) lru_unlink_elem (dbf, newcache[0]); lru_link_elem (dbf, newcache[0], NULL); - set_cache_entry (dbf, newcache[0]); } /* Get rid of old directories. */ @@ -725,18 +742,19 @@ _gdbm_cache_free (GDBM_FILE dbf) } } -/* Flush cache content to disk. */ +/* + * Flush cache content to disk. + * All cache elements with the changed buckets form a contiguous sequence + * at the head of the cache list (starting with cache_mru). + */ int _gdbm_cache_flush (GDBM_FILE dbf) { cache_elem *elem; - for (elem = dbf->cache_lru; elem; elem = elem->ca_prev) + for (elem = dbf->cache_mru; elem && elem->ca_changed; elem = elem->ca_next) { - if (elem->ca_changed) - { - if (_gdbm_write_bucket (dbf, elem)) - return -1; - } + if (_gdbm_write_bucket (dbf, elem)) + return -1; } return 0; } diff --git a/src/falloc.c b/src/falloc.c index f1cad4f..1c3115b 100644 --- a/src/falloc.c +++ b/src/falloc.c @@ -515,7 +515,7 @@ adjust_bucket_avail (GDBM_FILE dbf) av_el = dbf->avail->av_table[dbf->avail->count]; _gdbm_put_av_elem (av_el, dbf->bucket->bucket_avail, &dbf->bucket->av_count, dbf->coalesce_blocks); - dbf->bucket_changed = TRUE; + _gdbm_current_bucket_changed (dbf); } return 0; } @@ -533,7 +533,7 @@ adjust_bucket_avail (GDBM_FILE dbf) _gdbm_put_av_elem (av_el, dbf->avail->av_table, &dbf->avail->count, dbf->coalesce_blocks); - dbf->bucket_changed = TRUE; + _gdbm_current_bucket_changed (dbf); } return 0; } diff --git a/src/findkey.c b/src/findkey.c index d4c0d01..de8cbbf 100644 --- a/src/findkey.c +++ b/src/findkey.c @@ -67,8 +67,8 @@ _gdbm_read_entry (GDBM_FILE dbf, int elem_loc) data_cache_elem *data_ca; /* Is it already in the cache? */ - if (dbf->cache_entry->ca_data.elem_loc == elem_loc) - return dbf->cache_entry->ca_data.dptr; + if (dbf->cache_mru->ca_data.elem_loc == elem_loc) + return dbf->cache_mru->ca_data.dptr; if (!gdbm_bucket_element_valid_p (dbf, elem_loc)) { @@ -80,7 +80,7 @@ _gdbm_read_entry (GDBM_FILE dbf, int elem_loc) key_size = dbf->bucket->h_table[elem_loc].key_size; data_size = dbf->bucket->h_table[elem_loc].data_size; dsize = key_size + data_size; - data_ca = &dbf->cache_entry->ca_data; + data_ca = &dbf->cache_mru->ca_data; /* Make sure data_ca has sufficient space to accommodate both key and content. */ @@ -179,17 +179,17 @@ _gdbm_findkey (GDBM_FILE dbf, datum key, char **ret_dptr, int *ret_hash_val) return -1; /* Is the element the last one found for this bucket? */ - if (dbf->cache_entry->ca_data.elem_loc != -1 - && new_hash_val == dbf->cache_entry->ca_data.hash_val - && dbf->cache_entry->ca_data.key_size == key.dsize - && dbf->cache_entry->ca_data.dptr != NULL - && memcmp (dbf->cache_entry->ca_data.dptr, key.dptr, key.dsize) == 0) + if (dbf->cache_mru->ca_data.elem_loc != -1 + && new_hash_val == dbf->cache_mru->ca_data.hash_val + && dbf->cache_mru->ca_data.key_size == key.dsize + && dbf->cache_mru->ca_data.dptr != NULL + && memcmp (dbf->cache_mru->ca_data.dptr, key.dptr, key.dsize) == 0) { GDBM_DEBUG (GDBM_DEBUG_LOOKUP, "%s: found in cache", dbf->name); /* This is it. Return the cache pointer. */ if (ret_dptr) - *ret_dptr = dbf->cache_entry->ca_data.dptr + key.dsize; - return dbf->cache_entry->ca_data.elem_loc; + *ret_dptr = dbf->cache_mru->ca_data.dptr + key.dsize; + return dbf->cache_mru->ca_data.elem_loc; } /* It is not the cached value, search for element in the bucket. */ diff --git a/src/gdbmdefs.h b/src/gdbmdefs.h index b651f92..3348f9d 100644 --- a/src/gdbmdefs.h +++ b/src/gdbmdefs.h @@ -277,10 +277,7 @@ struct gdbm_file_info cache_elem *cache_lru; /* Last recently used element - tail of the list */ cache_elem *cache_avail; /* Pool of available elements (linked by prev, next) */ - /* Pointer to the current bucket's cache entry. */ - cache_elem *cache_entry; - - /* Points to the current hash bucket in the cache. */ + /* Points to dbf->cache_mru.ca_bucket -- the current hash bucket */ hash_bucket *bucket; /* The directory entry used to get the current hash bucket. */ @@ -294,8 +291,6 @@ struct gdbm_file_info end of an update. */ unsigned header_changed :1; unsigned directory_changed :1; - unsigned bucket_changed :1; - unsigned second_changed :1; off_t file_size; /* Cached value of the current disk file size. If -1, fstat will be used to retrieve it. */ diff --git a/src/gdbmdelete.c b/src/gdbmdelete.c index 948c8a1..9206658 100644 --- a/src/gdbmdelete.c +++ b/src/gdbmdelete.c @@ -85,12 +85,12 @@ gdbm_delete (GDBM_FILE dbf, datum key) return -1; /* Set the flags. */ - dbf->bucket_changed = TRUE; + _gdbm_current_bucket_changed (dbf); /* Invalidate data cache for the current bucket. */ - dbf->cache_entry->ca_data.hash_val = -1; - dbf->cache_entry->ca_data.key_size = 0; - dbf->cache_entry->ca_data.elem_loc = -1; + dbf->cache_mru->ca_data.hash_val = -1; + dbf->cache_mru->ca_data.key_size = 0; + dbf->cache_mru->ca_data.elem_loc = -1; /* Do the writes. */ return _gdbm_end_update (dbf); diff --git a/src/gdbmopen.c b/src/gdbmopen.c index 08d42bc..183159c 100644 --- a/src/gdbmopen.c +++ b/src/gdbmopen.c @@ -673,8 +673,6 @@ gdbm_fd_open (int fd, const char *file_name, int block_size, dbf->bucket_dir = 0; dbf->header_changed = FALSE; dbf->directory_changed = FALSE; - dbf->bucket_changed = FALSE; - dbf->second_changed = FALSE; if (flags & GDBM_XVERIFY) { diff --git a/src/gdbmseq.c b/src/gdbmseq.c index f938c5a..52bddbd 100644 --- a/src/gdbmseq.c +++ b/src/gdbmseq.c @@ -68,7 +68,7 @@ get_next_key (GDBM_FILE dbf, int elem_loc, datum *return_val) /* Find the next bucket. It is possible several entries in the bucket directory point to the same bucket. */ while (dbf->bucket_dir < GDBM_DIR_COUNT (dbf) - && dbf->cache_entry->ca_adr == dbf->dir[dbf->bucket_dir]) + && dbf->cache_mru->ca_adr == dbf->dir[dbf->bucket_dir]) dbf->bucket_dir++; /* Check to see if there was a next bucket. */ diff --git a/src/gdbmstore.c b/src/gdbmstore.c index f860d80..7491ffc 100644 --- a/src/gdbmstore.c +++ b/src/gdbmstore.c @@ -190,8 +190,7 @@ gdbm_store (GDBM_FILE dbf, datum key, datum content, int flags) } /* Current bucket has changed. */ - dbf->cache_entry->ca_changed = TRUE; - dbf->bucket_changed = TRUE; + _gdbm_current_bucket_changed (dbf); /* Write everything that is needed to the disk. */ return _gdbm_end_update (dbf); diff --git a/src/proto.h b/src/proto.h index 3b03c73..40a396a 100644 --- a/src/proto.h +++ b/src/proto.h @@ -27,6 +27,13 @@ int _gdbm_cache_init (GDBM_FILE, size_t); void _gdbm_cache_free (GDBM_FILE dbf); int _gdbm_cache_flush (GDBM_FILE dbf); +/* Mark current bucket as changed. */ +static inline void +_gdbm_current_bucket_changed (GDBM_FILE dbf) +{ + dbf->cache_mru->ca_changed = TRUE; +} + /* Return true if the directory entry at DIR_INDEX can be considered valid. This means that DIR_INDEX is in the valid range for addressing the dir array, and the offset stored in dir[DIR_INDEX] points past diff --git a/src/recover.c b/src/recover.c index dbc187b..9ef43d1 100644 --- a/src/recover.c +++ b/src/recover.c @@ -168,12 +168,9 @@ _gdbm_finish_transfer (GDBM_FILE dbf, GDBM_FILE new_dbf, dbf->cache_mru = new_dbf->cache_mru; dbf->cache_lru = new_dbf->cache_lru; dbf->cache_avail = new_dbf->cache_avail; - dbf->cache_entry = new_dbf->cache_entry; dbf->header_changed = new_dbf->header_changed; dbf->directory_changed = new_dbf->directory_changed; - dbf->bucket_changed = new_dbf->bucket_changed; - dbf->second_changed = new_dbf->second_changed; dbf->file_size = -1; diff --git a/src/update.c b/src/update.c index 7e16eeb..2faa116 100644 --- a/src/update.c +++ b/src/update.c @@ -63,20 +63,8 @@ _gdbm_end_update (GDBM_FILE dbf) off_t file_pos; /* Return value for lseek. */ int rc; - /* Write the current bucket. */ - if (dbf->bucket_changed && (dbf->cache_entry != NULL)) - { - if (_gdbm_write_bucket (dbf, dbf->cache_entry)) - return -1; - dbf->bucket_changed = FALSE; - } - - /* Write the other changed buckets if there are any. */ - if (dbf->second_changed) - { - _gdbm_cache_flush (dbf); - dbf->second_changed = FALSE; - } + /* Write the changed buckets if there are any. */ + _gdbm_cache_flush (dbf); /* Write the directory. */ if (dbf->directory_changed) diff --git a/tests/gtcacheopt.c b/tests/gtcacheopt.c index 3f23714..088e6d8 100644 --- a/tests/gtcacheopt.c +++ b/tests/gtcacheopt.c @@ -202,7 +202,7 @@ main (int argc, char **argv) i = CACHE_SIZE; if (gdbm_setopt (dbf, GDBM_SETCACHESIZE, &i, sizeof (i))) { - fprintf (stderr, "GDBM_GETCACHESIZE: %s\n", gdbm_strerror (gdbm_errno)); + fprintf (stderr, "GDBM_SETCACHESIZE: %s\n", gdbm_strerror (gdbm_errno)); return 1; } diff --git a/tests/gtload.c b/tests/gtload.c index d843111..1fcafb2 100644 --- a/tests/gtload.c +++ b/tests/gtload.c @@ -96,6 +96,7 @@ main (int argc, char **argv) int recover = 0; gdbm_recovery rcvr; int rcvr_flags = 0; + size_t cache_size = 0; progname = canonical_progname (argv[0]); #ifdef GDBM_DEBUG_ENABLE @@ -135,6 +136,8 @@ main (int argc, char **argv) delim = arg[7]; else if (strcmp (arg, "-recover") == 0) recover = 1; + else if (strncmp (arg, "-cachesize=", 11) == 0) + cache_size = read_size (arg + 11); else if (strcmp (arg, "-verbose") == 0) { verbose = 1; @@ -213,7 +216,17 @@ main (int argc, char **argv) gdbm_strerror (gdbm_errno)); exit (1); } - } + } + if (cache_size) + { + if (gdbm_setopt (dbf, GDBM_SETCACHESIZE, &cache_size, + sizeof (cache_size))) + { + fprintf (stderr, "GDBM_SETCACHESIZE failed: %s\n", + gdbm_strerror (gdbm_errno)); + exit (1); + } + } if (verbose) { diff --git a/tools/gdbmshell.c b/tools/gdbmshell.c index 3a85211..22c4938 100644 --- a/tools/gdbmshell.c +++ b/tools/gdbmshell.c @@ -373,7 +373,7 @@ _gdbm_print_bucket_cache (FILE *fp, GDBM_FILE dbf) fprintf (fp, _("Bucket Cache (size %zu/%zu):\n Index: Address Changed Data_Hash \n"), dbf->cache_num, dbf->cache_size); - for (elem = dbf->cache_entry, i = 0; elem; elem = elem->ca_next, i++) + for (elem = dbf->cache_mru, i = 0; elem; elem = elem->ca_next, i++) { fprintf (fp, " %5d: %15lu %7s %x\n", i, |