summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRusty Russell <rusty@rustcorp.com.au>2011-06-20 18:40:33 +0930
committerRusty Russell <rusty@rustcorp.com.au>2011-06-20 11:18:35 +0200
commitd24ddb0350ddb402bd9d219e129439cdbd77ecfe (patch)
tree7d1d7a632788ae10de780d0e664baa01264bdba3
parentd925b327f4703cc141c0a7f3eec912dba8440880 (diff)
downloadsamba-d24ddb0350ddb402bd9d219e129439cdbd77ecfe.tar.gz
tdb2: add lib/tdb2 (from CCAN init-1161-g661d41f)
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
-rw-r--r--lib/tdb2/LICENSE165
-rw-r--r--lib/tdb2/_info91
-rw-r--r--lib/tdb2/check.c835
-rw-r--r--lib/tdb2/doc/TDB1_porting.txt44
-rw-r--r--lib/tdb2/doc/design-1.3.txt1049
-rw-r--r--lib/tdb2/doc/design.lyx2689
-rw-r--r--lib/tdb2/doc/design.lyx,v4679
-rw-r--r--lib/tdb2/doc/design.pdfbin0 -> 240440 bytes
-rw-r--r--lib/tdb2/doc/design.txt1258
-rw-r--r--lib/tdb2/free.c968
-rw-r--r--lib/tdb2/hash.c881
-rw-r--r--lib/tdb2/io.c615
-rw-r--r--lib/tdb2/lock.c875
-rw-r--r--lib/tdb2/open.c661
-rw-r--r--lib/tdb2/private.h614
-rw-r--r--lib/tdb2/summary.c282
-rw-r--r--lib/tdb2/tdb.c484
-rw-r--r--lib/tdb2/tdb2.h846
-rw-r--r--lib/tdb2/test/external-agent.c250
-rw-r--r--lib/tdb2/test/external-agent.h43
-rw-r--r--lib/tdb2/test/failtest_helper.c117
-rw-r--r--lib/tdb2/test/failtest_helper.h17
-rw-r--r--lib/tdb2/test/layout.c348
-rw-r--r--lib/tdb2/test/layout.h68
-rw-r--r--lib/tdb2/test/lock-tracking.c147
-rw-r--r--lib/tdb2/test/lock-tracking.h25
-rw-r--r--lib/tdb2/test/logging.c24
-rw-r--r--lib/tdb2/test/logging.h15
-rw-r--r--lib/tdb2/test/run-001-encode.c48
-rw-r--r--lib/tdb2/test/run-001-fls.c40
-rw-r--r--lib/tdb2/test/run-01-new_database.c42
-rw-r--r--lib/tdb2/test/run-02-expand.c80
-rw-r--r--lib/tdb2/test/run-03-coalesce.c170
-rw-r--r--lib/tdb2/test/run-04-basichash.c267
-rw-r--r--lib/tdb2/test/run-05-readonly-open.c88
-rw-r--r--lib/tdb2/test/run-10-simple-store.c76
-rw-r--r--lib/tdb2/test/run-11-simple-fetch.c76
-rw-r--r--lib/tdb2/test/run-12-store.c58
-rw-r--r--lib/tdb2/test/run-13-delete.c207
-rw-r--r--lib/tdb2/test/run-14-exists.c57
-rw-r--r--lib/tdb2/test/run-15-append.c135
-rw-r--r--lib/tdb2/test/run-16-wipe_all.c50
-rw-r--r--lib/tdb2/test/run-20-growhash.c144
-rw-r--r--lib/tdb2/test/run-21-parse_record.c70
-rw-r--r--lib/tdb2/test/run-25-hashoverload.c121
-rw-r--r--lib/tdb2/test/run-30-exhaust-before-expand.c79
-rw-r--r--lib/tdb2/test/run-50-multiple-freelists.c71
-rw-r--r--lib/tdb2/test/run-55-transaction.c75
-rw-r--r--lib/tdb2/test/run-56-open-during-transaction.c175
-rw-r--r--lib/tdb2/test/run-57-die-during-transaction.c275
-rw-r--r--lib/tdb2/test/run-64-bit-tdb.c80
-rw-r--r--lib/tdb2/test/run-80-tdb_fd.c35
-rw-r--r--lib/tdb2/test/run-81-seqnum.c71
-rw-r--r--lib/tdb2/test/run-82-lockattr.c263
-rw-r--r--lib/tdb2/test/run-83-openhook.c98
-rw-r--r--lib/tdb2/test/run-90-get-set-attributes.c165
-rw-r--r--lib/tdb2/test/run-91-get-stats.c59
-rw-r--r--lib/tdb2/test/run-add-remove-flags.c93
-rw-r--r--lib/tdb2/test/run-check-callback.c90
-rw-r--r--lib/tdb2/test/run-expand-in-transaction.c45
-rw-r--r--lib/tdb2/test/run-features.c70
-rw-r--r--lib/tdb2/test/run-firstkey-nextkey.c162
-rw-r--r--lib/tdb2/test/run-fork-test.c180
-rw-r--r--lib/tdb2/test/run-lockall.c80
-rw-r--r--lib/tdb2/test/run-locktimeout.c192
-rw-r--r--lib/tdb2/test/run-missing-entries.c48
-rw-r--r--lib/tdb2/test/run-open-multiple-times.c84
-rw-r--r--lib/tdb2/test/run-record-expand.c53
-rw-r--r--lib/tdb2/test/run-remap-in-read_traverse.c65
-rw-r--r--lib/tdb2/test/run-seed.c67
-rw-r--r--lib/tdb2/test/run-simple-delete.c42
-rw-r--r--lib/tdb2/test/run-summary.c60
-rw-r--r--lib/tdb2/test/run-tdb_errorstr.c59
-rw-r--r--lib/tdb2/test/run-traverse.c211
-rw-r--r--lib/tdb2/tools/Makefile16
-rw-r--r--lib/tdb2/tools/growtdb-bench.c112
-rw-r--r--lib/tdb2/tools/mktdb2.c29
-rw-r--r--lib/tdb2/tools/speed.c440
-rw-r--r--lib/tdb2/tools/tdb2dump.c115
-rw-r--r--lib/tdb2/tools/tdb2restore.c227
-rw-r--r--lib/tdb2/tools/tdb2tool.c798
-rw-r--r--lib/tdb2/tools/tdb2torture.c494
-rw-r--r--lib/tdb2/transaction.c1308
-rw-r--r--lib/tdb2/traverse.c99
84 files changed, 26504 insertions, 0 deletions
diff --git a/lib/tdb2/LICENSE b/lib/tdb2/LICENSE
new file mode 100644
index 00000000000..cca7fc278f5
--- /dev/null
+++ b/lib/tdb2/LICENSE
@@ -0,0 +1,165 @@
+ GNU LESSER GENERAL PUBLIC LICENSE
+ Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+
+ This version of the GNU Lesser General Public License incorporates
+the terms and conditions of version 3 of the GNU General Public
+License, supplemented by the additional permissions listed below.
+
+ 0. Additional Definitions.
+
+ As used herein, "this License" refers to version 3 of the GNU Lesser
+General Public License, and the "GNU GPL" refers to version 3 of the GNU
+General Public License.
+
+ "The Library" refers to a covered work governed by this License,
+other than an Application or a Combined Work as defined below.
+
+ An "Application" is any work that makes use of an interface provided
+by the Library, but which is not otherwise based on the Library.
+Defining a subclass of a class defined by the Library is deemed a mode
+of using an interface provided by the Library.
+
+ A "Combined Work" is a work produced by combining or linking an
+Application with the Library. The particular version of the Library
+with which the Combined Work was made is also called the "Linked
+Version".
+
+ The "Minimal Corresponding Source" for a Combined Work means the
+Corresponding Source for the Combined Work, excluding any source code
+for portions of the Combined Work that, considered in isolation, are
+based on the Application, and not on the Linked Version.
+
+ The "Corresponding Application Code" for a Combined Work means the
+object code and/or source code for the Application, including any data
+and utility programs needed for reproducing the Combined Work from the
+Application, but excluding the System Libraries of the Combined Work.
+
+ 1. Exception to Section 3 of the GNU GPL.
+
+ You may convey a covered work under sections 3 and 4 of this License
+without being bound by section 3 of the GNU GPL.
+
+ 2. Conveying Modified Versions.
+
+ If you modify a copy of the Library, and, in your modifications, a
+facility refers to a function or data to be supplied by an Application
+that uses the facility (other than as an argument passed when the
+facility is invoked), then you may convey a copy of the modified
+version:
+
+ a) under this License, provided that you make a good faith effort to
+ ensure that, in the event an Application does not supply the
+ function or data, the facility still operates, and performs
+ whatever part of its purpose remains meaningful, or
+
+ b) under the GNU GPL, with none of the additional permissions of
+ this License applicable to that copy.
+
+ 3. Object Code Incorporating Material from Library Header Files.
+
+ The object code form of an Application may incorporate material from
+a header file that is part of the Library. You may convey such object
+code under terms of your choice, provided that, if the incorporated
+material is not limited to numerical parameters, data structure
+layouts and accessors, or small macros, inline functions and templates
+(ten or fewer lines in length), you do both of the following:
+
+ a) Give prominent notice with each copy of the object code that the
+ Library is used in it and that the Library and its use are
+ covered by this License.
+
+ b) Accompany the object code with a copy of the GNU GPL and this license
+ document.
+
+ 4. Combined Works.
+
+ You may convey a Combined Work under terms of your choice that,
+taken together, effectively do not restrict modification of the
+portions of the Library contained in the Combined Work and reverse
+engineering for debugging such modifications, if you also do each of
+the following:
+
+ a) Give prominent notice with each copy of the Combined Work that
+ the Library is used in it and that the Library and its use are
+ covered by this License.
+
+ b) Accompany the Combined Work with a copy of the GNU GPL and this license
+ document.
+
+ c) For a Combined Work that displays copyright notices during
+ execution, include the copyright notice for the Library among
+ these notices, as well as a reference directing the user to the
+ copies of the GNU GPL and this license document.
+
+ d) Do one of the following:
+
+ 0) Convey the Minimal Corresponding Source under the terms of this
+ License, and the Corresponding Application Code in a form
+ suitable for, and under terms that permit, the user to
+ recombine or relink the Application with a modified version of
+ the Linked Version to produce a modified Combined Work, in the
+ manner specified by section 6 of the GNU GPL for conveying
+ Corresponding Source.
+
+ 1) Use a suitable shared library mechanism for linking with the
+ Library. A suitable mechanism is one that (a) uses at run time
+ a copy of the Library already present on the user's computer
+ system, and (b) will operate properly with a modified version
+ of the Library that is interface-compatible with the Linked
+ Version.
+
+ e) Provide Installation Information, but only if you would otherwise
+ be required to provide such information under section 6 of the
+ GNU GPL, and only to the extent that such information is
+ necessary to install and execute a modified version of the
+ Combined Work produced by recombining or relinking the
+ Application with a modified version of the Linked Version. (If
+ you use option 4d0, the Installation Information must accompany
+ the Minimal Corresponding Source and Corresponding Application
+ Code. If you use option 4d1, you must provide the Installation
+ Information in the manner specified by section 6 of the GNU GPL
+ for conveying Corresponding Source.)
+
+ 5. Combined Libraries.
+
+ You may place library facilities that are a work based on the
+Library side by side in a single library together with other library
+facilities that are not Applications and are not covered by this
+License, and convey such a combined library under terms of your
+choice, if you do both of the following:
+
+ a) Accompany the combined library with a copy of the same work based
+ on the Library, uncombined with any other library facilities,
+ conveyed under the terms of this License.
+
+ b) Give prominent notice with the combined library that part of it
+ is a work based on the Library, and explaining where to find the
+ accompanying uncombined form of the same work.
+
+ 6. Revised Versions of the GNU Lesser General Public License.
+
+ The Free Software Foundation may publish revised and/or new versions
+of the GNU Lesser General Public License from time to time. Such new
+versions will be similar in spirit to the present version, but may
+differ in detail to address new problems or concerns.
+
+ Each version is given a distinguishing version number. If the
+Library as you received it specifies that a certain numbered version
+of the GNU Lesser General Public License "or any later version"
+applies to it, you have the option of following the terms and
+conditions either of that published version or of any later version
+published by the Free Software Foundation. If the Library as you
+received it does not specify a version number of the GNU Lesser
+General Public License, you may choose any version of the GNU Lesser
+General Public License ever published by the Free Software Foundation.
+
+ If the Library as you received it specifies that a proxy can decide
+whether future versions of the GNU Lesser General Public License shall
+apply, that proxy's public statement of acceptance of any version is
+permanent authorization for you to choose that version for the
+Library.
diff --git a/lib/tdb2/_info b/lib/tdb2/_info
new file mode 100644
index 00000000000..7213d67a22a
--- /dev/null
+++ b/lib/tdb2/_info
@@ -0,0 +1,91 @@
+#include <string.h>
+#include <stdio.h>
+
+/**
+ * tdb2 - [[WORK IN PROGRESS!]] The trivial (64bit transactional) database
+ *
+ * The tdb2 module provides an efficient keyword data mapping (usually
+ * within a file). It supports transactions, so the contents of the
+ * database is reliable even across crashes.
+ *
+ * Example:
+ * #include <ccan/tdb2/tdb2.h>
+ * #include <ccan/str/str.h>
+ * #include <err.h>
+ * #include <stdio.h>
+ *
+ * static void usage(const char *argv0)
+ * {
+ * errx(1, "Usage: %s fetch <dbfile> <key>\n"
+ * "OR %s store <dbfile> <key> <data>", argv0, argv0);
+ * }
+ *
+ * int main(int argc, char *argv[])
+ * {
+ * struct tdb_context *tdb;
+ * TDB_DATA key, value;
+ * enum TDB_ERROR error;
+ *
+ * if (argc < 4)
+ * usage(argv[0]);
+ *
+ * tdb = tdb_open(argv[2], TDB_DEFAULT, O_CREAT|O_RDWR,0600, NULL);
+ * if (!tdb)
+ * err(1, "Opening %s", argv[2]);
+ *
+ * key.dptr = (void *)argv[3];
+ * key.dsize = strlen(argv[3]);
+ *
+ * if (streq(argv[1], "fetch")) {
+ * if (argc != 4)
+ * usage(argv[0]);
+ * error = tdb_fetch(tdb, key, &value);
+ * if (error)
+ * errx(1, "fetch %s: %s",
+ * argv[3], tdb_errorstr(error));
+ * printf("%.*s\n", value.dsize, (char *)value.dptr);
+ * free(value.dptr);
+ * } else if (streq(argv[1], "store")) {
+ * if (argc != 5)
+ * usage(argv[0]);
+ * value.dptr = (void *)argv[4];
+ * value.dsize = strlen(argv[4]);
+ * error = tdb_store(tdb, key, value, 0);
+ * if (error)
+ * errx(1, "store %s: %s",
+ * argv[3], tdb_errorstr(error));
+ * } else
+ * usage(argv[0]);
+ *
+ * return 0;
+ * }
+ *
+ * Maintainer: Rusty Russell <rusty@rustcorp.com.au>
+ *
+ * Author: Rusty Russell
+ *
+ * License: LGPLv3 (or later)
+ */
+int main(int argc, char *argv[])
+{
+ if (argc != 2)
+ return 1;
+
+ if (strcmp(argv[1], "depends") == 0) {
+ printf("ccan/asprintf\n");
+ printf("ccan/hash\n");
+ printf("ccan/likely\n");
+ printf("ccan/asearch\n");
+ printf("ccan/compiler\n");
+ printf("ccan/build_assert\n");
+ printf("ccan/ilog\n");
+ printf("ccan/failtest\n");
+ printf("ccan/tally\n");
+ printf("ccan/typesafe_cb\n");
+ printf("ccan/cast\n");
+ printf("ccan/endian\n");
+ return 0;
+ }
+
+ return 1;
+}
diff --git a/lib/tdb2/check.c b/lib/tdb2/check.c
new file mode 100644
index 00000000000..52fb188764d
--- /dev/null
+++ b/lib/tdb2/check.c
@@ -0,0 +1,835 @@
+ /*
+ Trivial Database 2: free list/block handling
+ Copyright (C) Rusty Russell 2010
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 3 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+#include "private.h"
+#include <ccan/likely/likely.h>
+#include <ccan/asearch/asearch.h>
+
+/* We keep an ordered array of offsets. */
+static bool append(tdb_off_t **arr, size_t *num, tdb_off_t off)
+{
+ tdb_off_t *new = realloc(*arr, (*num + 1) * sizeof(tdb_off_t));
+ if (!new)
+ return false;
+ new[(*num)++] = off;
+ *arr = new;
+ return true;
+}
+
+static enum TDB_ERROR check_header(struct tdb_context *tdb, tdb_off_t *recovery,
+ uint64_t *features)
+{
+ uint64_t hash_test;
+ struct tdb_header hdr;
+ enum TDB_ERROR ecode;
+
+ ecode = tdb_read_convert(tdb, 0, &hdr, sizeof(hdr));
+ if (ecode != TDB_SUCCESS) {
+ return ecode;
+ }
+ /* magic food should not be converted, so convert back. */
+ tdb_convert(tdb, hdr.magic_food, sizeof(hdr.magic_food));
+
+ hash_test = TDB_HASH_MAGIC;
+ hash_test = tdb_hash(tdb, &hash_test, sizeof(hash_test));
+ if (hdr.hash_test != hash_test) {
+ return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
+ "check: hash test %llu should be %llu",
+ (long long)hdr.hash_test,
+ (long long)hash_test);
+ }
+
+ if (strcmp(hdr.magic_food, TDB_MAGIC_FOOD) != 0) {
+ return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
+ "check: bad magic '%.*s'",
+ (unsigned)sizeof(hdr.magic_food),
+ hdr.magic_food);
+ }
+
+ /* Features which are used must be a subset of features offered. */
+ if (hdr.features_used & ~hdr.features_offered) {
+ return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
+ "check: features used (0x%llx) which"
+ " are not offered (0x%llx)",
+ (long long)hdr.features_used,
+ (long long)hdr.features_offered);
+ }
+
+ *features = hdr.features_offered;
+ *recovery = hdr.recovery;
+ if (*recovery) {
+ if (*recovery < sizeof(hdr)
+ || *recovery > tdb->file->map_size) {
+ return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
+ "tdb_check:"
+ " invalid recovery offset %zu",
+ (size_t)*recovery);
+ }
+ }
+
+ /* Don't check reserved: they *can* be used later. */
+ return TDB_SUCCESS;
+}
+
+static enum TDB_ERROR check_hash_tree(struct tdb_context *tdb,
+ tdb_off_t off, unsigned int group_bits,
+ uint64_t hprefix,
+ unsigned hprefix_bits,
+ tdb_off_t used[],
+ size_t num_used,
+ size_t *num_found,
+ enum TDB_ERROR (*check)(TDB_DATA,
+ TDB_DATA, void *),
+ void *data);
+
+static enum TDB_ERROR check_hash_chain(struct tdb_context *tdb,
+ tdb_off_t off,
+ uint64_t hash,
+ tdb_off_t used[],
+ size_t num_used,
+ size_t *num_found,
+ enum TDB_ERROR (*check)(TDB_DATA,
+ TDB_DATA,
+ void *),
+ void *data)
+{
+ struct tdb_used_record rec;
+ enum TDB_ERROR ecode;
+
+ ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec));
+ if (ecode != TDB_SUCCESS) {
+ return ecode;
+ }
+
+ if (rec_magic(&rec) != TDB_CHAIN_MAGIC) {
+ return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
+ "tdb_check: Bad hash chain magic %llu",
+ (long long)rec_magic(&rec));
+ }
+
+ if (rec_data_length(&rec) != sizeof(struct tdb_chain)) {
+ return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
+ "tdb_check:"
+ " Bad hash chain length %llu vs %zu",
+ (long long)rec_data_length(&rec),
+ sizeof(struct tdb_chain));
+ }
+ if (rec_key_length(&rec) != 0) {
+ return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
+ "tdb_check: Bad hash chain key length %llu",
+ (long long)rec_key_length(&rec));
+ }
+ if (rec_hash(&rec) != 0) {
+ return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
+ "tdb_check: Bad hash chain hash value %llu",
+ (long long)rec_hash(&rec));
+ }
+
+ off += sizeof(rec);
+ ecode = check_hash_tree(tdb, off, 0, hash, 64,
+ used, num_used, num_found, check, data);
+ if (ecode != TDB_SUCCESS) {
+ return ecode;
+ }
+
+ off = tdb_read_off(tdb, off + offsetof(struct tdb_chain, next));
+ if (TDB_OFF_IS_ERR(off)) {
+ return off;
+ }
+ if (off == 0)
+ return TDB_SUCCESS;
+ (*num_found)++;
+ return check_hash_chain(tdb, off, hash, used, num_used, num_found,
+ check, data);
+}
+
+static enum TDB_ERROR check_hash_record(struct tdb_context *tdb,
+ tdb_off_t off,
+ uint64_t hprefix,
+ unsigned hprefix_bits,
+ tdb_off_t used[],
+ size_t num_used,
+ size_t *num_found,
+ enum TDB_ERROR (*check)(TDB_DATA,
+ TDB_DATA,
+ void *),
+ void *data)
+{
+ struct tdb_used_record rec;
+ enum TDB_ERROR ecode;
+
+ if (hprefix_bits >= 64)
+ return check_hash_chain(tdb, off, hprefix, used, num_used,
+ num_found, check, data);
+
+ ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec));
+ if (ecode != TDB_SUCCESS) {
+ return ecode;
+ }
+
+ if (rec_magic(&rec) != TDB_HTABLE_MAGIC) {
+ return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
+ "tdb_check: Bad hash table magic %llu",
+ (long long)rec_magic(&rec));
+ }
+ if (rec_data_length(&rec)
+ != sizeof(tdb_off_t) << TDB_SUBLEVEL_HASH_BITS) {
+ return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
+ "tdb_check:"
+ " Bad hash table length %llu vs %llu",
+ (long long)rec_data_length(&rec),
+ (long long)sizeof(tdb_off_t)
+ << TDB_SUBLEVEL_HASH_BITS);
+ }
+ if (rec_key_length(&rec) != 0) {
+ return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
+ "tdb_check: Bad hash table key length %llu",
+ (long long)rec_key_length(&rec));
+ }
+ if (rec_hash(&rec) != 0) {
+ return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
+ "tdb_check: Bad hash table hash value %llu",
+ (long long)rec_hash(&rec));
+ }
+
+ off += sizeof(rec);
+ return check_hash_tree(tdb, off,
+ TDB_SUBLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS,
+ hprefix, hprefix_bits,
+ used, num_used, num_found, check, data);
+}
+
+static int off_cmp(const tdb_off_t *a, const tdb_off_t *b)
+{
+ /* Can overflow an int. */
+ return *a > *b ? 1
+ : *a < *b ? -1
+ : 0;
+}
+
+static uint64_t get_bits(uint64_t h, unsigned num, unsigned *used)
+{
+ *used += num;
+
+ return (h >> (64 - *used)) & ((1U << num) - 1);
+}
+
+static enum TDB_ERROR check_hash_tree(struct tdb_context *tdb,
+ tdb_off_t off, unsigned int group_bits,
+ uint64_t hprefix,
+ unsigned hprefix_bits,
+ tdb_off_t used[],
+ size_t num_used,
+ size_t *num_found,
+ enum TDB_ERROR (*check)(TDB_DATA,
+ TDB_DATA, void *),
+ void *data)
+{
+ unsigned int g, b;
+ const tdb_off_t *hash;
+ struct tdb_used_record rec;
+ enum TDB_ERROR ecode;
+
+ hash = tdb_access_read(tdb, off,
+ sizeof(tdb_off_t)
+ << (group_bits + TDB_HASH_GROUP_BITS),
+ true);
+ if (TDB_PTR_IS_ERR(hash)) {
+ return TDB_PTR_ERR(hash);
+ }
+
+ for (g = 0; g < (1 << group_bits); g++) {
+ const tdb_off_t *group = hash + (g << TDB_HASH_GROUP_BITS);
+ for (b = 0; b < (1 << TDB_HASH_GROUP_BITS); b++) {
+ unsigned int bucket, i, used_bits;
+ uint64_t h;
+ tdb_off_t *p;
+ if (group[b] == 0)
+ continue;
+
+ off = group[b] & TDB_OFF_MASK;
+ p = asearch(&off, used, num_used, off_cmp);
+ if (!p) {
+ ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT,
+ TDB_LOG_ERROR,
+ "tdb_check: Invalid offset"
+ " %llu in hash",
+ (long long)off);
+ goto fail;
+ }
+ /* Mark it invalid. */
+ *p ^= 1;
+ (*num_found)++;
+
+ if (hprefix_bits == 64) {
+ /* Chained entries are unordered. */
+ if (is_subhash(group[b])) {
+ ecode = TDB_ERR_CORRUPT;
+ tdb_logerr(tdb, ecode,
+ TDB_LOG_ERROR,
+ "tdb_check: Invalid chain"
+ " entry subhash");
+ goto fail;
+ }
+ h = hash_record(tdb, off);
+ if (h != hprefix) {
+ ecode = TDB_ERR_CORRUPT;
+ tdb_logerr(tdb, ecode,
+ TDB_LOG_ERROR,
+ "check: bad hash chain"
+ " placement"
+ " 0x%llx vs 0x%llx",
+ (long long)h,
+ (long long)hprefix);
+ goto fail;
+ }
+ ecode = tdb_read_convert(tdb, off, &rec,
+ sizeof(rec));
+ if (ecode != TDB_SUCCESS) {
+ goto fail;
+ }
+ goto check;
+ }
+
+ if (is_subhash(group[b])) {
+ uint64_t subprefix;
+ subprefix = (hprefix
+ << (group_bits + TDB_HASH_GROUP_BITS))
+ + g * (1 << TDB_HASH_GROUP_BITS) + b;
+
+ ecode = check_hash_record(tdb,
+ group[b] & TDB_OFF_MASK,
+ subprefix,
+ hprefix_bits
+ + group_bits
+ + TDB_HASH_GROUP_BITS,
+ used, num_used, num_found,
+ check, data);
+ if (ecode != TDB_SUCCESS) {
+ goto fail;
+ }
+ continue;
+ }
+ /* A normal entry */
+
+ /* Does it belong here at all? */
+ h = hash_record(tdb, off);
+ used_bits = 0;
+ if (get_bits(h, hprefix_bits, &used_bits) != hprefix
+ && hprefix_bits) {
+ ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT,
+ TDB_LOG_ERROR,
+ "check: bad hash placement"
+ " 0x%llx vs 0x%llx",
+ (long long)h,
+ (long long)hprefix);
+ goto fail;
+ }
+
+ /* Does it belong in this group? */
+ if (get_bits(h, group_bits, &used_bits) != g) {
+ ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT,
+ TDB_LOG_ERROR,
+ "check: bad group %llu"
+ " vs %u",
+ (long long)h, g);
+ goto fail;
+ }
+
+ /* Are bucket bits correct? */
+ bucket = group[b] & TDB_OFF_HASH_GROUP_MASK;
+ if (get_bits(h, TDB_HASH_GROUP_BITS, &used_bits)
+ != bucket) {
+ used_bits -= TDB_HASH_GROUP_BITS;
+ ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT,
+ TDB_LOG_ERROR,
+ "check: bad bucket %u vs %u",
+ (unsigned)get_bits(h,
+ TDB_HASH_GROUP_BITS,
+ &used_bits),
+ bucket);
+ goto fail;
+ }
+
+ /* There must not be any zero entries between
+ * the bucket it belongs in and this one! */
+ for (i = bucket;
+ i != b;
+ i = (i + 1) % (1 << TDB_HASH_GROUP_BITS)) {
+ if (group[i] == 0) {
+ ecode = TDB_ERR_CORRUPT;
+ tdb_logerr(tdb, ecode,
+ TDB_LOG_ERROR,
+ "check: bad group placement"
+ " %u vs %u",
+ b, bucket);
+ goto fail;
+ }
+ }
+
+ ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec));
+ if (ecode != TDB_SUCCESS) {
+ goto fail;
+ }
+
+ /* Bottom bits must match header. */
+ if ((h & ((1 << 11)-1)) != rec_hash(&rec)) {
+ ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT,
+ TDB_LOG_ERROR,
+ "tdb_check: Bad hash magic"
+ " at offset %llu"
+ " (0x%llx vs 0x%llx)",
+ (long long)off,
+ (long long)h,
+ (long long)rec_hash(&rec));
+ goto fail;
+ }
+
+ check:
+ if (check) {
+ TDB_DATA k, d;
+ const unsigned char *kptr;
+
+ kptr = tdb_access_read(tdb,
+ off + sizeof(rec),
+ rec_key_length(&rec)
+ + rec_data_length(&rec),
+ false);
+ if (TDB_PTR_IS_ERR(kptr)) {
+ ecode = TDB_PTR_ERR(kptr);
+ goto fail;
+ }
+
+ k = tdb_mkdata(kptr, rec_key_length(&rec));
+ d = tdb_mkdata(kptr + k.dsize,
+ rec_data_length(&rec));
+ ecode = check(k, d, data);
+ tdb_access_release(tdb, kptr);
+ if (ecode != TDB_SUCCESS) {
+ goto fail;
+ }
+ }
+ }
+ }
+ tdb_access_release(tdb, hash);
+ return TDB_SUCCESS;
+
+fail:
+ tdb_access_release(tdb, hash);
+ return ecode;
+}
+
+static enum TDB_ERROR check_hash(struct tdb_context *tdb,
+ tdb_off_t used[],
+ size_t num_used, size_t num_ftables,
+ int (*check)(TDB_DATA, TDB_DATA, void *),
+ void *data)
+{
+ /* Free tables also show up as used. */
+ size_t num_found = num_ftables;
+ enum TDB_ERROR ecode;
+
+ ecode = check_hash_tree(tdb, offsetof(struct tdb_header, hashtable),
+ TDB_TOPLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS,
+ 0, 0, used, num_used, &num_found,
+ check, data);
+ if (ecode == TDB_SUCCESS) {
+ if (num_found != num_used) {
+ ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
+ "tdb_check: Not all entries"
+ " are in hash");
+ }
+ }
+ return ecode;
+}
+
+static enum TDB_ERROR check_free(struct tdb_context *tdb,
+ tdb_off_t off,
+ const struct tdb_free_record *frec,
+ tdb_off_t prev, unsigned int ftable,
+ unsigned int bucket)
+{
+ enum TDB_ERROR ecode;
+
+ if (frec_magic(frec) != TDB_FREE_MAGIC) {
+ return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
+ "tdb_check: offset %llu bad magic 0x%llx",
+ (long long)off,
+ (long long)frec->magic_and_prev);
+ }
+ if (frec_ftable(frec) != ftable) {
+ return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
+ "tdb_check: offset %llu bad freetable %u",
+ (long long)off, frec_ftable(frec));
+
+ }
+
+ ecode = tdb->methods->oob(tdb, off
+ + frec_len(frec)
+ + sizeof(struct tdb_used_record),
+ false);
+ if (ecode != TDB_SUCCESS) {
+ return ecode;
+ }
+ if (size_to_bucket(frec_len(frec)) != bucket) {
+ return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
+ "tdb_check: offset %llu in wrong bucket"
+ " (%u vs %u)",
+ (long long)off,
+ bucket, size_to_bucket(frec_len(frec)));
+ }
+ if (prev && prev != frec_prev(frec)) {
+ return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
+ "tdb_check: offset %llu bad prev"
+ " (%llu vs %llu)",
+ (long long)off,
+ (long long)prev, (long long)frec_len(frec));
+ }
+ return TDB_SUCCESS;
+}
+
+static enum TDB_ERROR check_free_table(struct tdb_context *tdb,
+ tdb_off_t ftable_off,
+ unsigned ftable_num,
+ tdb_off_t fr[],
+ size_t num_free,
+ size_t *num_found)
+{
+ struct tdb_freetable ft;
+ tdb_off_t h;
+ unsigned int i;
+ enum TDB_ERROR ecode;
+
+ ecode = tdb_read_convert(tdb, ftable_off, &ft, sizeof(ft));
+ if (ecode != TDB_SUCCESS) {
+ return ecode;
+ }
+
+ if (rec_magic(&ft.hdr) != TDB_FTABLE_MAGIC
+ || rec_key_length(&ft.hdr) != 0
+ || rec_data_length(&ft.hdr) != sizeof(ft) - sizeof(ft.hdr)
+ || rec_hash(&ft.hdr) != 0) {
+ return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
+ "tdb_check: Invalid header on free table");
+ }
+
+ for (i = 0; i < TDB_FREE_BUCKETS; i++) {
+ tdb_off_t off, prev = 0, *p, first = 0;
+ struct tdb_free_record f;
+
+ h = bucket_off(ftable_off, i);
+ for (off = tdb_read_off(tdb, h); off; off = f.next) {
+ if (TDB_OFF_IS_ERR(off)) {
+ return off;
+ }
+ if (!first) {
+ off &= TDB_OFF_MASK;
+ first = off;
+ }
+ ecode = tdb_read_convert(tdb, off, &f, sizeof(f));
+ if (ecode != TDB_SUCCESS) {
+ return ecode;
+ }
+ ecode = check_free(tdb, off, &f, prev, ftable_num, i);
+ if (ecode != TDB_SUCCESS) {
+ return ecode;
+ }
+
+ /* FIXME: Check hash bits */
+ p = asearch(&off, fr, num_free, off_cmp);
+ if (!p) {
+ return tdb_logerr(tdb, TDB_ERR_CORRUPT,
+ TDB_LOG_ERROR,
+ "tdb_check: Invalid offset"
+ " %llu in free table",
+ (long long)off);
+ }
+ /* Mark it invalid. */
+ *p ^= 1;
+ (*num_found)++;
+ prev = off;
+ }
+
+ if (first) {
+ /* Now we can check first back pointer. */
+ ecode = tdb_read_convert(tdb, first, &f, sizeof(f));
+ if (ecode != TDB_SUCCESS) {
+ return ecode;
+ }
+ ecode = check_free(tdb, first, &f, prev, ftable_num, i);
+ if (ecode != TDB_SUCCESS) {
+ return ecode;
+ }
+ }
+ }
+ return TDB_SUCCESS;
+}
+
+/* Slow, but should be very rare. */
+tdb_off_t dead_space(struct tdb_context *tdb, tdb_off_t off)
+{
+ size_t len;
+ enum TDB_ERROR ecode;
+
+ for (len = 0; off + len < tdb->file->map_size; len++) {
+ char c;
+ ecode = tdb->methods->tread(tdb, off, &c, 1);
+ if (ecode != TDB_SUCCESS) {
+ return ecode;
+ }
+ if (c != 0 && c != 0x43)
+ break;
+ }
+ return len;
+}
+
+static enum TDB_ERROR check_linear(struct tdb_context *tdb,
+ tdb_off_t **used, size_t *num_used,
+ tdb_off_t **fr, size_t *num_free,
+ uint64_t features, tdb_off_t recovery)
+{
+ tdb_off_t off;
+ tdb_len_t len;
+ enum TDB_ERROR ecode;
+ bool found_recovery = false;
+
+ for (off = sizeof(struct tdb_header);
+ off < tdb->file->map_size;
+ off += len) {
+ union {
+ struct tdb_used_record u;
+ struct tdb_free_record f;
+ struct tdb_recovery_record r;
+ } rec;
+ /* r is larger: only get that if we need to. */
+ ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec.f));
+ if (ecode != TDB_SUCCESS) {
+ return ecode;
+ }
+
+ /* If we crash after ftruncate, we can get zeroes or fill. */
+ if (rec.r.magic == TDB_RECOVERY_INVALID_MAGIC
+ || rec.r.magic == 0x4343434343434343ULL) {
+ ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec.r));
+ if (ecode != TDB_SUCCESS) {
+ return ecode;
+ }
+ if (recovery == off) {
+ found_recovery = true;
+ len = sizeof(rec.r) + rec.r.max_len;
+ } else {
+ len = dead_space(tdb, off);
+ if (TDB_OFF_IS_ERR(len)) {
+ return len;
+ }
+ if (len < sizeof(rec.r)) {
+ return tdb_logerr(tdb, TDB_ERR_CORRUPT,
+ TDB_LOG_ERROR,
+ "tdb_check: invalid"
+ " dead space at %zu",
+ (size_t)off);
+ }
+
+ tdb_logerr(tdb, TDB_SUCCESS, TDB_LOG_WARNING,
+ "Dead space at %zu-%zu (of %zu)",
+ (size_t)off, (size_t)(off + len),
+ (size_t)tdb->file->map_size);
+ }
+ } else if (rec.r.magic == TDB_RECOVERY_MAGIC) {
+ ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec.r));
+ if (ecode != TDB_SUCCESS) {
+ return ecode;
+ }
+ if (recovery != off) {
+ return tdb_logerr(tdb, TDB_ERR_CORRUPT,
+ TDB_LOG_ERROR,
+ "tdb_check: unexpected"
+ " recovery record at offset"
+ " %zu",
+ (size_t)off);
+ }
+ if (rec.r.len > rec.r.max_len) {
+ return tdb_logerr(tdb, TDB_ERR_CORRUPT,
+ TDB_LOG_ERROR,
+ "tdb_check: invalid recovery"
+ " length %zu",
+ (size_t)rec.r.len);
+ }
+ if (rec.r.eof > tdb->file->map_size) {
+ return tdb_logerr(tdb, TDB_ERR_CORRUPT,
+ TDB_LOG_ERROR,
+ "tdb_check: invalid old EOF"
+ " %zu", (size_t)rec.r.eof);
+ }
+ found_recovery = true;
+ len = sizeof(rec.r) + rec.r.max_len;
+ } else if (frec_magic(&rec.f) == TDB_FREE_MAGIC) {
+ len = sizeof(rec.u) + frec_len(&rec.f);
+ if (off + len > tdb->file->map_size) {
+ return tdb_logerr(tdb, TDB_ERR_CORRUPT,
+ TDB_LOG_ERROR,
+ "tdb_check: free overlength"
+ " %llu at offset %llu",
+ (long long)len,
+ (long long)off);
+ }
+ /* This record should be in free lists. */
+ if (frec_ftable(&rec.f) != TDB_FTABLE_NONE
+ && !append(fr, num_free, off)) {
+ return tdb_logerr(tdb, TDB_ERR_OOM,
+ TDB_LOG_ERROR,
+ "tdb_check: tracking %zu'th"
+ " free record.", *num_free);
+ }
+ } else if (rec_magic(&rec.u) == TDB_USED_MAGIC
+ || rec_magic(&rec.u) == TDB_CHAIN_MAGIC
+ || rec_magic(&rec.u) == TDB_HTABLE_MAGIC
+ || rec_magic(&rec.u) == TDB_FTABLE_MAGIC) {
+ uint64_t klen, dlen, extra;
+
+ /* This record is used! */
+ if (!append(used, num_used, off)) {
+ return tdb_logerr(tdb, TDB_ERR_OOM,
+ TDB_LOG_ERROR,
+ "tdb_check: tracking %zu'th"
+ " used record.", *num_used);
+ }
+
+ klen = rec_key_length(&rec.u);
+ dlen = rec_data_length(&rec.u);
+ extra = rec_extra_padding(&rec.u);
+
+ len = sizeof(rec.u) + klen + dlen + extra;
+ if (off + len > tdb->file->map_size) {
+ return tdb_logerr(tdb, TDB_ERR_CORRUPT,
+ TDB_LOG_ERROR,
+ "tdb_check: used overlength"
+ " %llu at offset %llu",
+ (long long)len,
+ (long long)off);
+ }
+
+ if (len < sizeof(rec.f)) {
+ return tdb_logerr(tdb, TDB_ERR_CORRUPT,
+ TDB_LOG_ERROR,
+ "tdb_check: too short record"
+ " %llu at %llu",
+ (long long)len,
+ (long long)off);
+ }
+
+ /* Check that records have correct 0 at end (but may
+ * not in future). */
+ if (extra && !features) {
+ const char *p;
+ char c;
+ p = tdb_access_read(tdb, off + sizeof(rec.u)
+ + klen + dlen, 1, false);
+ if (TDB_PTR_IS_ERR(p))
+ return TDB_PTR_ERR(p);
+ c = *p;
+ tdb_access_release(tdb, p);
+
+ if (c != '\0') {
+ return tdb_logerr(tdb, TDB_ERR_CORRUPT,
+ TDB_LOG_ERROR,
+ "tdb_check:"
+ " non-zero extra"
+ " at %llu",
+ (long long)off);
+ }
+ }
+ } else {
+ return tdb_logerr(tdb, TDB_ERR_CORRUPT,
+ TDB_LOG_ERROR,
+ "tdb_check: Bad magic 0x%llx"
+ " at offset %zu",
+ (long long)rec_magic(&rec.u),
+ (size_t)off);
+ }
+ }
+
+ /* We must have found recovery area if there was one. */
+ if (recovery != 0 && !found_recovery) {
+ return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
+ "tdb_check: expected a recovery area at %zu",
+ (size_t)recovery);
+ }
+
+ return TDB_SUCCESS;
+}
+
+enum TDB_ERROR tdb_check_(struct tdb_context *tdb,
+ enum TDB_ERROR (*check)(TDB_DATA, TDB_DATA, void *),
+ void *data)
+{
+ tdb_off_t *fr = NULL, *used = NULL, ft, recovery;
+ size_t num_free = 0, num_used = 0, num_found = 0, num_ftables = 0;
+ uint64_t features;
+ enum TDB_ERROR ecode;
+
+ ecode = tdb_allrecord_lock(tdb, F_RDLCK, TDB_LOCK_WAIT, false);
+ if (ecode != TDB_SUCCESS) {
+ return tdb->last_error = ecode;
+ }
+
+ ecode = tdb_lock_expand(tdb, F_RDLCK);
+ if (ecode != TDB_SUCCESS) {
+ tdb_allrecord_unlock(tdb, F_RDLCK);
+ return tdb->last_error = ecode;
+ }
+
+ ecode = check_header(tdb, &recovery, &features);
+ if (ecode != TDB_SUCCESS)
+ goto out;
+
+ /* First we do a linear scan, checking all records. */
+ ecode = check_linear(tdb, &used, &num_used, &fr, &num_free, features,
+ recovery);
+ if (ecode != TDB_SUCCESS)
+ goto out;
+
+ for (ft = first_ftable(tdb); ft; ft = next_ftable(tdb, ft)) {
+ if (TDB_OFF_IS_ERR(ft)) {
+ ecode = ft;
+ goto out;
+ }
+ ecode = check_free_table(tdb, ft, num_ftables, fr, num_free,
+ &num_found);
+ if (ecode != TDB_SUCCESS)
+ goto out;
+ num_ftables++;
+ }
+
+ /* FIXME: Check key uniqueness? */
+ ecode = check_hash(tdb, used, num_used, num_ftables, check, data);
+ if (ecode != TDB_SUCCESS)
+ goto out;
+
+ if (num_found != num_free) {
+ ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
+ "tdb_check: Not all entries are in"
+ " free table");
+ }
+
+out:
+ tdb_allrecord_unlock(tdb, F_RDLCK);
+ tdb_unlock_expand(tdb, F_RDLCK);
+ free(fr);
+ free(used);
+ return tdb->last_error = ecode;
+}
diff --git a/lib/tdb2/doc/TDB1_porting.txt b/lib/tdb2/doc/TDB1_porting.txt
new file mode 100644
index 00000000000..90ba2497382
--- /dev/null
+++ b/lib/tdb2/doc/TDB1_porting.txt
@@ -0,0 +1,44 @@
+Interface differences between TDB1 and TDB2.
+
+- tdb2 uses 'struct tdb_data', tdb1 uses 'struct TDB_DATA'. Use the
+ TDB_DATA typedef if you want portability between the two.
+
+- tdb2 functions return 0 on success, and a negative error on failure,
+ whereas tdb1 functions returned 0 on success, and -1 on failure.
+ tdb1 then used tdb_error() to determine the error; this is also
+ supported in tdb2 to ease backwards compatibility, though the other
+ form is preferred.
+
+- tdb2's tdb_fetch() returns an error, tdb1's returned the data directly
+ (or tdb_null, and you were supposed to check tdb_error() to find out why).
+
+- tdb2's tdb_nextkey() frees the old key's dptr, in tdb2 you needed to do
+ this manually.
+
+- tdb1's tdb_open/tdb_open_ex took an explicit hash size. tdb2's hash table
+ resizes as required.
+
+- tdb2 uses a linked list of attribute structures to implement logging and
+ alternate hashes. tdb1 used tdb_open_ex, which was not extensible.
+
+- tdb2 does locking on read-only databases (ie. O_RDONLY passed to tdb_open).
+ tdb1 did not: use the TDB_NOLOCK flag if you want to suppress locking.
+
+- tdb2's log function is simpler than tdb1's log function. The string is
+ already formatted, and it takes an enum tdb_log_level not a tdb_debug_level,
+ and which has only three values: TDB_LOG_ERROR, TDB_LOG_USE_ERROR and
+ TDB_LOG_WARNING.
+
+- tdb2 provides tdb_deq() for comparing two struct tdb_data.
+
+- tdb2's tdb_name() returns a copy of the name even for TDB_INTERNAL dbs.
+
+- tdb2 does not need tdb_reopen() or tdb_reopen_all(). If you call
+ fork() after during certain operations the child should close the
+ tdb, or complete the operations before continuing to use the tdb:
+
+ tdb_transaction_start(): child must tdb_transaction_cancel()
+ tdb_lockall(): child must call tdb_unlockall()
+ tdb_lockall_read(): child must call tdb_unlockall_read()
+ tdb_chainlock(): child must call tdb_chainunlock()
+ tdb_parse() callback: child must return from tdb_parse()
diff --git a/lib/tdb2/doc/design-1.3.txt b/lib/tdb2/doc/design-1.3.txt
new file mode 100644
index 00000000000..f81ecf78855
--- /dev/null
+++ b/lib/tdb2/doc/design-1.3.txt
@@ -0,0 +1,1049 @@
+TDB2: A Redesigning The Trivial DataBase
+
+Rusty Russell, IBM Corporation
+
+27-April-2010
+
+Abstract
+
+The Trivial DataBase on-disk format is 32 bits; with usage cases
+heading towards the 4G limit, that must change. This required
+breakage provides an opportunity to revisit TDB's other design
+decisions and reassess them.
+
+1 Introduction
+
+The Trivial DataBase was originally written by Andrew Tridgell as
+a simple key/data pair storage system with the same API as dbm,
+but allowing multiple readers and writers while being small
+enough (< 1000 lines of C) to include in SAMBA. The simple design
+created in 1999 has proven surprisingly robust and performant,
+used in Samba versions 3 and 4 as well as numerous other
+projects. Its useful life was greatly increased by the
+(backwards-compatible!) addition of transaction support in 2005.
+
+The wider variety and greater demands of TDB-using code has lead
+to some organic growth of the API, as well as some compromises on
+the implementation. None of these, by themselves, are seen as
+show-stoppers, but the cumulative effect is to a loss of elegance
+over the initial, simple TDB implementation. Here is a table of
+the approximate number of lines of implementation code and number
+of API functions at the end of each year:
+
+
++-----------+----------------+--------------------------------+
+| Year End | API Functions | Lines of C Code Implementation |
++-----------+----------------+--------------------------------+
++-----------+----------------+--------------------------------+
+| 1999 | 13 | 1195 |
++-----------+----------------+--------------------------------+
+| 2000 | 24 | 1725 |
++-----------+----------------+--------------------------------+
+| 2001 | 32 | 2228 |
++-----------+----------------+--------------------------------+
+| 2002 | 35 | 2481 |
++-----------+----------------+--------------------------------+
+| 2003 | 35 | 2552 |
++-----------+----------------+--------------------------------+
+| 2004 | 40 | 2584 |
++-----------+----------------+--------------------------------+
+| 2005 | 38 | 2647 |
++-----------+----------------+--------------------------------+
+| 2006 | 52 | 3754 |
++-----------+----------------+--------------------------------+
+| 2007 | 66 | 4398 |
++-----------+----------------+--------------------------------+
+| 2008 | 71 | 4768 |
++-----------+----------------+--------------------------------+
+| 2009 | 73 | 5715 |
++-----------+----------------+--------------------------------+
+
+
+This review is an attempt to catalog and address all the known
+issues with TDB and create solutions which address the problems
+without significantly increasing complexity; all involved are far
+too aware of the dangers of second system syndrome in rewriting a
+successful project like this.
+
+2 API Issues
+
+2.1 tdb_open_ex Is Not Expandable
+
+The tdb_open() call was expanded to tdb_open_ex(), which added an
+optional hashing function and an optional logging function
+argument. Additional arguments to open would require the
+introduction of a tdb_open_ex2 call etc.
+
+2.1.1 Proposed Solution
+
+tdb_open() will take a linked-list of attributes:
+
+enum tdb_attribute {
+
+ TDB_ATTRIBUTE_LOG = 0,
+
+ TDB_ATTRIBUTE_HASH = 1
+
+};
+
+struct tdb_attribute_base {
+
+ enum tdb_attribute attr;
+
+ union tdb_attribute *next;
+
+};
+
+struct tdb_attribute_log {
+
+ struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_LOG
+*/
+
+ tdb_log_func log_fn;
+
+ void *log_private;
+
+};
+
+struct tdb_attribute_hash {
+
+ struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_HASH
+*/
+
+ tdb_hash_func hash_fn;
+
+ void *hash_private;
+
+};
+
+union tdb_attribute {
+
+ struct tdb_attribute_base base;
+
+ struct tdb_attribute_log log;
+
+ struct tdb_attribute_hash hash;
+
+};
+
+This allows future attributes to be added, even if this expands
+the size of the union.
+
+2.2 tdb_traverse Makes Impossible Guarantees
+
+tdb_traverse (and tdb_firstkey/tdb_nextkey) predate transactions,
+and it was thought that it was important to guarantee that all
+records which exist at the start and end of the traversal would
+be included, and no record would be included twice.
+
+This adds complexity (see[Reliable-Traversal-Adds]) and does not
+work anyway for records which are altered (in particular, those
+which are expanded may be effectively deleted and re-added behind
+the traversal).
+
+2.2.1 <traverse-Proposed-Solution>Proposed Solution
+
+Abandon the guarantee. You will see every record if no changes
+occur during your traversal, otherwise you will see some subset.
+You can prevent changes by using a transaction or the locking
+API.
+
+2.3 Nesting of Transactions Is Fraught
+
+TDB has alternated between allowing nested transactions and not
+allowing them. Various paths in the Samba codebase assume that
+transactions will nest, and in a sense they can: the operation is
+only committed to disk when the outer transaction is committed.
+There are two problems, however:
+
+1. Canceling the inner transaction will cause the outer
+ transaction commit to fail, and will not undo any operations
+ since the inner transaction began. This problem is soluble with
+ some additional internal code.
+
+2. An inner transaction commit can be cancelled by the outer
+ transaction. This is desirable in the way which Samba's
+ database initialization code uses transactions, but could be a
+ surprise to any users expecting a successful transaction commit
+ to expose changes to others.
+
+The current solution is to specify the behavior at tdb_open(),
+with the default currently that nested transactions are allowed.
+This flag can also be changed at runtime.
+
+2.3.1 Proposed Solution
+
+Given the usage patterns, it seems that the “least-surprise”
+behavior of disallowing nested transactions should become the
+default. Additionally, it seems the outer transaction is the only
+code which knows whether inner transactions should be allowed, so
+a flag to indicate this could be added to tdb_transaction_start.
+However, this behavior can be simulated with a wrapper which uses
+tdb_add_flags() and tdb_remove_flags(), so the API should not be
+expanded for this relatively-obscure case.
+
+2.4 Incorrect Hash Function is Not Detected
+
+tdb_open_ex() allows the calling code to specify a different hash
+function to use, but does not check that all other processes
+accessing this tdb are using the same hash function. The result
+is that records are missing from tdb_fetch().
+
+2.4.1 Proposed Solution
+
+The header should contain an example hash result (eg. the hash of
+0xdeadbeef), and tdb_open_ex() should check that the given hash
+function produces the same answer, or fail the tdb_open call.
+
+2.5 tdb_set_max_dead/TDB_VOLATILE Expose Implementation
+
+In response to scalability issues with the free list ([TDB-Freelist-Is]
+) two API workarounds have been incorporated in TDB:
+tdb_set_max_dead() and the TDB_VOLATILE flag to tdb_open. The
+latter actually calls the former with an argument of “5”.
+
+This code allows deleted records to accumulate without putting
+them in the free list. On delete we iterate through each chain
+and free them in a batch if there are more than max_dead entries.
+These are never otherwise recycled except as a side-effect of a
+tdb_repack.
+
+2.5.1 Proposed Solution
+
+With the scalability problems of the freelist solved, this API
+can be removed. The TDB_VOLATILE flag may still be useful as a
+hint that store and delete of records will be at least as common
+as fetch in order to allow some internal tuning, but initially
+will become a no-op.
+
+2.6 <TDB-Files-Cannot>TDB Files Cannot Be Opened Multiple Times
+ In The Same Process
+
+No process can open the same TDB twice; we check and disallow it.
+This is an unfortunate side-effect of fcntl locks, which operate
+on a per-file rather than per-file-descriptor basis, and do not
+nest. Thus, closing any file descriptor on a file clears all the
+locks obtained by this process, even if they were placed using a
+different file descriptor!
+
+Note that even if this were solved, deadlock could occur if
+operations were nested: this is a more manageable programming
+error in most cases.
+
+2.6.1 Proposed Solution
+
+We could lobby POSIX to fix the perverse rules, or at least lobby
+Linux to violate them so that the most common implementation does
+not have this restriction. This would be a generally good idea
+for other fcntl lock users.
+
+Samba uses a wrapper which hands out the same tdb_context to
+multiple callers if this happens, and does simple reference
+counting. We should do this inside the tdb library, which already
+emulates lock nesting internally; it would need to recognize when
+deadlock occurs within a single process. This would create a new
+failure mode for tdb operations (while we currently handle
+locking failures, they are impossible in normal use and a process
+encountering them can do little but give up).
+
+I do not see benefit in an additional tdb_open flag to indicate
+whether re-opening is allowed, as though there may be some
+benefit to adding a call to detect when a tdb_context is shared,
+to allow other to create such an API.
+
+2.7 TDB API Is Not POSIX Thread-safe
+
+The TDB API uses an error code which can be queried after an
+operation to determine what went wrong. This programming model
+does not work with threads, unless specific additional guarantees
+are given by the implementation. In addition, even
+otherwise-independent threads cannot open the same TDB (as in [TDB-Files-Cannot]
+).
+
+2.7.1 Proposed Solution
+
+Reachitecting the API to include a tdb_errcode pointer would be a
+great deal of churn; we are better to guarantee that the
+tdb_errcode is per-thread so the current programming model can be
+maintained.
+
+This requires dynamic per-thread allocations, which is awkward
+with POSIX threads (pthread_key_create space is limited and we
+cannot simply allocate a key for every TDB).
+
+Internal locking is required to make sure that fcntl locks do not
+overlap between threads, and also that the global list of tdbs is
+maintained.
+
+The aim is that building tdb with -DTDB_PTHREAD will result in a
+pthread-safe version of the library, and otherwise no overhead
+will exist.
+
+2.8 *_nonblock Functions And *_mark Functions Expose
+ Implementation
+
+CTDB[footnote:
+Clustered TDB, see http://ctdb.samba.org
+] wishes to operate on TDB in a non-blocking manner. This is
+currently done as follows:
+
+1. Call the _nonblock variant of an API function (eg.
+ tdb_lockall_nonblock). If this fails:
+
+2. Fork a child process, and wait for it to call the normal
+ variant (eg. tdb_lockall).
+
+3. If the child succeeds, call the _mark variant to indicate we
+ already have the locks (eg. tdb_lockall_mark).
+
+4. Upon completion, tell the child to release the locks (eg.
+ tdb_unlockall).
+
+5. Indicate to tdb that it should consider the locks removed (eg.
+ tdb_unlockall_mark).
+
+There are several issues with this approach. Firstly, adding two
+new variants of each function clutters the API for an obscure
+use, and so not all functions have three variants. Secondly, it
+assumes that all paths of the functions ask for the same locks,
+otherwise the parent process will have to get a lock which the
+child doesn't have under some circumstances. I don't believe this
+is currently the case, but it constrains the implementation.
+
+2.8.1 <Proposed-Solution-locking-hook>Proposed Solution
+
+Implement a hook for locking methods, so that the caller can
+control the calls to create and remove fcntl locks. In this
+scenario, ctdbd would operate as follows:
+
+1. Call the normal API function, eg tdb_lockall().
+
+2. When the lock callback comes in, check if the child has the
+ lock. Initially, this is always false. If so, return 0.
+ Otherwise, try to obtain it in non-blocking mode. If that
+ fails, return EWOULDBLOCK.
+
+3. Release locks in the unlock callback as normal.
+
+4. If tdb_lockall() fails, see if we recorded a lock failure; if
+ so, call the child to repeat the operation.
+
+5. The child records what locks it obtains, and returns that
+ information to the parent.
+
+6. When the child has succeeded, goto 1.
+
+This is flexible enough to handle any potential locking scenario,
+even when lock requirements change. It can be optimized so that
+the parent does not release locks, just tells the child which
+locks it doesn't need to obtain.
+
+It also keeps the complexity out of the API, and in ctdbd where
+it is needed.
+
+2.9 tdb_chainlock Functions Expose Implementation
+
+tdb_chainlock locks some number of records, including the record
+indicated by the given key. This gave atomicity guarantees;
+no-one can start a transaction, alter, read or delete that key
+while the lock is held.
+
+It also makes the same guarantee for any other key in the chain,
+which is an internal implementation detail and potentially a
+cause for deadlock.
+
+2.9.1 Proposed Solution
+
+None. It would be nice to have an explicit single entry lock
+which effected no other keys. Unfortunately, this won't work for
+an entry which doesn't exist. Thus while chainlock may be
+implemented more efficiently for the existing case, it will still
+have overlap issues with the non-existing case. So it is best to
+keep the current (lack of) guarantee about which records will be
+effected to avoid constraining our implementation.
+
+2.10 Signal Handling is Not Race-Free
+
+The tdb_setalarm_sigptr() call allows the caller's signal handler
+to indicate that the tdb locking code should return with a
+failure, rather than trying again when a signal is received (and
+errno == EAGAIN). This is usually used to implement timeouts.
+
+Unfortunately, this does not work in the case where the signal is
+received before the tdb code enters the fcntl() call to place the
+lock: the code will sleep within the fcntl() code, unaware that
+the signal wants it to exit. In the case of long timeouts, this
+does not happen in practice.
+
+2.10.1 Proposed Solution
+
+The locking hooks proposed in[Proposed-Solution-locking-hook]
+would allow the user to decide on whether to fail the lock
+acquisition on a signal. This allows the caller to choose their
+own compromise: they could narrow the race by checking
+immediately before the fcntl call.[footnote:
+It may be possible to make this race-free in some implementations
+by having the signal handler alter the struct flock to make it
+invalid. This will cause the fcntl() lock call to fail with
+EINVAL if the signal occurs before the kernel is entered,
+otherwise EAGAIN.
+]
+
+2.11 The API Uses Gratuitous Typedefs, Capitals
+
+typedefs are useful for providing source compatibility when types
+can differ across implementations, or arguably in the case of
+function pointer definitions which are hard for humans to parse.
+Otherwise it is simply obfuscation and pollutes the namespace.
+
+Capitalization is usually reserved for compile-time constants and
+macros.
+
+ TDB_CONTEXT There is no reason to use this over 'struct
+ tdb_context'; the definition isn't visible to the API user
+ anyway.
+
+ TDB_DATA There is no reason to use this over struct TDB_DATA;
+ the struct needs to be understood by the API user.
+
+ struct TDB_DATA This would normally be called 'struct
+ tdb_data'.
+
+ enum TDB_ERROR Similarly, this would normally be enum
+ tdb_error.
+
+2.11.1 Proposed Solution
+
+None. Introducing lower case variants would please pedants like
+myself, but if it were done the existing ones should be kept.
+There is little point forcing a purely cosmetic change upon tdb
+users.
+
+2.12 <tdb_log_func-Doesnt-Take>tdb_log_func Doesn't Take The
+ Private Pointer
+
+For API compatibility reasons, the logging function needs to call
+tdb_get_logging_private() to retrieve the pointer registered by
+the tdb_open_ex for logging.
+
+2.12.1 Proposed Solution
+
+It should simply take an extra argument, since we are prepared to
+break the API/ABI.
+
+2.13 Various Callback Functions Are Not Typesafe
+
+The callback functions in tdb_set_logging_function (after [tdb_log_func-Doesnt-Take]
+ is resolved), tdb_parse_record, tdb_traverse, tdb_traverse_read
+and tdb_check all take void * and must internally convert it to
+the argument type they were expecting.
+
+If this type changes, the compiler will not produce warnings on
+the callers, since it only sees void *.
+
+2.13.1 Proposed Solution
+
+With careful use of macros, we can create callback functions
+which give a warning when used on gcc and the types of the
+callback and its private argument differ. Unsupported compilers
+will not give a warning, which is no worse than now. In addition,
+the callbacks become clearer, as they need not use void * for
+their parameter.
+
+See CCAN's typesafe_cb module at
+http://ccan.ozlabs.org/info/typesafe_cb.html
+
+2.14 TDB_CLEAR_IF_FIRST Must Be Specified On All Opens,
+ tdb_reopen_all Problematic
+
+The TDB_CLEAR_IF_FIRST flag to tdb_open indicates that the TDB
+file should be cleared if the caller discovers it is the only
+process with the TDB open. However, if any caller does not
+specify TDB_CLEAR_IF_FIRST it will not be detected, so will have
+the TDB erased underneath them (usually resulting in a crash).
+
+There is a similar issue on fork(); if the parent exits (or
+otherwise closes the tdb) before the child calls tdb_reopen_all()
+to establish the lock used to indicate the TDB is opened by
+someone, a TDB_CLEAR_IF_FIRST opener at that moment will believe
+it alone has opened the TDB and will erase it.
+
+2.14.1 Proposed Solution
+
+Remove TDB_CLEAR_IF_FIRST. Other workarounds are possible, but
+see [TDB_CLEAR_IF_FIRST-Imposes-Performance].
+
+3 Performance And Scalability Issues
+
+3.1 <TDB_CLEAR_IF_FIRST-Imposes-Performance>TDB_CLEAR_IF_FIRST
+ Imposes Performance Penalty
+
+When TDB_CLEAR_IF_FIRST is specified, a 1-byte read lock is
+placed at offset 4 (aka. the ACTIVE_LOCK). While these locks
+never conflict in normal tdb usage, they do add substantial
+overhead for most fcntl lock implementations when the kernel
+scans to detect if a lock conflict exists. This is often a single
+linked list, making the time to acquire and release a fcntl lock
+O(N) where N is the number of processes with the TDB open, not
+the number actually doing work.
+
+In a Samba server it is common to have huge numbers of clients
+sitting idle, and thus they have weaned themselves off the
+TDB_CLEAR_IF_FIRST flag.[footnote:
+There is a flag to tdb_reopen_all() which is used for this
+optimization: if the parent process will outlive the child, the
+child does not need the ACTIVE_LOCK. This is a workaround for
+this very performance issue.
+]
+
+3.1.1 Proposed Solution
+
+Remove the flag. It was a neat idea, but even trivial servers
+tend to know when they are initializing for the first time and
+can simply unlink the old tdb at that point.
+
+3.2 TDB Files Have a 4G Limit
+
+This seems to be becoming an issue (so much for “trivial”!),
+particularly for ldb.
+
+3.2.1 Proposed Solution
+
+A new, incompatible TDB format which uses 64 bit offsets
+internally rather than 32 bit as now. For simplicity of endian
+conversion (which TDB does on the fly if required), all values
+will be 64 bit on disk. In practice, some upper bits may be used
+for other purposes, but at least 56 bits will be available for
+file offsets.
+
+tdb_open() will automatically detect the old version, and even
+create them if TDB_VERSION6 is specified to tdb_open.
+
+32 bit processes will still be able to access TDBs larger than 4G
+(assuming that their off_t allows them to seek to 64 bits), they
+will gracefully fall back as they fail to mmap. This can happen
+already with large TDBs.
+
+Old versions of tdb will fail to open the new TDB files (since 28
+August 2009, commit 398d0c29290: prior to that any unrecognized
+file format would be erased and initialized as a fresh tdb!)
+
+3.3 TDB Records Have a 4G Limit
+
+This has not been a reported problem, and the API uses size_t
+which can be 64 bit on 64 bit platforms. However, other limits
+may have made such an issue moot.
+
+3.3.1 Proposed Solution
+
+Record sizes will be 64 bit, with an error returned on 32 bit
+platforms which try to access such records (the current
+implementation would return TDB_ERR_OOM in a similar case). It
+seems unlikely that 32 bit keys will be a limitation, so the
+implementation may not support this (see [sub:Records-Incur-A]).
+
+3.4 Hash Size Is Determined At TDB Creation Time
+
+TDB contains a number of hash chains in the header; the number is
+specified at creation time, and defaults to 131. This is such a
+bottleneck on large databases (as each hash chain gets quite
+long), that LDB uses 10,000 for this hash. In general it is
+impossible to know what the 'right' answer is at database
+creation time.
+
+3.4.1 Proposed Solution
+
+After comprehensive performance testing on various scalable hash
+variants[footnote:
+http://rusty.ozlabs.org/?p=89 and http://rusty.ozlabs.org/?p=94
+This was annoying because I was previously convinced that an
+expanding tree of hashes would be very close to optimal.
+], it became clear that it is hard to beat a straight linear hash
+table which doubles in size when it reaches saturation. There are
+three details which become important:
+
+1. On encountering a full bucket, we use the next bucket.
+
+2. Extra hash bits are stored with the offset, to reduce
+ comparisons.
+
+3. A marker entry is used on deleting an entry.
+
+The doubling of the table must be done under a transaction; we
+will not reduce it on deletion, so it will be an unusual case. It
+will either be placed at the head (other entries will be moved
+out the way so we can expand). We could have a pointer in the
+header to the current hashtable location, but that pointer would
+have to be read frequently to check for hashtable moves.
+
+The locking for this is slightly more complex than the chained
+case; we currently have one lock per bucket, and that means we
+would need to expand the lock if we overflow to the next bucket.
+The frequency of such collisions will effect our locking
+heuristics: we can always lock more buckets than we need.
+
+One possible optimization is to only re-check the hash size on an
+insert or a lookup miss.
+
+3.5 <TDB-Freelist-Is>TDB Freelist Is Highly Contended
+
+TDB uses a single linked list for the free list. Allocation
+occurs as follows, using heuristics which have evolved over time:
+
+1. Get the free list lock for this whole operation.
+
+2. Multiply length by 1.25, so we always over-allocate by 25%.
+
+3. Set the slack multiplier to 1.
+
+4. Examine the current freelist entry: if it is > length but <
+ the current best case, remember it as the best case.
+
+5. Multiply the slack multiplier by 1.05.
+
+6. If our best fit so far is less than length * slack multiplier,
+ return it. The slack will be turned into a new free record if
+ it's large enough.
+
+7. Otherwise, go onto the next freelist entry.
+
+Deleting a record occurs as follows:
+
+1. Lock the hash chain for this whole operation.
+
+2. Walk the chain to find the record, keeping the prev pointer
+ offset.
+
+3. If max_dead is non-zero:
+
+ (a) Walk the hash chain again and count the dead records.
+
+ (b) If it's more than max_dead, bulk free all the dead ones
+ (similar to steps 4 and below, but the lock is only obtained
+ once).
+
+ (c) Simply mark this record as dead and return.
+
+4. Get the free list lock for the remainder of this operation.
+
+5. <right-merging>Examine the following block to see if it is
+ free; if so, enlarge the current block and remove that block
+ from the free list. This was disabled, as removal from the free
+ list was O(entries-in-free-list).
+
+6. Examine the preceeding block to see if it is free: for this
+ reason, each block has a 32-bit tailer which indicates its
+ length. If it is free, expand it to cover our new block and
+ return.
+
+7. Otherwise, prepend ourselves to the free list.
+
+Disabling right-merging (step [right-merging]) causes
+fragmentation; the other heuristics proved insufficient to
+address this, so the final answer to this was that when we expand
+the TDB file inside a transaction commit, we repack the entire
+tdb.
+
+The single list lock limits our allocation rate; due to the other
+issues this is not currently seen as a bottleneck.
+
+3.5.1 Proposed Solution
+
+The first step is to remove all the current heuristics, as they
+obviously interact, then examine them once the lock contention is
+addressed.
+
+The free list must be split to reduce contention. Assuming
+perfect free merging, we can at most have 1 free list entry for
+each entry. This implies that the number of free lists is related
+to the size of the hash table, but as it is rare to walk a large
+number of free list entries we can use far fewer, say 1/32 of the
+number of hash buckets.
+
+There are various benefits in using per-size free lists (see [sub:TDB-Becomes-Fragmented]
+) but it's not clear this would reduce contention in the common
+case where all processes are allocating/freeing the same size.
+Thus we almost certainly need to divide in other ways: the most
+obvious is to divide the file into zones, and using a free list
+(or set of free lists) for each. This approximates address
+ordering.
+
+Note that this means we need to split the free lists when we
+expand the file; this is probably acceptable when we double the
+hash table size, since that is such an expensive operation
+already. In the case of increasing the file size, there is an
+optimization we can use: if we use M in the formula above as the
+file size rounded up to the next power of 2, we only need
+reshuffle free lists when the file size crosses a power of 2
+boundary, and reshuffling the free lists is trivial: we simply
+merge every consecutive pair of free lists.
+
+The basic algorithm is as follows. Freeing is simple:
+
+1. Identify the correct zone.
+
+2. Lock the corresponding list.
+
+3. Re-check the zone (we didn't have a lock, sizes could have
+ changed): relock if necessary.
+
+4. Place the freed entry in the list for that zone.
+
+Allocation is a little more complicated, as we perform delayed
+coalescing at this point:
+
+1. Pick a zone either the zone we last freed into, or based on a “
+ random” number.
+
+2. Lock the corresponding list.
+
+3. Re-check the zone: relock if necessary.
+
+4. If the top entry is -large enough, remove it from the list and
+ return it.
+
+5. Otherwise, coalesce entries in the list.
+
+ (a)
+
+ (b)
+
+ (c)
+
+ (d)
+
+6. If there was no entry large enough, unlock the list and try
+ the next zone.
+
+7.
+
+8.
+
+9. If no zone satisfies, expand the file.
+
+This optimizes rapid insert/delete of free list entries by not
+coalescing them all the time.. First-fit address ordering
+ordering seems to be fairly good for keeping fragmentation low
+(see [sub:TDB-Becomes-Fragmented]). Note that address ordering
+does not need a tailer to coalesce, though if we needed one we
+could have one cheaply: see [sub:Records-Incur-A].
+
+
+
+I anticipate that the number of entries in each free zone would
+be small, but it might be worth using one free entry to hold
+pointers to the others for cache efficiency.
+
+3.6 <sub:TDB-Becomes-Fragmented>TDB Becomes Fragmented
+
+Much of this is a result of allocation strategy[footnote:
+The Memory Fragmentation Problem: Solved? Johnstone & Wilson 1995
+ftp://ftp.cs.utexas.edu/pub/garbage/malloc/ismm98.ps
+] and deliberate hobbling of coalescing; internal fragmentation
+(aka overallocation) is deliberately set at 25%, and external
+fragmentation is only cured by the decision to repack the entire
+db when a transaction commit needs to enlarge the file.
+
+3.6.1 Proposed Solution
+
+The 25% overhead on allocation works in practice for ldb because
+indexes tend to expand by one record at a time. This internal
+fragmentation can be resolved by having an “expanded” bit in the
+header to note entries that have previously expanded, and
+allocating more space for them.
+
+There are is a spectrum of possible solutions for external
+fragmentation: one is to use a fragmentation-avoiding allocation
+strategy such as best-fit address-order allocator. The other end
+of the spectrum would be to use a bump allocator (very fast and
+simple) and simply repack the file when we reach the end.
+
+There are three problems with efficient fragmentation-avoiding
+allocators: they are non-trivial, they tend to use a single free
+list for each size, and there's no evidence that tdb allocation
+patterns will match those recorded for general allocators (though
+it seems likely).
+
+Thus we don't spend too much effort on external fragmentation; we
+will be no worse than the current code if we need to repack on
+occasion. More effort is spent on reducing freelist contention,
+and reducing overhead.
+
+3.7 <sub:Records-Incur-A>Records Incur A 28-Byte Overhead
+
+Each TDB record has a header as follows:
+
+struct tdb_record {
+
+ tdb_off_t next; /* offset of the next record in the list
+*/
+
+ tdb_len_t rec_len; /* total byte length of record */
+
+ tdb_len_t key_len; /* byte length of key */
+
+ tdb_len_t data_len; /* byte length of data */
+
+ uint32_t full_hash; /* the full 32 bit hash of the key */
+
+ uint32_t magic; /* try to catch errors */
+
+ /* the following union is implied:
+
+ union {
+
+ char record[rec_len];
+
+ struct {
+
+ char key[key_len];
+
+ char data[data_len];
+
+ }
+
+ uint32_t totalsize; (tailer)
+
+ }
+
+ */
+
+};
+
+Naively, this would double to a 56-byte overhead on a 64 bit
+implementation.
+
+3.7.1 Proposed Solution
+
+We can use various techniques to reduce this for an allocated
+block:
+
+1. The 'next' pointer is not required, as we are using a flat
+ hash table.
+
+2. 'rec_len' can instead be expressed as an addition to key_len
+ and data_len (it accounts for wasted or overallocated length in
+ the record). Since the record length is always a multiple of 8,
+ we can conveniently fit it in 32 bits (representing up to 35
+ bits).
+
+3. 'key_len' and 'data_len' can be reduced. I'm unwilling to
+ restrict 'data_len' to 32 bits, but instead we can combine the
+ two into one 64-bit field and using a 5 bit value which
+ indicates at what bit to divide the two. Keys are unlikely to
+ scale as fast as data, so I'm assuming a maximum key size of 32
+ bits.
+
+4. 'full_hash' is used to avoid a memcmp on the “miss” case, but
+ this is diminishing returns after a handful of bits (at 10
+ bits, it reduces 99.9% of false memcmp). As an aside, as the
+ lower bits are already incorporated in the hash table
+ resolution, the upper bits should be used here.
+
+5. 'magic' does not need to be enlarged: it currently reflects
+ one of 5 values (used, free, dead, recovery, and
+ unused_recovery). It is useful for quick sanity checking
+ however, and should not be eliminated.
+
+6. 'tailer' is only used to coalesce free blocks (so a block to
+ the right can find the header to check if this block is free).
+ This can be replaced by a single 'free' bit in the header of
+ the following block (and the tailer only exists in free
+ blocks).[footnote:
+This technique from Thomas Standish. Data Structure Techniques.
+Addison-Wesley, Reading, Massachusetts, 1980.
+] The current proposed coalescing algorithm doesn't need this,
+ however.
+
+This produces a 16 byte used header like this:
+
+struct tdb_used_record {
+
+ uint32_t magic : 16,
+
+ prev_is_free: 1,
+
+ key_data_divide: 5,
+
+ top_hash: 10;
+
+ uint32_t extra_octets;
+
+ uint64_t key_and_data_len;
+
+};
+
+And a free record like this:
+
+struct tdb_free_record {
+
+ uint32_t free_magic;
+
+ uint64_t total_length;
+
+ ...
+
+ uint64_t tailer;
+
+};
+
+
+
+3.8 Transaction Commit Requires 4 fdatasync
+
+The current transaction algorithm is:
+
+1. write_recovery_data();
+
+2. sync();
+
+3. write_recovery_header();
+
+4. sync();
+
+5. overwrite_with_new_data();
+
+6. sync();
+
+7. remove_recovery_header();
+
+8. sync();
+
+On current ext3, each sync flushes all data to disk, so the next
+3 syncs are relatively expensive. But this could become a
+performance bottleneck on other filesystems such as ext4.
+
+3.8.1 Proposed Solution
+
+
+
+
+
+
+
+
+
+Neil Brown points out that this is overzealous, and only one sync
+is needed:
+
+1. Bundle the recovery data, a transaction counter and a strong
+ checksum of the new data.
+
+2. Strong checksum that whole bundle.
+
+3. Store the bundle in the database.
+
+4. Overwrite the oldest of the two recovery pointers in the
+ header (identified using the transaction counter) with the
+ offset of this bundle.
+
+5. sync.
+
+6. Write the new data to the file.
+
+Checking for recovery means identifying the latest bundle with a
+valid checksum and using the new data checksum to ensure that it
+has been applied. This is more expensive than the current check,
+but need only be done at open. For running databases, a separate
+header field can be used to indicate a transaction in progress;
+we need only check for recovery if this is set.
+
+3.9 TDB Does Not Have Snapshot Support
+
+3.9.1 Proposed Solution
+
+None. At some point you say “use a real database”.
+
+But as a thought experiment, if we implemented transactions to
+only overwrite free entries (this is tricky: there must not be a
+header in each entry which indicates whether it is free, but use
+of presence in metadata elsewhere), and a pointer to the hash
+table, we could create an entirely new commit without destroying
+existing data. Then it would be easy to implement snapshots in a
+similar way.
+
+This would not allow arbitrary changes to the database, such as
+tdb_repack does, and would require more space (since we have to
+preserve the current and future entries at once). If we used hash
+trees rather than one big hash table, we might only have to
+rewrite some sections of the hash, too.
+
+We could then implement snapshots using a similar method, using
+multiple different hash tables/free tables.
+
+3.10 Transactions Cannot Operate in Parallel
+
+This would be useless for ldb, as it hits the index records with
+just about every update. It would add significant complexity in
+resolving clashes, and cause the all transaction callers to write
+their code to loop in the case where the transactions spuriously
+failed.
+
+3.10.1 Proposed Solution
+
+We could solve a small part of the problem by providing read-only
+transactions. These would allow one write transaction to begin,
+but it could not commit until all r/o transactions are done. This
+would require a new RO_TRANSACTION_LOCK, which would be upgraded
+on commit.
+
+3.11 Default Hash Function Is Suboptimal
+
+The Knuth-inspired multiplicative hash used by tdb is fairly slow
+(especially if we expand it to 64 bits), and works best when the
+hash bucket size is a prime number (which also means a slow
+modulus). In addition, it is highly predictable which could
+potentially lead to a Denial of Service attack in some TDB uses.
+
+3.11.1 Proposed Solution
+
+The Jenkins lookup3 hash[footnote:
+http://burtleburtle.net/bob/c/lookup3.c
+] is a fast and superbly-mixing hash. It's used by the Linux
+kernel and almost everything else. This has the particular
+properties that it takes an initial seed, and produces two 32 bit
+hash numbers, which we can combine into a 64-bit hash.
+
+The seed should be created at tdb-creation time from some random
+source, and placed in the header. This is far from foolproof, but
+adds a little bit of protection against hash bombing.
+
+3.12 <Reliable-Traversal-Adds>Reliable Traversal Adds Complexity
+
+We lock a record during traversal iteration, and try to grab that
+lock in the delete code. If that grab on delete fails, we simply
+mark it deleted and continue onwards; traversal checks for this
+condition and does the delete when it moves off the record.
+
+If traversal terminates, the dead record may be left
+indefinitely.
+
+3.12.1 Proposed Solution
+
+Remove reliability guarantees; see [traverse-Proposed-Solution].
+
+3.13 Fcntl Locking Adds Overhead
+
+Placing a fcntl lock means a system call, as does removing one.
+This is actually one reason why transactions can be faster
+(everything is locked once at transaction start). In the
+uncontended case, this overhead can theoretically be eliminated.
+
+3.13.1 Proposed Solution
+
+None.
+
+We tried this before with spinlock support, in the early days of
+TDB, and it didn't make much difference except in manufactured
+benchmarks.
+
+We could use spinlocks (with futex kernel support under Linux),
+but it means that we lose automatic cleanup when a process dies
+with a lock. There is a method of auto-cleanup under Linux, but
+it's not supported by other operating systems. We could
+reintroduce a clear-if-first-style lock and sweep for dead
+futexes on open, but that wouldn't help the normal case of one
+concurrent opener dying. Increasingly elaborate repair schemes
+could be considered, but they require an ABI change (everyone
+must use them) anyway, so there's no need to do this at the same
+time as everything else.
diff --git a/lib/tdb2/doc/design.lyx b/lib/tdb2/doc/design.lyx
new file mode 100644
index 00000000000..0a1d6a14bca
--- /dev/null
+++ b/lib/tdb2/doc/design.lyx
@@ -0,0 +1,2689 @@
+#LyX 1.6.7 created this file. For more info see http://www.lyx.org/
+\lyxformat 345
+\begin_document
+\begin_header
+\textclass article
+\use_default_options true
+\language english
+\inputencoding auto
+\font_roman default
+\font_sans default
+\font_typewriter default
+\font_default_family default
+\font_sc false
+\font_osf false
+\font_sf_scale 100
+\font_tt_scale 100
+
+\graphics default
+\paperfontsize default
+\use_hyperref false
+\papersize default
+\use_geometry false
+\use_amsmath 1
+\use_esint 1
+\cite_engine basic
+\use_bibtopic false
+\paperorientation portrait
+\secnumdepth 3
+\tocdepth 3
+\paragraph_separation indent
+\defskip medskip
+\quotes_language english
+\papercolumns 1
+\papersides 1
+\paperpagestyle default
+\tracking_changes true
+\output_changes true
+\author ""
+\author ""
+\end_header
+
+\begin_body
+
+\begin_layout Title
+TDB2: A Redesigning The Trivial DataBase
+\end_layout
+
+\begin_layout Author
+Rusty Russell, IBM Corporation
+\end_layout
+
+\begin_layout Date
+17-March-2011
+\end_layout
+
+\begin_layout Abstract
+The Trivial DataBase on-disk format is 32 bits; with usage cases heading
+ towards the 4G limit, that must change.
+ This required breakage provides an opportunity to revisit TDB's other design
+ decisions and reassess them.
+\end_layout
+
+\begin_layout Section
+Introduction
+\end_layout
+
+\begin_layout Standard
+The Trivial DataBase was originally written by Andrew Tridgell as a simple
+ key/data pair storage system with the same API as dbm, but allowing multiple
+ readers and writers while being small enough (< 1000 lines of C) to include
+ in SAMBA.
+ The simple design created in 1999 has proven surprisingly robust and performant
+, used in Samba versions 3 and 4 as well as numerous other projects.
+ Its useful life was greatly increased by the (backwards-compatible!) addition
+ of transaction support in 2005.
+\end_layout
+
+\begin_layout Standard
+The wider variety and greater demands of TDB-using code has lead to some
+ organic growth of the API, as well as some compromises on the implementation.
+ None of these, by themselves, are seen as show-stoppers, but the cumulative
+ effect is to a loss of elegance over the initial, simple TDB implementation.
+ Here is a table of the approximate number of lines of implementation code
+ and number of API functions at the end of each year:
+\end_layout
+
+\begin_layout Standard
+\begin_inset Tabular
+<lyxtabular version="3" rows="12" columns="3">
+<features>
+<column alignment="center" valignment="top" width="0">
+<column alignment="center" valignment="top" width="0">
+<column alignment="center" valignment="top" width="0">
+<row>
+<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+Year End
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+API Functions
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+Lines of C Code Implementation
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+1999
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+13
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+1195
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2000
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+24
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+1725
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2001
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+32
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2228
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2002
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+35
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2481
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2003
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+35
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2552
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2004
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+40
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2584
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2005
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+38
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2647
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2006
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+52
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+3754
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2007
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+66
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+4398
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2008
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+71
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+4768
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2009
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+73
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+5715
+\end_layout
+
+\end_inset
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+This review is an attempt to catalog and address all the known issues with
+ TDB and create solutions which address the problems without significantly
+ increasing complexity; all involved are far too aware of the dangers of
+ second system syndrome in rewriting a successful project like this.
+\end_layout
+
+\begin_layout Section
+API Issues
+\end_layout
+
+\begin_layout Subsection
+tdb_open_ex Is Not Expandable
+\end_layout
+
+\begin_layout Standard
+The tdb_open() call was expanded to tdb_open_ex(), which added an optional
+ hashing function and an optional logging function argument.
+ Additional arguments to open would require the introduction of a tdb_open_ex2
+ call etc.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\begin_inset CommandInset label
+LatexCommand label
+name "attributes"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+tdb_open() will take a linked-list of attributes:
+\end_layout
+
+\begin_layout LyX-Code
+enum tdb_attribute {
+\end_layout
+
+\begin_layout LyX-Code
+ TDB_ATTRIBUTE_LOG = 0,
+\end_layout
+
+\begin_layout LyX-Code
+ TDB_ATTRIBUTE_HASH = 1
+\end_layout
+
+\begin_layout LyX-Code
+};
+\end_layout
+
+\begin_layout LyX-Code
+struct tdb_attribute_base {
+\end_layout
+
+\begin_layout LyX-Code
+ enum tdb_attribute attr;
+\end_layout
+
+\begin_layout LyX-Code
+ union tdb_attribute *next;
+\end_layout
+
+\begin_layout LyX-Code
+};
+\end_layout
+
+\begin_layout LyX-Code
+struct tdb_attribute_log {
+\end_layout
+
+\begin_layout LyX-Code
+ struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_LOG */
+\end_layout
+
+\begin_layout LyX-Code
+ tdb_log_func log_fn;
+\end_layout
+
+\begin_layout LyX-Code
+ void *log_private;
+\end_layout
+
+\begin_layout LyX-Code
+};
+\end_layout
+
+\begin_layout LyX-Code
+struct tdb_attribute_hash {
+\end_layout
+
+\begin_layout LyX-Code
+ struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_HASH */
+\end_layout
+
+\begin_layout LyX-Code
+ tdb_hash_func hash_fn;
+\end_layout
+
+\begin_layout LyX-Code
+ void *hash_private;
+\end_layout
+
+\begin_layout LyX-Code
+};
+\end_layout
+
+\begin_layout LyX-Code
+union tdb_attribute {
+\end_layout
+
+\begin_layout LyX-Code
+ struct tdb_attribute_base base;
+\end_layout
+
+\begin_layout LyX-Code
+ struct tdb_attribute_log log;
+\end_layout
+
+\begin_layout LyX-Code
+ struct tdb_attribute_hash hash;
+\end_layout
+
+\begin_layout LyX-Code
+};
+\end_layout
+
+\begin_layout Standard
+This allows future attributes to be added, even if this expands the size
+ of the union.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
+\begin_layout Subsection
+tdb_traverse Makes Impossible Guarantees
+\end_layout
+
+\begin_layout Standard
+tdb_traverse (and tdb_firstkey/tdb_nextkey) predate transactions, and it
+ was thought that it was important to guarantee that all records which exist
+ at the start and end of the traversal would be included, and no record
+ would be included twice.
+\end_layout
+
+\begin_layout Standard
+This adds complexity (see
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "Reliable-Traversal-Adds"
+
+\end_inset
+
+) and does not work anyway for records which are altered (in particular,
+ those which are expanded may be effectively deleted and re-added behind
+ the traversal).
+\end_layout
+
+\begin_layout Subsubsection
+\begin_inset CommandInset label
+LatexCommand label
+name "traverse-Proposed-Solution"
+
+\end_inset
+
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+Abandon the guarantee.
+ You will see every record if no changes occur during your traversal, otherwise
+ you will see some subset.
+ You can prevent changes by using a transaction or the locking API.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+ Delete-during-traverse will still delete every record, too (assuming no
+ other changes).
+\end_layout
+
+\begin_layout Subsection
+Nesting of Transactions Is Fraught
+\end_layout
+
+\begin_layout Standard
+TDB has alternated between allowing nested transactions and not allowing
+ them.
+ Various paths in the Samba codebase assume that transactions will nest,
+ and in a sense they can: the operation is only committed to disk when the
+ outer transaction is committed.
+ There are two problems, however:
+\end_layout
+
+\begin_layout Enumerate
+Canceling the inner transaction will cause the outer transaction commit
+ to fail, and will not undo any operations since the inner transaction began.
+ This problem is soluble with some additional internal code.
+\end_layout
+
+\begin_layout Enumerate
+An inner transaction commit can be cancelled by the outer transaction.
+ This is desirable in the way which Samba's database initialization code
+ uses transactions, but could be a surprise to any users expecting a successful
+ transaction commit to expose changes to others.
+\end_layout
+
+\begin_layout Standard
+The current solution is to specify the behavior at tdb_open(), with the
+ default currently that nested transactions are allowed.
+ This flag can also be changed at runtime.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+Given the usage patterns, it seems that the
+\begin_inset Quotes eld
+\end_inset
+
+least-surprise
+\begin_inset Quotes erd
+\end_inset
+
+ behavior of disallowing nested transactions should become the default.
+ Additionally, it seems the outer transaction is the only code which knows
+ whether inner transactions should be allowed, so a flag to indicate this
+ could be added to tdb_transaction_start.
+ However, this behavior can be simulated with a wrapper which uses tdb_add_flags
+() and tdb_remove_flags(), so the API should not be expanded for this relatively
+-obscure case.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete; the nesting flag has been removed.
+\end_layout
+
+\begin_layout Subsection
+Incorrect Hash Function is Not Detected
+\end_layout
+
+\begin_layout Standard
+tdb_open_ex() allows the calling code to specify a different hash function
+ to use, but does not check that all other processes accessing this tdb
+ are using the same hash function.
+ The result is that records are missing from tdb_fetch().
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+The header should contain an example hash result (eg.
+ the hash of 0xdeadbeef), and tdb_open_ex() should check that the given
+ hash function produces the same answer, or fail the tdb_open call.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
+\begin_layout Subsection
+tdb_set_max_dead/TDB_VOLATILE Expose Implementation
+\end_layout
+
+\begin_layout Standard
+In response to scalability issues with the free list (
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "TDB-Freelist-Is"
+
+\end_inset
+
+) two API workarounds have been incorporated in TDB: tdb_set_max_dead()
+ and the TDB_VOLATILE flag to tdb_open.
+ The latter actually calls the former with an argument of
+\begin_inset Quotes eld
+\end_inset
+
+5
+\begin_inset Quotes erd
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Standard
+This code allows deleted records to accumulate without putting them in the
+ free list.
+ On delete we iterate through each chain and free them in a batch if there
+ are more than max_dead entries.
+ These are never otherwise recycled except as a side-effect of a tdb_repack.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+With the scalability problems of the freelist solved, this API can be removed.
+ The TDB_VOLATILE flag may still be useful as a hint that store and delete
+ of records will be at least as common as fetch in order to allow some internal
+ tuning, but initially will become a no-op.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+ Unknown flags cause tdb_open() to fail as well, so they can be detected
+ at runtime.
+\end_layout
+
+\begin_layout Subsection
+\begin_inset CommandInset label
+LatexCommand label
+name "TDB-Files-Cannot"
+
+\end_inset
+
+TDB Files Cannot Be Opened Multiple Times In The Same Process
+\end_layout
+
+\begin_layout Standard
+No process can open the same TDB twice; we check and disallow it.
+ This is an unfortunate side-effect of fcntl locks, which operate on a per-file
+ rather than per-file-descriptor basis, and do not nest.
+ Thus, closing any file descriptor on a file clears all the locks obtained
+ by this process, even if they were placed using a different file descriptor!
+\end_layout
+
+\begin_layout Standard
+Note that even if this were solved, deadlock could occur if operations were
+ nested: this is a more manageable programming error in most cases.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+We could lobby POSIX to fix the perverse rules, or at least lobby Linux
+ to violate them so that the most common implementation does not have this
+ restriction.
+ This would be a generally good idea for other fcntl lock users.
+\end_layout
+
+\begin_layout Standard
+Samba uses a wrapper which hands out the same tdb_context to multiple callers
+ if this happens, and does simple reference counting.
+ We should do this inside the tdb library, which already emulates lock nesting
+ internally; it would need to recognize when deadlock occurs within a single
+ process.
+ This would create a new failure mode for tdb operations (while we currently
+ handle locking failures, they are impossible in normal use and a process
+ encountering them can do little but give up).
+\end_layout
+
+\begin_layout Standard
+I do not see benefit in an additional tdb_open flag to indicate whether
+ re-opening is allowed, as though there may be some benefit to adding a
+ call to detect when a tdb_context is shared, to allow other to create such
+ an API.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
+\begin_layout Subsection
+TDB API Is Not POSIX Thread-safe
+\end_layout
+
+\begin_layout Standard
+The TDB API uses an error code which can be queried after an operation to
+ determine what went wrong.
+ This programming model does not work with threads, unless specific additional
+ guarantees are given by the implementation.
+ In addition, even otherwise-independent threads cannot open the same TDB
+ (as in
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "TDB-Files-Cannot"
+
+\end_inset
+
+).
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+Reachitecting the API to include a tdb_errcode pointer would be a great
+ deal of churn, but fortunately most functions return 0 on success and -1
+ on error: we can change these to return 0 on success and a negative error
+ code on error, and the API remains similar to previous.
+ The tdb_fetch, tdb_firstkey and tdb_nextkey functions need to take a TDB_DATA
+ pointer and return an error code.
+ It is also simpler to have tdb_nextkey replace its key argument in place,
+ freeing up any old .dptr.
+\end_layout
+
+\begin_layout Standard
+Internal locking is required to make sure that fcntl locks do not overlap
+ between threads, and also that the global list of tdbs is maintained.
+\end_layout
+
+\begin_layout Standard
+The aim is that building tdb with -DTDB_PTHREAD will result in a pthread-safe
+ version of the library, and otherwise no overhead will exist.
+ Alternatively, a hooking mechanism similar to that proposed for
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "Proposed-Solution-locking-hook"
+
+\end_inset
+
+ could be used to enable pthread locking at runtime.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Incomplete; API has been changed but thread safety has not been implemented.
+\end_layout
+
+\begin_layout Subsection
+*_nonblock Functions And *_mark Functions Expose Implementation
+\end_layout
+
+\begin_layout Standard
+CTDB
+\begin_inset Foot
+status collapsed
+
+\begin_layout Plain Layout
+Clustered TDB, see http://ctdb.samba.org
+\end_layout
+
+\end_inset
+
+ wishes to operate on TDB in a non-blocking manner.
+ This is currently done as follows:
+\end_layout
+
+\begin_layout Enumerate
+Call the _nonblock variant of an API function (eg.
+ tdb_lockall_nonblock).
+ If this fails:
+\end_layout
+
+\begin_layout Enumerate
+Fork a child process, and wait for it to call the normal variant (eg.
+ tdb_lockall).
+\end_layout
+
+\begin_layout Enumerate
+If the child succeeds, call the _mark variant to indicate we already have
+ the locks (eg.
+ tdb_lockall_mark).
+\end_layout
+
+\begin_layout Enumerate
+Upon completion, tell the child to release the locks (eg.
+ tdb_unlockall).
+\end_layout
+
+\begin_layout Enumerate
+Indicate to tdb that it should consider the locks removed (eg.
+ tdb_unlockall_mark).
+\end_layout
+
+\begin_layout Standard
+There are several issues with this approach.
+ Firstly, adding two new variants of each function clutters the API for
+ an obscure use, and so not all functions have three variants.
+ Secondly, it assumes that all paths of the functions ask for the same locks,
+ otherwise the parent process will have to get a lock which the child doesn't
+ have under some circumstances.
+ I don't believe this is currently the case, but it constrains the implementatio
+n.
+
+\end_layout
+
+\begin_layout Subsubsection
+\begin_inset CommandInset label
+LatexCommand label
+name "Proposed-Solution-locking-hook"
+
+\end_inset
+
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+Implement a hook for locking methods, so that the caller can control the
+ calls to create and remove fcntl locks.
+ In this scenario, ctdbd would operate as follows:
+\end_layout
+
+\begin_layout Enumerate
+Call the normal API function, eg tdb_lockall().
+\end_layout
+
+\begin_layout Enumerate
+When the lock callback comes in, check if the child has the lock.
+ Initially, this is always false.
+ If so, return 0.
+ Otherwise, try to obtain it in non-blocking mode.
+ If that fails, return EWOULDBLOCK.
+\end_layout
+
+\begin_layout Enumerate
+Release locks in the unlock callback as normal.
+\end_layout
+
+\begin_layout Enumerate
+If tdb_lockall() fails, see if we recorded a lock failure; if so, call the
+ child to repeat the operation.
+\end_layout
+
+\begin_layout Enumerate
+The child records what locks it obtains, and returns that information to
+ the parent.
+\end_layout
+
+\begin_layout Enumerate
+When the child has succeeded, goto 1.
+\end_layout
+
+\begin_layout Standard
+This is flexible enough to handle any potential locking scenario, even when
+ lock requirements change.
+ It can be optimized so that the parent does not release locks, just tells
+ the child which locks it doesn't need to obtain.
+\end_layout
+
+\begin_layout Standard
+It also keeps the complexity out of the API, and in ctdbd where it is needed.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Incomplete.
+\end_layout
+
+\begin_layout Subsection
+tdb_chainlock Functions Expose Implementation
+\end_layout
+
+\begin_layout Standard
+tdb_chainlock locks some number of records, including the record indicated
+ by the given key.
+ This gave atomicity guarantees; no-one can start a transaction, alter,
+ read or delete that key while the lock is held.
+\end_layout
+
+\begin_layout Standard
+It also makes the same guarantee for any other key in the chain, which is
+ an internal implementation detail and potentially a cause for deadlock.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+None.
+ It would be nice to have an explicit single entry lock which effected no
+ other keys.
+ Unfortunately, this won't work for an entry which doesn't exist.
+ Thus while chainlock may be implemented more efficiently for the existing
+ case, it will still have overlap issues with the non-existing case.
+ So it is best to keep the current (lack of) guarantee about which records
+ will be effected to avoid constraining our implementation.
+\end_layout
+
+\begin_layout Subsection
+Signal Handling is Not Race-Free
+\end_layout
+
+\begin_layout Standard
+The tdb_setalarm_sigptr() call allows the caller's signal handler to indicate
+ that the tdb locking code should return with a failure, rather than trying
+ again when a signal is received (and errno == EAGAIN).
+ This is usually used to implement timeouts.
+\end_layout
+
+\begin_layout Standard
+Unfortunately, this does not work in the case where the signal is received
+ before the tdb code enters the fcntl() call to place the lock: the code
+ will sleep within the fcntl() code, unaware that the signal wants it to
+ exit.
+ In the case of long timeouts, this does not happen in practice.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+The locking hooks proposed in
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "Proposed-Solution-locking-hook"
+
+\end_inset
+
+ would allow the user to decide on whether to fail the lock acquisition
+ on a signal.
+ This allows the caller to choose their own compromise: they could narrow
+ the race by checking immediately before the fcntl call.
+\begin_inset Foot
+status collapsed
+
+\begin_layout Plain Layout
+It may be possible to make this race-free in some implementations by having
+ the signal handler alter the struct flock to make it invalid.
+ This will cause the fcntl() lock call to fail with EINVAL if the signal
+ occurs before the kernel is entered, otherwise EAGAIN.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Incomplete.
+\end_layout
+
+\begin_layout Subsection
+The API Uses Gratuitous Typedefs, Capitals
+\end_layout
+
+\begin_layout Standard
+typedefs are useful for providing source compatibility when types can differ
+ across implementations, or arguably in the case of function pointer definitions
+ which are hard for humans to parse.
+ Otherwise it is simply obfuscation and pollutes the namespace.
+\end_layout
+
+\begin_layout Standard
+Capitalization is usually reserved for compile-time constants and macros.
+\end_layout
+
+\begin_layout Description
+TDB_CONTEXT There is no reason to use this over 'struct tdb_context'; the
+ definition isn't visible to the API user anyway.
+\end_layout
+
+\begin_layout Description
+TDB_DATA There is no reason to use this over struct TDB_DATA; the struct
+ needs to be understood by the API user.
+\end_layout
+
+\begin_layout Description
+struct
+\begin_inset space ~
+\end_inset
+
+TDB_DATA This would normally be called 'struct tdb_data'.
+\end_layout
+
+\begin_layout Description
+enum
+\begin_inset space ~
+\end_inset
+
+TDB_ERROR Similarly, this would normally be enum tdb_error.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+None.
+ Introducing lower case variants would please pedants like myself, but if
+ it were done the existing ones should be kept.
+ There is little point forcing a purely cosmetic change upon tdb users.
+\end_layout
+
+\begin_layout Subsection
+\begin_inset CommandInset label
+LatexCommand label
+name "tdb_log_func-Doesnt-Take"
+
+\end_inset
+
+tdb_log_func Doesn't Take The Private Pointer
+\end_layout
+
+\begin_layout Standard
+For API compatibility reasons, the logging function needs to call tdb_get_loggin
+g_private() to retrieve the pointer registered by the tdb_open_ex for logging.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+It should simply take an extra argument, since we are prepared to break
+ the API/ABI.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
+\begin_layout Subsection
+Various Callback Functions Are Not Typesafe
+\end_layout
+
+\begin_layout Standard
+The callback functions in tdb_set_logging_function (after
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "tdb_log_func-Doesnt-Take"
+
+\end_inset
+
+ is resolved), tdb_parse_record, tdb_traverse, tdb_traverse_read and tdb_check
+ all take void * and must internally convert it to the argument type they
+ were expecting.
+\end_layout
+
+\begin_layout Standard
+If this type changes, the compiler will not produce warnings on the callers,
+ since it only sees void *.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+With careful use of macros, we can create callback functions which give
+ a warning when used on gcc and the types of the callback and its private
+ argument differ.
+ Unsupported compilers will not give a warning, which is no worse than now.
+ In addition, the callbacks become clearer, as they need not use void *
+ for their parameter.
+\end_layout
+
+\begin_layout Standard
+See CCAN's typesafe_cb module at http://ccan.ozlabs.org/info/typesafe_cb.html
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
+\begin_layout Subsection
+TDB_CLEAR_IF_FIRST Must Be Specified On All Opens, tdb_reopen_all Problematic
+\end_layout
+
+\begin_layout Standard
+The TDB_CLEAR_IF_FIRST flag to tdb_open indicates that the TDB file should
+ be cleared if the caller discovers it is the only process with the TDB
+ open.
+ However, if any caller does not specify TDB_CLEAR_IF_FIRST it will not
+ be detected, so will have the TDB erased underneath them (usually resulting
+ in a crash).
+\end_layout
+
+\begin_layout Standard
+There is a similar issue on fork(); if the parent exits (or otherwise closes
+ the tdb) before the child calls tdb_reopen_all() to establish the lock
+ used to indicate the TDB is opened by someone, a TDB_CLEAR_IF_FIRST opener
+ at that moment will believe it alone has opened the TDB and will erase
+ it.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+Remove TDB_CLEAR_IF_FIRST.
+ Other workarounds are possible, but see
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "TDB_CLEAR_IF_FIRST-Imposes-Performance"
+
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
+\begin_layout Subsection
+Extending The Header Is Difficult
+\end_layout
+
+\begin_layout Standard
+We have reserved (zeroed) words in the TDB header, which can be used for
+ future features.
+ If the future features are compulsory, the version number must be updated
+ to prevent old code from accessing the database.
+ But if the future feature is optional, we have no way of telling if older
+ code is accessing the database or not.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+The header should contain a
+\begin_inset Quotes eld
+\end_inset
+
+format variant
+\begin_inset Quotes erd
+\end_inset
+
+ value (64-bit).
+ This is divided into two 32-bit parts:
+\end_layout
+
+\begin_layout Enumerate
+The lower part reflects the format variant understood by code accessing
+ the database.
+\end_layout
+
+\begin_layout Enumerate
+The upper part reflects the format variant you must understand to write
+ to the database (otherwise you can only open for reading).
+\end_layout
+
+\begin_layout Standard
+The latter field can only be written at creation time, the former should
+ be written under the OPEN_LOCK when opening the database for writing, if
+ the variant of the code is lower than the current lowest variant.
+\end_layout
+
+\begin_layout Standard
+This should allow backwards-compatible features to be added, and detection
+ if older code (which doesn't understand the feature) writes to the database.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
+\begin_layout Subsection
+Record Headers Are Not Expandible
+\end_layout
+
+\begin_layout Standard
+If we later want to add (say) checksums on keys and data, it would require
+ another format change, which we'd like to avoid.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+We often have extra padding at the tail of a record.
+ If we ensure that the first byte (if any) of this padding is zero, we will
+ have a way for future changes to detect code which doesn't understand a
+ new format: the new code would write (say) a 1 at the tail, and thus if
+ there is no tail or the first byte is 0, we would know the extension is
+ not present on that record.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
+\begin_layout Subsection
+TDB Does Not Use Talloc
+\end_layout
+
+\begin_layout Standard
+Many users of TDB (particularly Samba) use the talloc allocator, and thus
+ have to wrap TDB in a talloc context to use it conveniently.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+The allocation within TDB is not complicated enough to justify the use of
+ talloc, and I am reluctant to force another (excellent) library on TDB
+ users.
+ Nonetheless a compromise is possible.
+ An attribute (see
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "attributes"
+
+\end_inset
+
+) can be added later to tdb_open() to provide an alternate allocation mechanism,
+ specifically for talloc but usable by any other allocator (which would
+ ignore the
+\begin_inset Quotes eld
+\end_inset
+
+context
+\begin_inset Quotes erd
+\end_inset
+
+ argument).
+\end_layout
+
+\begin_layout Standard
+This would form a talloc heirarchy as expected, but the caller would still
+ have to attach a destructor to the tdb context returned from tdb_open to
+ close it.
+ All TDB_DATA fields would be children of the tdb_context, and the caller
+ would still have to manage them (using talloc_free() or talloc_steal()).
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Deferred.
+\end_layout
+
+\begin_layout Section
+Performance And Scalability Issues
+\end_layout
+
+\begin_layout Subsection
+\begin_inset CommandInset label
+LatexCommand label
+name "TDB_CLEAR_IF_FIRST-Imposes-Performance"
+
+\end_inset
+
+TDB_CLEAR_IF_FIRST Imposes Performance Penalty
+\end_layout
+
+\begin_layout Standard
+When TDB_CLEAR_IF_FIRST is specified, a 1-byte read lock is placed at offset
+ 4 (aka.
+ the ACTIVE_LOCK).
+ While these locks never conflict in normal tdb usage, they do add substantial
+ overhead for most fcntl lock implementations when the kernel scans to detect
+ if a lock conflict exists.
+ This is often a single linked list, making the time to acquire and release
+ a fcntl lock O(N) where N is the number of processes with the TDB open,
+ not the number actually doing work.
+\end_layout
+
+\begin_layout Standard
+In a Samba server it is common to have huge numbers of clients sitting idle,
+ and thus they have weaned themselves off the TDB_CLEAR_IF_FIRST flag.
+\begin_inset Foot
+status collapsed
+
+\begin_layout Plain Layout
+There is a flag to tdb_reopen_all() which is used for this optimization:
+ if the parent process will outlive the child, the child does not need the
+ ACTIVE_LOCK.
+ This is a workaround for this very performance issue.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+Remove the flag.
+ It was a neat idea, but even trivial servers tend to know when they are
+ initializing for the first time and can simply unlink the old tdb at that
+ point.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
+\begin_layout Subsection
+TDB Files Have a 4G Limit
+\end_layout
+
+\begin_layout Standard
+This seems to be becoming an issue (so much for
+\begin_inset Quotes eld
+\end_inset
+
+trivial
+\begin_inset Quotes erd
+\end_inset
+
+!), particularly for ldb.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+A new, incompatible TDB format which uses 64 bit offsets internally rather
+ than 32 bit as now.
+ For simplicity of endian conversion (which TDB does on the fly if required),
+ all values will be 64 bit on disk.
+ In practice, some upper bits may be used for other purposes, but at least
+ 56 bits will be available for file offsets.
+\end_layout
+
+\begin_layout Standard
+tdb_open() will automatically detect the old version, and even create them
+ if TDB_VERSION6 is specified to tdb_open.
+\end_layout
+
+\begin_layout Standard
+32 bit processes will still be able to access TDBs larger than 4G (assuming
+ that their off_t allows them to seek to 64 bits), they will gracefully
+ fall back as they fail to mmap.
+ This can happen already with large TDBs.
+\end_layout
+
+\begin_layout Standard
+Old versions of tdb will fail to open the new TDB files (since 28 August
+ 2009, commit 398d0c29290: prior to that any unrecognized file format would
+ be erased and initialized as a fresh tdb!)
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
+\begin_layout Subsection
+TDB Records Have a 4G Limit
+\end_layout
+
+\begin_layout Standard
+This has not been a reported problem, and the API uses size_t which can
+ be 64 bit on 64 bit platforms.
+ However, other limits may have made such an issue moot.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+Record sizes will be 64 bit, with an error returned on 32 bit platforms
+ which try to access such records (the current implementation would return
+ TDB_ERR_OOM in a similar case).
+ It seems unlikely that 32 bit keys will be a limitation, so the implementation
+ may not support this (see
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sub:Records-Incur-A"
+
+\end_inset
+
+).
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
+\begin_layout Subsection
+Hash Size Is Determined At TDB Creation Time
+\end_layout
+
+\begin_layout Standard
+TDB contains a number of hash chains in the header; the number is specified
+ at creation time, and defaults to 131.
+ This is such a bottleneck on large databases (as each hash chain gets quite
+ long), that LDB uses 10,000 for this hash.
+ In general it is impossible to know what the 'right' answer is at database
+ creation time.
+\end_layout
+
+\begin_layout Subsubsection
+\begin_inset CommandInset label
+LatexCommand label
+name "sub:Hash-Size-Solution"
+
+\end_inset
+
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+After comprehensive performance testing on various scalable hash variants
+\begin_inset Foot
+status collapsed
+
+\begin_layout Plain Layout
+http://rusty.ozlabs.org/?p=89 and http://rusty.ozlabs.org/?p=94 This was annoying
+ because I was previously convinced that an expanding tree of hashes would
+ be very close to optimal.
+\end_layout
+
+\end_inset
+
+, it became clear that it is hard to beat a straight linear hash table which
+ doubles in size when it reaches saturation.
+ Unfortunately, altering the hash table introduces serious locking complications
+: the entire hash table needs to be locked to enlarge the hash table, and
+ others might be holding locks.
+ Particularly insidious are insertions done under tdb_chainlock.
+\end_layout
+
+\begin_layout Standard
+Thus an expanding layered hash will be used: an array of hash groups, with
+ each hash group exploding into pointers to lower hash groups once it fills,
+ turning into a hash tree.
+ This has implications for locking: we must lock the entire group in case
+ we need to expand it, yet we don't know how deep the tree is at that point.
+\end_layout
+
+\begin_layout Standard
+Note that bits from the hash table entries should be stolen to hold more
+ hash bits to reduce the penalty of collisions.
+ We can use the otherwise-unused lower 3 bits.
+ If we limit the size of the database to 64 exabytes, we can use the top
+ 8 bits of the hash entry as well.
+ These 11 bits would reduce false positives down to 1 in 2000 which is more
+ than we need: we can use one of the bits to indicate that the extra hash
+ bits are valid.
+ This means we can choose not to re-hash all entries when we expand a hash
+ group; simply use the next bits we need and mark them invalid.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
+\begin_layout Subsection
+\begin_inset CommandInset label
+LatexCommand label
+name "TDB-Freelist-Is"
+
+\end_inset
+
+TDB Freelist Is Highly Contended
+\end_layout
+
+\begin_layout Standard
+TDB uses a single linked list for the free list.
+ Allocation occurs as follows, using heuristics which have evolved over
+ time:
+\end_layout
+
+\begin_layout Enumerate
+Get the free list lock for this whole operation.
+\end_layout
+
+\begin_layout Enumerate
+Multiply length by 1.25, so we always over-allocate by 25%.
+\end_layout
+
+\begin_layout Enumerate
+Set the slack multiplier to 1.
+\end_layout
+
+\begin_layout Enumerate
+Examine the current freelist entry: if it is > length but < the current
+ best case, remember it as the best case.
+\end_layout
+
+\begin_layout Enumerate
+Multiply the slack multiplier by 1.05.
+\end_layout
+
+\begin_layout Enumerate
+If our best fit so far is less than length * slack multiplier, return it.
+ The slack will be turned into a new free record if it's large enough.
+\end_layout
+
+\begin_layout Enumerate
+Otherwise, go onto the next freelist entry.
+\end_layout
+
+\begin_layout Standard
+Deleting a record occurs as follows:
+\end_layout
+
+\begin_layout Enumerate
+Lock the hash chain for this whole operation.
+\end_layout
+
+\begin_layout Enumerate
+Walk the chain to find the record, keeping the prev pointer offset.
+\end_layout
+
+\begin_layout Enumerate
+If max_dead is non-zero:
+\end_layout
+
+\begin_deeper
+\begin_layout Enumerate
+Walk the hash chain again and count the dead records.
+\end_layout
+
+\begin_layout Enumerate
+If it's more than max_dead, bulk free all the dead ones (similar to steps
+ 4 and below, but the lock is only obtained once).
+\end_layout
+
+\begin_layout Enumerate
+Simply mark this record as dead and return.
+
+\end_layout
+
+\end_deeper
+\begin_layout Enumerate
+Get the free list lock for the remainder of this operation.
+\end_layout
+
+\begin_layout Enumerate
+\begin_inset CommandInset label
+LatexCommand label
+name "right-merging"
+
+\end_inset
+
+Examine the following block to see if it is free; if so, enlarge the current
+ block and remove that block from the free list.
+ This was disabled, as removal from the free list was O(entries-in-free-list).
+\end_layout
+
+\begin_layout Enumerate
+Examine the preceeding block to see if it is free: for this reason, each
+ block has a 32-bit tailer which indicates its length.
+ If it is free, expand it to cover our new block and return.
+\end_layout
+
+\begin_layout Enumerate
+Otherwise, prepend ourselves to the free list.
+\end_layout
+
+\begin_layout Standard
+Disabling right-merging (step
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "right-merging"
+
+\end_inset
+
+) causes fragmentation; the other heuristics proved insufficient to address
+ this, so the final answer to this was that when we expand the TDB file
+ inside a transaction commit, we repack the entire tdb.
+\end_layout
+
+\begin_layout Standard
+The single list lock limits our allocation rate; due to the other issues
+ this is not currently seen as a bottleneck.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+The first step is to remove all the current heuristics, as they obviously
+ interact, then examine them once the lock contention is addressed.
+\end_layout
+
+\begin_layout Standard
+The free list must be split to reduce contention.
+ Assuming perfect free merging, we can at most have 1 free list entry for
+ each entry.
+ This implies that the number of free lists is related to the size of the
+ hash table, but as it is rare to walk a large number of free list entries
+ we can use far fewer, say 1/32 of the number of hash buckets.
+\end_layout
+
+\begin_layout Standard
+It seems tempting to try to reuse the hash implementation which we use for
+ records here, but we have two ways of searching for free entries: for allocatio
+n we search by size (and possibly zone) which produces too many clashes
+ for our hash table to handle well, and for coalescing we search by address.
+ Thus an array of doubly-linked free lists seems preferable.
+\end_layout
+
+\begin_layout Standard
+There are various benefits in using per-size free lists (see
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sub:TDB-Becomes-Fragmented"
+
+\end_inset
+
+) but it's not clear this would reduce contention in the common case where
+ all processes are allocating/freeing the same size.
+ Thus we almost certainly need to divide in other ways: the most obvious
+ is to divide the file into zones, and using a free list (or table of free
+ lists) for each.
+ This approximates address ordering.
+\end_layout
+
+\begin_layout Standard
+Unfortunately it is difficult to know what heuristics should be used to
+ determine zone sizes, and our transaction code relies on being able to
+ create a
+\begin_inset Quotes eld
+\end_inset
+
+recovery area
+\begin_inset Quotes erd
+\end_inset
+
+ by simply appending to the file (difficult if it would need to create a
+ new zone header).
+ Thus we use a linked-list of free tables; currently we only ever create
+ one, but if there is more than one we choose one at random to use.
+ In future we may use heuristics to add new free tables on contention.
+ We only expand the file when all free tables are exhausted.
+\end_layout
+
+\begin_layout Standard
+The basic algorithm is as follows.
+ Freeing is simple:
+\end_layout
+
+\begin_layout Enumerate
+Identify the correct free list.
+\end_layout
+
+\begin_layout Enumerate
+Lock the corresponding list.
+\end_layout
+
+\begin_layout Enumerate
+Re-check the list (we didn't have a lock, sizes could have changed): relock
+ if necessary.
+\end_layout
+
+\begin_layout Enumerate
+Place the freed entry in the list.
+\end_layout
+
+\begin_layout Standard
+Allocation is a little more complicated, as we perform delayed coalescing
+ at this point:
+\end_layout
+
+\begin_layout Enumerate
+Pick a free table; usually the previous one.
+\end_layout
+
+\begin_layout Enumerate
+Lock the corresponding list.
+\end_layout
+
+\begin_layout Enumerate
+If the top entry is -large enough, remove it from the list and return it.
+\end_layout
+
+\begin_layout Enumerate
+Otherwise, coalesce entries in the list.If there was no entry large enough,
+ unlock the list and try the next largest list
+\end_layout
+
+\begin_layout Enumerate
+If no list has an entry which meets our needs, try the next free table.
+\end_layout
+
+\begin_layout Enumerate
+If no zone satisfies, expand the file.
+\end_layout
+
+\begin_layout Standard
+This optimizes rapid insert/delete of free list entries by not coalescing
+ them all the time..
+ First-fit address ordering ordering seems to be fairly good for keeping
+ fragmentation low (see
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sub:TDB-Becomes-Fragmented"
+
+\end_inset
+
+).
+ Note that address ordering does not need a tailer to coalesce, though if
+ we needed one we could have one cheaply: see
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sub:Records-Incur-A"
+
+\end_inset
+
+.
+
+\end_layout
+
+\begin_layout Standard
+Each free entry has the free table number in the header: less than 255.
+ It also contains a doubly-linked list for easy deletion.
+\end_layout
+
+\begin_layout Subsection
+\begin_inset CommandInset label
+LatexCommand label
+name "sub:TDB-Becomes-Fragmented"
+
+\end_inset
+
+TDB Becomes Fragmented
+\end_layout
+
+\begin_layout Standard
+Much of this is a result of allocation strategy
+\begin_inset Foot
+status collapsed
+
+\begin_layout Plain Layout
+The Memory Fragmentation Problem: Solved? Johnstone & Wilson 1995 ftp://ftp.cs.ute
+xas.edu/pub/garbage/malloc/ismm98.ps
+\end_layout
+
+\end_inset
+
+ and deliberate hobbling of coalescing; internal fragmentation (aka overallocati
+on) is deliberately set at 25%, and external fragmentation is only cured
+ by the decision to repack the entire db when a transaction commit needs
+ to enlarge the file.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+The 25% overhead on allocation works in practice for ldb because indexes
+ tend to expand by one record at a time.
+ This internal fragmentation can be resolved by having an
+\begin_inset Quotes eld
+\end_inset
+
+expanded
+\begin_inset Quotes erd
+\end_inset
+
+ bit in the header to note entries that have previously expanded, and allocating
+ more space for them.
+\end_layout
+
+\begin_layout Standard
+There are is a spectrum of possible solutions for external fragmentation:
+ one is to use a fragmentation-avoiding allocation strategy such as best-fit
+ address-order allocator.
+ The other end of the spectrum would be to use a bump allocator (very fast
+ and simple) and simply repack the file when we reach the end.
+\end_layout
+
+\begin_layout Standard
+There are three problems with efficient fragmentation-avoiding allocators:
+ they are non-trivial, they tend to use a single free list for each size,
+ and there's no evidence that tdb allocation patterns will match those recorded
+ for general allocators (though it seems likely).
+\end_layout
+
+\begin_layout Standard
+Thus we don't spend too much effort on external fragmentation; we will be
+ no worse than the current code if we need to repack on occasion.
+ More effort is spent on reducing freelist contention, and reducing overhead.
+\end_layout
+
+\begin_layout Subsection
+\begin_inset CommandInset label
+LatexCommand label
+name "sub:Records-Incur-A"
+
+\end_inset
+
+Records Incur A 28-Byte Overhead
+\end_layout
+
+\begin_layout Standard
+Each TDB record has a header as follows:
+\end_layout
+
+\begin_layout LyX-Code
+struct tdb_record {
+\end_layout
+
+\begin_layout LyX-Code
+ tdb_off_t next; /* offset of the next record in the list */
+\end_layout
+
+\begin_layout LyX-Code
+ tdb_len_t rec_len; /* total byte length of record */
+\end_layout
+
+\begin_layout LyX-Code
+ tdb_len_t key_len; /* byte length of key */
+\end_layout
+
+\begin_layout LyX-Code
+ tdb_len_t data_len; /* byte length of data */
+\end_layout
+
+\begin_layout LyX-Code
+ uint32_t full_hash; /* the full 32 bit hash of the key */
+\end_layout
+
+\begin_layout LyX-Code
+ uint32_t magic; /* try to catch errors */
+\end_layout
+
+\begin_layout LyX-Code
+ /* the following union is implied:
+\end_layout
+
+\begin_layout LyX-Code
+ union {
+\end_layout
+
+\begin_layout LyX-Code
+ char record[rec_len];
+\end_layout
+
+\begin_layout LyX-Code
+ struct {
+\end_layout
+
+\begin_layout LyX-Code
+ char key[key_len];
+\end_layout
+
+\begin_layout LyX-Code
+ char data[data_len];
+\end_layout
+
+\begin_layout LyX-Code
+ }
+\end_layout
+
+\begin_layout LyX-Code
+ uint32_t totalsize; (tailer)
+\end_layout
+
+\begin_layout LyX-Code
+ }
+\end_layout
+
+\begin_layout LyX-Code
+ */
+\end_layout
+
+\begin_layout LyX-Code
+};
+\end_layout
+
+\begin_layout Standard
+Naively, this would double to a 56-byte overhead on a 64 bit implementation.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+We can use various techniques to reduce this for an allocated block:
+\end_layout
+
+\begin_layout Enumerate
+The 'next' pointer is not required, as we are using a flat hash table.
+\end_layout
+
+\begin_layout Enumerate
+'rec_len' can instead be expressed as an addition to key_len and data_len
+ (it accounts for wasted or overallocated length in the record).
+ Since the record length is always a multiple of 8, we can conveniently
+ fit it in 32 bits (representing up to 35 bits).
+\end_layout
+
+\begin_layout Enumerate
+'key_len' and 'data_len' can be reduced.
+ I'm unwilling to restrict 'data_len' to 32 bits, but instead we can combine
+ the two into one 64-bit field and using a 5 bit value which indicates at
+ what bit to divide the two.
+ Keys are unlikely to scale as fast as data, so I'm assuming a maximum key
+ size of 32 bits.
+\end_layout
+
+\begin_layout Enumerate
+'full_hash' is used to avoid a memcmp on the
+\begin_inset Quotes eld
+\end_inset
+
+miss
+\begin_inset Quotes erd
+\end_inset
+
+ case, but this is diminishing returns after a handful of bits (at 10 bits,
+ it reduces 99.9% of false memcmp).
+ As an aside, as the lower bits are already incorporated in the hash table
+ resolution, the upper bits should be used here.
+ Note that it's not clear that these bits will be a win, given the extra
+ bits in the hash table itself (see
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sub:Hash-Size-Solution"
+
+\end_inset
+
+).
+\end_layout
+
+\begin_layout Enumerate
+'magic' does not need to be enlarged: it currently reflects one of 5 values
+ (used, free, dead, recovery, and unused_recovery).
+ It is useful for quick sanity checking however, and should not be eliminated.
+\end_layout
+
+\begin_layout Enumerate
+'tailer' is only used to coalesce free blocks (so a block to the right can
+ find the header to check if this block is free).
+ This can be replaced by a single 'free' bit in the header of the following
+ block (and the tailer only exists in free blocks).
+\begin_inset Foot
+status collapsed
+
+\begin_layout Plain Layout
+This technique from Thomas Standish.
+ Data Structure Techniques.
+ Addison-Wesley, Reading, Massachusetts, 1980.
+\end_layout
+
+\end_inset
+
+ The current proposed coalescing algorithm doesn't need this, however.
+\end_layout
+
+\begin_layout Standard
+This produces a 16 byte used header like this:
+\end_layout
+
+\begin_layout LyX-Code
+struct tdb_used_record {
+\end_layout
+
+\begin_layout LyX-Code
+ uint32_t used_magic : 16,
+\end_layout
+
+\begin_layout LyX-Code
+
+\end_layout
+
+\begin_layout LyX-Code
+ key_data_divide: 5,
+\end_layout
+
+\begin_layout LyX-Code
+ top_hash: 11;
+\end_layout
+
+\begin_layout LyX-Code
+ uint32_t extra_octets;
+\end_layout
+
+\begin_layout LyX-Code
+ uint64_t key_and_data_len;
+\end_layout
+
+\begin_layout LyX-Code
+};
+\end_layout
+
+\begin_layout Standard
+And a free record like this:
+\end_layout
+
+\begin_layout LyX-Code
+struct tdb_free_record {
+\end_layout
+
+\begin_layout LyX-Code
+ uint64_t free_magic: 8,
+\end_layout
+
+\begin_layout LyX-Code
+ prev : 56;
+\end_layout
+
+\begin_layout LyX-Code
+
+\end_layout
+
+\begin_layout LyX-Code
+ uint64_t free_table: 8,
+\end_layout
+
+\begin_layout LyX-Code
+ total_length : 56
+\end_layout
+
+\begin_layout LyX-Code
+ uint64_t next;;
+\end_layout
+
+\begin_layout LyX-Code
+};
+\end_layout
+
+\begin_layout Standard
+Note that by limiting valid offsets to 56 bits, we can pack everything we
+ need into 3 64-byte words, meaning our minimum record size is 8 bytes.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
+\begin_layout Subsection
+Transaction Commit Requires 4 fdatasync
+\end_layout
+
+\begin_layout Standard
+The current transaction algorithm is:
+\end_layout
+
+\begin_layout Enumerate
+write_recovery_data();
+\end_layout
+
+\begin_layout Enumerate
+sync();
+\end_layout
+
+\begin_layout Enumerate
+write_recovery_header();
+\end_layout
+
+\begin_layout Enumerate
+sync();
+\end_layout
+
+\begin_layout Enumerate
+overwrite_with_new_data();
+\end_layout
+
+\begin_layout Enumerate
+sync();
+\end_layout
+
+\begin_layout Enumerate
+remove_recovery_header();
+\end_layout
+
+\begin_layout Enumerate
+sync();
+\end_layout
+
+\begin_layout Standard
+On current ext3, each sync flushes all data to disk, so the next 3 syncs
+ are relatively expensive.
+ But this could become a performance bottleneck on other filesystems such
+ as ext4.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+Neil Brown points out that this is overzealous, and only one sync is needed:
+\end_layout
+
+\begin_layout Enumerate
+Bundle the recovery data, a transaction counter and a strong checksum of
+ the new data.
+\end_layout
+
+\begin_layout Enumerate
+Strong checksum that whole bundle.
+\end_layout
+
+\begin_layout Enumerate
+Store the bundle in the database.
+\end_layout
+
+\begin_layout Enumerate
+Overwrite the oldest of the two recovery pointers in the header (identified
+ using the transaction counter) with the offset of this bundle.
+\end_layout
+
+\begin_layout Enumerate
+sync.
+\end_layout
+
+\begin_layout Enumerate
+Write the new data to the file.
+\end_layout
+
+\begin_layout Standard
+Checking for recovery means identifying the latest bundle with a valid checksum
+ and using the new data checksum to ensure that it has been applied.
+ This is more expensive than the current check, but need only be done at
+ open.
+ For running databases, a separate header field can be used to indicate
+ a transaction in progress; we need only check for recovery if this is set.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Deferred.
+\end_layout
+
+\begin_layout Subsection
+\begin_inset CommandInset label
+LatexCommand label
+name "sub:TDB-Does-Not"
+
+\end_inset
+
+TDB Does Not Have Snapshot Support
+\end_layout
+
+\begin_layout Subsubsection
+Proposed SolutionNone.
+ At some point you say
+\begin_inset Quotes eld
+\end_inset
+
+use a real database
+\begin_inset Quotes erd
+\end_inset
+
+ (but see
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "replay-attribute"
+
+\end_inset
+
+).
+\end_layout
+
+\begin_layout Standard
+But as a thought experiment, if we implemented transactions to only overwrite
+ free entries (this is tricky: there must not be a header in each entry
+ which indicates whether it is free, but use of presence in metadata elsewhere),
+ and a pointer to the hash table, we could create an entirely new commit
+ without destroying existing data.
+ Then it would be easy to implement snapshots in a similar way.
+\end_layout
+
+\begin_layout Standard
+This would not allow arbitrary changes to the database, such as tdb_repack
+ does, and would require more space (since we have to preserve the current
+ and future entries at once).
+ If we used hash trees rather than one big hash table, we might only have
+ to rewrite some sections of the hash, too.
+\end_layout
+
+\begin_layout Standard
+We could then implement snapshots using a similar method, using multiple
+ different hash tables/free tables.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Deferred.
+\end_layout
+
+\begin_layout Subsection
+Transactions Cannot Operate in Parallel
+\end_layout
+
+\begin_layout Standard
+This would be useless for ldb, as it hits the index records with just about
+ every update.
+ It would add significant complexity in resolving clashes, and cause the
+ all transaction callers to write their code to loop in the case where the
+ transactions spuriously failed.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+None (but see
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "replay-attribute"
+
+\end_inset
+
+).
+ We could solve a small part of the problem by providing read-only transactions.
+ These would allow one write transaction to begin, but it could not commit
+ until all r/o transactions are done.
+ This would require a new RO_TRANSACTION_LOCK, which would be upgraded on
+ commit.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Deferred.
+\end_layout
+
+\begin_layout Subsection
+Default Hash Function Is Suboptimal
+\end_layout
+
+\begin_layout Standard
+The Knuth-inspired multiplicative hash used by tdb is fairly slow (especially
+ if we expand it to 64 bits), and works best when the hash bucket size is
+ a prime number (which also means a slow modulus).
+ In addition, it is highly predictable which could potentially lead to a
+ Denial of Service attack in some TDB uses.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+The Jenkins lookup3 hash
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+http://burtleburtle.net/bob/c/lookup3.c
+\end_layout
+
+\end_inset
+
+ is a fast and superbly-mixing hash.
+ It's used by the Linux kernel and almost everything else.
+ This has the particular properties that it takes an initial seed, and produces
+ two 32 bit hash numbers, which we can combine into a 64-bit hash.
+\end_layout
+
+\begin_layout Standard
+The seed should be created at tdb-creation time from some random source,
+ and placed in the header.
+ This is far from foolproof, but adds a little bit of protection against
+ hash bombing.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
+\begin_layout Subsection
+\begin_inset CommandInset label
+LatexCommand label
+name "Reliable-Traversal-Adds"
+
+\end_inset
+
+Reliable Traversal Adds Complexity
+\end_layout
+
+\begin_layout Standard
+We lock a record during traversal iteration, and try to grab that lock in
+ the delete code.
+ If that grab on delete fails, we simply mark it deleted and continue onwards;
+ traversal checks for this condition and does the delete when it moves off
+ the record.
+\end_layout
+
+\begin_layout Standard
+If traversal terminates, the dead record may be left indefinitely.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+Remove reliability guarantees; see
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "traverse-Proposed-Solution"
+
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
+\begin_layout Subsection
+Fcntl Locking Adds Overhead
+\end_layout
+
+\begin_layout Standard
+Placing a fcntl lock means a system call, as does removing one.
+ This is actually one reason why transactions can be faster (everything
+ is locked once at transaction start).
+ In the uncontended case, this overhead can theoretically be eliminated.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+None.
+\end_layout
+
+\begin_layout Standard
+We tried this before with spinlock support, in the early days of TDB, and
+ it didn't make much difference except in manufactured benchmarks.
+\end_layout
+
+\begin_layout Standard
+We could use spinlocks (with futex kernel support under Linux), but it means
+ that we lose automatic cleanup when a process dies with a lock.
+ There is a method of auto-cleanup under Linux, but it's not supported by
+ other operating systems.
+ We could reintroduce a clear-if-first-style lock and sweep for dead futexes
+ on open, but that wouldn't help the normal case of one concurrent opener
+ dying.
+ Increasingly elaborate repair schemes could be considered, but they require
+ an ABI change (everyone must use them) anyway, so there's no need to do
+ this at the same time as everything else.
+\end_layout
+
+\begin_layout Subsection
+Some Transactions Don't Require Durability
+\end_layout
+
+\begin_layout Standard
+Volker points out that gencache uses a CLEAR_IF_FIRST tdb for normal (fast)
+ usage, and occasionally empties the results into a transactional TDB.
+ This kind of usage prioritizes performance over durability: as long as
+ we are consistent, data can be lost.
+\end_layout
+
+\begin_layout Standard
+This would be more neatly implemented inside tdb: a
+\begin_inset Quotes eld
+\end_inset
+
+soft
+\begin_inset Quotes erd
+\end_inset
+
+ transaction commit (ie.
+ syncless) which meant that data may be reverted on a crash.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+None.
+\end_layout
+
+\begin_layout Standard
+Unfortunately any transaction scheme which overwrites old data requires
+ a sync before that overwrite to avoid the possibility of corruption.
+\end_layout
+
+\begin_layout Standard
+It seems possible to use a scheme similar to that described in
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sub:TDB-Does-Not"
+
+\end_inset
+
+,where transactions are committed without overwriting existing data, and
+ an array of top-level pointers were available in the header.
+ If the transaction is
+\begin_inset Quotes eld
+\end_inset
+
+soft
+\begin_inset Quotes erd
+\end_inset
+
+ then we would not need a sync at all: existing processes would pick up
+ the new hash table and free list and work with that.
+\end_layout
+
+\begin_layout Standard
+At some later point, a sync would allow recovery of the old data into the
+ free lists (perhaps when the array of top-level pointers filled).
+ On crash, tdb_open() would examine the array of top levels, and apply the
+ transactions until it encountered an invalid checksum.
+\end_layout
+
+\begin_layout Subsection
+Tracing Is Fragile, Replay Is External
+\end_layout
+
+\begin_layout Standard
+The current TDB has compile-time-enabled tracing code, but it often breaks
+ as it is not enabled by default.
+ In a similar way, the ctdb code has an external wrapper which does replay
+ tracing so it can coordinate cluster-wide transactions.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\begin_inset CommandInset label
+LatexCommand label
+name "replay-attribute"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+Tridge points out that an attribute can be later added to tdb_open (see
+
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "attributes"
+
+\end_inset
+
+) to provide replay/trace hooks, which could become the basis for this and
+ future parallel transactions and snapshot support.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Deferred.
+\end_layout
+
+\end_body
+\end_document
diff --git a/lib/tdb2/doc/design.lyx,v b/lib/tdb2/doc/design.lyx,v
new file mode 100644
index 00000000000..13e6387f7fa
--- /dev/null
+++ b/lib/tdb2/doc/design.lyx,v
@@ -0,0 +1,4679 @@
+head 1.13;
+access;
+symbols;
+locks; strict;
+comment @# @;
+
+
+1.13
+date 2011.03.01.11.46.54; author rusty; state Exp;
+branches;
+next 1.12;
+
+1.12
+date 2010.12.01.12.20.49; author rusty; state Exp;
+branches;
+next 1.11;
+
+1.11
+date 2010.12.01.11.55.20; author rusty; state Exp;
+branches;
+next 1.10;
+
+1.10
+date 2010.09.14.00.33.57; author rusty; state Exp;
+branches;
+next 1.9;
+
+1.9
+date 2010.09.09.07.25.12; author rusty; state Exp;
+branches;
+next 1.8;
+
+1.8
+date 2010.09.02.02.29.05; author rusty; state Exp;
+branches;
+next 1.7;
+
+1.7
+date 2010.09.01.10.58.12; author rusty; state Exp;
+branches;
+next 1.6;
+
+1.6
+date 2010.08.02.00.21.43; author rusty; state Exp;
+branches;
+next 1.5;
+
+1.5
+date 2010.08.02.00.21.16; author rusty; state Exp;
+branches;
+next 1.4;
+
+1.4
+date 2010.05.10.13.09.11; author rusty; state Exp;
+branches;
+next 1.3;
+
+1.3
+date 2010.05.10.11.58.37; author rusty; state Exp;
+branches;
+next 1.2;
+
+1.2
+date 2010.05.10.05.35.13; author rusty; state Exp;
+branches;
+next 1.1;
+
+1.1
+date 2010.05.04.02.29.16; author rusty; state Exp;
+branches;
+next ;
+
+
+desc
+@First draft
+@
+
+
+1.13
+log
+@Thread-safe API
+@
+text
+@#LyX 1.6.7 created this file. For more info see http://www.lyx.org/
+\lyxformat 345
+\begin_document
+\begin_header
+\textclass article
+\use_default_options true
+\language english
+\inputencoding auto
+\font_roman default
+\font_sans default
+\font_typewriter default
+\font_default_family default
+\font_sc false
+\font_osf false
+\font_sf_scale 100
+\font_tt_scale 100
+
+\graphics default
+\paperfontsize default
+\use_hyperref false
+\papersize default
+\use_geometry false
+\use_amsmath 1
+\use_esint 1
+\cite_engine basic
+\use_bibtopic false
+\paperorientation portrait
+\secnumdepth 3
+\tocdepth 3
+\paragraph_separation indent
+\defskip medskip
+\quotes_language english
+\papercolumns 1
+\papersides 1
+\paperpagestyle default
+\tracking_changes true
+\output_changes true
+\author "Rusty Russell,,,"
+\author ""
+\end_header
+
+\begin_body
+
+\begin_layout Title
+TDB2: A Redesigning The Trivial DataBase
+\end_layout
+
+\begin_layout Author
+Rusty Russell, IBM Corporation
+\end_layout
+
+\begin_layout Date
+1-December-2010
+\end_layout
+
+\begin_layout Abstract
+The Trivial DataBase on-disk format is 32 bits; with usage cases heading
+ towards the 4G limit, that must change.
+ This required breakage provides an opportunity to revisit TDB's other design
+ decisions and reassess them.
+\end_layout
+
+\begin_layout Section
+Introduction
+\end_layout
+
+\begin_layout Standard
+The Trivial DataBase was originally written by Andrew Tridgell as a simple
+ key/data pair storage system with the same API as dbm, but allowing multiple
+ readers and writers while being small enough (< 1000 lines of C) to include
+ in SAMBA.
+ The simple design created in 1999 has proven surprisingly robust and performant
+, used in Samba versions 3 and 4 as well as numerous other projects.
+ Its useful life was greatly increased by the (backwards-compatible!) addition
+ of transaction support in 2005.
+\end_layout
+
+\begin_layout Standard
+The wider variety and greater demands of TDB-using code has lead to some
+ organic growth of the API, as well as some compromises on the implementation.
+ None of these, by themselves, are seen as show-stoppers, but the cumulative
+ effect is to a loss of elegance over the initial, simple TDB implementation.
+ Here is a table of the approximate number of lines of implementation code
+ and number of API functions at the end of each year:
+\end_layout
+
+\begin_layout Standard
+\begin_inset Tabular
+<lyxtabular version="3" rows="12" columns="3">
+<features>
+<column alignment="center" valignment="top" width="0">
+<column alignment="center" valignment="top" width="0">
+<column alignment="center" valignment="top" width="0">
+<row>
+<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+Year End
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+API Functions
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+Lines of C Code Implementation
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+1999
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+13
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+1195
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2000
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+24
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+1725
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2001
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+32
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2228
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2002
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+35
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2481
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2003
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+35
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2552
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2004
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+40
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2584
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2005
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+38
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2647
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2006
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+52
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+3754
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2007
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+66
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+4398
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2008
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+71
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+4768
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2009
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+73
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+5715
+\end_layout
+
+\end_inset
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+This review is an attempt to catalog and address all the known issues with
+ TDB and create solutions which address the problems without significantly
+ increasing complexity; all involved are far too aware of the dangers of
+ second system syndrome in rewriting a successful project like this.
+\end_layout
+
+\begin_layout Section
+API Issues
+\end_layout
+
+\begin_layout Subsection
+tdb_open_ex Is Not Expandable
+\end_layout
+
+\begin_layout Standard
+The tdb_open() call was expanded to tdb_open_ex(), which added an optional
+ hashing function and an optional logging function argument.
+ Additional arguments to open would require the introduction of a tdb_open_ex2
+ call etc.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\begin_inset CommandInset label
+LatexCommand label
+name "attributes"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+tdb_open() will take a linked-list of attributes:
+\end_layout
+
+\begin_layout LyX-Code
+enum tdb_attribute {
+\end_layout
+
+\begin_layout LyX-Code
+ TDB_ATTRIBUTE_LOG = 0,
+\end_layout
+
+\begin_layout LyX-Code
+ TDB_ATTRIBUTE_HASH = 1
+\end_layout
+
+\begin_layout LyX-Code
+};
+\end_layout
+
+\begin_layout LyX-Code
+struct tdb_attribute_base {
+\end_layout
+
+\begin_layout LyX-Code
+ enum tdb_attribute attr;
+\end_layout
+
+\begin_layout LyX-Code
+ union tdb_attribute *next;
+\end_layout
+
+\begin_layout LyX-Code
+};
+\end_layout
+
+\begin_layout LyX-Code
+struct tdb_attribute_log {
+\end_layout
+
+\begin_layout LyX-Code
+ struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_LOG */
+\end_layout
+
+\begin_layout LyX-Code
+ tdb_log_func log_fn;
+\end_layout
+
+\begin_layout LyX-Code
+ void *log_private;
+\end_layout
+
+\begin_layout LyX-Code
+};
+\end_layout
+
+\begin_layout LyX-Code
+struct tdb_attribute_hash {
+\end_layout
+
+\begin_layout LyX-Code
+ struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_HASH */
+\end_layout
+
+\begin_layout LyX-Code
+ tdb_hash_func hash_fn;
+\end_layout
+
+\begin_layout LyX-Code
+ void *hash_private;
+\end_layout
+
+\begin_layout LyX-Code
+};
+\end_layout
+
+\begin_layout LyX-Code
+union tdb_attribute {
+\end_layout
+
+\begin_layout LyX-Code
+ struct tdb_attribute_base base;
+\end_layout
+
+\begin_layout LyX-Code
+ struct tdb_attribute_log log;
+\end_layout
+
+\begin_layout LyX-Code
+ struct tdb_attribute_hash hash;
+\end_layout
+
+\begin_layout LyX-Code
+};
+\end_layout
+
+\begin_layout Standard
+This allows future attributes to be added, even if this expands the size
+ of the union.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
+\begin_layout Subsection
+tdb_traverse Makes Impossible Guarantees
+\end_layout
+
+\begin_layout Standard
+tdb_traverse (and tdb_firstkey/tdb_nextkey) predate transactions, and it
+ was thought that it was important to guarantee that all records which exist
+ at the start and end of the traversal would be included, and no record
+ would be included twice.
+\end_layout
+
+\begin_layout Standard
+This adds complexity (see
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "Reliable-Traversal-Adds"
+
+\end_inset
+
+) and does not work anyway for records which are altered (in particular,
+ those which are expanded may be effectively deleted and re-added behind
+ the traversal).
+\end_layout
+
+\begin_layout Subsubsection
+\begin_inset CommandInset label
+LatexCommand label
+name "traverse-Proposed-Solution"
+
+\end_inset
+
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+Abandon the guarantee.
+ You will see every record if no changes occur during your traversal, otherwise
+ you will see some subset.
+ You can prevent changes by using a transaction or the locking API.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+ Delete-during-traverse will still delete every record, too (assuming no
+ other changes).
+\end_layout
+
+\begin_layout Subsection
+Nesting of Transactions Is Fraught
+\end_layout
+
+\begin_layout Standard
+TDB has alternated between allowing nested transactions and not allowing
+ them.
+ Various paths in the Samba codebase assume that transactions will nest,
+ and in a sense they can: the operation is only committed to disk when the
+ outer transaction is committed.
+ There are two problems, however:
+\end_layout
+
+\begin_layout Enumerate
+Canceling the inner transaction will cause the outer transaction commit
+ to fail, and will not undo any operations since the inner transaction began.
+ This problem is soluble with some additional internal code.
+\end_layout
+
+\begin_layout Enumerate
+An inner transaction commit can be cancelled by the outer transaction.
+ This is desirable in the way which Samba's database initialization code
+ uses transactions, but could be a surprise to any users expecting a successful
+ transaction commit to expose changes to others.
+\end_layout
+
+\begin_layout Standard
+The current solution is to specify the behavior at tdb_open(), with the
+ default currently that nested transactions are allowed.
+ This flag can also be changed at runtime.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+Given the usage patterns, it seems that the
+\begin_inset Quotes eld
+\end_inset
+
+least-surprise
+\begin_inset Quotes erd
+\end_inset
+
+ behavior of disallowing nested transactions should become the default.
+ Additionally, it seems the outer transaction is the only code which knows
+ whether inner transactions should be allowed, so a flag to indicate this
+ could be added to tdb_transaction_start.
+ However, this behavior can be simulated with a wrapper which uses tdb_add_flags
+() and tdb_remove_flags(), so the API should not be expanded for this relatively
+-obscure case.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+
+\change_deleted 0 1298979572
+Incomplete; nesting flag is still defined as per tdb1.
+\change_inserted 0 1298979584
+Complete; the nesting flag has been removed.
+\change_unchanged
+
+\end_layout
+
+\begin_layout Subsection
+Incorrect Hash Function is Not Detected
+\end_layout
+
+\begin_layout Standard
+tdb_open_ex() allows the calling code to specify a different hash function
+ to use, but does not check that all other processes accessing this tdb
+ are using the same hash function.
+ The result is that records are missing from tdb_fetch().
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+The header should contain an example hash result (eg.
+ the hash of 0xdeadbeef), and tdb_open_ex() should check that the given
+ hash function produces the same answer, or fail the tdb_open call.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
+\begin_layout Subsection
+tdb_set_max_dead/TDB_VOLATILE Expose Implementation
+\end_layout
+
+\begin_layout Standard
+In response to scalability issues with the free list (
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "TDB-Freelist-Is"
+
+\end_inset
+
+) two API workarounds have been incorporated in TDB: tdb_set_max_dead()
+ and the TDB_VOLATILE flag to tdb_open.
+ The latter actually calls the former with an argument of
+\begin_inset Quotes eld
+\end_inset
+
+5
+\begin_inset Quotes erd
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Standard
+This code allows deleted records to accumulate without putting them in the
+ free list.
+ On delete we iterate through each chain and free them in a batch if there
+ are more than max_dead entries.
+ These are never otherwise recycled except as a side-effect of a tdb_repack.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+With the scalability problems of the freelist solved, this API can be removed.
+ The TDB_VOLATILE flag may still be useful as a hint that store and delete
+ of records will be at least as common as fetch in order to allow some internal
+ tuning, but initially will become a no-op.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Incomplete.
+ TDB_VOLATILE still defined, but implementation should fail on unknown flags
+ to be future-proof.
+\end_layout
+
+\begin_layout Subsection
+\begin_inset CommandInset label
+LatexCommand label
+name "TDB-Files-Cannot"
+
+\end_inset
+
+TDB Files Cannot Be Opened Multiple Times In The Same Process
+\end_layout
+
+\begin_layout Standard
+No process can open the same TDB twice; we check and disallow it.
+ This is an unfortunate side-effect of fcntl locks, which operate on a per-file
+ rather than per-file-descriptor basis, and do not nest.
+ Thus, closing any file descriptor on a file clears all the locks obtained
+ by this process, even if they were placed using a different file descriptor!
+\end_layout
+
+\begin_layout Standard
+Note that even if this were solved, deadlock could occur if operations were
+ nested: this is a more manageable programming error in most cases.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+We could lobby POSIX to fix the perverse rules, or at least lobby Linux
+ to violate them so that the most common implementation does not have this
+ restriction.
+ This would be a generally good idea for other fcntl lock users.
+\end_layout
+
+\begin_layout Standard
+Samba uses a wrapper which hands out the same tdb_context to multiple callers
+ if this happens, and does simple reference counting.
+ We should do this inside the tdb library, which already emulates lock nesting
+ internally; it would need to recognize when deadlock occurs within a single
+ process.
+ This would create a new failure mode for tdb operations (while we currently
+ handle locking failures, they are impossible in normal use and a process
+ encountering them can do little but give up).
+\end_layout
+
+\begin_layout Standard
+I do not see benefit in an additional tdb_open flag to indicate whether
+ re-opening is allowed, as though there may be some benefit to adding a
+ call to detect when a tdb_context is shared, to allow other to create such
+ an API.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Incomplete.
+\end_layout
+
+\begin_layout Subsection
+TDB API Is Not POSIX Thread-safe
+\end_layout
+
+\begin_layout Standard
+The TDB API uses an error code which can be queried after an operation to
+ determine what went wrong.
+ This programming model does not work with threads, unless specific additional
+ guarantees are given by the implementation.
+ In addition, even otherwise-independent threads cannot open the same TDB
+ (as in
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "TDB-Files-Cannot"
+
+\end_inset
+
+).
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+Reachitecting the API to include a tdb_errcode pointer would be a great
+ deal of churn
+\change_inserted 0 1298979557
+, but fortunately most functions return 0 on success and -1 on error: we
+ can change these to return 0 on success and a negative error code on error,
+ and the API remains similar to previous.
+ The tdb_fetch, tdb_firstkey and tdb_nextkey functions need to take a TDB_DATA
+ pointer and return an error code.
+ It is also simpler to have tdb_nextkey replace its key argument in place,
+ freeing up any old .dptr.
+\end_layout
+
+\begin_layout Standard
+
+\change_deleted 0 1298979438
+; we are better to guarantee that the tdb_errcode is per-thread so the current
+ programming model can be maintained.
+\end_layout
+
+\begin_layout Standard
+
+\change_deleted 0 1298979438
+This requires dynamic per-thread allocations, which is awkward with POSIX
+ threads (pthread_key_create space is limited and we cannot simply allocate
+ a key for every TDB).
+\change_unchanged
+
+\end_layout
+
+\begin_layout Standard
+Internal locking is required to make sure that fcntl locks do not overlap
+ between threads, and also that the global list of tdbs is maintained.
+\end_layout
+
+\begin_layout Standard
+The aim is that building tdb with -DTDB_PTHREAD will result in a pthread-safe
+ version of the library, and otherwise no overhead will exist.
+ Alternatively, a hooking mechanism similar to that proposed for
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "Proposed-Solution-locking-hook"
+
+\end_inset
+
+ could be used to enable pthread locking at runtime.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Incomplete
+\change_inserted 0 1298979681
+; API has been changed but thread safety has not been implemented.
+\change_deleted 0 1298979669
+.
+\change_unchanged
+
+\end_layout
+
+\begin_layout Subsection
+*_nonblock Functions And *_mark Functions Expose Implementation
+\end_layout
+
+\begin_layout Standard
+CTDB
+\begin_inset Foot
+status collapsed
+
+\begin_layout Plain Layout
+Clustered TDB, see http://ctdb.samba.org
+\end_layout
+
+\end_inset
+
+ wishes to operate on TDB in a non-blocking manner.
+ This is currently done as follows:
+\end_layout
+
+\begin_layout Enumerate
+Call the _nonblock variant of an API function (eg.
+ tdb_lockall_nonblock).
+ If this fails:
+\end_layout
+
+\begin_layout Enumerate
+Fork a child process, and wait for it to call the normal variant (eg.
+ tdb_lockall).
+\end_layout
+
+\begin_layout Enumerate
+If the child succeeds, call the _mark variant to indicate we already have
+ the locks (eg.
+ tdb_lockall_mark).
+\end_layout
+
+\begin_layout Enumerate
+Upon completion, tell the child to release the locks (eg.
+ tdb_unlockall).
+\end_layout
+
+\begin_layout Enumerate
+Indicate to tdb that it should consider the locks removed (eg.
+ tdb_unlockall_mark).
+\end_layout
+
+\begin_layout Standard
+There are several issues with this approach.
+ Firstly, adding two new variants of each function clutters the API for
+ an obscure use, and so not all functions have three variants.
+ Secondly, it assumes that all paths of the functions ask for the same locks,
+ otherwise the parent process will have to get a lock which the child doesn't
+ have under some circumstances.
+ I don't believe this is currently the case, but it constrains the implementatio
+n.
+
+\end_layout
+
+\begin_layout Subsubsection
+\begin_inset CommandInset label
+LatexCommand label
+name "Proposed-Solution-locking-hook"
+
+\end_inset
+
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+Implement a hook for locking methods, so that the caller can control the
+ calls to create and remove fcntl locks.
+ In this scenario, ctdbd would operate as follows:
+\end_layout
+
+\begin_layout Enumerate
+Call the normal API function, eg tdb_lockall().
+\end_layout
+
+\begin_layout Enumerate
+When the lock callback comes in, check if the child has the lock.
+ Initially, this is always false.
+ If so, return 0.
+ Otherwise, try to obtain it in non-blocking mode.
+ If that fails, return EWOULDBLOCK.
+\end_layout
+
+\begin_layout Enumerate
+Release locks in the unlock callback as normal.
+\end_layout
+
+\begin_layout Enumerate
+If tdb_lockall() fails, see if we recorded a lock failure; if so, call the
+ child to repeat the operation.
+\end_layout
+
+\begin_layout Enumerate
+The child records what locks it obtains, and returns that information to
+ the parent.
+\end_layout
+
+\begin_layout Enumerate
+When the child has succeeded, goto 1.
+\end_layout
+
+\begin_layout Standard
+This is flexible enough to handle any potential locking scenario, even when
+ lock requirements change.
+ It can be optimized so that the parent does not release locks, just tells
+ the child which locks it doesn't need to obtain.
+\end_layout
+
+\begin_layout Standard
+It also keeps the complexity out of the API, and in ctdbd where it is needed.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Incomplete.
+\end_layout
+
+\begin_layout Subsection
+tdb_chainlock Functions Expose Implementation
+\end_layout
+
+\begin_layout Standard
+tdb_chainlock locks some number of records, including the record indicated
+ by the given key.
+ This gave atomicity guarantees; no-one can start a transaction, alter,
+ read or delete that key while the lock is held.
+\end_layout
+
+\begin_layout Standard
+It also makes the same guarantee for any other key in the chain, which is
+ an internal implementation detail and potentially a cause for deadlock.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+None.
+ It would be nice to have an explicit single entry lock which effected no
+ other keys.
+ Unfortunately, this won't work for an entry which doesn't exist.
+ Thus while chainlock may be implemented more efficiently for the existing
+ case, it will still have overlap issues with the non-existing case.
+ So it is best to keep the current (lack of) guarantee about which records
+ will be effected to avoid constraining our implementation.
+\end_layout
+
+\begin_layout Subsection
+Signal Handling is Not Race-Free
+\end_layout
+
+\begin_layout Standard
+The tdb_setalarm_sigptr() call allows the caller's signal handler to indicate
+ that the tdb locking code should return with a failure, rather than trying
+ again when a signal is received (and errno == EAGAIN).
+ This is usually used to implement timeouts.
+\end_layout
+
+\begin_layout Standard
+Unfortunately, this does not work in the case where the signal is received
+ before the tdb code enters the fcntl() call to place the lock: the code
+ will sleep within the fcntl() code, unaware that the signal wants it to
+ exit.
+ In the case of long timeouts, this does not happen in practice.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+The locking hooks proposed in
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "Proposed-Solution-locking-hook"
+
+\end_inset
+
+ would allow the user to decide on whether to fail the lock acquisition
+ on a signal.
+ This allows the caller to choose their own compromise: they could narrow
+ the race by checking immediately before the fcntl call.
+\begin_inset Foot
+status collapsed
+
+\begin_layout Plain Layout
+It may be possible to make this race-free in some implementations by having
+ the signal handler alter the struct flock to make it invalid.
+ This will cause the fcntl() lock call to fail with EINVAL if the signal
+ occurs before the kernel is entered, otherwise EAGAIN.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Incomplete.
+\end_layout
+
+\begin_layout Subsection
+The API Uses Gratuitous Typedefs, Capitals
+\end_layout
+
+\begin_layout Standard
+typedefs are useful for providing source compatibility when types can differ
+ across implementations, or arguably in the case of function pointer definitions
+ which are hard for humans to parse.
+ Otherwise it is simply obfuscation and pollutes the namespace.
+\end_layout
+
+\begin_layout Standard
+Capitalization is usually reserved for compile-time constants and macros.
+\end_layout
+
+\begin_layout Description
+TDB_CONTEXT There is no reason to use this over 'struct tdb_context'; the
+ definition isn't visible to the API user anyway.
+\end_layout
+
+\begin_layout Description
+TDB_DATA There is no reason to use this over struct TDB_DATA; the struct
+ needs to be understood by the API user.
+\end_layout
+
+\begin_layout Description
+struct
+\begin_inset space ~
+\end_inset
+
+TDB_DATA This would normally be called 'struct tdb_data'.
+\end_layout
+
+\begin_layout Description
+enum
+\begin_inset space ~
+\end_inset
+
+TDB_ERROR Similarly, this would normally be enum tdb_error.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+None.
+ Introducing lower case variants would please pedants like myself, but if
+ it were done the existing ones should be kept.
+ There is little point forcing a purely cosmetic change upon tdb users.
+\end_layout
+
+\begin_layout Subsection
+\begin_inset CommandInset label
+LatexCommand label
+name "tdb_log_func-Doesnt-Take"
+
+\end_inset
+
+tdb_log_func Doesn't Take The Private Pointer
+\end_layout
+
+\begin_layout Standard
+For API compatibility reasons, the logging function needs to call tdb_get_loggin
+g_private() to retrieve the pointer registered by the tdb_open_ex for logging.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+It should simply take an extra argument, since we are prepared to break
+ the API/ABI.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
+\begin_layout Subsection
+Various Callback Functions Are Not Typesafe
+\end_layout
+
+\begin_layout Standard
+The callback functions in tdb_set_logging_function (after
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "tdb_log_func-Doesnt-Take"
+
+\end_inset
+
+ is resolved), tdb_parse_record, tdb_traverse, tdb_traverse_read and tdb_check
+ all take void * and must internally convert it to the argument type they
+ were expecting.
+\end_layout
+
+\begin_layout Standard
+If this type changes, the compiler will not produce warnings on the callers,
+ since it only sees void *.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+With careful use of macros, we can create callback functions which give
+ a warning when used on gcc and the types of the callback and its private
+ argument differ.
+ Unsupported compilers will not give a warning, which is no worse than now.
+ In addition, the callbacks become clearer, as they need not use void *
+ for their parameter.
+\end_layout
+
+\begin_layout Standard
+See CCAN's typesafe_cb module at http://ccan.ozlabs.org/info/typesafe_cb.html
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Incomplete.
+\end_layout
+
+\begin_layout Subsection
+TDB_CLEAR_IF_FIRST Must Be Specified On All Opens, tdb_reopen_all Problematic
+\end_layout
+
+\begin_layout Standard
+The TDB_CLEAR_IF_FIRST flag to tdb_open indicates that the TDB file should
+ be cleared if the caller discovers it is the only process with the TDB
+ open.
+ However, if any caller does not specify TDB_CLEAR_IF_FIRST it will not
+ be detected, so will have the TDB erased underneath them (usually resulting
+ in a crash).
+\end_layout
+
+\begin_layout Standard
+There is a similar issue on fork(); if the parent exits (or otherwise closes
+ the tdb) before the child calls tdb_reopen_all() to establish the lock
+ used to indicate the TDB is opened by someone, a TDB_CLEAR_IF_FIRST opener
+ at that moment will believe it alone has opened the TDB and will erase
+ it.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+Remove TDB_CLEAR_IF_FIRST.
+ Other workarounds are possible, but see
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "TDB_CLEAR_IF_FIRST-Imposes-Performance"
+
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+
+\change_deleted 0 1298979699
+Incomplete, TDB_CLEAR_IF_FIRST still defined, but not implemented.
+\change_inserted 0 1298979700
+Complete.
+\change_unchanged
+
+\end_layout
+
+\begin_layout Subsection
+Extending The Header Is Difficult
+\end_layout
+
+\begin_layout Standard
+We have reserved (zeroed) words in the TDB header, which can be used for
+ future features.
+ If the future features are compulsory, the version number must be updated
+ to prevent old code from accessing the database.
+ But if the future feature is optional, we have no way of telling if older
+ code is accessing the database or not.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+The header should contain a
+\begin_inset Quotes eld
+\end_inset
+
+format variant
+\begin_inset Quotes erd
+\end_inset
+
+ value (64-bit).
+ This is divided into two 32-bit parts:
+\end_layout
+
+\begin_layout Enumerate
+The lower part reflects the format variant understood by code accessing
+ the database.
+\end_layout
+
+\begin_layout Enumerate
+The upper part reflects the format variant you must understand to write
+ to the database (otherwise you can only open for reading).
+\end_layout
+
+\begin_layout Standard
+The latter field can only be written at creation time, the former should
+ be written under the OPEN_LOCK when opening the database for writing, if
+ the variant of the code is lower than the current lowest variant.
+\end_layout
+
+\begin_layout Standard
+This should allow backwards-compatible features to be added, and detection
+ if older code (which doesn't understand the feature) writes to the database.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Incomplete.
+\end_layout
+
+\begin_layout Subsection
+Record Headers Are Not Expandible
+\end_layout
+
+\begin_layout Standard
+If we later want to add (say) checksums on keys and data, it would require
+ another format change, which we'd like to avoid.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+We often have extra padding at the tail of a record.
+ If we ensure that the first byte (if any) of this padding is zero, we will
+ have a way for future changes to detect code which doesn't understand a
+ new format: the new code would write (say) a 1 at the tail, and thus if
+ there is no tail or the first byte is 0, we would know the extension is
+ not present on that record.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Incomplete.
+\end_layout
+
+\begin_layout Subsection
+TDB Does Not Use Talloc
+\end_layout
+
+\begin_layout Standard
+Many users of TDB (particularly Samba) use the talloc allocator, and thus
+ have to wrap TDB in a talloc context to use it conveniently.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+The allocation within TDB is not complicated enough to justify the use of
+ talloc, and I am reluctant to force another (excellent) library on TDB
+ users.
+ Nonetheless a compromise is possible.
+ An attribute (see
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "attributes"
+
+\end_inset
+
+) can be added later to tdb_open() to provide an alternate allocation mechanism,
+ specifically for talloc but usable by any other allocator (which would
+ ignore the
+\begin_inset Quotes eld
+\end_inset
+
+context
+\begin_inset Quotes erd
+\end_inset
+
+ argument).
+\end_layout
+
+\begin_layout Standard
+This would form a talloc heirarchy as expected, but the caller would still
+ have to attach a destructor to the tdb context returned from tdb_open to
+ close it.
+ All TDB_DATA fields would be children of the tdb_context, and the caller
+ would still have to manage them (using talloc_free() or talloc_steal()).
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Deferred.
+\end_layout
+
+\begin_layout Section
+Performance And Scalability Issues
+\end_layout
+
+\begin_layout Subsection
+\begin_inset CommandInset label
+LatexCommand label
+name "TDB_CLEAR_IF_FIRST-Imposes-Performance"
+
+\end_inset
+
+TDB_CLEAR_IF_FIRST Imposes Performance Penalty
+\end_layout
+
+\begin_layout Standard
+When TDB_CLEAR_IF_FIRST is specified, a 1-byte read lock is placed at offset
+ 4 (aka.
+ the ACTIVE_LOCK).
+ While these locks never conflict in normal tdb usage, they do add substantial
+ overhead for most fcntl lock implementations when the kernel scans to detect
+ if a lock conflict exists.
+ This is often a single linked list, making the time to acquire and release
+ a fcntl lock O(N) where N is the number of processes with the TDB open,
+ not the number actually doing work.
+\end_layout
+
+\begin_layout Standard
+In a Samba server it is common to have huge numbers of clients sitting idle,
+ and thus they have weaned themselves off the TDB_CLEAR_IF_FIRST flag.
+\begin_inset Foot
+status collapsed
+
+\begin_layout Plain Layout
+There is a flag to tdb_reopen_all() which is used for this optimization:
+ if the parent process will outlive the child, the child does not need the
+ ACTIVE_LOCK.
+ This is a workaround for this very performance issue.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+Remove the flag.
+ It was a neat idea, but even trivial servers tend to know when they are
+ initializing for the first time and can simply unlink the old tdb at that
+ point.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+
+\change_deleted 0 1298979837
+Incomplete; TDB_CLEAR_IF_FIRST still defined, but does nothing.
+\change_inserted 0 1298979837
+Complete.
+\change_unchanged
+
+\end_layout
+
+\begin_layout Subsection
+TDB Files Have a 4G Limit
+\end_layout
+
+\begin_layout Standard
+This seems to be becoming an issue (so much for
+\begin_inset Quotes eld
+\end_inset
+
+trivial
+\begin_inset Quotes erd
+\end_inset
+
+!), particularly for ldb.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+A new, incompatible TDB format which uses 64 bit offsets internally rather
+ than 32 bit as now.
+ For simplicity of endian conversion (which TDB does on the fly if required),
+ all values will be 64 bit on disk.
+ In practice, some upper bits may be used for other purposes, but at least
+ 56 bits will be available for file offsets.
+\end_layout
+
+\begin_layout Standard
+tdb_open() will automatically detect the old version, and even create them
+ if TDB_VERSION6 is specified to tdb_open.
+\end_layout
+
+\begin_layout Standard
+32 bit processes will still be able to access TDBs larger than 4G (assuming
+ that their off_t allows them to seek to 64 bits), they will gracefully
+ fall back as they fail to mmap.
+ This can happen already with large TDBs.
+\end_layout
+
+\begin_layout Standard
+Old versions of tdb will fail to open the new TDB files (since 28 August
+ 2009, commit 398d0c29290: prior to that any unrecognized file format would
+ be erased and initialized as a fresh tdb!)
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
+\begin_layout Subsection
+TDB Records Have a 4G Limit
+\end_layout
+
+\begin_layout Standard
+This has not been a reported problem, and the API uses size_t which can
+ be 64 bit on 64 bit platforms.
+ However, other limits may have made such an issue moot.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+Record sizes will be 64 bit, with an error returned on 32 bit platforms
+ which try to access such records (the current implementation would return
+ TDB_ERR_OOM in a similar case).
+ It seems unlikely that 32 bit keys will be a limitation, so the implementation
+ may not support this (see
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sub:Records-Incur-A"
+
+\end_inset
+
+).
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
+\begin_layout Subsection
+Hash Size Is Determined At TDB Creation Time
+\end_layout
+
+\begin_layout Standard
+TDB contains a number of hash chains in the header; the number is specified
+ at creation time, and defaults to 131.
+ This is such a bottleneck on large databases (as each hash chain gets quite
+ long), that LDB uses 10,000 for this hash.
+ In general it is impossible to know what the 'right' answer is at database
+ creation time.
+\end_layout
+
+\begin_layout Subsubsection
+\begin_inset CommandInset label
+LatexCommand label
+name "sub:Hash-Size-Solution"
+
+\end_inset
+
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+After comprehensive performance testing on various scalable hash variants
+\begin_inset Foot
+status collapsed
+
+\begin_layout Plain Layout
+http://rusty.ozlabs.org/?p=89 and http://rusty.ozlabs.org/?p=94 This was annoying
+ because I was previously convinced that an expanding tree of hashes would
+ be very close to optimal.
+\end_layout
+
+\end_inset
+
+, it became clear that it is hard to beat a straight linear hash table which
+ doubles in size when it reaches saturation.
+ Unfortunately, altering the hash table introduces serious locking complications
+: the entire hash table needs to be locked to enlarge the hash table, and
+ others might be holding locks.
+ Particularly insidious are insertions done under tdb_chainlock.
+\end_layout
+
+\begin_layout Standard
+Thus an expanding layered hash will be used: an array of hash groups, with
+ each hash group exploding into pointers to lower hash groups once it fills,
+ turning into a hash tree.
+ This has implications for locking: we must lock the entire group in case
+ we need to expand it, yet we don't know how deep the tree is at that point.
+\end_layout
+
+\begin_layout Standard
+Note that bits from the hash table entries should be stolen to hold more
+ hash bits to reduce the penalty of collisions.
+ We can use the otherwise-unused lower 3 bits.
+ If we limit the size of the database to 64 exabytes, we can use the top
+ 8 bits of the hash entry as well.
+ These 11 bits would reduce false positives down to 1 in 2000 which is more
+ than we need: we can use one of the bits to indicate that the extra hash
+ bits are valid.
+ This means we can choose not to re-hash all entries when we expand a hash
+ group; simply use the next bits we need and mark them invalid.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
+\begin_layout Subsection
+\begin_inset CommandInset label
+LatexCommand label
+name "TDB-Freelist-Is"
+
+\end_inset
+
+TDB Freelist Is Highly Contended
+\end_layout
+
+\begin_layout Standard
+TDB uses a single linked list for the free list.
+ Allocation occurs as follows, using heuristics which have evolved over
+ time:
+\end_layout
+
+\begin_layout Enumerate
+Get the free list lock for this whole operation.
+\end_layout
+
+\begin_layout Enumerate
+Multiply length by 1.25, so we always over-allocate by 25%.
+\end_layout
+
+\begin_layout Enumerate
+Set the slack multiplier to 1.
+\end_layout
+
+\begin_layout Enumerate
+Examine the current freelist entry: if it is > length but < the current
+ best case, remember it as the best case.
+\end_layout
+
+\begin_layout Enumerate
+Multiply the slack multiplier by 1.05.
+\end_layout
+
+\begin_layout Enumerate
+If our best fit so far is less than length * slack multiplier, return it.
+ The slack will be turned into a new free record if it's large enough.
+\end_layout
+
+\begin_layout Enumerate
+Otherwise, go onto the next freelist entry.
+\end_layout
+
+\begin_layout Standard
+Deleting a record occurs as follows:
+\end_layout
+
+\begin_layout Enumerate
+Lock the hash chain for this whole operation.
+\end_layout
+
+\begin_layout Enumerate
+Walk the chain to find the record, keeping the prev pointer offset.
+\end_layout
+
+\begin_layout Enumerate
+If max_dead is non-zero:
+\end_layout
+
+\begin_deeper
+\begin_layout Enumerate
+Walk the hash chain again and count the dead records.
+\end_layout
+
+\begin_layout Enumerate
+If it's more than max_dead, bulk free all the dead ones (similar to steps
+ 4 and below, but the lock is only obtained once).
+\end_layout
+
+\begin_layout Enumerate
+Simply mark this record as dead and return.
+
+\end_layout
+
+\end_deeper
+\begin_layout Enumerate
+Get the free list lock for the remainder of this operation.
+\end_layout
+
+\begin_layout Enumerate
+\begin_inset CommandInset label
+LatexCommand label
+name "right-merging"
+
+\end_inset
+
+Examine the following block to see if it is free; if so, enlarge the current
+ block and remove that block from the free list.
+ This was disabled, as removal from the free list was O(entries-in-free-list).
+\end_layout
+
+\begin_layout Enumerate
+Examine the preceeding block to see if it is free: for this reason, each
+ block has a 32-bit tailer which indicates its length.
+ If it is free, expand it to cover our new block and return.
+\end_layout
+
+\begin_layout Enumerate
+Otherwise, prepend ourselves to the free list.
+\end_layout
+
+\begin_layout Standard
+Disabling right-merging (step
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "right-merging"
+
+\end_inset
+
+) causes fragmentation; the other heuristics proved insufficient to address
+ this, so the final answer to this was that when we expand the TDB file
+ inside a transaction commit, we repack the entire tdb.
+\end_layout
+
+\begin_layout Standard
+The single list lock limits our allocation rate; due to the other issues
+ this is not currently seen as a bottleneck.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+The first step is to remove all the current heuristics, as they obviously
+ interact, then examine them once the lock contention is addressed.
+\end_layout
+
+\begin_layout Standard
+The free list must be split to reduce contention.
+ Assuming perfect free merging, we can at most have 1 free list entry for
+ each entry.
+ This implies that the number of free lists is related to the size of the
+ hash table, but as it is rare to walk a large number of free list entries
+ we can use far fewer, say 1/32 of the number of hash buckets.
+\end_layout
+
+\begin_layout Standard
+It seems tempting to try to reuse the hash implementation which we use for
+ records here, but we have two ways of searching for free entries: for allocatio
+n we search by size (and possibly zone) which produces too many clashes
+ for our hash table to handle well, and for coalescing we search by address.
+ Thus an array of doubly-linked free lists seems preferable.
+\end_layout
+
+\begin_layout Standard
+There are various benefits in using per-size free lists (see
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sub:TDB-Becomes-Fragmented"
+
+\end_inset
+
+) but it's not clear this would reduce contention in the common case where
+ all processes are allocating/freeing the same size.
+ Thus we almost certainly need to divide in other ways: the most obvious
+ is to divide the file into zones, and using a free list (or table of free
+ lists) for each.
+ This approximates address ordering.
+\end_layout
+
+\begin_layout Standard
+Unfortunately it is difficult to know what heuristics should be used to
+ determine zone sizes, and our transaction code relies on being able to
+ create a
+\begin_inset Quotes eld
+\end_inset
+
+recovery area
+\begin_inset Quotes erd
+\end_inset
+
+ by simply appending to the file (difficult if it would need to create a
+ new zone header).
+ Thus we use a linked-list of free tables; currently we only ever create
+ one, but if there is more than one we choose one at random to use.
+ In future we may use heuristics to add new free tables on contention.
+ We only expand the file when all free tables are exhausted.
+\end_layout
+
+\begin_layout Standard
+The basic algorithm is as follows.
+ Freeing is simple:
+\end_layout
+
+\begin_layout Enumerate
+Identify the correct free list.
+\end_layout
+
+\begin_layout Enumerate
+Lock the corresponding list.
+\end_layout
+
+\begin_layout Enumerate
+Re-check the list (we didn't have a lock, sizes could have changed): relock
+ if necessary.
+\end_layout
+
+\begin_layout Enumerate
+Place the freed entry in the list.
+\end_layout
+
+\begin_layout Standard
+Allocation is a little more complicated, as we perform delayed coalescing
+ at this point:
+\end_layout
+
+\begin_layout Enumerate
+Pick a free table; usually the previous one.
+\end_layout
+
+\begin_layout Enumerate
+Lock the corresponding list.
+\end_layout
+
+\begin_layout Enumerate
+If the top entry is -large enough, remove it from the list and return it.
+\end_layout
+
+\begin_layout Enumerate
+Otherwise, coalesce entries in the list.If there was no entry large enough,
+ unlock the list and try the next largest list
+\end_layout
+
+\begin_layout Enumerate
+If no list has an entry which meets our needs, try the next free table.
+\end_layout
+
+\begin_layout Enumerate
+If no zone satisfies, expand the file.
+\end_layout
+
+\begin_layout Standard
+This optimizes rapid insert/delete of free list entries by not coalescing
+ them all the time..
+ First-fit address ordering ordering seems to be fairly good for keeping
+ fragmentation low (see
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sub:TDB-Becomes-Fragmented"
+
+\end_inset
+
+).
+ Note that address ordering does not need a tailer to coalesce, though if
+ we needed one we could have one cheaply: see
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sub:Records-Incur-A"
+
+\end_inset
+
+.
+
+\end_layout
+
+\begin_layout Standard
+Each free entry has the free table number in the header: less than 255.
+ It also contains a doubly-linked list for easy deletion.
+\end_layout
+
+\begin_layout Subsection
+\begin_inset CommandInset label
+LatexCommand label
+name "sub:TDB-Becomes-Fragmented"
+
+\end_inset
+
+TDB Becomes Fragmented
+\end_layout
+
+\begin_layout Standard
+Much of this is a result of allocation strategy
+\begin_inset Foot
+status collapsed
+
+\begin_layout Plain Layout
+The Memory Fragmentation Problem: Solved? Johnstone & Wilson 1995 ftp://ftp.cs.ute
+xas.edu/pub/garbage/malloc/ismm98.ps
+\end_layout
+
+\end_inset
+
+ and deliberate hobbling of coalescing; internal fragmentation (aka overallocati
+on) is deliberately set at 25%, and external fragmentation is only cured
+ by the decision to repack the entire db when a transaction commit needs
+ to enlarge the file.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+The 25% overhead on allocation works in practice for ldb because indexes
+ tend to expand by one record at a time.
+ This internal fragmentation can be resolved by having an
+\begin_inset Quotes eld
+\end_inset
+
+expanded
+\begin_inset Quotes erd
+\end_inset
+
+ bit in the header to note entries that have previously expanded, and allocating
+ more space for them.
+\end_layout
+
+\begin_layout Standard
+There are is a spectrum of possible solutions for external fragmentation:
+ one is to use a fragmentation-avoiding allocation strategy such as best-fit
+ address-order allocator.
+ The other end of the spectrum would be to use a bump allocator (very fast
+ and simple) and simply repack the file when we reach the end.
+\end_layout
+
+\begin_layout Standard
+There are three problems with efficient fragmentation-avoiding allocators:
+ they are non-trivial, they tend to use a single free list for each size,
+ and there's no evidence that tdb allocation patterns will match those recorded
+ for general allocators (though it seems likely).
+\end_layout
+
+\begin_layout Standard
+Thus we don't spend too much effort on external fragmentation; we will be
+ no worse than the current code if we need to repack on occasion.
+ More effort is spent on reducing freelist contention, and reducing overhead.
+\end_layout
+
+\begin_layout Subsection
+\begin_inset CommandInset label
+LatexCommand label
+name "sub:Records-Incur-A"
+
+\end_inset
+
+Records Incur A 28-Byte Overhead
+\end_layout
+
+\begin_layout Standard
+Each TDB record has a header as follows:
+\end_layout
+
+\begin_layout LyX-Code
+struct tdb_record {
+\end_layout
+
+\begin_layout LyX-Code
+ tdb_off_t next; /* offset of the next record in the list */
+\end_layout
+
+\begin_layout LyX-Code
+ tdb_len_t rec_len; /* total byte length of record */
+\end_layout
+
+\begin_layout LyX-Code
+ tdb_len_t key_len; /* byte length of key */
+\end_layout
+
+\begin_layout LyX-Code
+ tdb_len_t data_len; /* byte length of data */
+\end_layout
+
+\begin_layout LyX-Code
+ uint32_t full_hash; /* the full 32 bit hash of the key */
+\end_layout
+
+\begin_layout LyX-Code
+ uint32_t magic; /* try to catch errors */
+\end_layout
+
+\begin_layout LyX-Code
+ /* the following union is implied:
+\end_layout
+
+\begin_layout LyX-Code
+ union {
+\end_layout
+
+\begin_layout LyX-Code
+ char record[rec_len];
+\end_layout
+
+\begin_layout LyX-Code
+ struct {
+\end_layout
+
+\begin_layout LyX-Code
+ char key[key_len];
+\end_layout
+
+\begin_layout LyX-Code
+ char data[data_len];
+\end_layout
+
+\begin_layout LyX-Code
+ }
+\end_layout
+
+\begin_layout LyX-Code
+ uint32_t totalsize; (tailer)
+\end_layout
+
+\begin_layout LyX-Code
+ }
+\end_layout
+
+\begin_layout LyX-Code
+ */
+\end_layout
+
+\begin_layout LyX-Code
+};
+\end_layout
+
+\begin_layout Standard
+Naively, this would double to a 56-byte overhead on a 64 bit implementation.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+We can use various techniques to reduce this for an allocated block:
+\end_layout
+
+\begin_layout Enumerate
+The 'next' pointer is not required, as we are using a flat hash table.
+\end_layout
+
+\begin_layout Enumerate
+'rec_len' can instead be expressed as an addition to key_len and data_len
+ (it accounts for wasted or overallocated length in the record).
+ Since the record length is always a multiple of 8, we can conveniently
+ fit it in 32 bits (representing up to 35 bits).
+\end_layout
+
+\begin_layout Enumerate
+'key_len' and 'data_len' can be reduced.
+ I'm unwilling to restrict 'data_len' to 32 bits, but instead we can combine
+ the two into one 64-bit field and using a 5 bit value which indicates at
+ what bit to divide the two.
+ Keys are unlikely to scale as fast as data, so I'm assuming a maximum key
+ size of 32 bits.
+\end_layout
+
+\begin_layout Enumerate
+'full_hash' is used to avoid a memcmp on the
+\begin_inset Quotes eld
+\end_inset
+
+miss
+\begin_inset Quotes erd
+\end_inset
+
+ case, but this is diminishing returns after a handful of bits (at 10 bits,
+ it reduces 99.9% of false memcmp).
+ As an aside, as the lower bits are already incorporated in the hash table
+ resolution, the upper bits should be used here.
+ Note that it's not clear that these bits will be a win, given the extra
+ bits in the hash table itself (see
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sub:Hash-Size-Solution"
+
+\end_inset
+
+).
+\end_layout
+
+\begin_layout Enumerate
+'magic' does not need to be enlarged: it currently reflects one of 5 values
+ (used, free, dead, recovery, and unused_recovery).
+ It is useful for quick sanity checking however, and should not be eliminated.
+\end_layout
+
+\begin_layout Enumerate
+'tailer' is only used to coalesce free blocks (so a block to the right can
+ find the header to check if this block is free).
+ This can be replaced by a single 'free' bit in the header of the following
+ block (and the tailer only exists in free blocks).
+\begin_inset Foot
+status collapsed
+
+\begin_layout Plain Layout
+This technique from Thomas Standish.
+ Data Structure Techniques.
+ Addison-Wesley, Reading, Massachusetts, 1980.
+\end_layout
+
+\end_inset
+
+ The current proposed coalescing algorithm doesn't need this, however.
+\end_layout
+
+\begin_layout Standard
+This produces a 16 byte used header like this:
+\end_layout
+
+\begin_layout LyX-Code
+struct tdb_used_record {
+\end_layout
+
+\begin_layout LyX-Code
+ uint32_t used_magic : 16,
+\end_layout
+
+\begin_layout LyX-Code
+
+\end_layout
+
+\begin_layout LyX-Code
+ key_data_divide: 5,
+\end_layout
+
+\begin_layout LyX-Code
+ top_hash: 11;
+\end_layout
+
+\begin_layout LyX-Code
+ uint32_t extra_octets;
+\end_layout
+
+\begin_layout LyX-Code
+ uint64_t key_and_data_len;
+\end_layout
+
+\begin_layout LyX-Code
+};
+\end_layout
+
+\begin_layout Standard
+And a free record like this:
+\end_layout
+
+\begin_layout LyX-Code
+struct tdb_free_record {
+\end_layout
+
+\begin_layout LyX-Code
+ uint64_t free_magic: 8,
+\end_layout
+
+\begin_layout LyX-Code
+ prev : 56;
+\end_layout
+
+\begin_layout LyX-Code
+
+\end_layout
+
+\begin_layout LyX-Code
+ uint64_t free_table: 8,
+\end_layout
+
+\begin_layout LyX-Code
+ total_length : 56
+\end_layout
+
+\begin_layout LyX-Code
+ uint64_t next;;
+\end_layout
+
+\begin_layout LyX-Code
+};
+\end_layout
+
+\begin_layout Standard
+
+\change_deleted 0 1291206079
+
+\change_unchanged
+Note that by limiting valid offsets to 56 bits, we can pack everything we
+ need into 3 64-byte words, meaning our minimum record size is 8 bytes.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
+\begin_layout Subsection
+Transaction Commit Requires 4 fdatasync
+\end_layout
+
+\begin_layout Standard
+The current transaction algorithm is:
+\end_layout
+
+\begin_layout Enumerate
+write_recovery_data();
+\end_layout
+
+\begin_layout Enumerate
+sync();
+\end_layout
+
+\begin_layout Enumerate
+write_recovery_header();
+\end_layout
+
+\begin_layout Enumerate
+sync();
+\end_layout
+
+\begin_layout Enumerate
+overwrite_with_new_data();
+\end_layout
+
+\begin_layout Enumerate
+sync();
+\end_layout
+
+\begin_layout Enumerate
+remove_recovery_header();
+\end_layout
+
+\begin_layout Enumerate
+sync();
+\end_layout
+
+\begin_layout Standard
+On current ext3, each sync flushes all data to disk, so the next 3 syncs
+ are relatively expensive.
+ But this could become a performance bottleneck on other filesystems such
+ as ext4.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+Neil Brown points out that this is overzealous, and only one sync is needed:
+\end_layout
+
+\begin_layout Enumerate
+Bundle the recovery data, a transaction counter and a strong checksum of
+ the new data.
+\end_layout
+
+\begin_layout Enumerate
+Strong checksum that whole bundle.
+\end_layout
+
+\begin_layout Enumerate
+Store the bundle in the database.
+\end_layout
+
+\begin_layout Enumerate
+Overwrite the oldest of the two recovery pointers in the header (identified
+ using the transaction counter) with the offset of this bundle.
+\end_layout
+
+\begin_layout Enumerate
+sync.
+\end_layout
+
+\begin_layout Enumerate
+Write the new data to the file.
+\end_layout
+
+\begin_layout Standard
+Checking for recovery means identifying the latest bundle with a valid checksum
+ and using the new data checksum to ensure that it has been applied.
+ This is more expensive than the current check, but need only be done at
+ open.
+ For running databases, a separate header field can be used to indicate
+ a transaction in progress; we need only check for recovery if this is set.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Deferred.
+\end_layout
+
+\begin_layout Subsection
+\begin_inset CommandInset label
+LatexCommand label
+name "sub:TDB-Does-Not"
+
+\end_inset
+
+TDB Does Not Have Snapshot Support
+\end_layout
+
+\begin_layout Subsubsection
+Proposed SolutionNone.
+ At some point you say
+\begin_inset Quotes eld
+\end_inset
+
+use a real database
+\begin_inset Quotes erd
+\end_inset
+
+ (but see
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "replay-attribute"
+
+\end_inset
+
+).
+\end_layout
+
+\begin_layout Standard
+But as a thought experiment, if we implemented transactions to only overwrite
+ free entries (this is tricky: there must not be a header in each entry
+ which indicates whether it is free, but use of presence in metadata elsewhere),
+ and a pointer to the hash table, we could create an entirely new commit
+ without destroying existing data.
+ Then it would be easy to implement snapshots in a similar way.
+\end_layout
+
+\begin_layout Standard
+This would not allow arbitrary changes to the database, such as tdb_repack
+ does, and would require more space (since we have to preserve the current
+ and future entries at once).
+ If we used hash trees rather than one big hash table, we might only have
+ to rewrite some sections of the hash, too.
+\end_layout
+
+\begin_layout Standard
+We could then implement snapshots using a similar method, using multiple
+ different hash tables/free tables.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Deferred.
+\end_layout
+
+\begin_layout Subsection
+Transactions Cannot Operate in Parallel
+\end_layout
+
+\begin_layout Standard
+This would be useless for ldb, as it hits the index records with just about
+ every update.
+ It would add significant complexity in resolving clashes, and cause the
+ all transaction callers to write their code to loop in the case where the
+ transactions spuriously failed.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+None (but see
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "replay-attribute"
+
+\end_inset
+
+).
+ We could solve a small part of the problem by providing read-only transactions.
+ These would allow one write transaction to begin, but it could not commit
+ until all r/o transactions are done.
+ This would require a new RO_TRANSACTION_LOCK, which would be upgraded on
+ commit.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Deferred.
+\end_layout
+
+\begin_layout Subsection
+Default Hash Function Is Suboptimal
+\end_layout
+
+\begin_layout Standard
+The Knuth-inspired multiplicative hash used by tdb is fairly slow (especially
+ if we expand it to 64 bits), and works best when the hash bucket size is
+ a prime number (which also means a slow modulus).
+ In addition, it is highly predictable which could potentially lead to a
+ Denial of Service attack in some TDB uses.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+The Jenkins lookup3 hash
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+http://burtleburtle.net/bob/c/lookup3.c
+\end_layout
+
+\end_inset
+
+ is a fast and superbly-mixing hash.
+ It's used by the Linux kernel and almost everything else.
+ This has the particular properties that it takes an initial seed, and produces
+ two 32 bit hash numbers, which we can combine into a 64-bit hash.
+\end_layout
+
+\begin_layout Standard
+The seed should be created at tdb-creation time from some random source,
+ and placed in the header.
+ This is far from foolproof, but adds a little bit of protection against
+ hash bombing.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
+\begin_layout Subsection
+\begin_inset CommandInset label
+LatexCommand label
+name "Reliable-Traversal-Adds"
+
+\end_inset
+
+Reliable Traversal Adds Complexity
+\end_layout
+
+\begin_layout Standard
+We lock a record during traversal iteration, and try to grab that lock in
+ the delete code.
+ If that grab on delete fails, we simply mark it deleted and continue onwards;
+ traversal checks for this condition and does the delete when it moves off
+ the record.
+\end_layout
+
+\begin_layout Standard
+If traversal terminates, the dead record may be left indefinitely.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+Remove reliability guarantees; see
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "traverse-Proposed-Solution"
+
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
+\begin_layout Subsection
+Fcntl Locking Adds Overhead
+\end_layout
+
+\begin_layout Standard
+Placing a fcntl lock means a system call, as does removing one.
+ This is actually one reason why transactions can be faster (everything
+ is locked once at transaction start).
+ In the uncontended case, this overhead can theoretically be eliminated.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+None.
+\end_layout
+
+\begin_layout Standard
+We tried this before with spinlock support, in the early days of TDB, and
+ it didn't make much difference except in manufactured benchmarks.
+\end_layout
+
+\begin_layout Standard
+We could use spinlocks (with futex kernel support under Linux), but it means
+ that we lose automatic cleanup when a process dies with a lock.
+ There is a method of auto-cleanup under Linux, but it's not supported by
+ other operating systems.
+ We could reintroduce a clear-if-first-style lock and sweep for dead futexes
+ on open, but that wouldn't help the normal case of one concurrent opener
+ dying.
+ Increasingly elaborate repair schemes could be considered, but they require
+ an ABI change (everyone must use them) anyway, so there's no need to do
+ this at the same time as everything else.
+\end_layout
+
+\begin_layout Subsection
+Some Transactions Don't Require Durability
+\end_layout
+
+\begin_layout Standard
+Volker points out that gencache uses a CLEAR_IF_FIRST tdb for normal (fast)
+ usage, and occasionally empties the results into a transactional TDB.
+ This kind of usage prioritizes performance over durability: as long as
+ we are consistent, data can be lost.
+\end_layout
+
+\begin_layout Standard
+This would be more neatly implemented inside tdb: a
+\begin_inset Quotes eld
+\end_inset
+
+soft
+\begin_inset Quotes erd
+\end_inset
+
+ transaction commit (ie.
+ syncless) which meant that data may be reverted on a crash.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+None.
+\end_layout
+
+\begin_layout Standard
+Unfortunately any transaction scheme which overwrites old data requires
+ a sync before that overwrite to avoid the possibility of corruption.
+\end_layout
+
+\begin_layout Standard
+It seems possible to use a scheme similar to that described in
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sub:TDB-Does-Not"
+
+\end_inset
+
+,where transactions are committed without overwriting existing data, and
+ an array of top-level pointers were available in the header.
+ If the transaction is
+\begin_inset Quotes eld
+\end_inset
+
+soft
+\begin_inset Quotes erd
+\end_inset
+
+ then we would not need a sync at all: existing processes would pick up
+ the new hash table and free list and work with that.
+\end_layout
+
+\begin_layout Standard
+At some later point, a sync would allow recovery of the old data into the
+ free lists (perhaps when the array of top-level pointers filled).
+ On crash, tdb_open() would examine the array of top levels, and apply the
+ transactions until it encountered an invalid checksum.
+\end_layout
+
+\begin_layout Subsection
+Tracing Is Fragile, Replay Is External
+\end_layout
+
+\begin_layout Standard
+The current TDB has compile-time-enabled tracing code, but it often breaks
+ as it is not enabled by default.
+ In a similar way, the ctdb code has an external wrapper which does replay
+ tracing so it can coordinate cluster-wide transactions.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\begin_inset CommandInset label
+LatexCommand label
+name "replay-attribute"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+Tridge points out that an attribute can be later added to tdb_open (see
+
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "attributes"
+
+\end_inset
+
+) to provide replay/trace hooks, which could become the basis for this and
+ future parallel transactions and snapshot support.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Deferred.
+\end_layout
+
+\end_body
+\end_document
+@
+
+
+1.12
+log
+@Add status, some fixes, linked freelists.
+@
+text
+@d53 1
+a53 7
+
+\change_deleted 0 1291204535
+14-September
+\change_inserted 0 1291204533
+1-December
+\change_unchanged
+-2010
+a580 2
+\change_inserted 0 1291204563
+
+a583 2
+
+\change_inserted 0 1291204572
+a587 2
+
+\change_inserted 0 1291204573
+a588 2
+\change_unchanged
+
+a629 2
+\change_inserted 0 1291204588
+
+a632 2
+
+\change_inserted 0 1291204588
+a636 2
+
+\change_inserted 0 1291204631
+a639 2
+\change_unchanged
+
+a693 2
+\change_inserted 0 1291204639
+
+a696 2
+
+\change_inserted 0 1291204640
+d702 1
+a702 1
+\change_inserted 0 1291204665
+d704 2
+a728 2
+\change_inserted 0 1291204671
+
+a731 2
+
+\change_inserted 0 1291204671
+a735 2
+
+\change_inserted 0 1291204673
+a736 2
+\change_unchanged
+
+a780 2
+\change_inserted 0 1291204731
+
+a783 2
+
+\change_inserted 0 1291204732
+a787 2
+
+\change_inserted 0 1291204779
+a790 2
+\change_unchanged
+
+a842 2
+\change_inserted 0 1291204830
+
+a845 2
+
+\change_inserted 0 1291204831
+a849 2
+
+\change_inserted 0 1291204834
+a850 2
+\change_unchanged
+
+d879 9
+a887 2
+ deal of churn; we are better to guarantee that the tdb_errcode is per-thread
+ so the current programming model can be maintained.
+d891 9
+d903 2
+a922 2
+\change_inserted 0 1291204847
+
+a925 2
+
+\change_inserted 0 1291204847
+d930 5
+a934 3
+
+\change_inserted 0 1291204852
+Incomplete.
+a1051 2
+\change_inserted 0 1291204881
+
+a1054 2
+
+\change_inserted 0 1291204881
+a1058 2
+
+\change_inserted 0 1291204885
+a1059 2
+\change_unchanged
+
+a1140 2
+\change_inserted 0 1291204898
+
+a1143 2
+
+\change_inserted 0 1291204898
+a1147 2
+
+\change_inserted 0 1291204901
+a1148 2
+\change_unchanged
+
+a1224 2
+\change_inserted 0 1291204908
+
+a1227 2
+
+\change_inserted 0 1291204908
+a1231 2
+
+\change_inserted 0 1291204908
+a1232 2
+\change_unchanged
+
+a1271 2
+\change_inserted 0 1291204917
+
+a1274 2
+
+\change_inserted 0 1291204917
+a1278 2
+
+\change_inserted 0 1291204920
+a1279 2
+\change_unchanged
+
+a1316 2
+\change_inserted 0 1291204927
+
+a1319 2
+
+\change_inserted 0 1291204928
+d1325 1
+a1325 1
+\change_inserted 0 1291204942
+d1327 2
+a1381 2
+\change_inserted 0 1291205003
+
+a1384 2
+
+\change_inserted 0 1291205004
+a1388 2
+
+\change_inserted 0 1291205007
+a1411 2
+\change_inserted 0 1291205019
+
+a1414 2
+
+\change_inserted 0 1291205019
+a1418 2
+
+\change_inserted 0 1291205023
+a1419 2
+\change_unchanged
+
+a1465 2
+\change_inserted 0 1291205029
+
+a1468 2
+
+\change_inserted 0 1291205029
+a1472 2
+
+\change_inserted 0 1291206020
+a1473 2
+\change_unchanged
+
+a1528 2
+\change_inserted 0 1291205043
+
+a1531 2
+
+\change_inserted 0 1291205043
+d1537 1
+a1537 1
+\change_inserted 0 1291205057
+d1539 2
+a1589 2
+\change_inserted 0 1291205062
+
+a1592 2
+
+\change_inserted 0 1291205062
+a1596 2
+
+\change_inserted 0 1291205062
+a1597 2
+\change_unchanged
+
+a1626 2
+\change_inserted 0 1291205072
+
+a1629 2
+
+\change_inserted 0 1291205073
+a1633 2
+
+\change_inserted 0 1291205073
+a1634 2
+\change_unchanged
+
+a1674 4
+
+\change_deleted 0 1291204504
+
+\change_unchanged
+a1699 2
+\change_inserted 0 1291205079
+
+a1702 2
+
+\change_inserted 0 1291205080
+a1706 2
+
+\change_inserted 0 1291205080
+a1707 2
+\change_unchanged
+
+a1833 2
+\change_inserted 0 1291205090
+
+d1869 2
+a1870 7
+ is to divide the file into zones, and using a free list (or
+\change_inserted 0 1291205498
+table
+\change_deleted 0 1291205497
+set
+\change_unchanged
+ of free lists) for each.
+a1871 2
+\change_inserted 0 1291205203
+
+a1874 2
+
+\change_inserted 0 1291205358
+a1890 21
+\change_unchanged
+
+\end_layout
+
+\begin_layout Standard
+
+\change_deleted 0 1291205198
+Note that this means we need to split the free lists when we expand the
+ file; this is probably acceptable when we double the hash table size, since
+ that is such an expensive operation already.
+ In the case of increasing the file size, there is an optimization we can
+ use: if we use M in the formula above as the file size rounded up to the
+ next power of 2, we only need reshuffle free lists when the file size crosses
+ a power of 2 boundary,
+\emph on
+and
+\emph default
+reshuffling the free lists is trivial: we simply merge every consecutive
+ pair of free lists.
+\change_unchanged
+
+d1899 1
+a1899 7
+Identify the correct
+\change_inserted 0 1291205366
+free list
+\change_deleted 0 1291205364
+zone
+\change_unchanged
+.
+d1907 2
+a1908 7
+Re-check the
+\change_inserted 0 1291205372
+list
+\change_deleted 0 1291205371
+zone
+\change_unchanged
+ (we didn't have a lock, sizes could have changed): relock if necessary.
+d1912 1
+a1912 5
+Place the freed entry in the list
+\change_deleted 0 1291205382
+ for that zone
+\change_unchanged
+.
+d1921 1
+a1921 15
+Pick a
+\change_deleted 0 1291205403
+zone either the zone we last freed into, or based on a
+\begin_inset Quotes eld
+\end_inset
+
+random
+\begin_inset Quotes erd
+\end_inset
+
+ number.
+\change_inserted 0 1291205411
+free table; usually the previous one.
+\change_unchanged
+
+a1925 10
+\change_deleted 0 1291205432
+
+\end_layout
+
+\begin_layout Enumerate
+
+\change_deleted 0 1291205428
+Re-check the zone: relock if necessary.
+\change_unchanged
+
+d1934 1
+a1934 7
+ unlock the list and try the next
+\change_inserted 0 1291205455
+largest list
+\change_deleted 0 1291205452
+zone.
+\change_inserted 0 1291205457
+
+a1937 2
+
+\change_inserted 0 1291205476
+a1938 2
+\change_unchanged
+
+a1966 2
+\change_inserted 0 1291205542
+
+a1969 2
+
+\change_inserted 0 1291205591
+a1971 70
+\change_unchanged
+
+\end_layout
+
+\begin_layout Standard
+
+\change_deleted 0 1291205539
+I anticipate that the number of entries in each free zone would be small,
+ but it might be worth using one free entry to hold pointers to the others
+ for cache efficiency.
+\change_unchanged
+
+\end_layout
+
+\begin_layout Standard
+
+\change_deleted 0 1291205534
+\begin_inset CommandInset label
+LatexCommand label
+name "freelist-in-zone"
+
+\end_inset
+
+If we want to avoid locking complexity (enlarging the free lists when we
+ enlarge the file) we could place the array of free lists at the beginning
+ of each zone.
+ This means existing array lists never move, but means that a record cannot
+ be larger than a zone.
+ That in turn implies that zones should be variable sized (say, power of
+ 2), which makes the question
+\begin_inset Quotes eld
+\end_inset
+
+what zone is this record in?
+\begin_inset Quotes erd
+\end_inset
+
+ much harder (and
+\begin_inset Quotes eld
+\end_inset
+
+pick a random zone
+\begin_inset Quotes erd
+\end_inset
+
+, but that's less common).
+ It could be done with as few as 4 bits from the record header.
+\begin_inset Foot
+status collapsed
+
+\begin_layout Plain Layout
+Using
+\begin_inset Formula $2^{16+N*3}$
+\end_inset
+
+means 0 gives a minimal 65536-byte zone, 15 gives the maximal
+\begin_inset Formula $2^{61}$
+\end_inset
+
+ byte zone.
+ Zones range in factor of 8 steps.
+ Given the zone size for the zone the current record is in, we can determine
+ the start of the zone.
+\end_layout
+
+\end_inset
+
+
+\change_inserted 0 1291205139
+
+d2218 1
+a2218 5
+ uint32_t
+\change_inserted 0 1291205758
+used_
+\change_unchanged
+magic : 16,
+a2222 4
+\change_deleted 0 1291205693
+ prev_is_free: 1,
+\change_unchanged
+
+d2230 1
+a2230 7
+ top_hash: 1
+\change_inserted 0 1291205704
+1
+\change_deleted 0 1291205704
+0
+\change_unchanged
+;
+d2254 1
+a2254 9
+ uint
+\change_inserted 0 1291205725
+64
+\change_deleted 0 1291205723
+32
+\change_unchanged
+_t
+\change_inserted 0 1291205753
+free_magic: 8,
+a2257 2
+
+\change_inserted 0 1291205746
+a2262 24
+\change_deleted 0 1291205749
+free_magic;
+\change_unchanged
+
+\end_layout
+
+\begin_layout LyX-Code
+ uint64_t
+\change_inserted 0 1291205786
+free_table: 8,
+\end_layout
+
+\begin_layout LyX-Code
+
+\change_inserted 0 1291205788
+
+\change_unchanged
+total_length
+\change_inserted 0 1291205792
+ : 56
+\change_deleted 0 1291205790
+;
+\change_unchanged
+
+d2266 1
+a2266 7
+ uint64_t
+\change_deleted 0 1291205801
+prev,
+\change_unchanged
+next;
+\change_deleted 0 1291205811
+
+d2270 1
+a2270 3
+
+\change_deleted 0 1291205811
+ ...
+d2274 1
+a2274 5
+
+\change_deleted 0 1291205808
+ uint64_t tailer
+\change_unchanged
+;
+d2283 5
+a2287 16
+\change_deleted 0 1291205827
+We might want to take some bits from the used record's top_hash (and the
+ free record which has 32 bits of padding to spare anyway) if we use variable
+ sized zones.
+ See
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "freelist-in-zone"
+
+\end_inset
+
+.
+
+\change_inserted 0 1291205885
+ Note that by limiting valid offsets to 56 bits, we can pack everything
+ we need into 3 64-byte words, meaning our minimum record size is 8 bytes.
+a2290 2
+
+\change_inserted 0 1291205886
+a2294 2
+
+\change_inserted 0 1291205886
+a2295 2
+\change_unchanged
+
+a2385 2
+\change_inserted 0 1291205894
+
+a2388 2
+
+\change_inserted 0 1291205894
+a2392 2
+
+\change_inserted 0 1291205902
+a2393 2
+\change_unchanged
+
+a2415 4
+
+\change_deleted 0 1291204504
+
+\change_unchanged
+a2445 2
+\change_inserted 0 1291205910
+
+a2448 2
+
+\change_inserted 0 1291205910
+a2452 2
+
+\change_inserted 0 1291205914
+a2453 2
+\change_unchanged
+
+a2485 2
+\change_inserted 0 1291205919
+
+a2488 2
+
+\change_inserted 0 1291205919
+a2492 2
+
+\change_inserted 0 1291205922
+a2493 2
+\change_unchanged
+
+a2533 2
+\change_inserted 0 1291205929
+
+a2536 2
+
+\change_inserted 0 1291205929
+a2540 2
+
+\change_inserted 0 1291205929
+a2541 2
+\change_unchanged
+
+a2578 2
+\change_inserted 0 1291205932
+
+a2581 2
+
+\change_inserted 0 1291205933
+a2585 2
+
+\change_inserted 0 1291205933
+a2586 2
+\change_unchanged
+
+a2724 2
+\change_inserted 0 1291205944
+
+a2727 2
+
+\change_inserted 0 1291205945
+a2731 2
+
+\change_inserted 0 1291205948
+a2732 2
+\change_unchanged
+
+@
+
+
+1.11
+log
+@Merge changes
+@
+text
+@d53 7
+a59 1
+14-September-2010
+d587 16
+d644 18
+d716 16
+d753 16
+d813 18
+d883 16
+d953 16
+d1084 16
+d1181 16
+d1273 16
+d1328 16
+d1381 16
+d1447 19
+a1465 2
+ if older code (which doesn't understand the feature) writes to the database.Reco
+rd Headers Are Not Expandible
+d1484 16
+d1546 16
+d1617 16
+d1680 16
+d1725 16
+d1810 16
+d1951 8
+a1958 3
+Proposed SolutionThe first step is to remove all the current heuristics,
+ as they obviously interact, then examine them once the lock contention
+ is addressed.
+d1989 7
+a1995 2
+ is to divide the file into zones, and using a free list (or set of free
+ lists) for each.
+d1997 2
+d2002 25
+d2039 2
+d2049 7
+a2055 1
+Identify the correct zone.
+d2063 7
+a2069 2
+Re-check the zone (we didn't have a lock, sizes could have changed): relock
+ if necessary.
+d2073 5
+a2077 1
+Place the freed entry in the list for that zone.
+d2086 3
+a2088 1
+Pick a zone either the zone we last freed into, or based on a
+d2097 4
+d2105 2
+d2110 2
+d2113 2
+d2123 15
+a2137 1
+ unlock the list and try the next zone.
+d2166 11
+d2180 2
+d2185 2
+d2190 2
+d2223 1
+a2223 1
+status open
+d2243 2
+d2491 5
+a2495 1
+ uint32_t magic : 16,
+d2499 2
+d2502 2
+d2511 7
+a2517 1
+ top_hash: 10;
+d2541 29
+a2569 1
+ uint32_t free_magic;
+d2573 11
+a2583 1
+ uint64_t total_length;
+d2587 7
+a2593 1
+ uint64_t prev, next;
+d2597 2
+d2603 5
+a2607 1
+ uint64_t tailer;
+d2615 2
+d2628 18
+d2736 16
+d2808 16
+d2856 16
+d2912 16
+d2965 16
+d3119 16
+@
+
+
+1.10
+log
+@Tracing attribute, talloc support.
+@
+text
+@d1 1
+a1 1
+#LyX 1.6.5 created this file. For more info see http://www.lyx.org/
+d53 1
+a53 7
+
+\change_deleted 0 1283307542
+26-July
+\change_inserted 0 1284423485
+14-September
+\change_unchanged
+-2010
+a472 2
+\change_inserted 0 1284422789
+
+a479 2
+\change_unchanged
+
+a838 2
+
+\change_inserted 0 1284016998
+a846 2
+\change_unchanged
+
+a1194 2
+\change_inserted 0 1284015637
+
+a1197 2
+
+\change_inserted 0 1284015716
+a1201 2
+
+\change_inserted 0 1284015906
+a1210 2
+
+\change_inserted 0 1284015637
+a1214 2
+
+\change_inserted 0 1284016114
+a1227 2
+
+\change_inserted 0 1284016149
+a1232 2
+
+\change_inserted 0 1284016639
+a1237 2
+
+\change_inserted 0 1284016821
+a1243 2
+
+\change_inserted 0 1284016803
+d1245 2
+a1246 9
+ if older code (which doesn't understand the feature) writes to the database.
+\change_deleted 0 1284016101
+
+\end_layout
+
+\begin_layout Subsection
+
+\change_inserted 0 1284015634
+Record Headers Are Not Expandible
+a1249 2
+
+\change_inserted 0 1284015634
+a1254 2
+
+\change_inserted 0 1284015634
+a1258 2
+
+\change_inserted 0 1284422552
+a1267 2
+
+\change_inserted 0 1284422568
+a1271 2
+
+\change_inserted 0 1284422646
+a1276 2
+
+\change_inserted 0 1284422656
+a1280 2
+
+\change_inserted 0 1284423065
+a1305 2
+
+\change_inserted 0 1284423042
+a1310 2
+\change_unchanged
+
+a1457 2
+
+\change_inserted 0 1283336713
+a1463 2
+
+\change_unchanged
+d1482 2
+d1485 1
+a1485 51
+\change_deleted 0 1283307675
+There are three details which become important:
+\end_layout
+
+\begin_layout Enumerate
+
+\change_deleted 0 1283307675
+On encountering a full bucket, we use the next bucket.
+\end_layout
+
+\begin_layout Enumerate
+
+\change_deleted 0 1283307675
+Extra hash bits are stored with the offset, to reduce comparisons.
+\end_layout
+
+\begin_layout Enumerate
+
+\change_deleted 0 1283307675
+A marker entry is used on deleting an entry.
+\end_layout
+
+\begin_layout Standard
+
+\change_deleted 0 1283307675
+The doubling of the table must be done under a transaction; we will not
+ reduce it on deletion, so it will be an unusual case.
+ It will either be placed at the head (other entries will be moved out the
+ way so we can expand).
+ We could have a pointer in the header to the current hashtable location,
+ but that pointer would have to be read frequently to check for hashtable
+ moves.
+\end_layout
+
+\begin_layout Standard
+
+\change_deleted 0 1283307675
+The locking for this is slightly more complex than the chained case; we
+ currently have one lock per bucket, and that means we would need to expand
+ the lock if we overflow to the next bucket.
+ The frequency of such collisions will effect our locking heuristics: we
+ can always lock more buckets than we need.
+\end_layout
+
+\begin_layout Standard
+
+\change_deleted 0 1283307675
+One possible optimization is to only re-check the hash size on an insert
+ or a lookup miss.
+
+\change_inserted 0 1283307770
+a1492 2
+
+\change_inserted 0 1283336187
+a1500 2
+
+\change_inserted 0 1283336586
+a1510 2
+\change_unchanged
+
+d1636 3
+a1638 8
+Proposed Solution
+\change_deleted 0 1283336858
+
+\end_layout
+
+\begin_layout Standard
+The first step is to remove all the current heuristics, as they obviously
+ interact, then examine them once the lock contention is addressed.
+a1647 2
+\change_inserted 0 1283336910
+
+a1650 2
+
+\change_inserted 0 1283337052
+a1655 2
+\change_unchanged
+
+a1776 2
+\change_inserted 0 1283309850
+
+a1779 2
+
+\change_inserted 0 1283337216
+a1813 2
+
+\change_inserted 0 1284424151
+a1825 2
+\change_unchanged
+
+a1830 2
+\change_unchanged
+
+a2031 2
+
+\change_inserted 0 1283336739
+a2040 2
+\change_unchanged
+
+a2117 2
+\change_inserted 0 1283337133
+
+a2120 2
+
+\change_inserted 0 1283337139
+a2121 2
+\change_unchanged
+
+a2136 2
+
+\change_inserted 0 1283337235
+a2147 2
+\change_unchanged
+
+d2251 1
+a2251 7
+Proposed Solution
+\change_deleted 0 1284423472
+
+\end_layout
+
+\begin_layout Standard
+None.
+d2261 1
+a2261 1
+\change_inserted 0 1284423891
+d2263 1
+a2263 4
+\change_deleted 0 1284423891
+.
+
+\change_inserted 0 1284423901
+a2271 2
+\change_unchanged
+
+a2293 2
+\change_inserted 0 1284423495
+
+a2312 2
+
+\change_inserted 0 1284424201
+d2321 1
+a2321 3
+
+\change_unchanged
+We could solve a small part of the problem by providing read-only transactions.
+a2505 2
+\change_inserted 0 1284423555
+
+a2508 2
+
+\change_inserted 0 1284423617
+a2512 2
+
+\change_inserted 0 1284423719
+a2519 2
+
+\change_inserted 0 1284423864
+a2530 2
+
+\change_inserted 0 1284423850
+a2540 2
+\change_unchanged
+
+@
+
+
+1.9
+log
+@Extension mechanism.
+@
+text
+@d56 2
+a57 2
+\change_inserted 0 1284016854
+9-September
+d479 11
+d1303 1
+a1303 1
+\change_inserted 0 1284016847
+d1310 56
+d1945 1
+a1945 1
+\change_inserted 0 1283310945
+d1956 2
+d2402 2
+d2416 4
+d2421 12
+d2455 2
+d2476 12
+d2673 47
+@
+
+
+1.8
+log
+@Remove bogus footnote
+@
+text
+@d56 2
+a57 2
+\change_inserted 0 1283307544
+1-September
+d838 12
+d1198 103
+@
+
+
+1.7
+log
+@Moving hash table does not work.
+@
+text
+@a1436 12
+\begin_inset Foot
+status collapsed
+
+\begin_layout Plain Layout
+
+\change_inserted 0 1283336450
+If we make the hash offsets zone-relative, then this only restricts the
+ zone size, not the overall database size.
+\end_layout
+
+\end_inset
+
+@
+
+
+1.6
+log
+@Commit changes
+@
+text
+@d38 1
+a38 1
+\author ""
+d53 7
+a59 1
+26-July-2010
+d1333 10
+d1361 3
+a1363 1
+ There are three details which become important:
+d1367 2
+d1373 2
+d1379 2
+d1385 2
+d1397 2
+d1407 2
+d1411 45
+d1582 2
+d1598 14
+d1733 62
+d1996 13
+d2086 10
+d2110 15
+a2124 1
+\begin_layout LyX-Code
+@
+
+
+1.5
+log
+@Soft transaction commit
+@
+text
+@d38 1
+a38 1
+\author "Rusty Russell,,,"
+a52 4
+
+\change_deleted 0 1280141199
+10-May-2010
+\change_inserted 0 1280141202
+a53 2
+\change_unchanged
+
+a2028 2
+
+\change_inserted 0 1280140902
+a2034 2
+
+\change_unchanged
+a2212 2
+\change_inserted 0 1280140661
+
+a2215 2
+
+\change_inserted 0 1280140703
+a2219 2
+
+\change_inserted 0 1280708312
+a2226 2
+
+\change_inserted 0 1280708400
+a2239 2
+
+\change_inserted 0 1280140836
+a2243 2
+
+\change_inserted 0 1280708255
+a2247 2
+
+\change_inserted 0 1280708374
+a2252 2
+
+\change_inserted 0 1280141181
+a2274 2
+
+\change_inserted 0 1280141345
+@
+
+
+1.4
+log
+@Merge changes
+@
+text
+@d38 1
+a38 1
+\author ""
+d53 2
+d56 4
+d2035 10
+d2223 84
+@
+
+
+1.3
+log
+@Transaction and freelist rethink.
+@
+text
+@d38 1
+a38 1
+\author "Rusty Russell,,,"
+d53 1
+a53 1
+27-April-2010
+d662 1
+a662 5
+ behavior of disallowing
+\change_inserted 0 1272940179
+nested
+\change_unchanged
+transactions should become the default.
+a1210 2
+\change_inserted 0 1272944650
+
+a1214 2
+
+\change_inserted 0 1272944763
+a1218 2
+\change_unchanged
+
+a1223 2
+\change_unchanged
+
+a1301 2
+
+\change_inserted 0 1273478114
+a1310 2
+\change_unchanged
+
+d1515 1
+a1515 11
+The free list
+\change_deleted 0 1273469807
+should
+\change_inserted 0 1273469810
+must
+\change_unchanged
+ be split
+\change_deleted 0 1273469815
+into multiple lists
+\change_unchanged
+to reduce contention.
+a1520 2
+\change_inserted 0 1273470006
+
+a1523 2
+
+\change_inserted 0 1273492055
+a1539 2
+
+\change_inserted 0 1273483888
+a1551 2
+\change_unchanged
+
+a1554 8
+
+\change_deleted 0 1272942055
+There are various ways to organize these lisys, but because we want to be
+ able to quickly identify which free list an entry is in, and reduce the
+ number of locks required for merging, we will use zoning (eg.
+ each free list covers some fixed fraction of the file).
+
+\change_inserted 0 1273484187
+d1556 1
+a1556 7
+
+\change_deleted 0 1273484194
+The algorithm for f
+\change_inserted 0 1273484194
+F
+\change_unchanged
+reeing is simple:
+d1560 1
+a1560 7
+Identify the correct
+\change_deleted 0 1273482856
+free list
+\change_inserted 0 1273482857
+zone
+\change_unchanged
+.
+d1564 1
+a1564 7
+Lock the
+\change_inserted 0 1273482895
+corresponding
+\change_unchanged
+list
+\change_inserted 0 1273482863
+.
+a1567 2
+
+\change_inserted 0 1273482909
+d1573 1
+a1573 13
+
+\change_deleted 0 1273482885
+, and p
+\change_inserted 0 1273482888
+P
+\change_unchanged
+lace the freed entry
+\change_deleted 0 1273492415
+at the head
+\change_inserted 0 1273492415
+in the list for that zone
+\change_unchanged
+.
+d1577 2
+a1578 7
+Allocation is a little more complicated, as we
+\change_deleted 0 1273483240
+merge entries as we walk the list:
+\change_inserted 0 1273484250
+perform delayed coalescing at this point:
+\change_unchanged
+
+d1582 1
+a1582 19
+Pick a
+\change_deleted 0 1273482955
+free list;
+\change_inserted 0 1273482957
+zone
+\change_unchanged
+ either the
+\change_deleted 0 1273482962
+list
+\change_inserted 0 1273482962
+zone
+\change_unchanged
+ we last freed
+\change_deleted 0 1273482966
+o
+\change_inserted 0 1273482966
+i
+\change_unchanged
+nto, or based on a
+d1594 1
+a1594 9
+Lock th
+\change_inserted 0 1273482980
+e corresponding
+\change_deleted 0 1273482973
+at
+\change_unchanged
+ list.
+\change_inserted 0 1273482982
+
+a1597 2
+
+\change_inserted 0 1273483084
+a1598 53
+\change_unchanged
+
+\end_layout
+
+\begin_layout Enumerate
+If the top entry is
+\change_deleted 0 1273492155
+well-sized,
+\change_inserted 0 1273492159
+-large enough,
+\change_unchanged
+remove it from the list and return it.
+\end_layout
+
+\begin_layout Enumerate
+Otherwise,
+\change_inserted 0 1273492206
+coalesce entries in the list.
+\change_deleted 0 1273492200
+examine the entry to the right of it in the file.
+ If it is free:
+\end_layout
+
+\begin_deeper
+\begin_layout Enumerate
+
+\change_deleted 0 1273492200
+If that entry is in a different list, lock that list too.
+\end_layout
+
+\begin_layout Enumerate
+
+\change_deleted 0 1273492200
+If we had to place a new lock, re-check that the entry is free.
+\end_layout
+
+\begin_layout Enumerate
+
+\change_deleted 0 1273492200
+Remove that entry from its free list and expand this entry to cover it.
+\end_layout
+
+\begin_layout Enumerate
+
+\change_deleted 0 1273485554
+Goto step 3.
+\end_layout
+
+\end_deeper
+\begin_layout Enumerate
+
+\change_inserted 0 1273485311
+If there was no entry large enough, unlock the list and try the next zone.
+d1602 1
+a1602 5
+
+\change_deleted 0 1273483646
+Repeat step 3 with each entry in the list.
+\change_unchanged
+
+d1606 2
+a1607 5
+
+\change_deleted 0 1273483668
+Unlock the list and repeat step 2 with the next list.
+\change_unchanged
+
+d1611 1
+a1611 7
+If no
+\change_deleted 0 1273483671
+list
+\change_inserted 0 1273483671
+zone
+\change_unchanged
+ satisfies, expand the file.
+d1615 2
+a1616 9
+This optimizes rapid insert/delete of free list entries
+\change_inserted 0 1273485794
+ by not coalescing them all the time.
+\change_deleted 0 1273483685
+, and allows us to get rid of the tailer altogether
+\change_unchanged
+.
+
+\change_inserted 0 1273492299
+a1638 39
+
+\change_deleted 0 1273476840
+The question of
+\begin_inset Quotes eld
+\end_inset
+
+well-sized
+\begin_inset Quotes erd
+\end_inset
+
+ free entries is more difficult: the 25% overhead works in practice for
+ ldb because indexes tend to expand by one record at a time.
+ This can be resolved by having an
+\begin_inset Quotes eld
+\end_inset
+
+expanded
+\begin_inset Quotes erd
+\end_inset
+
+ bit in the header to note entries that have previously expanded, and allocating
+ more space for them.
+ Whether the
+\begin_inset Quotes eld
+\end_inset
+
+increasing slack
+\begin_inset Quotes erd
+\end_inset
+
+ algorithm should be implemented or first-fit used is still unknown: we
+ will determine this once these other ideas are implemented.
+\change_inserted 0 1273483750
+
+\end_layout
+
+\begin_layout Standard
+
+\change_inserted 0 1273492450
+a1644 2
+
+\change_inserted 0 1273470441
+a1654 2
+
+\change_inserted 0 1273476556
+a1659 2
+
+\change_inserted 0 1273470423
+a1661 2
+\change_unchanged
+
+a1672 2
+
+\change_inserted 0 1273476847
+a1676 2
+
+\change_inserted 0 1273476886
+a1691 2
+
+\change_inserted 0 1273477233
+a1699 2
+
+\change_inserted 0 1273477534
+a1706 2
+
+\change_inserted 0 1273482700
+a1712 2
+
+\change_inserted 0 1273478079
+a1722 2
+
+\change_inserted 0 1273477839
+a1726 2
+
+\change_inserted 0 1273477925
+a1730 2
+
+\change_inserted 0 1273477925
+a1734 2
+
+\change_inserted 0 1273477925
+a1738 2
+
+\change_inserted 0 1273477925
+a1742 2
+
+\change_inserted 0 1273477925
+a1746 2
+
+\change_inserted 0 1273477925
+a1750 2
+
+\change_inserted 0 1273477925
+a1754 2
+
+\change_inserted 0 1273477925
+a1758 2
+
+\change_inserted 0 1273477925
+a1762 2
+
+\change_inserted 0 1273477925
+a1766 2
+
+\change_inserted 0 1273477925
+a1770 2
+
+\change_inserted 0 1273477925
+a1774 2
+
+\change_inserted 0 1273477925
+a1778 2
+
+\change_inserted 0 1273477925
+a1782 2
+
+\change_inserted 0 1273477925
+a1786 2
+
+\change_inserted 0 1273477925
+a1790 2
+
+\change_inserted 0 1273477925
+a1794 2
+
+\change_inserted 0 1273477925
+a1798 2
+
+\change_inserted 0 1273492522
+a1802 2
+
+\change_inserted 0 1273492530
+a1806 2
+
+\change_inserted 0 1273492546
+a1810 2
+
+\change_inserted 0 1273478239
+a1814 2
+
+\change_inserted 0 1273479960
+a1821 2
+
+\change_inserted 0 1273480265
+a1830 2
+
+\change_inserted 0 1273480354
+a1845 2
+
+\change_inserted 0 1273478968
+a1851 2
+
+\change_inserted 0 1273492604
+a1859 2
+
+\change_inserted 0 1273479572
+a1862 2
+\change_unchanged
+
+a1870 2
+
+\change_inserted 0 1273480282
+a1874 2
+
+\change_inserted 0 1273478931
+a1878 2
+
+\change_inserted 0 1273481549
+a1882 2
+
+\change_inserted 0 1273481557
+a1886 2
+
+\change_inserted 0 1273480307
+a1890 2
+
+\change_inserted 0 1273480335
+a1894 2
+
+\change_inserted 0 1273479897
+a1898 2
+
+\change_inserted 0 1273479653
+a1902 2
+
+\change_inserted 0 1273480371
+a1906 2
+
+\change_inserted 0 1273480464
+a1910 2
+
+\change_inserted 0 1273480399
+a1914 2
+
+\change_inserted 0 1273480425
+a1918 2
+
+\change_inserted 0 1273480453
+a1922 2
+
+\change_inserted 0 1273480455
+a1926 2
+
+\change_inserted 0 1273480450
+a1930 2
+
+\change_inserted 0 1273480452
+a1935 2
+\change_inserted 0 1273478830
+
+a1942 5
+
+\change_deleted 0 1273481604
+In theory, we could get away with 2: one after we write the new data, and
+ one to somehow atomically change over to it.
+\change_inserted 0 1273481632
+a1946 2
+
+\change_inserted 0 1273481724
+a1950 2
+
+\change_inserted 0 1273481713
+a1954 2
+
+\change_inserted 0 1273481717
+a1958 2
+
+\change_inserted 0 1273481730
+a1962 2
+
+\change_inserted 0 1273481736
+a1966 2
+
+\change_inserted 0 1273481744
+a1970 2
+
+\change_inserted 0 1273481748
+a1974 2
+
+\change_inserted 0 1273482185
+a1978 2
+
+\change_inserted 0 1273482259
+a1989 50
+
+\change_deleted 0 1273481848
+None.
+ Trying to rewrite the transaction code is a separate experiment, which
+ I encourage someone else to do.
+ At some point you say
+\begin_inset Quotes eld
+\end_inset
+
+use a real database
+\begin_inset Quotes erd
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Standard
+
+\change_deleted 0 1273481848
+But as a thought experiment:
+\change_unchanged
+
+\end_layout
+
+\begin_layout Standard
+
+\change_deleted 0 1273481788
+Say there was a pointer in the header which said where the hash table and
+ free list tables were, and that no blocks were labeled with whether they
+ were free or not (it had to be derived from what list they were in).
+ We could create new hash table and free list in some free space, and populate
+ it as we want the post-committed state to look.
+ Then we sync, then we switch the offset in the header, then we sync again.
+\end_layout
+
+\begin_layout Standard
+
+\change_deleted 0 1273481788
+This would not allow arbitrary changes to the database, such as tdb_repack
+ does, and would require more space (since we have to preserve the current
+ and future entries at once).
+ If we used hash trees rather than one big hash table, we might only have
+ to rewrite some sections of the hash, too.
+\change_inserted 0 1273481854
+
+\end_layout
+
+\begin_layout Standard
+
+\change_inserted 0 1273482102
+a1993 2
+
+\change_inserted 0 1273482061
+a1998 2
+
+\change_inserted 0 1273482063
+a2002 2
+
+\change_inserted 0 1273482072
+a2006 2
+
+\change_inserted 0 1273482139
+a2011 2
+
+\change_inserted 0 1273482364
+a2015 2
+
+\change_inserted 0 1273482163
+a2019 2
+
+\change_inserted 0 1273482493
+a2037 2
+
+\change_inserted 0 1273482536
+a2046 2
+\change_unchanged
+
+a2049 2
+
+\change_inserted 0 1273482641
+a2058 2
+
+\change_inserted 0 1273481827
+d2067 2
+a2068 11
+We could
+\change_inserted 0 1273481829
+then
+\change_unchanged
+implement snapshots using a similar method
+\change_deleted 0 1273481838
+ to the above, only
+\change_inserted 0 1273481840
+,
+\change_unchanged
+ using multiple different hash tables/free tables.
+@
+
+
+1.2
+log
+@After first feedback (Ronnie & Volker)
+@
+text
+@d1314 13
+d1531 11
+a1541 1
+The free list should be split into multiple lists to reduce contention.
+d1547 39
+d1596 7
+d1604 1
+a1604 1
+The algorithm for freeing is simple:
+d1608 7
+a1614 1
+Identify the correct free list.
+d1618 30
+a1647 1
+Lock the list, and place the freed entry at the head.
+d1651 7
+a1657 2
+Allocation is a little more complicated, as we merge entries as we walk
+ the list:
+d1661 19
+a1679 1
+Pick a free list; either the list we last freed onto, or based on a
+d1691 17
+a1707 1
+Lock that list.
+d1711 7
+a1717 1
+If the top entry is well-sized, remove it from the list and return it.
+d1721 5
+a1725 1
+Otherwise, examine the entry to the right of it in the file.
+d1731 2
+d1737 2
+d1743 2
+d1749 2
+d1756 8
+d1765 2
+d1770 2
+d1773 2
+d1778 7
+a1784 1
+If no list satisfies, expand the file.
+d1788 28
+a1815 2
+This optimizes rapid insert/delete of free list entries, and allows us to
+ get rid of the tailer altogether.
+d1819 2
+d1851 1
+a1851 1
+\change_inserted 0 1272941474
+d1857 303
+a2159 18
+\change_inserted 0 1272942759
+There are various ways to organize these lists, but because we want to be
+ able to quickly identify which free list an entry is in, and reduce the
+ number of locks required for merging, we will use zoning (eg.
+ each of the N free lists in a tdb file of size M covers a fixed fraction
+ M/N).
+ Note that this means we need to reshuffle the free lists when we expand
+ the file; this is probably acceptable when we double the hash table size,
+ since that is such an expensive operation already.
+ In the case of increasing the file size, there is an optimization we can
+ use: if we use M in the formula above as the file size rounded up to the
+ next power of 2, we only need reshuffle free lists when the file size crosses
+ a power of 2 boundary,
+\emph on
+and
+\emph default
+reshuffling the free lists is trivial: we simply merge every consecutive
+ pair of free lists.
+d2164 107
+d2276 2
+d2280 59
+d2346 2
+d2363 2
+d2366 2
+d2371 2
+d2382 2
+d2389 57
+d2458 13
+d2474 32
+a2505 2
+We could implement snapshots using a similar method to the above, only using
+ multiple different hash tables/free tables.
+@
+
+
+1.1
+log
+@Initial revision
+@
+text
+@d1 1
+a1 1
+#LyX 1.6.4 created this file. For more info see http://www.lyx.org/
+d36 3
+a38 3
+\tracking_changes false
+\output_changes false
+\author ""
+d662 5
+a666 1
+ behavior of disallowing transactions should become the default.
+d1215 21
+d1527 2
+d1533 3
+a1535 1
+ The algorithm for freeing is simple:
+d1642 26
+@
diff --git a/lib/tdb2/doc/design.pdf b/lib/tdb2/doc/design.pdf
new file mode 100644
index 00000000000..558dc1f8c2e
--- /dev/null
+++ b/lib/tdb2/doc/design.pdf
Binary files differ
diff --git a/lib/tdb2/doc/design.txt b/lib/tdb2/doc/design.txt
new file mode 100644
index 00000000000..bd2ffde4db6
--- /dev/null
+++ b/lib/tdb2/doc/design.txt
@@ -0,0 +1,1258 @@
+TDB2: A Redesigning The Trivial DataBase
+
+Rusty Russell, IBM Corporation
+
+1-December-2010
+
+Abstract
+
+The Trivial DataBase on-disk format is 32 bits; with usage cases
+heading towards the 4G limit, that must change. This required
+breakage provides an opportunity to revisit TDB's other design
+decisions and reassess them.
+
+1 Introduction
+
+The Trivial DataBase was originally written by Andrew Tridgell as
+a simple key/data pair storage system with the same API as dbm,
+but allowing multiple readers and writers while being small
+enough (< 1000 lines of C) to include in SAMBA. The simple design
+created in 1999 has proven surprisingly robust and performant,
+used in Samba versions 3 and 4 as well as numerous other
+projects. Its useful life was greatly increased by the
+(backwards-compatible!) addition of transaction support in 2005.
+
+The wider variety and greater demands of TDB-using code has lead
+to some organic growth of the API, as well as some compromises on
+the implementation. None of these, by themselves, are seen as
+show-stoppers, but the cumulative effect is to a loss of elegance
+over the initial, simple TDB implementation. Here is a table of
+the approximate number of lines of implementation code and number
+of API functions at the end of each year:
+
+
++-----------+----------------+--------------------------------+
+| Year End | API Functions | Lines of C Code Implementation |
++-----------+----------------+--------------------------------+
++-----------+----------------+--------------------------------+
+| 1999 | 13 | 1195 |
++-----------+----------------+--------------------------------+
+| 2000 | 24 | 1725 |
++-----------+----------------+--------------------------------+
+| 2001 | 32 | 2228 |
++-----------+----------------+--------------------------------+
+| 2002 | 35 | 2481 |
++-----------+----------------+--------------------------------+
+| 2003 | 35 | 2552 |
++-----------+----------------+--------------------------------+
+| 2004 | 40 | 2584 |
++-----------+----------------+--------------------------------+
+| 2005 | 38 | 2647 |
++-----------+----------------+--------------------------------+
+| 2006 | 52 | 3754 |
++-----------+----------------+--------------------------------+
+| 2007 | 66 | 4398 |
++-----------+----------------+--------------------------------+
+| 2008 | 71 | 4768 |
++-----------+----------------+--------------------------------+
+| 2009 | 73 | 5715 |
++-----------+----------------+--------------------------------+
+
+
+This review is an attempt to catalog and address all the known
+issues with TDB and create solutions which address the problems
+without significantly increasing complexity; all involved are far
+too aware of the dangers of second system syndrome in rewriting a
+successful project like this.
+
+2 API Issues
+
+2.1 tdb_open_ex Is Not Expandable
+
+The tdb_open() call was expanded to tdb_open_ex(), which added an
+optional hashing function and an optional logging function
+argument. Additional arguments to open would require the
+introduction of a tdb_open_ex2 call etc.
+
+2.1.1 Proposed Solution<attributes>
+
+tdb_open() will take a linked-list of attributes:
+
+enum tdb_attribute {
+
+ TDB_ATTRIBUTE_LOG = 0,
+
+ TDB_ATTRIBUTE_HASH = 1
+
+};
+
+struct tdb_attribute_base {
+
+ enum tdb_attribute attr;
+
+ union tdb_attribute *next;
+
+};
+
+struct tdb_attribute_log {
+
+ struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_LOG
+*/
+
+ tdb_log_func log_fn;
+
+ void *log_private;
+
+};
+
+struct tdb_attribute_hash {
+
+ struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_HASH
+*/
+
+ tdb_hash_func hash_fn;
+
+ void *hash_private;
+
+};
+
+union tdb_attribute {
+
+ struct tdb_attribute_base base;
+
+ struct tdb_attribute_log log;
+
+ struct tdb_attribute_hash hash;
+
+};
+
+This allows future attributes to be added, even if this expands
+the size of the union.
+
+2.1.2 Status
+
+Complete.
+
+2.2 tdb_traverse Makes Impossible Guarantees
+
+tdb_traverse (and tdb_firstkey/tdb_nextkey) predate transactions,
+and it was thought that it was important to guarantee that all
+records which exist at the start and end of the traversal would
+be included, and no record would be included twice.
+
+This adds complexity (see[Reliable-Traversal-Adds]) and does not
+work anyway for records which are altered (in particular, those
+which are expanded may be effectively deleted and re-added behind
+the traversal).
+
+2.2.1 <traverse-Proposed-Solution>Proposed Solution
+
+Abandon the guarantee. You will see every record if no changes
+occur during your traversal, otherwise you will see some subset.
+You can prevent changes by using a transaction or the locking
+API.
+
+2.2.2 Status
+
+Complete. Delete-during-traverse will still delete every record,
+too (assuming no other changes).
+
+2.3 Nesting of Transactions Is Fraught
+
+TDB has alternated between allowing nested transactions and not
+allowing them. Various paths in the Samba codebase assume that
+transactions will nest, and in a sense they can: the operation is
+only committed to disk when the outer transaction is committed.
+There are two problems, however:
+
+1. Canceling the inner transaction will cause the outer
+ transaction commit to fail, and will not undo any operations
+ since the inner transaction began. This problem is soluble with
+ some additional internal code.
+
+2. An inner transaction commit can be cancelled by the outer
+ transaction. This is desirable in the way which Samba's
+ database initialization code uses transactions, but could be a
+ surprise to any users expecting a successful transaction commit
+ to expose changes to others.
+
+The current solution is to specify the behavior at tdb_open(),
+with the default currently that nested transactions are allowed.
+This flag can also be changed at runtime.
+
+2.3.1 Proposed Solution
+
+Given the usage patterns, it seems that the “least-surprise”
+behavior of disallowing nested transactions should become the
+default. Additionally, it seems the outer transaction is the only
+code which knows whether inner transactions should be allowed, so
+a flag to indicate this could be added to tdb_transaction_start.
+However, this behavior can be simulated with a wrapper which uses
+tdb_add_flags() and tdb_remove_flags(), so the API should not be
+expanded for this relatively-obscure case.
+
+2.3.2 Status
+
+Incomplete; nesting flag is still defined as per tdb1.
+
+2.4 Incorrect Hash Function is Not Detected
+
+tdb_open_ex() allows the calling code to specify a different hash
+function to use, but does not check that all other processes
+accessing this tdb are using the same hash function. The result
+is that records are missing from tdb_fetch().
+
+2.4.1 Proposed Solution
+
+The header should contain an example hash result (eg. the hash of
+0xdeadbeef), and tdb_open_ex() should check that the given hash
+function produces the same answer, or fail the tdb_open call.
+
+2.4.2 Status
+
+Complete.
+
+2.5 tdb_set_max_dead/TDB_VOLATILE Expose Implementation
+
+In response to scalability issues with the free list ([TDB-Freelist-Is]
+) two API workarounds have been incorporated in TDB:
+tdb_set_max_dead() and the TDB_VOLATILE flag to tdb_open. The
+latter actually calls the former with an argument of “5”.
+
+This code allows deleted records to accumulate without putting
+them in the free list. On delete we iterate through each chain
+and free them in a batch if there are more than max_dead entries.
+These are never otherwise recycled except as a side-effect of a
+tdb_repack.
+
+2.5.1 Proposed Solution
+
+With the scalability problems of the freelist solved, this API
+can be removed. The TDB_VOLATILE flag may still be useful as a
+hint that store and delete of records will be at least as common
+as fetch in order to allow some internal tuning, but initially
+will become a no-op.
+
+2.5.2 Status
+
+Incomplete. TDB_VOLATILE still defined, but implementation should
+fail on unknown flags to be future-proof.
+
+2.6 <TDB-Files-Cannot>TDB Files Cannot Be Opened Multiple Times
+ In The Same Process
+
+No process can open the same TDB twice; we check and disallow it.
+This is an unfortunate side-effect of fcntl locks, which operate
+on a per-file rather than per-file-descriptor basis, and do not
+nest. Thus, closing any file descriptor on a file clears all the
+locks obtained by this process, even if they were placed using a
+different file descriptor!
+
+Note that even if this were solved, deadlock could occur if
+operations were nested: this is a more manageable programming
+error in most cases.
+
+2.6.1 Proposed Solution
+
+We could lobby POSIX to fix the perverse rules, or at least lobby
+Linux to violate them so that the most common implementation does
+not have this restriction. This would be a generally good idea
+for other fcntl lock users.
+
+Samba uses a wrapper which hands out the same tdb_context to
+multiple callers if this happens, and does simple reference
+counting. We should do this inside the tdb library, which already
+emulates lock nesting internally; it would need to recognize when
+deadlock occurs within a single process. This would create a new
+failure mode for tdb operations (while we currently handle
+locking failures, they are impossible in normal use and a process
+encountering them can do little but give up).
+
+I do not see benefit in an additional tdb_open flag to indicate
+whether re-opening is allowed, as though there may be some
+benefit to adding a call to detect when a tdb_context is shared,
+to allow other to create such an API.
+
+2.6.2 Status
+
+Incomplete.
+
+2.7 TDB API Is Not POSIX Thread-safe
+
+The TDB API uses an error code which can be queried after an
+operation to determine what went wrong. This programming model
+does not work with threads, unless specific additional guarantees
+are given by the implementation. In addition, even
+otherwise-independent threads cannot open the same TDB (as in [TDB-Files-Cannot]
+).
+
+2.7.1 Proposed Solution
+
+Reachitecting the API to include a tdb_errcode pointer would be a
+great deal of churn; we are better to guarantee that the
+tdb_errcode is per-thread so the current programming model can be
+maintained.
+
+This requires dynamic per-thread allocations, which is awkward
+with POSIX threads (pthread_key_create space is limited and we
+cannot simply allocate a key for every TDB).
+
+Internal locking is required to make sure that fcntl locks do not
+overlap between threads, and also that the global list of tdbs is
+maintained.
+
+The aim is that building tdb with -DTDB_PTHREAD will result in a
+pthread-safe version of the library, and otherwise no overhead
+will exist. Alternatively, a hooking mechanism similar to that
+proposed for [Proposed-Solution-locking-hook] could be used to
+enable pthread locking at runtime.
+
+2.7.2 Status
+
+Incomplete.
+
+2.8 *_nonblock Functions And *_mark Functions Expose
+ Implementation
+
+CTDB[footnote:
+Clustered TDB, see http://ctdb.samba.org
+] wishes to operate on TDB in a non-blocking manner. This is
+currently done as follows:
+
+1. Call the _nonblock variant of an API function (eg.
+ tdb_lockall_nonblock). If this fails:
+
+2. Fork a child process, and wait for it to call the normal
+ variant (eg. tdb_lockall).
+
+3. If the child succeeds, call the _mark variant to indicate we
+ already have the locks (eg. tdb_lockall_mark).
+
+4. Upon completion, tell the child to release the locks (eg.
+ tdb_unlockall).
+
+5. Indicate to tdb that it should consider the locks removed (eg.
+ tdb_unlockall_mark).
+
+There are several issues with this approach. Firstly, adding two
+new variants of each function clutters the API for an obscure
+use, and so not all functions have three variants. Secondly, it
+assumes that all paths of the functions ask for the same locks,
+otherwise the parent process will have to get a lock which the
+child doesn't have under some circumstances. I don't believe this
+is currently the case, but it constrains the implementation.
+
+2.8.1 <Proposed-Solution-locking-hook>Proposed Solution
+
+Implement a hook for locking methods, so that the caller can
+control the calls to create and remove fcntl locks. In this
+scenario, ctdbd would operate as follows:
+
+1. Call the normal API function, eg tdb_lockall().
+
+2. When the lock callback comes in, check if the child has the
+ lock. Initially, this is always false. If so, return 0.
+ Otherwise, try to obtain it in non-blocking mode. If that
+ fails, return EWOULDBLOCK.
+
+3. Release locks in the unlock callback as normal.
+
+4. If tdb_lockall() fails, see if we recorded a lock failure; if
+ so, call the child to repeat the operation.
+
+5. The child records what locks it obtains, and returns that
+ information to the parent.
+
+6. When the child has succeeded, goto 1.
+
+This is flexible enough to handle any potential locking scenario,
+even when lock requirements change. It can be optimized so that
+the parent does not release locks, just tells the child which
+locks it doesn't need to obtain.
+
+It also keeps the complexity out of the API, and in ctdbd where
+it is needed.
+
+2.8.2 Status
+
+Incomplete.
+
+2.9 tdb_chainlock Functions Expose Implementation
+
+tdb_chainlock locks some number of records, including the record
+indicated by the given key. This gave atomicity guarantees;
+no-one can start a transaction, alter, read or delete that key
+while the lock is held.
+
+It also makes the same guarantee for any other key in the chain,
+which is an internal implementation detail and potentially a
+cause for deadlock.
+
+2.9.1 Proposed Solution
+
+None. It would be nice to have an explicit single entry lock
+which effected no other keys. Unfortunately, this won't work for
+an entry which doesn't exist. Thus while chainlock may be
+implemented more efficiently for the existing case, it will still
+have overlap issues with the non-existing case. So it is best to
+keep the current (lack of) guarantee about which records will be
+effected to avoid constraining our implementation.
+
+2.10 Signal Handling is Not Race-Free
+
+The tdb_setalarm_sigptr() call allows the caller's signal handler
+to indicate that the tdb locking code should return with a
+failure, rather than trying again when a signal is received (and
+errno == EAGAIN). This is usually used to implement timeouts.
+
+Unfortunately, this does not work in the case where the signal is
+received before the tdb code enters the fcntl() call to place the
+lock: the code will sleep within the fcntl() code, unaware that
+the signal wants it to exit. In the case of long timeouts, this
+does not happen in practice.
+
+2.10.1 Proposed Solution
+
+The locking hooks proposed in[Proposed-Solution-locking-hook]
+would allow the user to decide on whether to fail the lock
+acquisition on a signal. This allows the caller to choose their
+own compromise: they could narrow the race by checking
+immediately before the fcntl call.[footnote:
+It may be possible to make this race-free in some implementations
+by having the signal handler alter the struct flock to make it
+invalid. This will cause the fcntl() lock call to fail with
+EINVAL if the signal occurs before the kernel is entered,
+otherwise EAGAIN.
+]
+
+2.10.2 Status
+
+Incomplete.
+
+2.11 The API Uses Gratuitous Typedefs, Capitals
+
+typedefs are useful for providing source compatibility when types
+can differ across implementations, or arguably in the case of
+function pointer definitions which are hard for humans to parse.
+Otherwise it is simply obfuscation and pollutes the namespace.
+
+Capitalization is usually reserved for compile-time constants and
+macros.
+
+ TDB_CONTEXT There is no reason to use this over 'struct
+ tdb_context'; the definition isn't visible to the API user
+ anyway.
+
+ TDB_DATA There is no reason to use this over struct TDB_DATA;
+ the struct needs to be understood by the API user.
+
+ struct TDB_DATA This would normally be called 'struct
+ tdb_data'.
+
+ enum TDB_ERROR Similarly, this would normally be enum
+ tdb_error.
+
+2.11.1 Proposed Solution
+
+None. Introducing lower case variants would please pedants like
+myself, but if it were done the existing ones should be kept.
+There is little point forcing a purely cosmetic change upon tdb
+users.
+
+2.12 <tdb_log_func-Doesnt-Take>tdb_log_func Doesn't Take The
+ Private Pointer
+
+For API compatibility reasons, the logging function needs to call
+tdb_get_logging_private() to retrieve the pointer registered by
+the tdb_open_ex for logging.
+
+2.12.1 Proposed Solution
+
+It should simply take an extra argument, since we are prepared to
+break the API/ABI.
+
+2.12.2 Status
+
+Complete.
+
+2.13 Various Callback Functions Are Not Typesafe
+
+The callback functions in tdb_set_logging_function (after [tdb_log_func-Doesnt-Take]
+ is resolved), tdb_parse_record, tdb_traverse, tdb_traverse_read
+and tdb_check all take void * and must internally convert it to
+the argument type they were expecting.
+
+If this type changes, the compiler will not produce warnings on
+the callers, since it only sees void *.
+
+2.13.1 Proposed Solution
+
+With careful use of macros, we can create callback functions
+which give a warning when used on gcc and the types of the
+callback and its private argument differ. Unsupported compilers
+will not give a warning, which is no worse than now. In addition,
+the callbacks become clearer, as they need not use void * for
+their parameter.
+
+See CCAN's typesafe_cb module at
+http://ccan.ozlabs.org/info/typesafe_cb.html
+
+2.13.2 Status
+
+Incomplete.
+
+2.14 TDB_CLEAR_IF_FIRST Must Be Specified On All Opens,
+ tdb_reopen_all Problematic
+
+The TDB_CLEAR_IF_FIRST flag to tdb_open indicates that the TDB
+file should be cleared if the caller discovers it is the only
+process with the TDB open. However, if any caller does not
+specify TDB_CLEAR_IF_FIRST it will not be detected, so will have
+the TDB erased underneath them (usually resulting in a crash).
+
+There is a similar issue on fork(); if the parent exits (or
+otherwise closes the tdb) before the child calls tdb_reopen_all()
+to establish the lock used to indicate the TDB is opened by
+someone, a TDB_CLEAR_IF_FIRST opener at that moment will believe
+it alone has opened the TDB and will erase it.
+
+2.14.1 Proposed Solution
+
+Remove TDB_CLEAR_IF_FIRST. Other workarounds are possible, but
+see [TDB_CLEAR_IF_FIRST-Imposes-Performance].
+
+2.14.2 Status
+
+Incomplete, TDB_CLEAR_IF_FIRST still defined, but not
+implemented.
+
+2.15 Extending The Header Is Difficult
+
+We have reserved (zeroed) words in the TDB header, which can be
+used for future features. If the future features are compulsory,
+the version number must be updated to prevent old code from
+accessing the database. But if the future feature is optional, we
+have no way of telling if older code is accessing the database or
+not.
+
+2.15.1 Proposed Solution
+
+The header should contain a “format variant” value (64-bit). This
+is divided into two 32-bit parts:
+
+1. The lower part reflects the format variant understood by code
+ accessing the database.
+
+2. The upper part reflects the format variant you must understand
+ to write to the database (otherwise you can only open for
+ reading).
+
+The latter field can only be written at creation time, the former
+should be written under the OPEN_LOCK when opening the database
+for writing, if the variant of the code is lower than the current
+lowest variant.
+
+This should allow backwards-compatible features to be added, and
+detection if older code (which doesn't understand the feature)
+writes to the database.
+
+2.15.2 Status
+
+Incomplete.
+
+2.16 Record Headers Are Not Expandible
+
+If we later want to add (say) checksums on keys and data, it
+would require another format change, which we'd like to avoid.
+
+2.16.1 Proposed Solution
+
+We often have extra padding at the tail of a record. If we ensure
+that the first byte (if any) of this padding is zero, we will
+have a way for future changes to detect code which doesn't
+understand a new format: the new code would write (say) a 1 at
+the tail, and thus if there is no tail or the first byte is 0, we
+would know the extension is not present on that record.
+
+2.16.2 Status
+
+Incomplete.
+
+2.17 TDB Does Not Use Talloc
+
+Many users of TDB (particularly Samba) use the talloc allocator,
+and thus have to wrap TDB in a talloc context to use it
+conveniently.
+
+2.17.1 Proposed Solution
+
+The allocation within TDB is not complicated enough to justify
+the use of talloc, and I am reluctant to force another
+(excellent) library on TDB users. Nonetheless a compromise is
+possible. An attribute (see [attributes]) can be added later to
+tdb_open() to provide an alternate allocation mechanism,
+specifically for talloc but usable by any other allocator (which
+would ignore the “context” argument).
+
+This would form a talloc heirarchy as expected, but the caller
+would still have to attach a destructor to the tdb context
+returned from tdb_open to close it. All TDB_DATA fields would be
+children of the tdb_context, and the caller would still have to
+manage them (using talloc_free() or talloc_steal()).
+
+2.17.2 Status
+
+Deferred.
+
+3 Performance And Scalability Issues
+
+3.1 <TDB_CLEAR_IF_FIRST-Imposes-Performance>TDB_CLEAR_IF_FIRST
+ Imposes Performance Penalty
+
+When TDB_CLEAR_IF_FIRST is specified, a 1-byte read lock is
+placed at offset 4 (aka. the ACTIVE_LOCK). While these locks
+never conflict in normal tdb usage, they do add substantial
+overhead for most fcntl lock implementations when the kernel
+scans to detect if a lock conflict exists. This is often a single
+linked list, making the time to acquire and release a fcntl lock
+O(N) where N is the number of processes with the TDB open, not
+the number actually doing work.
+
+In a Samba server it is common to have huge numbers of clients
+sitting idle, and thus they have weaned themselves off the
+TDB_CLEAR_IF_FIRST flag.[footnote:
+There is a flag to tdb_reopen_all() which is used for this
+optimization: if the parent process will outlive the child, the
+child does not need the ACTIVE_LOCK. This is a workaround for
+this very performance issue.
+]
+
+3.1.1 Proposed Solution
+
+Remove the flag. It was a neat idea, but even trivial servers
+tend to know when they are initializing for the first time and
+can simply unlink the old tdb at that point.
+
+3.1.2 Status
+
+Incomplete; TDB_CLEAR_IF_FIRST still defined, but does nothing.
+
+3.2 TDB Files Have a 4G Limit
+
+This seems to be becoming an issue (so much for “trivial”!),
+particularly for ldb.
+
+3.2.1 Proposed Solution
+
+A new, incompatible TDB format which uses 64 bit offsets
+internally rather than 32 bit as now. For simplicity of endian
+conversion (which TDB does on the fly if required), all values
+will be 64 bit on disk. In practice, some upper bits may be used
+for other purposes, but at least 56 bits will be available for
+file offsets.
+
+tdb_open() will automatically detect the old version, and even
+create them if TDB_VERSION6 is specified to tdb_open.
+
+32 bit processes will still be able to access TDBs larger than 4G
+(assuming that their off_t allows them to seek to 64 bits), they
+will gracefully fall back as they fail to mmap. This can happen
+already with large TDBs.
+
+Old versions of tdb will fail to open the new TDB files (since 28
+August 2009, commit 398d0c29290: prior to that any unrecognized
+file format would be erased and initialized as a fresh tdb!)
+
+3.2.2 Status
+
+Complete.
+
+3.3 TDB Records Have a 4G Limit
+
+This has not been a reported problem, and the API uses size_t
+which can be 64 bit on 64 bit platforms. However, other limits
+may have made such an issue moot.
+
+3.3.1 Proposed Solution
+
+Record sizes will be 64 bit, with an error returned on 32 bit
+platforms which try to access such records (the current
+implementation would return TDB_ERR_OOM in a similar case). It
+seems unlikely that 32 bit keys will be a limitation, so the
+implementation may not support this (see [sub:Records-Incur-A]).
+
+3.3.2 Status
+
+Complete.
+
+3.4 Hash Size Is Determined At TDB Creation Time
+
+TDB contains a number of hash chains in the header; the number is
+specified at creation time, and defaults to 131. This is such a
+bottleneck on large databases (as each hash chain gets quite
+long), that LDB uses 10,000 for this hash. In general it is
+impossible to know what the 'right' answer is at database
+creation time.
+
+3.4.1 <sub:Hash-Size-Solution>Proposed Solution
+
+After comprehensive performance testing on various scalable hash
+variants[footnote:
+http://rusty.ozlabs.org/?p=89 and http://rusty.ozlabs.org/?p=94
+This was annoying because I was previously convinced that an
+expanding tree of hashes would be very close to optimal.
+], it became clear that it is hard to beat a straight linear hash
+table which doubles in size when it reaches saturation.
+Unfortunately, altering the hash table introduces serious locking
+complications: the entire hash table needs to be locked to
+enlarge the hash table, and others might be holding locks.
+Particularly insidious are insertions done under tdb_chainlock.
+
+Thus an expanding layered hash will be used: an array of hash
+groups, with each hash group exploding into pointers to lower
+hash groups once it fills, turning into a hash tree. This has
+implications for locking: we must lock the entire group in case
+we need to expand it, yet we don't know how deep the tree is at
+that point.
+
+Note that bits from the hash table entries should be stolen to
+hold more hash bits to reduce the penalty of collisions. We can
+use the otherwise-unused lower 3 bits. If we limit the size of
+the database to 64 exabytes, we can use the top 8 bits of the
+hash entry as well. These 11 bits would reduce false positives
+down to 1 in 2000 which is more than we need: we can use one of
+the bits to indicate that the extra hash bits are valid. This
+means we can choose not to re-hash all entries when we expand a
+hash group; simply use the next bits we need and mark them
+invalid.
+
+3.4.2 Status
+
+Complete.
+
+3.5 <TDB-Freelist-Is>TDB Freelist Is Highly Contended
+
+TDB uses a single linked list for the free list. Allocation
+occurs as follows, using heuristics which have evolved over time:
+
+1. Get the free list lock for this whole operation.
+
+2. Multiply length by 1.25, so we always over-allocate by 25%.
+
+3. Set the slack multiplier to 1.
+
+4. Examine the current freelist entry: if it is > length but <
+ the current best case, remember it as the best case.
+
+5. Multiply the slack multiplier by 1.05.
+
+6. If our best fit so far is less than length * slack multiplier,
+ return it. The slack will be turned into a new free record if
+ it's large enough.
+
+7. Otherwise, go onto the next freelist entry.
+
+Deleting a record occurs as follows:
+
+1. Lock the hash chain for this whole operation.
+
+2. Walk the chain to find the record, keeping the prev pointer
+ offset.
+
+3. If max_dead is non-zero:
+
+ (a) Walk the hash chain again and count the dead records.
+
+ (b) If it's more than max_dead, bulk free all the dead ones
+ (similar to steps 4 and below, but the lock is only obtained
+ once).
+
+ (c) Simply mark this record as dead and return.
+
+4. Get the free list lock for the remainder of this operation.
+
+5. <right-merging>Examine the following block to see if it is
+ free; if so, enlarge the current block and remove that block
+ from the free list. This was disabled, as removal from the free
+ list was O(entries-in-free-list).
+
+6. Examine the preceeding block to see if it is free: for this
+ reason, each block has a 32-bit tailer which indicates its
+ length. If it is free, expand it to cover our new block and
+ return.
+
+7. Otherwise, prepend ourselves to the free list.
+
+Disabling right-merging (step [right-merging]) causes
+fragmentation; the other heuristics proved insufficient to
+address this, so the final answer to this was that when we expand
+the TDB file inside a transaction commit, we repack the entire
+tdb.
+
+The single list lock limits our allocation rate; due to the other
+issues this is not currently seen as a bottleneck.
+
+3.5.1 Proposed Solution
+
+The first step is to remove all the current heuristics, as they
+obviously interact, then examine them once the lock contention is
+addressed.
+
+The free list must be split to reduce contention. Assuming
+perfect free merging, we can at most have 1 free list entry for
+each entry. This implies that the number of free lists is related
+to the size of the hash table, but as it is rare to walk a large
+number of free list entries we can use far fewer, say 1/32 of the
+number of hash buckets.
+
+It seems tempting to try to reuse the hash implementation which
+we use for records here, but we have two ways of searching for
+free entries: for allocation we search by size (and possibly
+zone) which produces too many clashes for our hash table to
+handle well, and for coalescing we search by address. Thus an
+array of doubly-linked free lists seems preferable.
+
+There are various benefits in using per-size free lists (see [sub:TDB-Becomes-Fragmented]
+) but it's not clear this would reduce contention in the common
+case where all processes are allocating/freeing the same size.
+Thus we almost certainly need to divide in other ways: the most
+obvious is to divide the file into zones, and using a free list
+(or table of free lists) for each. This approximates address
+ordering.
+
+Unfortunately it is difficult to know what heuristics should be
+used to determine zone sizes, and our transaction code relies on
+being able to create a “recovery area” by simply appending to the
+file (difficult if it would need to create a new zone header).
+Thus we use a linked-list of free tables; currently we only ever
+create one, but if there is more than one we choose one at random
+to use. In future we may use heuristics to add new free tables on
+contention. We only expand the file when all free tables are
+exhausted.
+
+The basic algorithm is as follows. Freeing is simple:
+
+1. Identify the correct free list.
+
+2. Lock the corresponding list.
+
+3. Re-check the list (we didn't have a lock, sizes could have
+ changed): relock if necessary.
+
+4. Place the freed entry in the list.
+
+Allocation is a little more complicated, as we perform delayed
+coalescing at this point:
+
+1. Pick a free table; usually the previous one.
+
+2. Lock the corresponding list.
+
+3. If the top entry is -large enough, remove it from the list and
+ return it.
+
+4. Otherwise, coalesce entries in the list.If there was no entry
+ large enough, unlock the list and try the next largest list
+
+5. If no list has an entry which meets our needs, try the next
+ free table.
+
+6. If no zone satisfies, expand the file.
+
+This optimizes rapid insert/delete of free list entries by not
+coalescing them all the time.. First-fit address ordering
+ordering seems to be fairly good for keeping fragmentation low
+(see [sub:TDB-Becomes-Fragmented]). Note that address ordering
+does not need a tailer to coalesce, though if we needed one we
+could have one cheaply: see [sub:Records-Incur-A].
+
+Each free entry has the free table number in the header: less
+than 255. It also contains a doubly-linked list for easy
+deletion.
+
+3.6 <sub:TDB-Becomes-Fragmented>TDB Becomes Fragmented
+
+Much of this is a result of allocation strategy[footnote:
+The Memory Fragmentation Problem: Solved? Johnstone & Wilson 1995
+ftp://ftp.cs.utexas.edu/pub/garbage/malloc/ismm98.ps
+] and deliberate hobbling of coalescing; internal fragmentation
+(aka overallocation) is deliberately set at 25%, and external
+fragmentation is only cured by the decision to repack the entire
+db when a transaction commit needs to enlarge the file.
+
+3.6.1 Proposed Solution
+
+The 25% overhead on allocation works in practice for ldb because
+indexes tend to expand by one record at a time. This internal
+fragmentation can be resolved by having an “expanded” bit in the
+header to note entries that have previously expanded, and
+allocating more space for them.
+
+There are is a spectrum of possible solutions for external
+fragmentation: one is to use a fragmentation-avoiding allocation
+strategy such as best-fit address-order allocator. The other end
+of the spectrum would be to use a bump allocator (very fast and
+simple) and simply repack the file when we reach the end.
+
+There are three problems with efficient fragmentation-avoiding
+allocators: they are non-trivial, they tend to use a single free
+list for each size, and there's no evidence that tdb allocation
+patterns will match those recorded for general allocators (though
+it seems likely).
+
+Thus we don't spend too much effort on external fragmentation; we
+will be no worse than the current code if we need to repack on
+occasion. More effort is spent on reducing freelist contention,
+and reducing overhead.
+
+3.7 <sub:Records-Incur-A>Records Incur A 28-Byte Overhead
+
+Each TDB record has a header as follows:
+
+struct tdb_record {
+
+ tdb_off_t next; /* offset of the next record in the list
+*/
+
+ tdb_len_t rec_len; /* total byte length of record */
+
+ tdb_len_t key_len; /* byte length of key */
+
+ tdb_len_t data_len; /* byte length of data */
+
+ uint32_t full_hash; /* the full 32 bit hash of the key */
+
+ uint32_t magic; /* try to catch errors */
+
+ /* the following union is implied:
+
+ union {
+
+ char record[rec_len];
+
+ struct {
+
+ char key[key_len];
+
+ char data[data_len];
+
+ }
+
+ uint32_t totalsize; (tailer)
+
+ }
+
+ */
+
+};
+
+Naively, this would double to a 56-byte overhead on a 64 bit
+implementation.
+
+3.7.1 Proposed Solution
+
+We can use various techniques to reduce this for an allocated
+block:
+
+1. The 'next' pointer is not required, as we are using a flat
+ hash table.
+
+2. 'rec_len' can instead be expressed as an addition to key_len
+ and data_len (it accounts for wasted or overallocated length in
+ the record). Since the record length is always a multiple of 8,
+ we can conveniently fit it in 32 bits (representing up to 35
+ bits).
+
+3. 'key_len' and 'data_len' can be reduced. I'm unwilling to
+ restrict 'data_len' to 32 bits, but instead we can combine the
+ two into one 64-bit field and using a 5 bit value which
+ indicates at what bit to divide the two. Keys are unlikely to
+ scale as fast as data, so I'm assuming a maximum key size of 32
+ bits.
+
+4. 'full_hash' is used to avoid a memcmp on the “miss” case, but
+ this is diminishing returns after a handful of bits (at 10
+ bits, it reduces 99.9% of false memcmp). As an aside, as the
+ lower bits are already incorporated in the hash table
+ resolution, the upper bits should be used here. Note that it's
+ not clear that these bits will be a win, given the extra bits
+ in the hash table itself (see [sub:Hash-Size-Solution]).
+
+5. 'magic' does not need to be enlarged: it currently reflects
+ one of 5 values (used, free, dead, recovery, and
+ unused_recovery). It is useful for quick sanity checking
+ however, and should not be eliminated.
+
+6. 'tailer' is only used to coalesce free blocks (so a block to
+ the right can find the header to check if this block is free).
+ This can be replaced by a single 'free' bit in the header of
+ the following block (and the tailer only exists in free
+ blocks).[footnote:
+This technique from Thomas Standish. Data Structure Techniques.
+Addison-Wesley, Reading, Massachusetts, 1980.
+] The current proposed coalescing algorithm doesn't need this,
+ however.
+
+This produces a 16 byte used header like this:
+
+struct tdb_used_record {
+
+ uint32_t used_magic : 16,
+
+
+
+ key_data_divide: 5,
+
+ top_hash: 11;
+
+ uint32_t extra_octets;
+
+ uint64_t key_and_data_len;
+
+};
+
+And a free record like this:
+
+struct tdb_free_record {
+
+ uint64_t free_magic: 8,
+
+ prev : 56;
+
+
+
+ uint64_t free_table: 8,
+
+ total_length : 56
+
+ uint64_t next;;
+
+};
+
+Note that by limiting valid offsets to 56 bits, we can pack
+everything we need into 3 64-byte words, meaning our minimum
+record size is 8 bytes.
+
+3.7.2 Status
+
+Complete.
+
+3.8 Transaction Commit Requires 4 fdatasync
+
+The current transaction algorithm is:
+
+1. write_recovery_data();
+
+2. sync();
+
+3. write_recovery_header();
+
+4. sync();
+
+5. overwrite_with_new_data();
+
+6. sync();
+
+7. remove_recovery_header();
+
+8. sync();
+
+On current ext3, each sync flushes all data to disk, so the next
+3 syncs are relatively expensive. But this could become a
+performance bottleneck on other filesystems such as ext4.
+
+3.8.1 Proposed Solution
+
+Neil Brown points out that this is overzealous, and only one sync
+is needed:
+
+1. Bundle the recovery data, a transaction counter and a strong
+ checksum of the new data.
+
+2. Strong checksum that whole bundle.
+
+3. Store the bundle in the database.
+
+4. Overwrite the oldest of the two recovery pointers in the
+ header (identified using the transaction counter) with the
+ offset of this bundle.
+
+5. sync.
+
+6. Write the new data to the file.
+
+Checking for recovery means identifying the latest bundle with a
+valid checksum and using the new data checksum to ensure that it
+has been applied. This is more expensive than the current check,
+but need only be done at open. For running databases, a separate
+header field can be used to indicate a transaction in progress;
+we need only check for recovery if this is set.
+
+3.8.2 Status
+
+Deferred.
+
+3.9 <sub:TDB-Does-Not>TDB Does Not Have Snapshot Support
+
+3.9.1 Proposed SolutionNone. At some point you say “use a real
+ database” (but see [replay-attribute]).
+
+But as a thought experiment, if we implemented transactions to
+only overwrite free entries (this is tricky: there must not be a
+header in each entry which indicates whether it is free, but use
+of presence in metadata elsewhere), and a pointer to the hash
+table, we could create an entirely new commit without destroying
+existing data. Then it would be easy to implement snapshots in a
+similar way.
+
+This would not allow arbitrary changes to the database, such as
+tdb_repack does, and would require more space (since we have to
+preserve the current and future entries at once). If we used hash
+trees rather than one big hash table, we might only have to
+rewrite some sections of the hash, too.
+
+We could then implement snapshots using a similar method, using
+multiple different hash tables/free tables.
+
+3.9.2 Status
+
+Deferred.
+
+3.10 Transactions Cannot Operate in Parallel
+
+This would be useless for ldb, as it hits the index records with
+just about every update. It would add significant complexity in
+resolving clashes, and cause the all transaction callers to write
+their code to loop in the case where the transactions spuriously
+failed.
+
+3.10.1 Proposed Solution
+
+None (but see [replay-attribute]). We could solve a small part of
+the problem by providing read-only transactions. These would
+allow one write transaction to begin, but it could not commit
+until all r/o transactions are done. This would require a new
+RO_TRANSACTION_LOCK, which would be upgraded on commit.
+
+3.10.2 Status
+
+Deferred.
+
+3.11 Default Hash Function Is Suboptimal
+
+The Knuth-inspired multiplicative hash used by tdb is fairly slow
+(especially if we expand it to 64 bits), and works best when the
+hash bucket size is a prime number (which also means a slow
+modulus). In addition, it is highly predictable which could
+potentially lead to a Denial of Service attack in some TDB uses.
+
+3.11.1 Proposed Solution
+
+The Jenkins lookup3 hash[footnote:
+http://burtleburtle.net/bob/c/lookup3.c
+] is a fast and superbly-mixing hash. It's used by the Linux
+kernel and almost everything else. This has the particular
+properties that it takes an initial seed, and produces two 32 bit
+hash numbers, which we can combine into a 64-bit hash.
+
+The seed should be created at tdb-creation time from some random
+source, and placed in the header. This is far from foolproof, but
+adds a little bit of protection against hash bombing.
+
+3.11.2 Status
+
+Complete.
+
+3.12 <Reliable-Traversal-Adds>Reliable Traversal Adds Complexity
+
+We lock a record during traversal iteration, and try to grab that
+lock in the delete code. If that grab on delete fails, we simply
+mark it deleted and continue onwards; traversal checks for this
+condition and does the delete when it moves off the record.
+
+If traversal terminates, the dead record may be left
+indefinitely.
+
+3.12.1 Proposed Solution
+
+Remove reliability guarantees; see [traverse-Proposed-Solution].
+
+3.12.2 Status
+
+Complete.
+
+3.13 Fcntl Locking Adds Overhead
+
+Placing a fcntl lock means a system call, as does removing one.
+This is actually one reason why transactions can be faster
+(everything is locked once at transaction start). In the
+uncontended case, this overhead can theoretically be eliminated.
+
+3.13.1 Proposed Solution
+
+None.
+
+We tried this before with spinlock support, in the early days of
+TDB, and it didn't make much difference except in manufactured
+benchmarks.
+
+We could use spinlocks (with futex kernel support under Linux),
+but it means that we lose automatic cleanup when a process dies
+with a lock. There is a method of auto-cleanup under Linux, but
+it's not supported by other operating systems. We could
+reintroduce a clear-if-first-style lock and sweep for dead
+futexes on open, but that wouldn't help the normal case of one
+concurrent opener dying. Increasingly elaborate repair schemes
+could be considered, but they require an ABI change (everyone
+must use them) anyway, so there's no need to do this at the same
+time as everything else.
+
+3.14 Some Transactions Don't Require Durability
+
+Volker points out that gencache uses a CLEAR_IF_FIRST tdb for
+normal (fast) usage, and occasionally empties the results into a
+transactional TDB. This kind of usage prioritizes performance
+over durability: as long as we are consistent, data can be lost.
+
+This would be more neatly implemented inside tdb: a “soft”
+transaction commit (ie. syncless) which meant that data may be
+reverted on a crash.
+
+3.14.1 Proposed Solution
+
+None.
+
+Unfortunately any transaction scheme which overwrites old data
+requires a sync before that overwrite to avoid the possibility of
+corruption.
+
+It seems possible to use a scheme similar to that described in [sub:TDB-Does-Not]
+,where transactions are committed without overwriting existing
+data, and an array of top-level pointers were available in the
+header. If the transaction is “soft” then we would not need a
+sync at all: existing processes would pick up the new hash table
+and free list and work with that.
+
+At some later point, a sync would allow recovery of the old data
+into the free lists (perhaps when the array of top-level pointers
+filled). On crash, tdb_open() would examine the array of top
+levels, and apply the transactions until it encountered an
+invalid checksum.
+
+3.15 Tracing Is Fragile, Replay Is External
+
+The current TDB has compile-time-enabled tracing code, but it
+often breaks as it is not enabled by default. In a similar way,
+the ctdb code has an external wrapper which does replay tracing
+so it can coordinate cluster-wide transactions.
+
+3.15.1 Proposed Solution<replay-attribute>
+
+Tridge points out that an attribute can be later added to
+tdb_open (see [attributes]) to provide replay/trace hooks, which
+could become the basis for this and future parallel transactions
+and snapshot support.
+
+3.15.2 Status
+
+Deferred.
diff --git a/lib/tdb2/free.c b/lib/tdb2/free.c
new file mode 100644
index 00000000000..a770751dc02
--- /dev/null
+++ b/lib/tdb2/free.c
@@ -0,0 +1,968 @@
+ /*
+ Trivial Database 2: free list/block handling
+ Copyright (C) Rusty Russell 2010
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 3 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+#include "private.h"
+#include <ccan/likely/likely.h>
+#include <ccan/ilog/ilog.h>
+#include <time.h>
+#include <assert.h>
+#include <limits.h>
+
+static unsigned fls64(uint64_t val)
+{
+ return ilog64(val);
+}
+
+/* In which bucket would we find a particular record size? (ignoring header) */
+unsigned int size_to_bucket(tdb_len_t data_len)
+{
+ unsigned int bucket;
+
+ /* We can't have records smaller than this. */
+ assert(data_len >= TDB_MIN_DATA_LEN);
+
+ /* Ignoring the header... */
+ if (data_len - TDB_MIN_DATA_LEN <= 64) {
+ /* 0 in bucket 0, 8 in bucket 1... 64 in bucket 8. */
+ bucket = (data_len - TDB_MIN_DATA_LEN) / 8;
+ } else {
+ /* After that we go power of 2. */
+ bucket = fls64(data_len - TDB_MIN_DATA_LEN) + 2;
+ }
+
+ if (unlikely(bucket >= TDB_FREE_BUCKETS))
+ bucket = TDB_FREE_BUCKETS - 1;
+ return bucket;
+}
+
+tdb_off_t first_ftable(struct tdb_context *tdb)
+{
+ return tdb_read_off(tdb, offsetof(struct tdb_header, free_table));
+}
+
+tdb_off_t next_ftable(struct tdb_context *tdb, tdb_off_t ftable)
+{
+ return tdb_read_off(tdb, ftable + offsetof(struct tdb_freetable,next));
+}
+
+enum TDB_ERROR tdb_ftable_init(struct tdb_context *tdb)
+{
+ /* Use reservoir sampling algorithm to select a free list at random. */
+ unsigned int rnd, max = 0, count = 0;
+ tdb_off_t off;
+
+ tdb->ftable_off = off = first_ftable(tdb);
+ tdb->ftable = 0;
+
+ while (off) {
+ if (TDB_OFF_IS_ERR(off)) {
+ return off;
+ }
+
+ rnd = random();
+ if (rnd >= max) {
+ tdb->ftable_off = off;
+ tdb->ftable = count;
+ max = rnd;
+ }
+
+ off = next_ftable(tdb, off);
+ count++;
+ }
+ return TDB_SUCCESS;
+}
+
+/* Offset of a given bucket. */
+tdb_off_t bucket_off(tdb_off_t ftable_off, unsigned bucket)
+{
+ return ftable_off + offsetof(struct tdb_freetable, buckets)
+ + bucket * sizeof(tdb_off_t);
+}
+
+/* Returns free_buckets + 1, or list number to search, or -ve error. */
+static tdb_off_t find_free_head(struct tdb_context *tdb,
+ tdb_off_t ftable_off,
+ tdb_off_t bucket)
+{
+ /* Speculatively search for a non-zero bucket. */
+ return tdb_find_nonzero_off(tdb, bucket_off(ftable_off, 0),
+ bucket, TDB_FREE_BUCKETS);
+}
+
+static void check_list(struct tdb_context *tdb, tdb_off_t b_off)
+{
+#ifdef CCAN_TDB2_DEBUG
+ tdb_off_t off, prev = 0, first;
+ struct tdb_free_record r;
+
+ first = off = (tdb_read_off(tdb, b_off) & TDB_OFF_MASK);
+ while (off != 0) {
+ tdb_read_convert(tdb, off, &r, sizeof(r));
+ if (frec_magic(&r) != TDB_FREE_MAGIC)
+ abort();
+ if (prev && frec_prev(&r) != prev)
+ abort();
+ prev = off;
+ off = r.next;
+ }
+
+ if (first) {
+ tdb_read_convert(tdb, first, &r, sizeof(r));
+ if (frec_prev(&r) != prev)
+ abort();
+ }
+#endif
+}
+
+/* Remove from free bucket. */
+static enum TDB_ERROR remove_from_list(struct tdb_context *tdb,
+ tdb_off_t b_off, tdb_off_t r_off,
+ const struct tdb_free_record *r)
+{
+ tdb_off_t off, prev_next, head;
+ enum TDB_ERROR ecode;
+
+ /* Is this only element in list? Zero out bucket, and we're done. */
+ if (frec_prev(r) == r_off)
+ return tdb_write_off(tdb, b_off, 0);
+
+ /* off = &r->prev->next */
+ off = frec_prev(r) + offsetof(struct tdb_free_record, next);
+
+ /* Get prev->next */
+ prev_next = tdb_read_off(tdb, off);
+ if (TDB_OFF_IS_ERR(prev_next))
+ return prev_next;
+
+ /* If prev->next == 0, we were head: update bucket to point to next. */
+ if (prev_next == 0) {
+ /* We must preserve upper bits. */
+ head = tdb_read_off(tdb, b_off);
+ if (TDB_OFF_IS_ERR(head))
+ return head;
+
+ if ((head & TDB_OFF_MASK) != r_off) {
+ return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
+ "remove_from_list:"
+ " %llu head %llu on list %llu",
+ (long long)r_off,
+ (long long)head,
+ (long long)b_off);
+ }
+ head = ((head & ~TDB_OFF_MASK) | r->next);
+ ecode = tdb_write_off(tdb, b_off, head);
+ if (ecode != TDB_SUCCESS)
+ return ecode;
+ } else {
+ /* r->prev->next = r->next */
+ ecode = tdb_write_off(tdb, off, r->next);
+ if (ecode != TDB_SUCCESS)
+ return ecode;
+ }
+
+ /* If we were the tail, off = &head->prev. */
+ if (r->next == 0) {
+ head = tdb_read_off(tdb, b_off);
+ if (TDB_OFF_IS_ERR(head))
+ return head;
+ head &= TDB_OFF_MASK;
+ off = head + offsetof(struct tdb_free_record, magic_and_prev);
+ } else {
+ /* off = &r->next->prev */
+ off = r->next + offsetof(struct tdb_free_record,
+ magic_and_prev);
+ }
+
+#ifdef CCAN_TDB2_DEBUG
+ /* *off == r */
+ if ((tdb_read_off(tdb, off) & TDB_OFF_MASK) != r_off) {
+ return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
+ "remove_from_list:"
+ " %llu bad prev in list %llu",
+ (long long)r_off, (long long)b_off);
+ }
+#endif
+ /* r->next->prev = r->prev */
+ return tdb_write_off(tdb, off, r->magic_and_prev);
+}
+
+/* Enqueue in this free bucket: sets coalesce if we've added 128
+ * entries to it. */
+static enum TDB_ERROR enqueue_in_free(struct tdb_context *tdb,
+ tdb_off_t b_off,
+ tdb_off_t off,
+ tdb_len_t len,
+ bool *coalesce)
+{
+ struct tdb_free_record new;
+ enum TDB_ERROR ecode;
+ tdb_off_t prev, head;
+ uint64_t magic = (TDB_FREE_MAGIC << (64 - TDB_OFF_UPPER_STEAL));
+
+ head = tdb_read_off(tdb, b_off);
+ if (TDB_OFF_IS_ERR(head))
+ return head;
+
+ /* We only need to set ftable_and_len; rest is set in enqueue_in_free */
+ new.ftable_and_len = ((uint64_t)tdb->ftable << (64 - TDB_OFF_UPPER_STEAL))
+ | len;
+
+ /* new->next = head. */
+ new.next = (head & TDB_OFF_MASK);
+
+ /* First element? Prev points to ourselves. */
+ if (!new.next) {
+ new.magic_and_prev = (magic | off);
+ } else {
+ /* new->prev = next->prev */
+ prev = tdb_read_off(tdb,
+ new.next + offsetof(struct tdb_free_record,
+ magic_and_prev));
+ new.magic_and_prev = prev;
+ if (frec_magic(&new) != TDB_FREE_MAGIC) {
+ return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
+ "enqueue_in_free: %llu bad head"
+ " prev %llu",
+ (long long)new.next,
+ (long long)prev);
+ }
+ /* next->prev = new. */
+ ecode = tdb_write_off(tdb, new.next
+ + offsetof(struct tdb_free_record,
+ magic_and_prev),
+ off | magic);
+ if (ecode != TDB_SUCCESS) {
+ return ecode;
+ }
+
+#ifdef CCAN_TDB2_DEBUG
+ prev = tdb_read_off(tdb, frec_prev(&new)
+ + offsetof(struct tdb_free_record, next));
+ if (prev != 0) {
+ return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
+ "enqueue_in_free:"
+ " %llu bad tail next ptr %llu",
+ (long long)frec_prev(&new)
+ + offsetof(struct tdb_free_record,
+ next),
+ (long long)prev);
+ }
+#endif
+ }
+
+ /* Update enqueue count, but don't set high bit: see TDB_OFF_IS_ERR */
+ if (*coalesce)
+ head += (1ULL << (64 - TDB_OFF_UPPER_STEAL));
+ head &= ~(TDB_OFF_MASK | (1ULL << 63));
+ head |= off;
+
+ ecode = tdb_write_off(tdb, b_off, head);
+ if (ecode != TDB_SUCCESS) {
+ return ecode;
+ }
+
+ /* It's time to coalesce if counter wrapped. */
+ if (*coalesce)
+ *coalesce = ((head & ~TDB_OFF_MASK) == 0);
+
+ return tdb_write_convert(tdb, off, &new, sizeof(new));
+}
+
+static tdb_off_t ftable_offset(struct tdb_context *tdb, unsigned int ftable)
+{
+ tdb_off_t off;
+ unsigned int i;
+
+ if (likely(tdb->ftable == ftable))
+ return tdb->ftable_off;
+
+ off = first_ftable(tdb);
+ for (i = 0; i < ftable; i++) {
+ if (TDB_OFF_IS_ERR(off)) {
+ break;
+ }
+ off = next_ftable(tdb, off);
+ }
+ return off;
+}
+
+/* Note: we unlock the current bucket if fail (-ve), or coalesce (+ve) and
+ * need to blatt the *protect record (which is set to an error). */
+static tdb_len_t coalesce(struct tdb_context *tdb,
+ tdb_off_t off, tdb_off_t b_off,
+ tdb_len_t data_len,
+ tdb_off_t *protect)
+{
+ tdb_off_t end;
+ struct tdb_free_record rec;
+ enum TDB_ERROR ecode;
+
+ tdb->stats.alloc_coalesce_tried++;
+ end = off + sizeof(struct tdb_used_record) + data_len;
+
+ while (end < tdb->file->map_size) {
+ const struct tdb_free_record *r;
+ tdb_off_t nb_off;
+ unsigned ftable, bucket;
+
+ r = tdb_access_read(tdb, end, sizeof(*r), true);
+ if (TDB_PTR_IS_ERR(r)) {
+ ecode = TDB_PTR_ERR(r);
+ goto err;
+ }
+
+ if (frec_magic(r) != TDB_FREE_MAGIC
+ || frec_ftable(r) == TDB_FTABLE_NONE) {
+ tdb_access_release(tdb, r);
+ break;
+ }
+
+ ftable = frec_ftable(r);
+ bucket = size_to_bucket(frec_len(r));
+ nb_off = ftable_offset(tdb, ftable);
+ if (TDB_OFF_IS_ERR(nb_off)) {
+ tdb_access_release(tdb, r);
+ ecode = nb_off;
+ goto err;
+ }
+ nb_off = bucket_off(nb_off, bucket);
+ tdb_access_release(tdb, r);
+
+ /* We may be violating lock order here, so best effort. */
+ if (tdb_lock_free_bucket(tdb, nb_off, TDB_LOCK_NOWAIT)
+ != TDB_SUCCESS) {
+ tdb->stats.alloc_coalesce_lockfail++;
+ break;
+ }
+
+ /* Now we have lock, re-check. */
+ ecode = tdb_read_convert(tdb, end, &rec, sizeof(rec));
+ if (ecode != TDB_SUCCESS) {
+ tdb_unlock_free_bucket(tdb, nb_off);
+ goto err;
+ }
+
+ if (unlikely(frec_magic(&rec) != TDB_FREE_MAGIC)) {
+ tdb->stats.alloc_coalesce_race++;
+ tdb_unlock_free_bucket(tdb, nb_off);
+ break;
+ }
+
+ if (unlikely(frec_ftable(&rec) != ftable)
+ || unlikely(size_to_bucket(frec_len(&rec)) != bucket)) {
+ tdb->stats.alloc_coalesce_race++;
+ tdb_unlock_free_bucket(tdb, nb_off);
+ break;
+ }
+
+ /* Did we just mess up a record you were hoping to use? */
+ if (end == *protect) {
+ tdb->stats.alloc_coalesce_iterate_clash++;
+ *protect = TDB_ERR_NOEXIST;
+ }
+
+ ecode = remove_from_list(tdb, nb_off, end, &rec);
+ check_list(tdb, nb_off);
+ if (ecode != TDB_SUCCESS) {
+ tdb_unlock_free_bucket(tdb, nb_off);
+ goto err;
+ }
+
+ end += sizeof(struct tdb_used_record) + frec_len(&rec);
+ tdb_unlock_free_bucket(tdb, nb_off);
+ tdb->stats.alloc_coalesce_num_merged++;
+ }
+
+ /* Didn't find any adjacent free? */
+ if (end == off + sizeof(struct tdb_used_record) + data_len)
+ return 0;
+
+ /* Before we expand, check this isn't one you wanted protected? */
+ if (off == *protect) {
+ *protect = TDB_ERR_EXISTS;
+ tdb->stats.alloc_coalesce_iterate_clash++;
+ }
+
+ /* OK, expand initial record */
+ ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec));
+ if (ecode != TDB_SUCCESS) {
+ goto err;
+ }
+
+ if (frec_len(&rec) != data_len) {
+ ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
+ "coalesce: expected data len %zu not %zu",
+ (size_t)data_len, (size_t)frec_len(&rec));
+ goto err;
+ }
+
+ ecode = remove_from_list(tdb, b_off, off, &rec);
+ check_list(tdb, b_off);
+ if (ecode != TDB_SUCCESS) {
+ goto err;
+ }
+
+ /* Try locking violation first. We don't allow coalesce recursion! */
+ ecode = add_free_record(tdb, off, end - off, TDB_LOCK_NOWAIT, false);
+ if (ecode != TDB_SUCCESS) {
+ /* Need to drop lock. Can't rely on anything stable. */
+ tdb->stats.alloc_coalesce_lockfail++;
+ *protect = TDB_ERR_CORRUPT;
+
+ /* We have to drop this to avoid deadlocks, so make sure record
+ * doesn't get coalesced by someone else! */
+ rec.ftable_and_len = (TDB_FTABLE_NONE
+ << (64 - TDB_OFF_UPPER_STEAL))
+ | (end - off - sizeof(struct tdb_used_record));
+ ecode = tdb_write_off(tdb,
+ off + offsetof(struct tdb_free_record,
+ ftable_and_len),
+ rec.ftable_and_len);
+ if (ecode != TDB_SUCCESS) {
+ goto err;
+ }
+
+ tdb_unlock_free_bucket(tdb, b_off);
+
+ ecode = add_free_record(tdb, off, end - off, TDB_LOCK_WAIT,
+ false);
+ if (ecode != TDB_SUCCESS) {
+ return ecode;
+ }
+ } else if (TDB_OFF_IS_ERR(*protect)) {
+ /* For simplicity, we always drop lock if they can't continue */
+ tdb_unlock_free_bucket(tdb, b_off);
+ }
+ tdb->stats.alloc_coalesce_succeeded++;
+
+ /* Return usable length. */
+ return end - off - sizeof(struct tdb_used_record);
+
+err:
+ /* To unify error paths, we *always* unlock bucket on error. */
+ tdb_unlock_free_bucket(tdb, b_off);
+ return ecode;
+}
+
+/* List is locked: we unlock it. */
+static enum TDB_ERROR coalesce_list(struct tdb_context *tdb,
+ tdb_off_t ftable_off,
+ tdb_off_t b_off,
+ unsigned int limit)
+{
+ enum TDB_ERROR ecode;
+ tdb_off_t off;
+
+ off = tdb_read_off(tdb, b_off);
+ if (TDB_OFF_IS_ERR(off)) {
+ ecode = off;
+ goto unlock_err;
+ }
+ /* A little bit of paranoia: counter should be 0. */
+ off &= TDB_OFF_MASK;
+
+ while (off && limit--) {
+ struct tdb_free_record rec;
+ tdb_len_t coal;
+ tdb_off_t next;
+
+ ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec));
+ if (ecode != TDB_SUCCESS)
+ goto unlock_err;
+
+ next = rec.next;
+ coal = coalesce(tdb, off, b_off, frec_len(&rec), &next);
+ if (TDB_OFF_IS_ERR(coal)) {
+ /* This has already unlocked on error. */
+ return coal;
+ }
+ if (TDB_OFF_IS_ERR(next)) {
+ /* Coalescing had to unlock, so stop. */
+ return TDB_SUCCESS;
+ }
+ /* Keep going if we're doing well... */
+ limit += size_to_bucket(coal / 16 + TDB_MIN_DATA_LEN);
+ off = next;
+ }
+
+ /* Now, move those elements to the tail of the list so we get something
+ * else next time. */
+ if (off) {
+ struct tdb_free_record oldhrec, newhrec, oldtrec, newtrec;
+ tdb_off_t oldhoff, oldtoff, newtoff;
+
+ /* The record we were up to is the new head. */
+ ecode = tdb_read_convert(tdb, off, &newhrec, sizeof(newhrec));
+ if (ecode != TDB_SUCCESS)
+ goto unlock_err;
+
+ /* Get the new tail. */
+ newtoff = frec_prev(&newhrec);
+ ecode = tdb_read_convert(tdb, newtoff, &newtrec,
+ sizeof(newtrec));
+ if (ecode != TDB_SUCCESS)
+ goto unlock_err;
+
+ /* Get the old head. */
+ oldhoff = tdb_read_off(tdb, b_off);
+ if (TDB_OFF_IS_ERR(oldhoff)) {
+ ecode = oldhoff;
+ goto unlock_err;
+ }
+
+ /* This could happen if they all coalesced away. */
+ if (oldhoff == off)
+ goto out;
+
+ ecode = tdb_read_convert(tdb, oldhoff, &oldhrec,
+ sizeof(oldhrec));
+ if (ecode != TDB_SUCCESS)
+ goto unlock_err;
+
+ /* Get the old tail. */
+ oldtoff = frec_prev(&oldhrec);
+ ecode = tdb_read_convert(tdb, oldtoff, &oldtrec,
+ sizeof(oldtrec));
+ if (ecode != TDB_SUCCESS)
+ goto unlock_err;
+
+ /* Old tail's next points to old head. */
+ oldtrec.next = oldhoff;
+
+ /* Old head's prev points to old tail. */
+ oldhrec.magic_and_prev
+ = (TDB_FREE_MAGIC << (64 - TDB_OFF_UPPER_STEAL))
+ | oldtoff;
+
+ /* New tail's next is 0. */
+ newtrec.next = 0;
+
+ /* Write out the modified versions. */
+ ecode = tdb_write_convert(tdb, oldtoff, &oldtrec,
+ sizeof(oldtrec));
+ if (ecode != TDB_SUCCESS)
+ goto unlock_err;
+
+ ecode = tdb_write_convert(tdb, oldhoff, &oldhrec,
+ sizeof(oldhrec));
+ if (ecode != TDB_SUCCESS)
+ goto unlock_err;
+
+ ecode = tdb_write_convert(tdb, newtoff, &newtrec,
+ sizeof(newtrec));
+ if (ecode != TDB_SUCCESS)
+ goto unlock_err;
+
+ /* And finally link in new head. */
+ ecode = tdb_write_off(tdb, b_off, off);
+ if (ecode != TDB_SUCCESS)
+ goto unlock_err;
+ }
+out:
+ tdb_unlock_free_bucket(tdb, b_off);
+ return TDB_SUCCESS;
+
+unlock_err:
+ tdb_unlock_free_bucket(tdb, b_off);
+ return ecode;
+}
+
+/* List must not be locked if coalesce_ok is set. */
+enum TDB_ERROR add_free_record(struct tdb_context *tdb,
+ tdb_off_t off, tdb_len_t len_with_header,
+ enum tdb_lock_flags waitflag,
+ bool coalesce)
+{
+ tdb_off_t b_off;
+ tdb_len_t len;
+ enum TDB_ERROR ecode;
+
+ assert(len_with_header >= sizeof(struct tdb_free_record));
+
+ len = len_with_header - sizeof(struct tdb_used_record);
+
+ b_off = bucket_off(tdb->ftable_off, size_to_bucket(len));
+ ecode = tdb_lock_free_bucket(tdb, b_off, waitflag);
+ if (ecode != TDB_SUCCESS) {
+ return ecode;
+ }
+
+ ecode = enqueue_in_free(tdb, b_off, off, len, &coalesce);
+ check_list(tdb, b_off);
+
+ /* Coalescing unlocks free list. */
+ if (!ecode && coalesce)
+ ecode = coalesce_list(tdb, tdb->ftable_off, b_off, 2);
+ else
+ tdb_unlock_free_bucket(tdb, b_off);
+ return ecode;
+}
+
+static size_t adjust_size(size_t keylen, size_t datalen)
+{
+ size_t size = keylen + datalen;
+
+ if (size < TDB_MIN_DATA_LEN)
+ size = TDB_MIN_DATA_LEN;
+
+ /* Round to next uint64_t boundary. */
+ return (size + (sizeof(uint64_t) - 1ULL)) & ~(sizeof(uint64_t) - 1ULL);
+}
+
+/* If we have enough left over to be useful, split that off. */
+static size_t record_leftover(size_t keylen, size_t datalen,
+ bool want_extra, size_t total_len)
+{
+ ssize_t leftover;
+
+ if (want_extra)
+ datalen += datalen / 2;
+ leftover = total_len - adjust_size(keylen, datalen);
+
+ if (leftover < (ssize_t)sizeof(struct tdb_free_record))
+ return 0;
+
+ return leftover;
+}
+
+/* We need size bytes to put our key and data in. */
+static tdb_off_t lock_and_alloc(struct tdb_context *tdb,
+ tdb_off_t ftable_off,
+ tdb_off_t bucket,
+ size_t keylen, size_t datalen,
+ bool want_extra,
+ unsigned magic,
+ unsigned hashlow)
+{
+ tdb_off_t off, b_off,best_off;
+ struct tdb_free_record best = { 0 };
+ double multiplier;
+ size_t size = adjust_size(keylen, datalen);
+ enum TDB_ERROR ecode;
+
+ tdb->stats.allocs++;
+ b_off = bucket_off(ftable_off, bucket);
+
+ /* FIXME: Try non-blocking wait first, to measure contention. */
+ /* Lock this bucket. */
+ ecode = tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT);
+ if (ecode != TDB_SUCCESS) {
+ return ecode;
+ }
+
+ best.ftable_and_len = -1ULL;
+ best_off = 0;
+
+ /* Get slack if we're after extra. */
+ if (want_extra)
+ multiplier = 1.5;
+ else
+ multiplier = 1.0;
+
+ /* Walk the list to see if any are large enough, getting less fussy
+ * as we go. */
+ off = tdb_read_off(tdb, b_off);
+ if (TDB_OFF_IS_ERR(off)) {
+ ecode = off;
+ goto unlock_err;
+ }
+ off &= TDB_OFF_MASK;
+
+ while (off) {
+ const struct tdb_free_record *r;
+ tdb_len_t len;
+ tdb_off_t next;
+
+ r = tdb_access_read(tdb, off, sizeof(*r), true);
+ if (TDB_PTR_IS_ERR(r)) {
+ ecode = TDB_PTR_ERR(r);
+ goto unlock_err;
+ }
+
+ if (frec_magic(r) != TDB_FREE_MAGIC) {
+ ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
+ "lock_and_alloc:"
+ " %llu non-free 0x%llx",
+ (long long)off,
+ (long long)r->magic_and_prev);
+ tdb_access_release(tdb, r);
+ goto unlock_err;
+ }
+
+ if (frec_len(r) >= size && frec_len(r) < frec_len(&best)) {
+ best_off = off;
+ best = *r;
+ }
+
+ if (frec_len(&best) <= size * multiplier && best_off) {
+ tdb_access_release(tdb, r);
+ break;
+ }
+
+ multiplier *= 1.01;
+
+ next = r->next;
+ len = frec_len(r);
+ tdb_access_release(tdb, r);
+ off = next;
+ }
+
+ /* If we found anything at all, use it. */
+ if (best_off) {
+ struct tdb_used_record rec;
+ size_t leftover;
+
+ /* We're happy with this size: take it. */
+ ecode = remove_from_list(tdb, b_off, best_off, &best);
+ check_list(tdb, b_off);
+ if (ecode != TDB_SUCCESS) {
+ goto unlock_err;
+ }
+
+ leftover = record_leftover(keylen, datalen, want_extra,
+ frec_len(&best));
+
+ assert(keylen + datalen + leftover <= frec_len(&best));
+ /* We need to mark non-free before we drop lock, otherwise
+ * coalesce() could try to merge it! */
+ ecode = set_header(tdb, &rec, magic, keylen, datalen,
+ frec_len(&best) - leftover, hashlow);
+ if (ecode != TDB_SUCCESS) {
+ goto unlock_err;
+ }
+
+ ecode = tdb_write_convert(tdb, best_off, &rec, sizeof(rec));
+ if (ecode != TDB_SUCCESS) {
+ goto unlock_err;
+ }
+
+ /* For futureproofing, we put a 0 in any unused space. */
+ if (rec_extra_padding(&rec)) {
+ ecode = tdb->methods->twrite(tdb, best_off + sizeof(rec)
+ + keylen + datalen, "", 1);
+ if (ecode != TDB_SUCCESS) {
+ goto unlock_err;
+ }
+ }
+
+ /* Bucket of leftover will be <= current bucket, so nested
+ * locking is allowed. */
+ if (leftover) {
+ tdb->stats.alloc_leftover++;
+ ecode = add_free_record(tdb,
+ best_off + sizeof(rec)
+ + frec_len(&best) - leftover,
+ leftover, TDB_LOCK_WAIT, false);
+ if (ecode != TDB_SUCCESS) {
+ best_off = ecode;
+ }
+ }
+ tdb_unlock_free_bucket(tdb, b_off);
+
+ return best_off;
+ }
+
+ tdb_unlock_free_bucket(tdb, b_off);
+ return 0;
+
+unlock_err:
+ tdb_unlock_free_bucket(tdb, b_off);
+ return ecode;
+}
+
+/* Get a free block from current free list, or 0 if none, -ve on error. */
+static tdb_off_t get_free(struct tdb_context *tdb,
+ size_t keylen, size_t datalen, bool want_extra,
+ unsigned magic, unsigned hashlow)
+{
+ tdb_off_t off, ftable_off;
+ tdb_off_t start_b, b, ftable;
+ bool wrapped = false;
+
+ /* If they are growing, add 50% to get to higher bucket. */
+ if (want_extra)
+ start_b = size_to_bucket(adjust_size(keylen,
+ datalen + datalen / 2));
+ else
+ start_b = size_to_bucket(adjust_size(keylen, datalen));
+
+ ftable_off = tdb->ftable_off;
+ ftable = tdb->ftable;
+ while (!wrapped || ftable_off != tdb->ftable_off) {
+ /* Start at exact size bucket, and search up... */
+ for (b = find_free_head(tdb, ftable_off, start_b);
+ b < TDB_FREE_BUCKETS;
+ b = find_free_head(tdb, ftable_off, b + 1)) {
+ /* Try getting one from list. */
+ off = lock_and_alloc(tdb, ftable_off,
+ b, keylen, datalen, want_extra,
+ magic, hashlow);
+ if (TDB_OFF_IS_ERR(off))
+ return off;
+ if (off != 0) {
+ if (b == start_b)
+ tdb->stats.alloc_bucket_exact++;
+ if (b == TDB_FREE_BUCKETS - 1)
+ tdb->stats.alloc_bucket_max++;
+ /* Worked? Stay using this list. */
+ tdb->ftable_off = ftable_off;
+ tdb->ftable = ftable;
+ return off;
+ }
+ /* Didn't work. Try next bucket. */
+ }
+
+ if (TDB_OFF_IS_ERR(b)) {
+ return b;
+ }
+
+ /* Hmm, try next table. */
+ ftable_off = next_ftable(tdb, ftable_off);
+ if (TDB_OFF_IS_ERR(ftable_off)) {
+ return ftable_off;
+ }
+ ftable++;
+
+ if (ftable_off == 0) {
+ wrapped = true;
+ ftable_off = first_ftable(tdb);
+ if (TDB_OFF_IS_ERR(ftable_off)) {
+ return ftable_off;
+ }
+ ftable = 0;
+ }
+ }
+
+ return 0;
+}
+
+enum TDB_ERROR set_header(struct tdb_context *tdb,
+ struct tdb_used_record *rec,
+ unsigned magic, uint64_t keylen, uint64_t datalen,
+ uint64_t actuallen, unsigned hashlow)
+{
+ uint64_t keybits = (fls64(keylen) + 1) / 2;
+
+ /* Use bottom bits of hash, so it's independent of hash table size. */
+ rec->magic_and_meta = (hashlow & ((1 << 11)-1))
+ | ((actuallen - (keylen + datalen)) << 11)
+ | (keybits << 43)
+ | ((uint64_t)magic << 48);
+ rec->key_and_data_len = (keylen | (datalen << (keybits*2)));
+
+ /* Encoding can fail on big values. */
+ if (rec_key_length(rec) != keylen
+ || rec_data_length(rec) != datalen
+ || rec_extra_padding(rec) != actuallen - (keylen + datalen)) {
+ return tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
+ "Could not encode k=%llu,d=%llu,a=%llu",
+ (long long)keylen, (long long)datalen,
+ (long long)actuallen);
+ }
+ return TDB_SUCCESS;
+}
+
+/* Expand the database. */
+static enum TDB_ERROR tdb_expand(struct tdb_context *tdb, tdb_len_t size)
+{
+ uint64_t old_size, rec_size, map_size;
+ tdb_len_t wanted;
+ enum TDB_ERROR ecode;
+
+ /* Need to hold a hash lock to expand DB: transactions rely on it. */
+ if (!(tdb->flags & TDB_NOLOCK)
+ && !tdb->file->allrecord_lock.count && !tdb_has_hash_locks(tdb)) {
+ return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR,
+ "tdb_expand: must hold lock during expand");
+ }
+
+ /* Only one person can expand file at a time. */
+ ecode = tdb_lock_expand(tdb, F_WRLCK);
+ if (ecode != TDB_SUCCESS) {
+ return ecode;
+ }
+
+ /* Someone else may have expanded the file, so retry. */
+ old_size = tdb->file->map_size;
+ tdb->methods->oob(tdb, tdb->file->map_size + 1, true);
+ if (tdb->file->map_size != old_size) {
+ tdb_unlock_expand(tdb, F_WRLCK);
+ return TDB_SUCCESS;
+ }
+
+ /* limit size in order to avoid using up huge amounts of memory for
+ * in memory tdbs if an oddball huge record creeps in */
+ if (size > 100 * 1024) {
+ rec_size = size * 2;
+ } else {
+ rec_size = size * 100;
+ }
+
+ /* always make room for at least rec_size more records, and at
+ least 25% more space. if the DB is smaller than 100MiB,
+ otherwise grow it by 10% only. */
+ if (old_size > 100 * 1024 * 1024) {
+ map_size = old_size / 10;
+ } else {
+ map_size = old_size / 4;
+ }
+
+ if (map_size > rec_size) {
+ wanted = map_size;
+ } else {
+ wanted = rec_size;
+ }
+
+ /* We need room for the record header too. */
+ wanted = adjust_size(0, sizeof(struct tdb_used_record) + wanted);
+
+ ecode = tdb->methods->expand_file(tdb, wanted);
+ if (ecode != TDB_SUCCESS) {
+ tdb_unlock_expand(tdb, F_WRLCK);
+ return ecode;
+ }
+
+ /* We need to drop this lock before adding free record. */
+ tdb_unlock_expand(tdb, F_WRLCK);
+
+ tdb->stats.expands++;
+ return add_free_record(tdb, old_size, wanted, TDB_LOCK_WAIT, true);
+}
+
+/* This won't fail: it will expand the database if it has to. */
+tdb_off_t alloc(struct tdb_context *tdb, size_t keylen, size_t datalen,
+ uint64_t hash, unsigned magic, bool growing)
+{
+ tdb_off_t off;
+
+ /* We can't hold pointers during this: we could unmap! */
+ assert(!tdb->direct_access);
+
+ for (;;) {
+ enum TDB_ERROR ecode;
+ off = get_free(tdb, keylen, datalen, growing, magic, hash);
+ if (likely(off != 0))
+ break;
+
+ ecode = tdb_expand(tdb, adjust_size(keylen, datalen));
+ if (ecode != TDB_SUCCESS) {
+ return ecode;
+ }
+ }
+
+ return off;
+}
diff --git a/lib/tdb2/hash.c b/lib/tdb2/hash.c
new file mode 100644
index 00000000000..1359cfecd66
--- /dev/null
+++ b/lib/tdb2/hash.c
@@ -0,0 +1,881 @@
+ /*
+ Trivial Database 2: hash handling
+ Copyright (C) Rusty Russell 2010
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 3 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+#include "private.h"
+#include <assert.h>
+
+uint64_t tdb_hash(struct tdb_context *tdb, const void *ptr, size_t len)
+{
+ return tdb->hash_fn(ptr, len, tdb->hash_seed, tdb->hash_data);
+}
+
+uint64_t hash_record(struct tdb_context *tdb, tdb_off_t off)
+{
+ const struct tdb_used_record *r;
+ const void *key;
+ uint64_t klen, hash;
+
+ r = tdb_access_read(tdb, off, sizeof(*r), true);
+ if (TDB_PTR_IS_ERR(r)) {
+ /* FIXME */
+ return 0;
+ }
+
+ klen = rec_key_length(r);
+ tdb_access_release(tdb, r);
+
+ key = tdb_access_read(tdb, off + sizeof(*r), klen, false);
+ if (TDB_PTR_IS_ERR(key)) {
+ return 0;
+ }
+
+ hash = tdb_hash(tdb, key, klen);
+ tdb_access_release(tdb, key);
+ return hash;
+}
+
+/* Get bits from a value. */
+static uint32_t bits_from(uint64_t val, unsigned start, unsigned num)
+{
+ assert(num <= 32);
+ return (val >> start) & ((1U << num) - 1);
+}
+
+/* We take bits from the top: that way we can lock whole sections of the hash
+ * by using lock ranges. */
+static uint32_t use_bits(struct hash_info *h, unsigned num)
+{
+ h->hash_used += num;
+ return bits_from(h->h, 64 - h->hash_used, num);
+}
+
+static tdb_bool_err key_matches(struct tdb_context *tdb,
+ const struct tdb_used_record *rec,
+ tdb_off_t off,
+ const struct tdb_data *key)
+{
+ tdb_bool_err ret = false;
+ const char *rkey;
+
+ if (rec_key_length(rec) != key->dsize) {
+ tdb->stats.compare_wrong_keylen++;
+ return ret;
+ }
+
+ rkey = tdb_access_read(tdb, off + sizeof(*rec), key->dsize, false);
+ if (TDB_PTR_IS_ERR(rkey)) {
+ return TDB_PTR_ERR(rkey);
+ }
+ if (memcmp(rkey, key->dptr, key->dsize) == 0)
+ ret = true;
+ else
+ tdb->stats.compare_wrong_keycmp++;
+ tdb_access_release(tdb, rkey);
+ return ret;
+}
+
+/* Does entry match? */
+static tdb_bool_err match(struct tdb_context *tdb,
+ struct hash_info *h,
+ const struct tdb_data *key,
+ tdb_off_t val,
+ struct tdb_used_record *rec)
+{
+ tdb_off_t off;
+ enum TDB_ERROR ecode;
+
+ tdb->stats.compares++;
+ /* Desired bucket must match. */
+ if (h->home_bucket != (val & TDB_OFF_HASH_GROUP_MASK)) {
+ tdb->stats.compare_wrong_bucket++;
+ return false;
+ }
+
+ /* Top bits of offset == next bits of hash. */
+ if (bits_from(val, TDB_OFF_HASH_EXTRA_BIT, TDB_OFF_UPPER_STEAL_EXTRA)
+ != bits_from(h->h, 64 - h->hash_used - TDB_OFF_UPPER_STEAL_EXTRA,
+ TDB_OFF_UPPER_STEAL_EXTRA)) {
+ tdb->stats.compare_wrong_offsetbits++;
+ return false;
+ }
+
+ off = val & TDB_OFF_MASK;
+ ecode = tdb_read_convert(tdb, off, rec, sizeof(*rec));
+ if (ecode != TDB_SUCCESS) {
+ return ecode;
+ }
+
+ if ((h->h & ((1 << 11)-1)) != rec_hash(rec)) {
+ tdb->stats.compare_wrong_rechash++;
+ return false;
+ }
+
+ return key_matches(tdb, rec, off, key);
+}
+
+static tdb_off_t hbucket_off(tdb_off_t group_start, unsigned bucket)
+{
+ return group_start
+ + (bucket % (1 << TDB_HASH_GROUP_BITS)) * sizeof(tdb_off_t);
+}
+
+bool is_subhash(tdb_off_t val)
+{
+ return (val >> TDB_OFF_UPPER_STEAL_SUBHASH_BIT) & 1;
+}
+
+/* FIXME: Guess the depth, don't over-lock! */
+static tdb_off_t hlock_range(tdb_off_t group, tdb_off_t *size)
+{
+ *size = 1ULL << (64 - (TDB_TOPLEVEL_HASH_BITS - TDB_HASH_GROUP_BITS));
+ return group << (64 - (TDB_TOPLEVEL_HASH_BITS - TDB_HASH_GROUP_BITS));
+}
+
+static tdb_off_t COLD find_in_chain(struct tdb_context *tdb,
+ struct tdb_data key,
+ tdb_off_t chain,
+ struct hash_info *h,
+ struct tdb_used_record *rec,
+ struct traverse_info *tinfo)
+{
+ tdb_off_t off, next;
+ enum TDB_ERROR ecode;
+
+ /* In case nothing is free, we set these to zero. */
+ h->home_bucket = h->found_bucket = 0;
+
+ for (off = chain; off; off = next) {
+ unsigned int i;
+
+ h->group_start = off;
+ ecode = tdb_read_convert(tdb, off, h->group, sizeof(h->group));
+ if (ecode != TDB_SUCCESS) {
+ return ecode;
+ }
+
+ for (i = 0; i < (1 << TDB_HASH_GROUP_BITS); i++) {
+ tdb_off_t recoff;
+ if (!h->group[i]) {
+ /* Remember this empty bucket. */
+ h->home_bucket = h->found_bucket = i;
+ continue;
+ }
+
+ /* We can insert extra bits via add_to_hash
+ * empty bucket logic. */
+ recoff = h->group[i] & TDB_OFF_MASK;
+ ecode = tdb_read_convert(tdb, recoff, rec,
+ sizeof(*rec));
+ if (ecode != TDB_SUCCESS) {
+ return ecode;
+ }
+
+ ecode = key_matches(tdb, rec, recoff, &key);
+ if (ecode < 0) {
+ return ecode;
+ }
+ if (ecode == 1) {
+ h->home_bucket = h->found_bucket = i;
+
+ if (tinfo) {
+ tinfo->levels[tinfo->num_levels]
+ .hashtable = off;
+ tinfo->levels[tinfo->num_levels]
+ .total_buckets
+ = 1 << TDB_HASH_GROUP_BITS;
+ tinfo->levels[tinfo->num_levels].entry
+ = i;
+ tinfo->num_levels++;
+ }
+ return recoff;
+ }
+ }
+ next = tdb_read_off(tdb, off
+ + offsetof(struct tdb_chain, next));
+ if (TDB_OFF_IS_ERR(next)) {
+ return next;
+ }
+ if (next)
+ next += sizeof(struct tdb_used_record);
+ }
+ return 0;
+}
+
+/* This is the core routine which searches the hashtable for an entry.
+ * On error, no locks are held and -ve is returned.
+ * Otherwise, hinfo is filled in (and the optional tinfo).
+ * If not found, the return value is 0.
+ * If found, the return value is the offset, and *rec is the record. */
+tdb_off_t find_and_lock(struct tdb_context *tdb,
+ struct tdb_data key,
+ int ltype,
+ struct hash_info *h,
+ struct tdb_used_record *rec,
+ struct traverse_info *tinfo)
+{
+ uint32_t i, group;
+ tdb_off_t hashtable;
+ enum TDB_ERROR ecode;
+
+ h->h = tdb_hash(tdb, key.dptr, key.dsize);
+ h->hash_used = 0;
+ group = use_bits(h, TDB_TOPLEVEL_HASH_BITS - TDB_HASH_GROUP_BITS);
+ h->home_bucket = use_bits(h, TDB_HASH_GROUP_BITS);
+
+ h->hlock_start = hlock_range(group, &h->hlock_range);
+ ecode = tdb_lock_hashes(tdb, h->hlock_start, h->hlock_range, ltype,
+ TDB_LOCK_WAIT);
+ if (ecode != TDB_SUCCESS) {
+ return ecode;
+ }
+
+ hashtable = offsetof(struct tdb_header, hashtable);
+ if (tinfo) {
+ tinfo->toplevel_group = group;
+ tinfo->num_levels = 1;
+ tinfo->levels[0].entry = 0;
+ tinfo->levels[0].hashtable = hashtable
+ + (group << TDB_HASH_GROUP_BITS) * sizeof(tdb_off_t);
+ tinfo->levels[0].total_buckets = 1 << TDB_HASH_GROUP_BITS;
+ }
+
+ while (h->hash_used <= 64) {
+ /* Read in the hash group. */
+ h->group_start = hashtable
+ + group * (sizeof(tdb_off_t) << TDB_HASH_GROUP_BITS);
+
+ ecode = tdb_read_convert(tdb, h->group_start, &h->group,
+ sizeof(h->group));
+ if (ecode != TDB_SUCCESS) {
+ goto fail;
+ }
+
+ /* Pointer to another hash table? Go down... */
+ if (is_subhash(h->group[h->home_bucket])) {
+ hashtable = (h->group[h->home_bucket] & TDB_OFF_MASK)
+ + sizeof(struct tdb_used_record);
+ if (tinfo) {
+ /* When we come back, use *next* bucket */
+ tinfo->levels[tinfo->num_levels-1].entry
+ += h->home_bucket + 1;
+ }
+ group = use_bits(h, TDB_SUBLEVEL_HASH_BITS
+ - TDB_HASH_GROUP_BITS);
+ h->home_bucket = use_bits(h, TDB_HASH_GROUP_BITS);
+ if (tinfo) {
+ tinfo->levels[tinfo->num_levels].hashtable
+ = hashtable;
+ tinfo->levels[tinfo->num_levels].total_buckets
+ = 1 << TDB_SUBLEVEL_HASH_BITS;
+ tinfo->levels[tinfo->num_levels].entry
+ = group << TDB_HASH_GROUP_BITS;
+ tinfo->num_levels++;
+ }
+ continue;
+ }
+
+ /* It's in this group: search (until 0 or all searched) */
+ for (i = 0, h->found_bucket = h->home_bucket;
+ i < (1 << TDB_HASH_GROUP_BITS);
+ i++, h->found_bucket = ((h->found_bucket+1)
+ % (1 << TDB_HASH_GROUP_BITS))) {
+ tdb_bool_err berr;
+ if (is_subhash(h->group[h->found_bucket]))
+ continue;
+
+ if (!h->group[h->found_bucket])
+ break;
+
+ berr = match(tdb, h, &key, h->group[h->found_bucket],
+ rec);
+ if (berr < 0) {
+ ecode = berr;
+ goto fail;
+ }
+ if (berr) {
+ if (tinfo) {
+ tinfo->levels[tinfo->num_levels-1].entry
+ += h->found_bucket;
+ }
+ return h->group[h->found_bucket] & TDB_OFF_MASK;
+ }
+ }
+ /* Didn't find it: h indicates where it would go. */
+ return 0;
+ }
+
+ return find_in_chain(tdb, key, hashtable, h, rec, tinfo);
+
+fail:
+ tdb_unlock_hashes(tdb, h->hlock_start, h->hlock_range, ltype);
+ return ecode;
+}
+
+/* I wrote a simple test, expanding a hash to 2GB, for the following
+ * cases:
+ * 1) Expanding all the buckets at once,
+ * 2) Expanding the bucket we wanted to place the new entry into.
+ * 3) Expanding the most-populated bucket,
+ *
+ * I measured the worst/average/best density during this process.
+ * 1) 3%/16%/30%
+ * 2) 4%/20%/38%
+ * 3) 6%/22%/41%
+ *
+ * So we figure out the busiest bucket for the moment.
+ */
+static unsigned fullest_bucket(struct tdb_context *tdb,
+ const tdb_off_t *group,
+ unsigned new_bucket)
+{
+ unsigned counts[1 << TDB_HASH_GROUP_BITS] = { 0 };
+ unsigned int i, best_bucket;
+
+ /* Count the new entry. */
+ counts[new_bucket]++;
+ best_bucket = new_bucket;
+
+ for (i = 0; i < (1 << TDB_HASH_GROUP_BITS); i++) {
+ unsigned this_bucket;
+
+ if (is_subhash(group[i]))
+ continue;
+ this_bucket = group[i] & TDB_OFF_HASH_GROUP_MASK;
+ if (++counts[this_bucket] > counts[best_bucket])
+ best_bucket = this_bucket;
+ }
+
+ return best_bucket;
+}
+
+static bool put_into_group(tdb_off_t *group,
+ unsigned bucket, tdb_off_t encoded)
+{
+ unsigned int i;
+
+ for (i = 0; i < (1 << TDB_HASH_GROUP_BITS); i++) {
+ unsigned b = (bucket + i) % (1 << TDB_HASH_GROUP_BITS);
+
+ if (group[b] == 0) {
+ group[b] = encoded;
+ return true;
+ }
+ }
+ return false;
+}
+
+static void force_into_group(tdb_off_t *group,
+ unsigned bucket, tdb_off_t encoded)
+{
+ if (!put_into_group(group, bucket, encoded))
+ abort();
+}
+
+static tdb_off_t encode_offset(tdb_off_t new_off, struct hash_info *h)
+{
+ return h->home_bucket
+ | new_off
+ | ((uint64_t)bits_from(h->h,
+ 64 - h->hash_used - TDB_OFF_UPPER_STEAL_EXTRA,
+ TDB_OFF_UPPER_STEAL_EXTRA)
+ << TDB_OFF_HASH_EXTRA_BIT);
+}
+
+/* Simply overwrite the hash entry we found before. */
+enum TDB_ERROR replace_in_hash(struct tdb_context *tdb,
+ struct hash_info *h,
+ tdb_off_t new_off)
+{
+ return tdb_write_off(tdb, hbucket_off(h->group_start, h->found_bucket),
+ encode_offset(new_off, h));
+}
+
+/* We slot in anywhere that's empty in the chain. */
+static enum TDB_ERROR COLD add_to_chain(struct tdb_context *tdb,
+ tdb_off_t subhash,
+ tdb_off_t new_off)
+{
+ tdb_off_t entry;
+ enum TDB_ERROR ecode;
+
+ entry = tdb_find_zero_off(tdb, subhash, 1<<TDB_HASH_GROUP_BITS);
+ if (TDB_OFF_IS_ERR(entry)) {
+ return entry;
+ }
+
+ if (entry == 1 << TDB_HASH_GROUP_BITS) {
+ tdb_off_t next;
+
+ next = tdb_read_off(tdb, subhash
+ + offsetof(struct tdb_chain, next));
+ if (TDB_OFF_IS_ERR(next)) {
+ return next;
+ }
+
+ if (!next) {
+ next = alloc(tdb, 0, sizeof(struct tdb_chain), 0,
+ TDB_CHAIN_MAGIC, false);
+ if (TDB_OFF_IS_ERR(next))
+ return next;
+ ecode = zero_out(tdb,
+ next+sizeof(struct tdb_used_record),
+ sizeof(struct tdb_chain));
+ if (ecode != TDB_SUCCESS) {
+ return ecode;
+ }
+ ecode = tdb_write_off(tdb, subhash
+ + offsetof(struct tdb_chain,
+ next),
+ next);
+ if (ecode != TDB_SUCCESS) {
+ return ecode;
+ }
+ }
+ return add_to_chain(tdb, next, new_off);
+ }
+
+ return tdb_write_off(tdb, subhash + entry * sizeof(tdb_off_t),
+ new_off);
+}
+
+/* Add into a newly created subhash. */
+static enum TDB_ERROR add_to_subhash(struct tdb_context *tdb, tdb_off_t subhash,
+ unsigned hash_used, tdb_off_t val)
+{
+ tdb_off_t off = (val & TDB_OFF_MASK), *group;
+ struct hash_info h;
+ unsigned int gnum;
+
+ h.hash_used = hash_used;
+
+ if (hash_used + TDB_SUBLEVEL_HASH_BITS > 64)
+ return add_to_chain(tdb, subhash, off);
+
+ h.h = hash_record(tdb, off);
+ gnum = use_bits(&h, TDB_SUBLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS);
+ h.group_start = subhash
+ + gnum * (sizeof(tdb_off_t) << TDB_HASH_GROUP_BITS);
+ h.home_bucket = use_bits(&h, TDB_HASH_GROUP_BITS);
+
+ group = tdb_access_write(tdb, h.group_start,
+ sizeof(*group) << TDB_HASH_GROUP_BITS, true);
+ if (TDB_PTR_IS_ERR(group)) {
+ return TDB_PTR_ERR(group);
+ }
+ force_into_group(group, h.home_bucket, encode_offset(off, &h));
+ return tdb_access_commit(tdb, group);
+}
+
+static enum TDB_ERROR expand_group(struct tdb_context *tdb, struct hash_info *h)
+{
+ unsigned bucket, num_vals, i, magic;
+ size_t subsize;
+ tdb_off_t subhash;
+ tdb_off_t vals[1 << TDB_HASH_GROUP_BITS];
+ enum TDB_ERROR ecode;
+
+ /* Attach new empty subhash under fullest bucket. */
+ bucket = fullest_bucket(tdb, h->group, h->home_bucket);
+
+ if (h->hash_used == 64) {
+ tdb->stats.alloc_chain++;
+ subsize = sizeof(struct tdb_chain);
+ magic = TDB_CHAIN_MAGIC;
+ } else {
+ tdb->stats.alloc_subhash++;
+ subsize = (sizeof(tdb_off_t) << TDB_SUBLEVEL_HASH_BITS);
+ magic = TDB_HTABLE_MAGIC;
+ }
+
+ subhash = alloc(tdb, 0, subsize, 0, magic, false);
+ if (TDB_OFF_IS_ERR(subhash)) {
+ return subhash;
+ }
+
+ ecode = zero_out(tdb, subhash + sizeof(struct tdb_used_record),
+ subsize);
+ if (ecode != TDB_SUCCESS) {
+ return ecode;
+ }
+
+ /* Remove any which are destined for bucket or are in wrong place. */
+ num_vals = 0;
+ for (i = 0; i < (1 << TDB_HASH_GROUP_BITS); i++) {
+ unsigned home_bucket = h->group[i] & TDB_OFF_HASH_GROUP_MASK;
+ if (!h->group[i] || is_subhash(h->group[i]))
+ continue;
+ if (home_bucket == bucket || home_bucket != i) {
+ vals[num_vals++] = h->group[i];
+ h->group[i] = 0;
+ }
+ }
+ /* FIXME: This assert is valid, but we do this during unit test :( */
+ /* assert(num_vals); */
+
+ /* Overwrite expanded bucket with subhash pointer. */
+ h->group[bucket] = subhash | (1ULL << TDB_OFF_UPPER_STEAL_SUBHASH_BIT);
+
+ /* Point to actual contents of record. */
+ subhash += sizeof(struct tdb_used_record);
+
+ /* Put values back. */
+ for (i = 0; i < num_vals; i++) {
+ unsigned this_bucket = vals[i] & TDB_OFF_HASH_GROUP_MASK;
+
+ if (this_bucket == bucket) {
+ ecode = add_to_subhash(tdb, subhash, h->hash_used,
+ vals[i]);
+ if (ecode != TDB_SUCCESS)
+ return ecode;
+ } else {
+ /* There should be room to put this back. */
+ force_into_group(h->group, this_bucket, vals[i]);
+ }
+ }
+ return TDB_SUCCESS;
+}
+
+enum TDB_ERROR delete_from_hash(struct tdb_context *tdb, struct hash_info *h)
+{
+ unsigned int i, num_movers = 0;
+ tdb_off_t movers[1 << TDB_HASH_GROUP_BITS];
+
+ h->group[h->found_bucket] = 0;
+ for (i = 1; i < (1 << TDB_HASH_GROUP_BITS); i++) {
+ unsigned this_bucket;
+
+ this_bucket = (h->found_bucket+i) % (1 << TDB_HASH_GROUP_BITS);
+ /* Empty bucket? We're done. */
+ if (!h->group[this_bucket])
+ break;
+
+ /* Ignore subhashes. */
+ if (is_subhash(h->group[this_bucket]))
+ continue;
+
+ /* If this one is not happy where it is, we'll move it. */
+ if ((h->group[this_bucket] & TDB_OFF_HASH_GROUP_MASK)
+ != this_bucket) {
+ movers[num_movers++] = h->group[this_bucket];
+ h->group[this_bucket] = 0;
+ }
+ }
+
+ /* Put back the ones we erased. */
+ for (i = 0; i < num_movers; i++) {
+ force_into_group(h->group, movers[i] & TDB_OFF_HASH_GROUP_MASK,
+ movers[i]);
+ }
+
+ /* Now we write back the hash group */
+ return tdb_write_convert(tdb, h->group_start,
+ h->group, sizeof(h->group));
+}
+
+enum TDB_ERROR add_to_hash(struct tdb_context *tdb, struct hash_info *h,
+ tdb_off_t new_off)
+{
+ enum TDB_ERROR ecode;
+
+ /* We hit an empty bucket during search? That's where it goes. */
+ if (!h->group[h->found_bucket]) {
+ h->group[h->found_bucket] = encode_offset(new_off, h);
+ /* Write back the modified group. */
+ return tdb_write_convert(tdb, h->group_start,
+ h->group, sizeof(h->group));
+ }
+
+ if (h->hash_used > 64)
+ return add_to_chain(tdb, h->group_start, new_off);
+
+ /* We're full. Expand. */
+ ecode = expand_group(tdb, h);
+ if (ecode != TDB_SUCCESS) {
+ return ecode;
+ }
+
+ if (is_subhash(h->group[h->home_bucket])) {
+ /* We were expanded! */
+ tdb_off_t hashtable;
+ unsigned int gnum;
+
+ /* Write back the modified group. */
+ ecode = tdb_write_convert(tdb, h->group_start, h->group,
+ sizeof(h->group));
+ if (ecode != TDB_SUCCESS) {
+ return ecode;
+ }
+
+ /* Move hashinfo down a level. */
+ hashtable = (h->group[h->home_bucket] & TDB_OFF_MASK)
+ + sizeof(struct tdb_used_record);
+ gnum = use_bits(h,TDB_SUBLEVEL_HASH_BITS - TDB_HASH_GROUP_BITS);
+ h->home_bucket = use_bits(h, TDB_HASH_GROUP_BITS);
+ h->group_start = hashtable
+ + gnum * (sizeof(tdb_off_t) << TDB_HASH_GROUP_BITS);
+ ecode = tdb_read_convert(tdb, h->group_start, &h->group,
+ sizeof(h->group));
+ if (ecode != TDB_SUCCESS) {
+ return ecode;
+ }
+ }
+
+ /* Expanding the group must have made room if it didn't choose this
+ * bucket. */
+ if (put_into_group(h->group, h->home_bucket, encode_offset(new_off,h))){
+ return tdb_write_convert(tdb, h->group_start,
+ h->group, sizeof(h->group));
+ }
+
+ /* This can happen if all hashes in group (and us) dropped into same
+ * group in subhash. */
+ return add_to_hash(tdb, h, new_off);
+}
+
+/* Traverse support: returns offset of record, or 0 or -ve error. */
+static tdb_off_t iterate_hash(struct tdb_context *tdb,
+ struct traverse_info *tinfo)
+{
+ tdb_off_t off, val, i;
+ struct traverse_level *tlevel;
+
+ tlevel = &tinfo->levels[tinfo->num_levels-1];
+
+again:
+ for (i = tdb_find_nonzero_off(tdb, tlevel->hashtable,
+ tlevel->entry, tlevel->total_buckets);
+ i != tlevel->total_buckets;
+ i = tdb_find_nonzero_off(tdb, tlevel->hashtable,
+ i+1, tlevel->total_buckets)) {
+ if (TDB_OFF_IS_ERR(i)) {
+ return i;
+ }
+
+ val = tdb_read_off(tdb, tlevel->hashtable+sizeof(tdb_off_t)*i);
+ if (TDB_OFF_IS_ERR(val)) {
+ return val;
+ }
+
+ off = val & TDB_OFF_MASK;
+
+ /* This makes the delete-all-in-traverse case work
+ * (and simplifies our logic a little). */
+ if (off == tinfo->prev)
+ continue;
+
+ tlevel->entry = i;
+
+ if (!is_subhash(val)) {
+ /* Found one. */
+ tinfo->prev = off;
+ return off;
+ }
+
+ /* When we come back, we want the next one */
+ tlevel->entry++;
+ tinfo->num_levels++;
+ tlevel++;
+ tlevel->hashtable = off + sizeof(struct tdb_used_record);
+ tlevel->entry = 0;
+ /* Next level is a chain? */
+ if (unlikely(tinfo->num_levels == TDB_MAX_LEVELS + 1))
+ tlevel->total_buckets = (1 << TDB_HASH_GROUP_BITS);
+ else
+ tlevel->total_buckets = (1 << TDB_SUBLEVEL_HASH_BITS);
+ goto again;
+ }
+
+ /* Nothing there? */
+ if (tinfo->num_levels == 1)
+ return 0;
+
+ /* Handle chained entries. */
+ if (unlikely(tinfo->num_levels == TDB_MAX_LEVELS + 1)) {
+ tlevel->hashtable = tdb_read_off(tdb, tlevel->hashtable
+ + offsetof(struct tdb_chain,
+ next));
+ if (TDB_OFF_IS_ERR(tlevel->hashtable)) {
+ return tlevel->hashtable;
+ }
+ if (tlevel->hashtable) {
+ tlevel->hashtable += sizeof(struct tdb_used_record);
+ tlevel->entry = 0;
+ goto again;
+ }
+ }
+
+ /* Go back up and keep searching. */
+ tinfo->num_levels--;
+ tlevel--;
+ goto again;
+}
+
+/* Return success if we find something, TDB_ERR_NOEXIST if none. */
+enum TDB_ERROR next_in_hash(struct tdb_context *tdb,
+ struct traverse_info *tinfo,
+ TDB_DATA *kbuf, size_t *dlen)
+{
+ const unsigned group_bits = TDB_TOPLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS;
+ tdb_off_t hl_start, hl_range, off;
+ enum TDB_ERROR ecode;
+
+ while (tinfo->toplevel_group < (1 << group_bits)) {
+ hl_start = (tdb_off_t)tinfo->toplevel_group
+ << (64 - group_bits);
+ hl_range = 1ULL << group_bits;
+ ecode = tdb_lock_hashes(tdb, hl_start, hl_range, F_RDLCK,
+ TDB_LOCK_WAIT);
+ if (ecode != TDB_SUCCESS) {
+ return ecode;
+ }
+
+ off = iterate_hash(tdb, tinfo);
+ if (off) {
+ struct tdb_used_record rec;
+
+ if (TDB_OFF_IS_ERR(off)) {
+ ecode = off;
+ goto fail;
+ }
+
+ ecode = tdb_read_convert(tdb, off, &rec, sizeof(rec));
+ if (ecode != TDB_SUCCESS) {
+ goto fail;
+ }
+ if (rec_magic(&rec) != TDB_USED_MAGIC) {
+ ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT,
+ TDB_LOG_ERROR,
+ "next_in_hash:"
+ " corrupt record at %llu",
+ (long long)off);
+ goto fail;
+ }
+
+ kbuf->dsize = rec_key_length(&rec);
+
+ /* They want data as well? */
+ if (dlen) {
+ *dlen = rec_data_length(&rec);
+ kbuf->dptr = tdb_alloc_read(tdb,
+ off + sizeof(rec),
+ kbuf->dsize
+ + *dlen);
+ } else {
+ kbuf->dptr = tdb_alloc_read(tdb,
+ off + sizeof(rec),
+ kbuf->dsize);
+ }
+ tdb_unlock_hashes(tdb, hl_start, hl_range, F_RDLCK);
+ if (TDB_PTR_IS_ERR(kbuf->dptr)) {
+ return TDB_PTR_ERR(kbuf->dptr);
+ }
+ return TDB_SUCCESS;
+ }
+
+ tdb_unlock_hashes(tdb, hl_start, hl_range, F_RDLCK);
+
+ tinfo->toplevel_group++;
+ tinfo->levels[0].hashtable
+ += (sizeof(tdb_off_t) << TDB_HASH_GROUP_BITS);
+ tinfo->levels[0].entry = 0;
+ }
+ return TDB_ERR_NOEXIST;
+
+fail:
+ tdb_unlock_hashes(tdb, hl_start, hl_range, F_RDLCK);
+ return ecode;
+
+}
+
+enum TDB_ERROR first_in_hash(struct tdb_context *tdb,
+ struct traverse_info *tinfo,
+ TDB_DATA *kbuf, size_t *dlen)
+{
+ tinfo->prev = 0;
+ tinfo->toplevel_group = 0;
+ tinfo->num_levels = 1;
+ tinfo->levels[0].hashtable = offsetof(struct tdb_header, hashtable);
+ tinfo->levels[0].entry = 0;
+ tinfo->levels[0].total_buckets = (1 << TDB_HASH_GROUP_BITS);
+
+ return next_in_hash(tdb, tinfo, kbuf, dlen);
+}
+
+/* Even if the entry isn't in this hash bucket, you'd have to lock this
+ * bucket to find it. */
+static enum TDB_ERROR chainlock(struct tdb_context *tdb, const TDB_DATA *key,
+ int ltype, enum tdb_lock_flags waitflag,
+ const char *func)
+{
+ enum TDB_ERROR ecode;
+ uint64_t h = tdb_hash(tdb, key->dptr, key->dsize);
+ tdb_off_t lockstart, locksize;
+ unsigned int group, gbits;
+
+ gbits = TDB_TOPLEVEL_HASH_BITS - TDB_HASH_GROUP_BITS;
+ group = bits_from(h, 64 - gbits, gbits);
+
+ lockstart = hlock_range(group, &locksize);
+
+ ecode = tdb_lock_hashes(tdb, lockstart, locksize, ltype, waitflag);
+ tdb_trace_1rec(tdb, func, *key);
+ return ecode;
+}
+
+/* lock/unlock one hash chain. This is meant to be used to reduce
+ contention - it cannot guarantee how many records will be locked */
+enum TDB_ERROR tdb_chainlock(struct tdb_context *tdb, TDB_DATA key)
+{
+ return tdb->last_error = chainlock(tdb, &key, F_WRLCK, TDB_LOCK_WAIT,
+ "tdb_chainlock");
+}
+
+void tdb_chainunlock(struct tdb_context *tdb, TDB_DATA key)
+{
+ uint64_t h = tdb_hash(tdb, key.dptr, key.dsize);
+ tdb_off_t lockstart, locksize;
+ unsigned int group, gbits;
+
+ gbits = TDB_TOPLEVEL_HASH_BITS - TDB_HASH_GROUP_BITS;
+ group = bits_from(h, 64 - gbits, gbits);
+
+ lockstart = hlock_range(group, &locksize);
+
+ tdb_trace_1rec(tdb, "tdb_chainunlock", key);
+ tdb_unlock_hashes(tdb, lockstart, locksize, F_WRLCK);
+}
+
+enum TDB_ERROR tdb_chainlock_read(struct tdb_context *tdb, TDB_DATA key)
+{
+ return tdb->last_error = chainlock(tdb, &key, F_RDLCK, TDB_LOCK_WAIT,
+ "tdb_chainlock_read");
+}
+
+void tdb_chainunlock_read(struct tdb_context *tdb, TDB_DATA key)
+{
+ uint64_t h = tdb_hash(tdb, key.dptr, key.dsize);
+ tdb_off_t lockstart, locksize;
+ unsigned int group, gbits;
+
+ gbits = TDB_TOPLEVEL_HASH_BITS - TDB_HASH_GROUP_BITS;
+ group = bits_from(h, 64 - gbits, gbits);
+
+ lockstart = hlock_range(group, &locksize);
+
+ tdb_trace_1rec(tdb, "tdb_chainunlock_read", key);
+ tdb_unlock_hashes(tdb, lockstart, locksize, F_RDLCK);
+}
diff --git a/lib/tdb2/io.c b/lib/tdb2/io.c
new file mode 100644
index 00000000000..8c5f45f3082
--- /dev/null
+++ b/lib/tdb2/io.c
@@ -0,0 +1,615 @@
+ /*
+ Unix SMB/CIFS implementation.
+
+ trivial database library
+
+ Copyright (C) Andrew Tridgell 1999-2005
+ Copyright (C) Paul `Rusty' Russell 2000
+ Copyright (C) Jeremy Allison 2000-2003
+ Copyright (C) Rusty Russell 2010
+
+ ** NOTE! The following LGPL license applies to the tdb
+ ** library. This does NOT imply that all of Samba is released
+ ** under the LGPL
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 3 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+#include "private.h"
+#include <assert.h>
+#include <ccan/likely/likely.h>
+
+void tdb_munmap(struct tdb_file *file)
+{
+ if (file->fd == -1)
+ return;
+
+ if (file->map_ptr) {
+ munmap(file->map_ptr, file->map_size);
+ file->map_ptr = NULL;
+ }
+}
+
+void tdb_mmap(struct tdb_context *tdb)
+{
+ if (tdb->flags & TDB_INTERNAL)
+ return;
+
+ if (tdb->flags & TDB_NOMMAP)
+ return;
+
+ /* size_t can be smaller than off_t. */
+ if ((size_t)tdb->file->map_size == tdb->file->map_size) {
+ tdb->file->map_ptr = mmap(NULL, tdb->file->map_size,
+ tdb->mmap_flags,
+ MAP_SHARED, tdb->file->fd, 0);
+ } else
+ tdb->file->map_ptr = MAP_FAILED;
+
+ /*
+ * NB. When mmap fails it returns MAP_FAILED *NOT* NULL !!!!
+ */
+ if (tdb->file->map_ptr == MAP_FAILED) {
+ tdb->file->map_ptr = NULL;
+ tdb_logerr(tdb, TDB_SUCCESS, TDB_LOG_WARNING,
+ "tdb_mmap failed for size %lld (%s)",
+ (long long)tdb->file->map_size, strerror(errno));
+ }
+}
+
+/* check for an out of bounds access - if it is out of bounds then
+ see if the database has been expanded by someone else and expand
+ if necessary
+ note that "len" is the minimum length needed for the db
+*/
+static enum TDB_ERROR tdb_oob(struct tdb_context *tdb, tdb_off_t len,
+ bool probe)
+{
+ struct stat st;
+ enum TDB_ERROR ecode;
+
+ /* We can't hold pointers during this: we could unmap! */
+ assert(!tdb->direct_access
+ || (tdb->flags & TDB_NOLOCK)
+ || tdb_has_expansion_lock(tdb));
+
+ if (len <= tdb->file->map_size)
+ return 0;
+ if (tdb->flags & TDB_INTERNAL) {
+ if (!probe) {
+ tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
+ "tdb_oob len %lld beyond internal"
+ " malloc size %lld",
+ (long long)len,
+ (long long)tdb->file->map_size);
+ }
+ return TDB_ERR_IO;
+ }
+
+ ecode = tdb_lock_expand(tdb, F_RDLCK);
+ if (ecode != TDB_SUCCESS) {
+ return ecode;
+ }
+
+ if (fstat(tdb->file->fd, &st) != 0) {
+ tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
+ "Failed to fstat file: %s", strerror(errno));
+ tdb_unlock_expand(tdb, F_RDLCK);
+ return TDB_ERR_IO;
+ }
+
+ tdb_unlock_expand(tdb, F_RDLCK);
+
+ if (st.st_size < (size_t)len) {
+ if (!probe) {
+ tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
+ "tdb_oob len %zu beyond eof at %zu",
+ (size_t)len, st.st_size);
+ }
+ return TDB_ERR_IO;
+ }
+
+ /* Unmap, update size, remap */
+ tdb_munmap(tdb->file);
+
+ tdb->file->map_size = st.st_size;
+ tdb_mmap(tdb);
+ return TDB_SUCCESS;
+}
+
+/* Endian conversion: we only ever deal with 8 byte quantities */
+void *tdb_convert(const struct tdb_context *tdb, void *buf, tdb_len_t size)
+{
+ assert(size % 8 == 0);
+ if (unlikely((tdb->flags & TDB_CONVERT)) && buf) {
+ uint64_t i, *p = (uint64_t *)buf;
+ for (i = 0; i < size / 8; i++)
+ p[i] = bswap_64(p[i]);
+ }
+ return buf;
+}
+
+/* Return first non-zero offset in offset array, or end, or -ve error. */
+/* FIXME: Return the off? */
+uint64_t tdb_find_nonzero_off(struct tdb_context *tdb,
+ tdb_off_t base, uint64_t start, uint64_t end)
+{
+ uint64_t i;
+ const uint64_t *val;
+
+ /* Zero vs non-zero is the same unconverted: minor optimization. */
+ val = tdb_access_read(tdb, base + start * sizeof(tdb_off_t),
+ (end - start) * sizeof(tdb_off_t), false);
+ if (TDB_PTR_IS_ERR(val)) {
+ return TDB_PTR_ERR(val);
+ }
+
+ for (i = 0; i < (end - start); i++) {
+ if (val[i])
+ break;
+ }
+ tdb_access_release(tdb, val);
+ return start + i;
+}
+
+/* Return first zero offset in num offset array, or num, or -ve error. */
+uint64_t tdb_find_zero_off(struct tdb_context *tdb, tdb_off_t off,
+ uint64_t num)
+{
+ uint64_t i;
+ const uint64_t *val;
+
+ /* Zero vs non-zero is the same unconverted: minor optimization. */
+ val = tdb_access_read(tdb, off, num * sizeof(tdb_off_t), false);
+ if (TDB_PTR_IS_ERR(val)) {
+ return TDB_PTR_ERR(val);
+ }
+
+ for (i = 0; i < num; i++) {
+ if (!val[i])
+ break;
+ }
+ tdb_access_release(tdb, val);
+ return i;
+}
+
+enum TDB_ERROR zero_out(struct tdb_context *tdb, tdb_off_t off, tdb_len_t len)
+{
+ char buf[8192] = { 0 };
+ void *p = tdb->methods->direct(tdb, off, len, true);
+ enum TDB_ERROR ecode = TDB_SUCCESS;
+
+ assert(!tdb->read_only);
+ if (TDB_PTR_IS_ERR(p)) {
+ return TDB_PTR_ERR(p);
+ }
+ if (p) {
+ memset(p, 0, len);
+ return ecode;
+ }
+ while (len) {
+ unsigned todo = len < sizeof(buf) ? len : sizeof(buf);
+ ecode = tdb->methods->twrite(tdb, off, buf, todo);
+ if (ecode != TDB_SUCCESS) {
+ break;
+ }
+ len -= todo;
+ off += todo;
+ }
+ return ecode;
+}
+
+tdb_off_t tdb_read_off(struct tdb_context *tdb, tdb_off_t off)
+{
+ tdb_off_t ret;
+ enum TDB_ERROR ecode;
+
+ if (likely(!(tdb->flags & TDB_CONVERT))) {
+ tdb_off_t *p = tdb->methods->direct(tdb, off, sizeof(*p),
+ false);
+ if (TDB_PTR_IS_ERR(p)) {
+ return TDB_PTR_ERR(p);
+ }
+ if (p)
+ return *p;
+ }
+
+ ecode = tdb_read_convert(tdb, off, &ret, sizeof(ret));
+ if (ecode != TDB_SUCCESS) {
+ return ecode;
+ }
+ return ret;
+}
+
+/* write a lump of data at a specified offset */
+static enum TDB_ERROR tdb_write(struct tdb_context *tdb, tdb_off_t off,
+ const void *buf, tdb_len_t len)
+{
+ enum TDB_ERROR ecode;
+
+ if (tdb->read_only) {
+ return tdb_logerr(tdb, TDB_ERR_RDONLY, TDB_LOG_USE_ERROR,
+ "Write to read-only database");
+ }
+
+ ecode = tdb->methods->oob(tdb, off + len, 0);
+ if (ecode != TDB_SUCCESS) {
+ return ecode;
+ }
+
+ if (tdb->file->map_ptr) {
+ memcpy(off + (char *)tdb->file->map_ptr, buf, len);
+ } else {
+ ssize_t ret;
+ ret = pwrite(tdb->file->fd, buf, len, off);
+ if (ret != len) {
+ /* This shouldn't happen: we avoid sparse files. */
+ if (ret >= 0)
+ errno = ENOSPC;
+
+ return tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
+ "tdb_write: %zi at %zu len=%zu (%s)",
+ ret, (size_t)off, (size_t)len,
+ strerror(errno));
+ }
+ }
+ return TDB_SUCCESS;
+}
+
+/* read a lump of data at a specified offset */
+static enum TDB_ERROR tdb_read(struct tdb_context *tdb, tdb_off_t off,
+ void *buf, tdb_len_t len)
+{
+ enum TDB_ERROR ecode;
+
+ ecode = tdb->methods->oob(tdb, off + len, 0);
+ if (ecode != TDB_SUCCESS) {
+ return ecode;
+ }
+
+ if (tdb->file->map_ptr) {
+ memcpy(buf, off + (char *)tdb->file->map_ptr, len);
+ } else {
+ ssize_t r = pread(tdb->file->fd, buf, len, off);
+ if (r != len) {
+ return tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
+ "tdb_read failed with %zi at %zu "
+ "len=%zu (%s) map_size=%zu",
+ r, (size_t)off, (size_t)len,
+ strerror(errno),
+ (size_t)tdb->file->map_size);
+ }
+ }
+ return TDB_SUCCESS;
+}
+
+enum TDB_ERROR tdb_write_convert(struct tdb_context *tdb, tdb_off_t off,
+ const void *rec, size_t len)
+{
+ enum TDB_ERROR ecode;
+
+ if (unlikely((tdb->flags & TDB_CONVERT))) {
+ void *conv = malloc(len);
+ if (!conv) {
+ return tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR,
+ "tdb_write: no memory converting"
+ " %zu bytes", len);
+ }
+ memcpy(conv, rec, len);
+ ecode = tdb->methods->twrite(tdb, off,
+ tdb_convert(tdb, conv, len), len);
+ free(conv);
+ } else {
+ ecode = tdb->methods->twrite(tdb, off, rec, len);
+ }
+ return ecode;
+}
+
+enum TDB_ERROR tdb_read_convert(struct tdb_context *tdb, tdb_off_t off,
+ void *rec, size_t len)
+{
+ enum TDB_ERROR ecode = tdb->methods->tread(tdb, off, rec, len);
+ tdb_convert(tdb, rec, len);
+ return ecode;
+}
+
+enum TDB_ERROR tdb_write_off(struct tdb_context *tdb,
+ tdb_off_t off, tdb_off_t val)
+{
+ if (tdb->read_only) {
+ return tdb_logerr(tdb, TDB_ERR_RDONLY, TDB_LOG_USE_ERROR,
+ "Write to read-only database");
+ }
+
+ if (likely(!(tdb->flags & TDB_CONVERT))) {
+ tdb_off_t *p = tdb->methods->direct(tdb, off, sizeof(*p),
+ true);
+ if (TDB_PTR_IS_ERR(p)) {
+ return TDB_PTR_ERR(p);
+ }
+ if (p) {
+ *p = val;
+ return TDB_SUCCESS;
+ }
+ }
+ return tdb_write_convert(tdb, off, &val, sizeof(val));
+}
+
+static void *_tdb_alloc_read(struct tdb_context *tdb, tdb_off_t offset,
+ tdb_len_t len, unsigned int prefix)
+{
+ unsigned char *buf;
+ enum TDB_ERROR ecode;
+
+ /* some systems don't like zero length malloc */
+ buf = malloc(prefix + len ? prefix + len : 1);
+ if (!buf) {
+ tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_USE_ERROR,
+ "tdb_alloc_read malloc failed len=%zu",
+ (size_t)(prefix + len));
+ return TDB_ERR_PTR(TDB_ERR_OOM);
+ } else {
+ ecode = tdb->methods->tread(tdb, offset, buf+prefix, len);
+ if (unlikely(ecode != TDB_SUCCESS)) {
+ free(buf);
+ return TDB_ERR_PTR(ecode);
+ }
+ }
+ return buf;
+}
+
+/* read a lump of data, allocating the space for it */
+void *tdb_alloc_read(struct tdb_context *tdb, tdb_off_t offset, tdb_len_t len)
+{
+ return _tdb_alloc_read(tdb, offset, len, 0);
+}
+
+static enum TDB_ERROR fill(struct tdb_context *tdb,
+ const void *buf, size_t size,
+ tdb_off_t off, tdb_len_t len)
+{
+ while (len) {
+ size_t n = len > size ? size : len;
+ ssize_t ret = pwrite(tdb->file->fd, buf, n, off);
+ if (ret != n) {
+ if (ret >= 0)
+ errno = ENOSPC;
+
+ return tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
+ "fill failed:"
+ " %zi at %zu len=%zu (%s)",
+ ret, (size_t)off, (size_t)len,
+ strerror(errno));
+ }
+ len -= n;
+ off += n;
+ }
+ return TDB_SUCCESS;
+}
+
+/* expand a file. we prefer to use ftruncate, as that is what posix
+ says to use for mmap expansion */
+static enum TDB_ERROR tdb_expand_file(struct tdb_context *tdb,
+ tdb_len_t addition)
+{
+ char buf[8192];
+ enum TDB_ERROR ecode;
+
+ if (tdb->read_only) {
+ return tdb_logerr(tdb, TDB_ERR_RDONLY, TDB_LOG_USE_ERROR,
+ "Expand on read-only database");
+ }
+
+ if (tdb->flags & TDB_INTERNAL) {
+ char *new = realloc(tdb->file->map_ptr,
+ tdb->file->map_size + addition);
+ if (!new) {
+ return tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR,
+ "No memory to expand database");
+ }
+ tdb->file->map_ptr = new;
+ tdb->file->map_size += addition;
+ } else {
+ /* Unmap before trying to write; old TDB claimed OpenBSD had
+ * problem with this otherwise. */
+ tdb_munmap(tdb->file);
+
+ /* If this fails, we try to fill anyway. */
+ if (ftruncate(tdb->file->fd, tdb->file->map_size + addition))
+ ;
+
+ /* now fill the file with something. This ensures that the
+ file isn't sparse, which would be very bad if we ran out of
+ disk. This must be done with write, not via mmap */
+ memset(buf, 0x43, sizeof(buf));
+ ecode = fill(tdb, buf, sizeof(buf), tdb->file->map_size,
+ addition);
+ if (ecode != TDB_SUCCESS)
+ return ecode;
+ tdb->file->map_size += addition;
+ tdb_mmap(tdb);
+ }
+ return TDB_SUCCESS;
+}
+
+const void *tdb_access_read(struct tdb_context *tdb,
+ tdb_off_t off, tdb_len_t len, bool convert)
+{
+ void *ret = NULL;
+
+ if (likely(!(tdb->flags & TDB_CONVERT))) {
+ ret = tdb->methods->direct(tdb, off, len, false);
+
+ if (TDB_PTR_IS_ERR(ret)) {
+ return ret;
+ }
+ }
+ if (!ret) {
+ struct tdb_access_hdr *hdr;
+ hdr = _tdb_alloc_read(tdb, off, len, sizeof(*hdr));
+ if (TDB_PTR_IS_ERR(hdr)) {
+ return hdr;
+ }
+ hdr->next = tdb->access;
+ tdb->access = hdr;
+ ret = hdr + 1;
+ if (convert) {
+ tdb_convert(tdb, (void *)ret, len);
+ }
+ } else
+ tdb->direct_access++;
+
+ return ret;
+}
+
+void *tdb_access_write(struct tdb_context *tdb,
+ tdb_off_t off, tdb_len_t len, bool convert)
+{
+ void *ret = NULL;
+
+ if (tdb->read_only) {
+ tdb_logerr(tdb, TDB_ERR_RDONLY, TDB_LOG_USE_ERROR,
+ "Write to read-only database");
+ return TDB_ERR_PTR(TDB_ERR_RDONLY);
+ }
+
+ if (likely(!(tdb->flags & TDB_CONVERT))) {
+ ret = tdb->methods->direct(tdb, off, len, true);
+
+ if (TDB_PTR_IS_ERR(ret)) {
+ return ret;
+ }
+ }
+
+ if (!ret) {
+ struct tdb_access_hdr *hdr;
+ hdr = _tdb_alloc_read(tdb, off, len, sizeof(*hdr));
+ if (TDB_PTR_IS_ERR(hdr)) {
+ return hdr;
+ }
+ hdr->next = tdb->access;
+ tdb->access = hdr;
+ hdr->off = off;
+ hdr->len = len;
+ hdr->convert = convert;
+ ret = hdr + 1;
+ if (convert)
+ tdb_convert(tdb, (void *)ret, len);
+ } else
+ tdb->direct_access++;
+
+ return ret;
+}
+
+static struct tdb_access_hdr **find_hdr(struct tdb_context *tdb, const void *p)
+{
+ struct tdb_access_hdr **hp;
+
+ for (hp = &tdb->access; *hp; hp = &(*hp)->next) {
+ if (*hp + 1 == p)
+ return hp;
+ }
+ return NULL;
+}
+
+void tdb_access_release(struct tdb_context *tdb, const void *p)
+{
+ struct tdb_access_hdr *hdr, **hp = find_hdr(tdb, p);
+
+ if (hp) {
+ hdr = *hp;
+ *hp = hdr->next;
+ free(hdr);
+ } else
+ tdb->direct_access--;
+}
+
+enum TDB_ERROR tdb_access_commit(struct tdb_context *tdb, void *p)
+{
+ struct tdb_access_hdr *hdr, **hp = find_hdr(tdb, p);
+ enum TDB_ERROR ecode;
+
+ if (hp) {
+ hdr = *hp;
+ if (hdr->convert)
+ ecode = tdb_write_convert(tdb, hdr->off, p, hdr->len);
+ else
+ ecode = tdb_write(tdb, hdr->off, p, hdr->len);
+ *hp = hdr->next;
+ free(hdr);
+ } else {
+ tdb->direct_access--;
+ ecode = TDB_SUCCESS;
+ }
+
+ return ecode;
+}
+
+static void *tdb_direct(struct tdb_context *tdb, tdb_off_t off, size_t len,
+ bool write_mode)
+{
+ enum TDB_ERROR ecode;
+
+ if (unlikely(!tdb->file->map_ptr))
+ return NULL;
+
+ ecode = tdb_oob(tdb, off + len, true);
+ if (unlikely(ecode != TDB_SUCCESS))
+ return TDB_ERR_PTR(ecode);
+ return (char *)tdb->file->map_ptr + off;
+}
+
+void tdb_inc_seqnum(struct tdb_context *tdb)
+{
+ tdb_off_t seq;
+
+ if (likely(!(tdb->flags & TDB_CONVERT))) {
+ int64_t *direct;
+
+ direct = tdb->methods->direct(tdb,
+ offsetof(struct tdb_header,
+ seqnum),
+ sizeof(*direct), true);
+ if (likely(direct)) {
+ /* Don't let it go negative, even briefly */
+ if (unlikely((*direct) + 1) < 0)
+ *direct = 0;
+ (*direct)++;
+ return;
+ }
+ }
+
+ seq = tdb_read_off(tdb, offsetof(struct tdb_header, seqnum));
+ if (!TDB_OFF_IS_ERR(seq)) {
+ seq++;
+ if (unlikely((int64_t)seq < 0))
+ seq = 0;
+ tdb_write_off(tdb, offsetof(struct tdb_header, seqnum), seq);
+ }
+}
+
+static const struct tdb_methods io_methods = {
+ tdb_read,
+ tdb_write,
+ tdb_oob,
+ tdb_expand_file,
+ tdb_direct,
+};
+
+/*
+ initialise the default methods table
+*/
+void tdb_io_init(struct tdb_context *tdb)
+{
+ tdb->methods = &io_methods;
+}
diff --git a/lib/tdb2/lock.c b/lib/tdb2/lock.c
new file mode 100644
index 00000000000..76b8bc31579
--- /dev/null
+++ b/lib/tdb2/lock.c
@@ -0,0 +1,875 @@
+ /*
+ Unix SMB/CIFS implementation.
+
+ trivial database library
+
+ Copyright (C) Andrew Tridgell 1999-2005
+ Copyright (C) Paul `Rusty' Russell 2000
+ Copyright (C) Jeremy Allison 2000-2003
+
+ ** NOTE! The following LGPL license applies to the tdb
+ ** library. This does NOT imply that all of Samba is released
+ ** under the LGPL
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 3 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "private.h"
+#include <assert.h>
+#include <ccan/build_assert/build_assert.h>
+
+/* If we were threaded, we could wait for unlock, but we're not, so fail. */
+static enum TDB_ERROR owner_conflict(struct tdb_context *tdb, const char *call)
+{
+ return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_USE_ERROR,
+ "%s: lock owned by another tdb in this process.",
+ call);
+}
+
+/* If we fork, we no longer really own locks. */
+static bool check_lock_pid(struct tdb_context *tdb,
+ const char *call, bool log)
+{
+ /* No locks? No problem! */
+ if (tdb->file->allrecord_lock.count == 0
+ && tdb->file->num_lockrecs == 0) {
+ return true;
+ }
+
+ /* No fork? No problem! */
+ if (tdb->file->locker == getpid()) {
+ return true;
+ }
+
+ if (log) {
+ tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_USE_ERROR,
+ "%s: fork() detected after lock acquisition!"
+ " (%u vs %u)", call, tdb->file->locker, getpid());
+ }
+ return false;
+}
+
+int tdb_fcntl_lock(int fd, int rw, off_t off, off_t len, bool waitflag,
+ void *unused)
+{
+ struct flock fl;
+ int ret;
+
+ do {
+ fl.l_type = rw;
+ fl.l_whence = SEEK_SET;
+ fl.l_start = off;
+ fl.l_len = len;
+
+ if (waitflag)
+ ret = fcntl(fd, F_SETLKW, &fl);
+ else
+ ret = fcntl(fd, F_SETLK, &fl);
+ } while (ret != 0 && errno == EINTR);
+ return ret;
+}
+
+int tdb_fcntl_unlock(int fd, int rw, off_t off, off_t len, void *unused)
+{
+ struct flock fl;
+ int ret;
+
+ do {
+ fl.l_type = F_UNLCK;
+ fl.l_whence = SEEK_SET;
+ fl.l_start = off;
+ fl.l_len = len;
+
+ ret = fcntl(fd, F_SETLKW, &fl);
+ } while (ret != 0 && errno == EINTR);
+ return ret;
+}
+
+static int lock(struct tdb_context *tdb,
+ int rw, off_t off, off_t len, bool waitflag)
+{
+ int ret;
+ if (tdb->file->allrecord_lock.count == 0
+ && tdb->file->num_lockrecs == 0) {
+ tdb->file->locker = getpid();
+ }
+
+ tdb->stats.lock_lowlevel++;
+ ret = tdb->lock_fn(tdb->file->fd, rw, off, len, waitflag,
+ tdb->lock_data);
+ if (!waitflag) {
+ tdb->stats.lock_nonblock++;
+ if (ret != 0)
+ tdb->stats.lock_nonblock_fail++;
+ }
+ return ret;
+}
+
+static int unlock(struct tdb_context *tdb, int rw, off_t off, off_t len)
+{
+#if 0 /* Check they matched up locks and unlocks correctly. */
+ char line[80];
+ FILE *locks;
+ bool found = false;
+
+ locks = fopen("/proc/locks", "r");
+
+ while (fgets(line, 80, locks)) {
+ char *p;
+ int type, start, l;
+
+ /* eg. 1: FLOCK ADVISORY WRITE 2440 08:01:2180826 0 EOF */
+ p = strchr(line, ':') + 1;
+ if (strncmp(p, " POSIX ADVISORY ", strlen(" POSIX ADVISORY ")))
+ continue;
+ p += strlen(" FLOCK ADVISORY ");
+ if (strncmp(p, "READ ", strlen("READ ")) == 0)
+ type = F_RDLCK;
+ else if (strncmp(p, "WRITE ", strlen("WRITE ")) == 0)
+ type = F_WRLCK;
+ else
+ abort();
+ p += 6;
+ if (atoi(p) != getpid())
+ continue;
+ p = strchr(strchr(p, ' ') + 1, ' ') + 1;
+ start = atoi(p);
+ p = strchr(p, ' ') + 1;
+ if (strncmp(p, "EOF", 3) == 0)
+ l = 0;
+ else
+ l = atoi(p) - start + 1;
+
+ if (off == start) {
+ if (len != l) {
+ fprintf(stderr, "Len %u should be %u: %s",
+ (int)len, l, line);
+ abort();
+ }
+ if (type != rw) {
+ fprintf(stderr, "Type %s wrong: %s",
+ rw == F_RDLCK ? "READ" : "WRITE", line);
+ abort();
+ }
+ found = true;
+ break;
+ }
+ }
+
+ if (!found) {
+ fprintf(stderr, "Unlock on %u@%u not found!",
+ (int)off, (int)len);
+ abort();
+ }
+
+ fclose(locks);
+#endif
+
+ return tdb->unlock_fn(tdb->file->fd, rw, off, len, tdb->lock_data);
+}
+
+/* a byte range locking function - return 0 on success
+ this functions locks len bytes at the specified offset.
+
+ note that a len of zero means lock to end of file
+*/
+static enum TDB_ERROR tdb_brlock(struct tdb_context *tdb,
+ int rw_type, tdb_off_t offset, tdb_off_t len,
+ enum tdb_lock_flags flags)
+{
+ int ret;
+
+ if (tdb->flags & TDB_NOLOCK) {
+ return TDB_SUCCESS;
+ }
+
+ if (rw_type == F_WRLCK && tdb->read_only) {
+ return tdb_logerr(tdb, TDB_ERR_RDONLY, TDB_LOG_USE_ERROR,
+ "Write lock attempted on read-only database");
+ }
+
+ /* A 32 bit system cannot open a 64-bit file, but it could have
+ * expanded since then: check here. */
+ if ((size_t)(offset + len) != offset + len) {
+ return tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
+ "tdb_brlock: lock on giant offset %llu",
+ (long long)(offset + len));
+ }
+
+ ret = lock(tdb, rw_type, offset, len, flags & TDB_LOCK_WAIT);
+ if (ret != 0) {
+ /* Generic lock error. errno set by fcntl.
+ * EAGAIN is an expected return from non-blocking
+ * locks. */
+ if (!(flags & TDB_LOCK_PROBE)
+ && (errno != EAGAIN && errno != EINTR)) {
+ tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR,
+ "tdb_brlock failed (fd=%d) at"
+ " offset %zu rw_type=%d flags=%d len=%zu:"
+ " %s",
+ tdb->file->fd, (size_t)offset, rw_type,
+ flags, (size_t)len, strerror(errno));
+ }
+ return TDB_ERR_LOCK;
+ }
+ return TDB_SUCCESS;
+}
+
+static enum TDB_ERROR tdb_brunlock(struct tdb_context *tdb,
+ int rw_type, tdb_off_t offset, size_t len)
+{
+ if (tdb->flags & TDB_NOLOCK) {
+ return TDB_SUCCESS;
+ }
+
+ if (!check_lock_pid(tdb, "tdb_brunlock", true))
+ return TDB_ERR_LOCK;
+
+ if (unlock(tdb, rw_type, offset, len) == -1) {
+ return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR,
+ "tdb_brunlock failed (fd=%d) at offset %zu"
+ " rw_type=%d len=%zu: %s",
+ tdb->file->fd, (size_t)offset, rw_type,
+ (size_t)len, strerror(errno));
+ }
+ return TDB_SUCCESS;
+}
+
+/*
+ upgrade a read lock to a write lock. This needs to be handled in a
+ special way as some OSes (such as solaris) have too conservative
+ deadlock detection and claim a deadlock when progress can be
+ made. For those OSes we may loop for a while.
+*/
+enum TDB_ERROR tdb_allrecord_upgrade(struct tdb_context *tdb)
+{
+ int count = 1000;
+
+ if (!check_lock_pid(tdb, "tdb_transaction_prepare_commit", true))
+ return TDB_ERR_LOCK;
+
+ if (tdb->file->allrecord_lock.count != 1) {
+ return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR,
+ "tdb_allrecord_upgrade failed:"
+ " count %u too high",
+ tdb->file->allrecord_lock.count);
+ }
+
+ if (tdb->file->allrecord_lock.off != 1) {
+ return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR,
+ "tdb_allrecord_upgrade failed:"
+ " already upgraded?");
+ }
+
+ if (tdb->file->allrecord_lock.owner != tdb) {
+ return owner_conflict(tdb, "tdb_allrecord_upgrade");
+ }
+
+ while (count--) {
+ struct timeval tv;
+ if (tdb_brlock(tdb, F_WRLCK,
+ TDB_HASH_LOCK_START, 0,
+ TDB_LOCK_WAIT|TDB_LOCK_PROBE) == TDB_SUCCESS) {
+ tdb->file->allrecord_lock.ltype = F_WRLCK;
+ tdb->file->allrecord_lock.off = 0;
+ return TDB_SUCCESS;
+ }
+ if (errno != EDEADLK) {
+ break;
+ }
+ /* sleep for as short a time as we can - more portable than usleep() */
+ tv.tv_sec = 0;
+ tv.tv_usec = 1;
+ select(0, NULL, NULL, NULL, &tv);
+ }
+
+ if (errno != EAGAIN && errno != EINTR)
+ tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR,
+ "tdb_allrecord_upgrade failed");
+ return TDB_ERR_LOCK;
+}
+
+static struct tdb_lock *find_nestlock(struct tdb_context *tdb, tdb_off_t offset,
+ const struct tdb_context *owner)
+{
+ unsigned int i;
+
+ for (i=0; i<tdb->file->num_lockrecs; i++) {
+ if (tdb->file->lockrecs[i].off == offset) {
+ if (owner && tdb->file->lockrecs[i].owner != owner)
+ return NULL;
+ return &tdb->file->lockrecs[i];
+ }
+ }
+ return NULL;
+}
+
+enum TDB_ERROR tdb_lock_and_recover(struct tdb_context *tdb)
+{
+ enum TDB_ERROR ecode;
+
+ if (!check_lock_pid(tdb, "tdb_transaction_prepare_commit", true))
+ return TDB_ERR_LOCK;
+
+ ecode = tdb_allrecord_lock(tdb, F_WRLCK, TDB_LOCK_WAIT|TDB_LOCK_NOCHECK,
+ false);
+ if (ecode != TDB_SUCCESS) {
+ return ecode;
+ }
+
+ ecode = tdb_lock_open(tdb, F_WRLCK, TDB_LOCK_WAIT|TDB_LOCK_NOCHECK);
+ if (ecode != TDB_SUCCESS) {
+ tdb_allrecord_unlock(tdb, F_WRLCK);
+ return ecode;
+ }
+ ecode = tdb_transaction_recover(tdb);
+ tdb_unlock_open(tdb, F_WRLCK);
+ tdb_allrecord_unlock(tdb, F_WRLCK);
+
+ return ecode;
+}
+
+/* lock an offset in the database. */
+static enum TDB_ERROR tdb_nest_lock(struct tdb_context *tdb,
+ tdb_off_t offset, int ltype,
+ enum tdb_lock_flags flags)
+{
+ struct tdb_lock *new_lck;
+ enum TDB_ERROR ecode;
+
+ if (offset > (TDB_HASH_LOCK_START + TDB_HASH_LOCK_RANGE
+ + tdb->file->map_size / 8)) {
+ return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR,
+ "tdb_nest_lock: invalid offset %zu ltype=%d",
+ (size_t)offset, ltype);
+ }
+
+ if (tdb->flags & TDB_NOLOCK)
+ return TDB_SUCCESS;
+
+ if (!check_lock_pid(tdb, "tdb_nest_lock", true)) {
+ return TDB_ERR_LOCK;
+ }
+
+ tdb->stats.locks++;
+
+ new_lck = find_nestlock(tdb, offset, NULL);
+ if (new_lck) {
+ if (new_lck->owner != tdb) {
+ return owner_conflict(tdb, "tdb_nest_lock");
+ }
+
+ if (new_lck->ltype == F_RDLCK && ltype == F_WRLCK) {
+ return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR,
+ "tdb_nest_lock:"
+ " offset %zu has read lock",
+ (size_t)offset);
+ }
+ /* Just increment the struct, posix locks don't stack. */
+ new_lck->count++;
+ return TDB_SUCCESS;
+ }
+
+#if 0
+ if (tdb->file->num_lockrecs
+ && offset >= TDB_HASH_LOCK_START
+ && offset < TDB_HASH_LOCK_START + TDB_HASH_LOCK_RANGE) {
+ return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR,
+ "tdb_nest_lock: already have a hash lock?");
+ }
+#endif
+
+ new_lck = (struct tdb_lock *)realloc(
+ tdb->file->lockrecs,
+ sizeof(*tdb->file->lockrecs) * (tdb->file->num_lockrecs+1));
+ if (new_lck == NULL) {
+ return tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR,
+ "tdb_nest_lock:"
+ " unable to allocate %zu lock struct",
+ tdb->file->num_lockrecs + 1);
+ }
+ tdb->file->lockrecs = new_lck;
+
+ /* Since fcntl locks don't nest, we do a lock for the first one,
+ and simply bump the count for future ones */
+ ecode = tdb_brlock(tdb, ltype, offset, 1, flags);
+ if (ecode != TDB_SUCCESS) {
+ return ecode;
+ }
+
+ /* First time we grab a lock, perhaps someone died in commit? */
+ if (!(flags & TDB_LOCK_NOCHECK)
+ && tdb->file->num_lockrecs == 0) {
+ tdb_bool_err berr = tdb_needs_recovery(tdb);
+ if (berr != false) {
+ tdb_brunlock(tdb, ltype, offset, 1);
+
+ if (berr < 0)
+ return berr;
+ ecode = tdb_lock_and_recover(tdb);
+ if (ecode == TDB_SUCCESS) {
+ ecode = tdb_brlock(tdb, ltype, offset, 1,
+ flags);
+ }
+ if (ecode != TDB_SUCCESS) {
+ return ecode;
+ }
+ }
+ }
+
+ tdb->file->lockrecs[tdb->file->num_lockrecs].owner = tdb;
+ tdb->file->lockrecs[tdb->file->num_lockrecs].off = offset;
+ tdb->file->lockrecs[tdb->file->num_lockrecs].count = 1;
+ tdb->file->lockrecs[tdb->file->num_lockrecs].ltype = ltype;
+ tdb->file->num_lockrecs++;
+
+ return TDB_SUCCESS;
+}
+
+static enum TDB_ERROR tdb_nest_unlock(struct tdb_context *tdb,
+ tdb_off_t off, int ltype)
+{
+ struct tdb_lock *lck;
+ enum TDB_ERROR ecode;
+
+ if (tdb->flags & TDB_NOLOCK)
+ return TDB_SUCCESS;
+
+ lck = find_nestlock(tdb, off, tdb);
+ if ((lck == NULL) || (lck->count == 0)) {
+ return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR,
+ "tdb_nest_unlock: no lock for %zu",
+ (size_t)off);
+ }
+
+ if (lck->count > 1) {
+ lck->count--;
+ return TDB_SUCCESS;
+ }
+
+ /*
+ * This lock has count==1 left, so we need to unlock it in the
+ * kernel. We don't bother with decrementing the in-memory array
+ * element, we're about to overwrite it with the last array element
+ * anyway.
+ */
+ ecode = tdb_brunlock(tdb, ltype, off, 1);
+
+ /*
+ * Shrink the array by overwriting the element just unlocked with the
+ * last array element.
+ */
+ *lck = tdb->file->lockrecs[--tdb->file->num_lockrecs];
+
+ return ecode;
+}
+
+/*
+ get the transaction lock
+ */
+enum TDB_ERROR tdb_transaction_lock(struct tdb_context *tdb, int ltype)
+{
+ return tdb_nest_lock(tdb, TDB_TRANSACTION_LOCK, ltype, TDB_LOCK_WAIT);
+}
+
+/*
+ release the transaction lock
+ */
+void tdb_transaction_unlock(struct tdb_context *tdb, int ltype)
+{
+ tdb_nest_unlock(tdb, TDB_TRANSACTION_LOCK, ltype);
+}
+
+/* We only need to lock individual bytes, but Linux merges consecutive locks
+ * so we lock in contiguous ranges. */
+static enum TDB_ERROR tdb_lock_gradual(struct tdb_context *tdb,
+ int ltype, enum tdb_lock_flags flags,
+ tdb_off_t off, tdb_off_t len)
+{
+ enum TDB_ERROR ecode;
+ enum tdb_lock_flags nb_flags = (flags & ~TDB_LOCK_WAIT);
+
+ if (len <= 1) {
+ /* 0 would mean to end-of-file... */
+ assert(len != 0);
+ /* Single hash. Just do blocking lock. */
+ return tdb_brlock(tdb, ltype, off, len, flags);
+ }
+
+ /* First we try non-blocking. */
+ if (tdb_brlock(tdb, ltype, off, len, nb_flags) == TDB_SUCCESS) {
+ return TDB_SUCCESS;
+ }
+
+ /* Try locking first half, then second. */
+ ecode = tdb_lock_gradual(tdb, ltype, flags, off, len / 2);
+ if (ecode != TDB_SUCCESS)
+ return ecode;
+
+ ecode = tdb_lock_gradual(tdb, ltype, flags,
+ off + len / 2, len - len / 2);
+ if (ecode != TDB_SUCCESS) {
+ tdb_brunlock(tdb, ltype, off, len / 2);
+ }
+ return ecode;
+}
+
+/* lock/unlock entire database. It can only be upgradable if you have some
+ * other way of guaranteeing exclusivity (ie. transaction write lock). */
+enum TDB_ERROR tdb_allrecord_lock(struct tdb_context *tdb, int ltype,
+ enum tdb_lock_flags flags, bool upgradable)
+{
+ enum TDB_ERROR ecode;
+ tdb_bool_err berr;
+
+ if (tdb->flags & TDB_NOLOCK)
+ return TDB_SUCCESS;
+
+ if (!check_lock_pid(tdb, "tdb_allrecord_lock", true)) {
+ return TDB_ERR_LOCK;
+ }
+
+ if (tdb->file->allrecord_lock.count) {
+ if (tdb->file->allrecord_lock.owner != tdb) {
+ return owner_conflict(tdb, "tdb_allrecord_lock");
+ }
+
+ if (ltype == F_RDLCK
+ || tdb->file->allrecord_lock.ltype == F_WRLCK) {
+ tdb->file->allrecord_lock.count++;
+ return TDB_SUCCESS;
+ }
+
+ /* a global lock of a different type exists */
+ return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_USE_ERROR,
+ "tdb_allrecord_lock: already have %s lock",
+ tdb->file->allrecord_lock.ltype == F_RDLCK
+ ? "read" : "write");
+ }
+
+ if (tdb_has_hash_locks(tdb)) {
+ /* can't combine global and chain locks */
+ return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_USE_ERROR,
+ "tdb_allrecord_lock:"
+ " already have chain lock");
+ }
+
+ if (upgradable && ltype != F_RDLCK) {
+ /* tdb error: you can't upgrade a write lock! */
+ return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR,
+ "tdb_allrecord_lock:"
+ " can't upgrade a write lock");
+ }
+
+ tdb->stats.locks++;
+again:
+ /* Lock hashes, gradually. */
+ ecode = tdb_lock_gradual(tdb, ltype, flags, TDB_HASH_LOCK_START,
+ TDB_HASH_LOCK_RANGE);
+ if (ecode != TDB_SUCCESS)
+ return ecode;
+
+ /* Lock free tables: there to end of file. */
+ ecode = tdb_brlock(tdb, ltype,
+ TDB_HASH_LOCK_START + TDB_HASH_LOCK_RANGE,
+ 0, flags);
+ if (ecode != TDB_SUCCESS) {
+ tdb_brunlock(tdb, ltype, TDB_HASH_LOCK_START,
+ TDB_HASH_LOCK_RANGE);
+ return ecode;
+ }
+
+ tdb->file->allrecord_lock.owner = tdb;
+ tdb->file->allrecord_lock.count = 1;
+ /* If it's upgradable, it's actually exclusive so we can treat
+ * it as a write lock. */
+ tdb->file->allrecord_lock.ltype = upgradable ? F_WRLCK : ltype;
+ tdb->file->allrecord_lock.off = upgradable;
+
+ /* Now check for needing recovery. */
+ if (flags & TDB_LOCK_NOCHECK)
+ return TDB_SUCCESS;
+
+ berr = tdb_needs_recovery(tdb);
+ if (likely(berr == false))
+ return TDB_SUCCESS;
+
+ tdb_allrecord_unlock(tdb, ltype);
+ if (berr < 0)
+ return berr;
+ ecode = tdb_lock_and_recover(tdb);
+ if (ecode != TDB_SUCCESS) {
+ return ecode;
+ }
+ goto again;
+}
+
+enum TDB_ERROR tdb_lock_open(struct tdb_context *tdb,
+ int ltype, enum tdb_lock_flags flags)
+{
+ return tdb_nest_lock(tdb, TDB_OPEN_LOCK, ltype, flags);
+}
+
+void tdb_unlock_open(struct tdb_context *tdb, int ltype)
+{
+ tdb_nest_unlock(tdb, TDB_OPEN_LOCK, ltype);
+}
+
+bool tdb_has_open_lock(struct tdb_context *tdb)
+{
+ return !(tdb->flags & TDB_NOLOCK)
+ && find_nestlock(tdb, TDB_OPEN_LOCK, tdb) != NULL;
+}
+
+enum TDB_ERROR tdb_lock_expand(struct tdb_context *tdb, int ltype)
+{
+ /* Lock doesn't protect data, so don't check (we recurse if we do!) */
+ return tdb_nest_lock(tdb, TDB_EXPANSION_LOCK, ltype,
+ TDB_LOCK_WAIT | TDB_LOCK_NOCHECK);
+}
+
+void tdb_unlock_expand(struct tdb_context *tdb, int ltype)
+{
+ tdb_nest_unlock(tdb, TDB_EXPANSION_LOCK, ltype);
+}
+
+/* unlock entire db */
+void tdb_allrecord_unlock(struct tdb_context *tdb, int ltype)
+{
+ if (tdb->flags & TDB_NOLOCK)
+ return;
+
+ if (tdb->file->allrecord_lock.count == 0) {
+ tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_USE_ERROR,
+ "tdb_allrecord_unlock: not locked!");
+ return;
+ }
+
+ if (tdb->file->allrecord_lock.owner != tdb) {
+ tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_USE_ERROR,
+ "tdb_allrecord_unlock: not locked by us!");
+ return;
+ }
+
+ /* Upgradable locks are marked as write locks. */
+ if (tdb->file->allrecord_lock.ltype != ltype
+ && (!tdb->file->allrecord_lock.off || ltype != F_RDLCK)) {
+ tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR,
+ "tdb_allrecord_unlock: have %s lock",
+ tdb->file->allrecord_lock.ltype == F_RDLCK
+ ? "read" : "write");
+ return;
+ }
+
+ if (tdb->file->allrecord_lock.count > 1) {
+ tdb->file->allrecord_lock.count--;
+ return;
+ }
+
+ tdb->file->allrecord_lock.count = 0;
+ tdb->file->allrecord_lock.ltype = 0;
+
+ tdb_brunlock(tdb, ltype, TDB_HASH_LOCK_START, 0);
+}
+
+bool tdb_has_expansion_lock(struct tdb_context *tdb)
+{
+ return find_nestlock(tdb, TDB_EXPANSION_LOCK, tdb) != NULL;
+}
+
+bool tdb_has_hash_locks(struct tdb_context *tdb)
+{
+ unsigned int i;
+
+ for (i=0; i<tdb->file->num_lockrecs; i++) {
+ if (tdb->file->lockrecs[i].off >= TDB_HASH_LOCK_START
+ && tdb->file->lockrecs[i].off < (TDB_HASH_LOCK_START
+ + TDB_HASH_LOCK_RANGE))
+ return true;
+ }
+ return false;
+}
+
+static bool tdb_has_free_lock(struct tdb_context *tdb)
+{
+ unsigned int i;
+
+ if (tdb->flags & TDB_NOLOCK)
+ return false;
+
+ for (i=0; i<tdb->file->num_lockrecs; i++) {
+ if (tdb->file->lockrecs[i].off
+ > TDB_HASH_LOCK_START + TDB_HASH_LOCK_RANGE)
+ return true;
+ }
+ return false;
+}
+
+enum TDB_ERROR tdb_lock_hashes(struct tdb_context *tdb,
+ tdb_off_t hash_lock,
+ tdb_len_t hash_range,
+ int ltype, enum tdb_lock_flags waitflag)
+{
+ /* FIXME: Do this properly, using hlock_range */
+ unsigned l = TDB_HASH_LOCK_START
+ + (hash_lock >> (64 - TDB_HASH_LOCK_RANGE_BITS));
+
+ /* a allrecord lock allows us to avoid per chain locks */
+ if (tdb->file->allrecord_lock.count) {
+ if (!check_lock_pid(tdb, "tdb_lock_hashes", true))
+ return TDB_ERR_LOCK;
+
+ if (tdb->file->allrecord_lock.owner != tdb)
+ return owner_conflict(tdb, "tdb_lock_hashes");
+ if (ltype == tdb->file->allrecord_lock.ltype
+ || ltype == F_RDLCK) {
+ return TDB_SUCCESS;
+ }
+
+ return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_USE_ERROR,
+ "tdb_lock_hashes:"
+ " already have %s allrecordlock",
+ tdb->file->allrecord_lock.ltype == F_RDLCK
+ ? "read" : "write");
+ }
+
+ if (tdb_has_free_lock(tdb)) {
+ return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR,
+ "tdb_lock_hashes: already have free lock");
+ }
+
+ if (tdb_has_expansion_lock(tdb)) {
+ return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR,
+ "tdb_lock_hashes:"
+ " already have expansion lock");
+ }
+
+ return tdb_nest_lock(tdb, l, ltype, waitflag);
+}
+
+enum TDB_ERROR tdb_unlock_hashes(struct tdb_context *tdb,
+ tdb_off_t hash_lock,
+ tdb_len_t hash_range, int ltype)
+{
+ unsigned l = TDB_HASH_LOCK_START
+ + (hash_lock >> (64 - TDB_HASH_LOCK_RANGE_BITS));
+
+ if (tdb->flags & TDB_NOLOCK)
+ return 0;
+
+ /* a allrecord lock allows us to avoid per chain locks */
+ if (tdb->file->allrecord_lock.count) {
+ if (tdb->file->allrecord_lock.ltype == F_RDLCK
+ && ltype == F_WRLCK) {
+ return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR,
+ "tdb_unlock_hashes RO allrecord!");
+ }
+ return TDB_SUCCESS;
+ }
+
+ return tdb_nest_unlock(tdb, l, ltype);
+}
+
+/* Hash locks use TDB_HASH_LOCK_START + the next 30 bits.
+ * Then we begin; bucket offsets are sizeof(tdb_len_t) apart, so we divide.
+ * The result is that on 32 bit systems we don't use lock values > 2^31 on
+ * files that are less than 4GB.
+ */
+static tdb_off_t free_lock_off(tdb_off_t b_off)
+{
+ return TDB_HASH_LOCK_START + TDB_HASH_LOCK_RANGE
+ + b_off / sizeof(tdb_off_t);
+}
+
+enum TDB_ERROR tdb_lock_free_bucket(struct tdb_context *tdb, tdb_off_t b_off,
+ enum tdb_lock_flags waitflag)
+{
+ assert(b_off >= sizeof(struct tdb_header));
+
+ if (tdb->flags & TDB_NOLOCK)
+ return 0;
+
+ /* a allrecord lock allows us to avoid per chain locks */
+ if (tdb->file->allrecord_lock.count) {
+ if (!check_lock_pid(tdb, "tdb_lock_free_bucket", true))
+ return TDB_ERR_LOCK;
+
+ if (tdb->file->allrecord_lock.ltype == F_WRLCK)
+ return 0;
+ return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR,
+ "tdb_lock_free_bucket with"
+ " read-only allrecordlock!");
+ }
+
+#if 0 /* FIXME */
+ if (tdb_has_expansion_lock(tdb)) {
+ return tdb_logerr(tdb, TDB_ERR_LOCK, TDB_LOG_ERROR,
+ "tdb_lock_free_bucket:"
+ " already have expansion lock");
+ }
+#endif
+
+ return tdb_nest_lock(tdb, free_lock_off(b_off), F_WRLCK, waitflag);
+}
+
+void tdb_unlock_free_bucket(struct tdb_context *tdb, tdb_off_t b_off)
+{
+ if (tdb->file->allrecord_lock.count)
+ return;
+
+ tdb_nest_unlock(tdb, free_lock_off(b_off), F_WRLCK);
+}
+
+enum TDB_ERROR tdb_lockall(struct tdb_context *tdb)
+{
+ return tdb_allrecord_lock(tdb, F_WRLCK, TDB_LOCK_WAIT, false);
+}
+
+void tdb_unlockall(struct tdb_context *tdb)
+{
+ tdb_allrecord_unlock(tdb, F_WRLCK);
+}
+
+enum TDB_ERROR tdb_lockall_read(struct tdb_context *tdb)
+{
+ return tdb_allrecord_lock(tdb, F_RDLCK, TDB_LOCK_WAIT, false);
+}
+
+void tdb_unlockall_read(struct tdb_context *tdb)
+{
+ tdb_allrecord_unlock(tdb, F_RDLCK);
+}
+
+void tdb_lock_cleanup(struct tdb_context *tdb)
+{
+ unsigned int i;
+
+ /* We don't want to warn: they're allowed to close tdb after fork. */
+ if (!check_lock_pid(tdb, "tdb_close", false))
+ return;
+
+ while (tdb->file->allrecord_lock.count
+ && tdb->file->allrecord_lock.owner == tdb) {
+ tdb_allrecord_unlock(tdb, tdb->file->allrecord_lock.ltype);
+ }
+
+ for (i=0; i<tdb->file->num_lockrecs; i++) {
+ if (tdb->file->lockrecs[i].owner == tdb) {
+ tdb_nest_unlock(tdb,
+ tdb->file->lockrecs[i].off,
+ tdb->file->lockrecs[i].ltype);
+ i--;
+ }
+ }
+}
diff --git a/lib/tdb2/open.c b/lib/tdb2/open.c
new file mode 100644
index 00000000000..c35598cdcce
--- /dev/null
+++ b/lib/tdb2/open.c
@@ -0,0 +1,661 @@
+#include "private.h"
+#include <ccan/hash/hash.h>
+#include <assert.h>
+
+/* all lock info, to detect double-opens (fcntl file don't nest!) */
+static struct tdb_file *files = NULL;
+
+static struct tdb_file *find_file(dev_t device, ino_t ino)
+{
+ struct tdb_file *i;
+
+ for (i = files; i; i = i->next) {
+ if (i->device == device && i->inode == ino) {
+ i->refcnt++;
+ break;
+ }
+ }
+ return i;
+}
+
+static bool read_all(int fd, void *buf, size_t len)
+{
+ while (len) {
+ ssize_t ret;
+ ret = read(fd, buf, len);
+ if (ret < 0)
+ return false;
+ if (ret == 0) {
+ /* ETOOSHORT? */
+ errno = EWOULDBLOCK;
+ return false;
+ }
+ buf = (char *)buf + ret;
+ len -= ret;
+ }
+ return true;
+}
+
+static uint64_t random_number(struct tdb_context *tdb)
+{
+ int fd;
+ uint64_t ret = 0;
+ struct timeval now;
+
+ fd = open("/dev/urandom", O_RDONLY);
+ if (fd >= 0) {
+ if (read_all(fd, &ret, sizeof(ret))) {
+ close(fd);
+ return ret;
+ }
+ close(fd);
+ }
+ /* FIXME: Untested! Based on Wikipedia protocol description! */
+ fd = open("/dev/egd-pool", O_RDWR);
+ if (fd >= 0) {
+ /* Command is 1, next byte is size we want to read. */
+ char cmd[2] = { 1, sizeof(uint64_t) };
+ if (write(fd, cmd, sizeof(cmd)) == sizeof(cmd)) {
+ char reply[1 + sizeof(uint64_t)];
+ int r = read(fd, reply, sizeof(reply));
+ if (r > 1) {
+ /* Copy at least some bytes. */
+ memcpy(&ret, reply+1, r - 1);
+ if (reply[0] == sizeof(uint64_t)
+ && r == sizeof(reply)) {
+ close(fd);
+ return ret;
+ }
+ }
+ }
+ close(fd);
+ }
+
+ /* Fallback: pid and time. */
+ gettimeofday(&now, NULL);
+ ret = getpid() * 100132289ULL + now.tv_sec * 1000000ULL + now.tv_usec;
+ tdb_logerr(tdb, TDB_SUCCESS, TDB_LOG_WARNING,
+ "tdb_open: random from getpid and time");
+ return ret;
+}
+
+struct new_database {
+ struct tdb_header hdr;
+ struct tdb_freetable ftable;
+};
+
+/* initialise a new database */
+static enum TDB_ERROR tdb_new_database(struct tdb_context *tdb,
+ struct tdb_attribute_seed *seed,
+ struct tdb_header *hdr)
+{
+ /* We make it up in memory, then write it out if not internal */
+ struct new_database newdb;
+ unsigned int magic_len;
+ ssize_t rlen;
+ enum TDB_ERROR ecode;
+
+ /* Fill in the header */
+ newdb.hdr.version = TDB_VERSION;
+ if (seed)
+ newdb.hdr.hash_seed = seed->seed;
+ else
+ newdb.hdr.hash_seed = random_number(tdb);
+ newdb.hdr.hash_test = TDB_HASH_MAGIC;
+ newdb.hdr.hash_test = tdb->hash_fn(&newdb.hdr.hash_test,
+ sizeof(newdb.hdr.hash_test),
+ newdb.hdr.hash_seed,
+ tdb->hash_data);
+ newdb.hdr.recovery = 0;
+ newdb.hdr.features_used = newdb.hdr.features_offered = TDB_FEATURE_MASK;
+ newdb.hdr.seqnum = 0;
+ memset(newdb.hdr.reserved, 0, sizeof(newdb.hdr.reserved));
+ /* Initial hashes are empty. */
+ memset(newdb.hdr.hashtable, 0, sizeof(newdb.hdr.hashtable));
+
+ /* Free is empty. */
+ newdb.hdr.free_table = offsetof(struct new_database, ftable);
+ memset(&newdb.ftable, 0, sizeof(newdb.ftable));
+ ecode = set_header(NULL, &newdb.ftable.hdr, TDB_FTABLE_MAGIC, 0,
+ sizeof(newdb.ftable) - sizeof(newdb.ftable.hdr),
+ sizeof(newdb.ftable) - sizeof(newdb.ftable.hdr),
+ 0);
+ if (ecode != TDB_SUCCESS) {
+ return ecode;
+ }
+
+ /* Magic food */
+ memset(newdb.hdr.magic_food, 0, sizeof(newdb.hdr.magic_food));
+ strcpy(newdb.hdr.magic_food, TDB_MAGIC_FOOD);
+
+ /* This creates an endian-converted database, as if read from disk */
+ magic_len = sizeof(newdb.hdr.magic_food);
+ tdb_convert(tdb,
+ (char *)&newdb.hdr + magic_len, sizeof(newdb) - magic_len);
+
+ *hdr = newdb.hdr;
+
+ if (tdb->flags & TDB_INTERNAL) {
+ tdb->file->map_size = sizeof(newdb);
+ tdb->file->map_ptr = malloc(tdb->file->map_size);
+ if (!tdb->file->map_ptr) {
+ return tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR,
+ "tdb_new_database:"
+ " failed to allocate");
+ }
+ memcpy(tdb->file->map_ptr, &newdb, tdb->file->map_size);
+ return TDB_SUCCESS;
+ }
+ if (lseek(tdb->file->fd, 0, SEEK_SET) == -1) {
+ return tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
+ "tdb_new_database:"
+ " failed to seek: %s", strerror(errno));
+ }
+
+ if (ftruncate(tdb->file->fd, 0) == -1) {
+ return tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
+ "tdb_new_database:"
+ " failed to truncate: %s", strerror(errno));
+ }
+
+ rlen = write(tdb->file->fd, &newdb, sizeof(newdb));
+ if (rlen != sizeof(newdb)) {
+ if (rlen >= 0)
+ errno = ENOSPC;
+ return tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
+ "tdb_new_database: %zi writing header: %s",
+ rlen, strerror(errno));
+ }
+ return TDB_SUCCESS;
+}
+
+static enum TDB_ERROR tdb_new_file(struct tdb_context *tdb)
+{
+ tdb->file = malloc(sizeof(*tdb->file));
+ if (!tdb->file)
+ return tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR,
+ "tdb_open: cannot alloc tdb_file structure");
+ tdb->file->num_lockrecs = 0;
+ tdb->file->lockrecs = NULL;
+ tdb->file->allrecord_lock.count = 0;
+ tdb->file->refcnt = 1;
+ return TDB_SUCCESS;
+}
+
+enum TDB_ERROR tdb_set_attribute(struct tdb_context *tdb,
+ const union tdb_attribute *attr)
+{
+ switch (attr->base.attr) {
+ case TDB_ATTRIBUTE_LOG:
+ tdb->log_fn = attr->log.fn;
+ tdb->log_data = attr->log.data;
+ break;
+ case TDB_ATTRIBUTE_HASH:
+ case TDB_ATTRIBUTE_SEED:
+ case TDB_ATTRIBUTE_OPENHOOK:
+ return tdb->last_error
+ = tdb_logerr(tdb, TDB_ERR_EINVAL,
+ TDB_LOG_USE_ERROR,
+ "tdb_set_attribute:"
+ " cannot set %s after opening",
+ attr->base.attr == TDB_ATTRIBUTE_HASH
+ ? "TDB_ATTRIBUTE_HASH"
+ : attr->base.attr == TDB_ATTRIBUTE_SEED
+ ? "TDB_ATTRIBUTE_SEED"
+ : "TDB_ATTRIBUTE_OPENHOOK");
+ case TDB_ATTRIBUTE_STATS:
+ return tdb->last_error
+ = tdb_logerr(tdb, TDB_ERR_EINVAL,
+ TDB_LOG_USE_ERROR,
+ "tdb_set_attribute:"
+ " cannot set TDB_ATTRIBUTE_STATS");
+ case TDB_ATTRIBUTE_FLOCK:
+ tdb->lock_fn = attr->flock.lock;
+ tdb->unlock_fn = attr->flock.unlock;
+ tdb->lock_data = attr->flock.data;
+ break;
+ default:
+ return tdb->last_error
+ = tdb_logerr(tdb, TDB_ERR_EINVAL,
+ TDB_LOG_USE_ERROR,
+ "tdb_set_attribute:"
+ " unknown attribute type %u",
+ attr->base.attr);
+ }
+ return TDB_SUCCESS;
+}
+
+static uint64_t jenkins_hash(const void *key, size_t length, uint64_t seed,
+ void *unused)
+{
+ uint64_t ret;
+ /* hash64_stable assumes lower bits are more important; they are a
+ * slightly better hash. We use the upper bits first, so swap them. */
+ ret = hash64_stable((const unsigned char *)key, length, seed);
+ return (ret >> 32) | (ret << 32);
+}
+
+enum TDB_ERROR tdb_get_attribute(struct tdb_context *tdb,
+ union tdb_attribute *attr)
+{
+ switch (attr->base.attr) {
+ case TDB_ATTRIBUTE_LOG:
+ if (!tdb->log_fn)
+ return tdb->last_error = TDB_ERR_NOEXIST;
+ attr->log.fn = tdb->log_fn;
+ attr->log.data = tdb->log_data;
+ break;
+ case TDB_ATTRIBUTE_HASH:
+ attr->hash.fn = tdb->hash_fn;
+ attr->hash.data = tdb->hash_data;
+ break;
+ case TDB_ATTRIBUTE_SEED:
+ attr->seed.seed = tdb->hash_seed;
+ break;
+ case TDB_ATTRIBUTE_OPENHOOK:
+ return tdb->last_error
+ = tdb_logerr(tdb, TDB_ERR_EINVAL,
+ TDB_LOG_USE_ERROR,
+ "tdb_get_attribute:"
+ " cannot get TDB_ATTRIBUTE_OPENHOOK");
+ case TDB_ATTRIBUTE_STATS: {
+ size_t size = attr->stats.size;
+ if (size > tdb->stats.size)
+ size = tdb->stats.size;
+ memcpy(&attr->stats, &tdb->stats, size);
+ break;
+ }
+ case TDB_ATTRIBUTE_FLOCK:
+ attr->flock.lock = tdb->lock_fn;
+ attr->flock.unlock = tdb->unlock_fn;
+ attr->flock.data = tdb->lock_data;
+ break;
+ default:
+ return tdb->last_error
+ = tdb_logerr(tdb, TDB_ERR_EINVAL,
+ TDB_LOG_USE_ERROR,
+ "tdb_get_attribute:"
+ " unknown attribute type %u",
+ attr->base.attr);
+ }
+ attr->base.next = NULL;
+ return TDB_SUCCESS;
+}
+
+void tdb_unset_attribute(struct tdb_context *tdb,
+ enum tdb_attribute_type type)
+{
+ switch (type) {
+ case TDB_ATTRIBUTE_LOG:
+ tdb->log_fn = NULL;
+ break;
+ case TDB_ATTRIBUTE_HASH:
+ case TDB_ATTRIBUTE_SEED:
+ case TDB_ATTRIBUTE_OPENHOOK:
+ tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_LOG_USE_ERROR,
+ "tdb_unset_attribute: cannot unset %s after opening",
+ type == TDB_ATTRIBUTE_HASH
+ ? "TDB_ATTRIBUTE_HASH"
+ : type == TDB_ATTRIBUTE_SEED
+ ? "TDB_ATTRIBUTE_SEED"
+ : "TDB_ATTRIBUTE_OPENHOOK");
+ break;
+ case TDB_ATTRIBUTE_STATS:
+ tdb_logerr(tdb, TDB_ERR_EINVAL,
+ TDB_LOG_USE_ERROR,
+ "tdb_unset_attribute:"
+ "cannot unset TDB_ATTRIBUTE_STATS");
+ break;
+ case TDB_ATTRIBUTE_FLOCK:
+ tdb->lock_fn = tdb_fcntl_lock;
+ tdb->unlock_fn = tdb_fcntl_unlock;
+ break;
+ default:
+ tdb_logerr(tdb, TDB_ERR_EINVAL,
+ TDB_LOG_USE_ERROR,
+ "tdb_unset_attribute: unknown attribute type %u",
+ type);
+ }
+}
+
+struct tdb_context *tdb_open(const char *name, int tdb_flags,
+ int open_flags, mode_t mode,
+ union tdb_attribute *attr)
+{
+ struct tdb_context *tdb;
+ struct stat st;
+ int saved_errno = 0;
+ uint64_t hash_test;
+ unsigned v;
+ ssize_t rlen;
+ struct tdb_header hdr;
+ struct tdb_attribute_seed *seed = NULL;
+ struct tdb_attribute_openhook *openhook = NULL;
+ tdb_bool_err berr;
+ enum TDB_ERROR ecode;
+ int openlock;
+
+ tdb = malloc(sizeof(*tdb) + (name ? strlen(name) + 1 : 0));
+ if (!tdb) {
+ /* Can't log this */
+ errno = ENOMEM;
+ return NULL;
+ }
+ /* Set name immediately for logging functions. */
+ if (name) {
+ tdb->name = strcpy((char *)(tdb + 1), name);
+ } else {
+ tdb->name = NULL;
+ }
+ tdb->direct_access = 0;
+ tdb->flags = tdb_flags;
+ tdb->log_fn = NULL;
+ tdb->transaction = NULL;
+ tdb->access = NULL;
+ tdb->last_error = TDB_SUCCESS;
+ tdb->file = NULL;
+ tdb->lock_fn = tdb_fcntl_lock;
+ tdb->unlock_fn = tdb_fcntl_unlock;
+ tdb->hash_fn = jenkins_hash;
+ memset(&tdb->stats, 0, sizeof(tdb->stats));
+ tdb->stats.base.attr = TDB_ATTRIBUTE_STATS;
+ tdb->stats.size = sizeof(tdb->stats);
+ tdb_io_init(tdb);
+
+ while (attr) {
+ switch (attr->base.attr) {
+ case TDB_ATTRIBUTE_HASH:
+ tdb->hash_fn = attr->hash.fn;
+ tdb->hash_data = attr->hash.data;
+ break;
+ case TDB_ATTRIBUTE_SEED:
+ seed = &attr->seed;
+ break;
+ case TDB_ATTRIBUTE_OPENHOOK:
+ openhook = &attr->openhook;
+ break;
+ default:
+ /* These are set as normal. */
+ ecode = tdb_set_attribute(tdb, attr);
+ if (ecode != TDB_SUCCESS)
+ goto fail;
+ }
+ attr = attr->base.next;
+ }
+
+ if (tdb_flags & ~(TDB_INTERNAL | TDB_NOLOCK | TDB_NOMMAP | TDB_CONVERT
+ | TDB_NOSYNC | TDB_SEQNUM | TDB_ALLOW_NESTING)) {
+ ecode = tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_LOG_USE_ERROR,
+ "tdb_open: unknown flags %u", tdb_flags);
+ goto fail;
+ }
+
+ if ((open_flags & O_ACCMODE) == O_WRONLY) {
+ ecode = tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_LOG_USE_ERROR,
+ "tdb_open: can't open tdb %s write-only",
+ name);
+ goto fail;
+ }
+
+ if ((open_flags & O_ACCMODE) == O_RDONLY) {
+ tdb->read_only = true;
+ tdb->mmap_flags = PROT_READ;
+ openlock = F_RDLCK;
+ } else {
+ tdb->read_only = false;
+ tdb->mmap_flags = PROT_READ | PROT_WRITE;
+ openlock = F_WRLCK;
+ }
+
+ /* internal databases don't need any of the rest. */
+ if (tdb->flags & TDB_INTERNAL) {
+ tdb->flags |= (TDB_NOLOCK | TDB_NOMMAP);
+ ecode = tdb_new_file(tdb);
+ if (ecode != TDB_SUCCESS) {
+ goto fail;
+ }
+ tdb->file->fd = -1;
+ ecode = tdb_new_database(tdb, seed, &hdr);
+ if (ecode != TDB_SUCCESS) {
+ goto fail;
+ }
+ tdb_convert(tdb, &hdr.hash_seed, sizeof(hdr.hash_seed));
+ tdb->hash_seed = hdr.hash_seed;
+ tdb_ftable_init(tdb);
+ return tdb;
+ }
+
+ if (stat(name, &st) != -1)
+ tdb->file = find_file(st.st_dev, st.st_ino);
+
+ if (!tdb->file) {
+ int fd;
+
+ if ((fd = open(name, open_flags, mode)) == -1) {
+ /* errno set by open(2) */
+ saved_errno = errno;
+ tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
+ "tdb_open: could not open file %s: %s",
+ name, strerror(errno));
+ goto fail_errno;
+ }
+
+ /* on exec, don't inherit the fd */
+ v = fcntl(fd, F_GETFD, 0);
+ fcntl(fd, F_SETFD, v | FD_CLOEXEC);
+
+ if (fstat(fd, &st) == -1) {
+ saved_errno = errno;
+ tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
+ "tdb_open: could not stat open %s: %s",
+ name, strerror(errno));
+ close(fd);
+ goto fail_errno;
+ }
+
+ ecode = tdb_new_file(tdb);
+ if (ecode != TDB_SUCCESS) {
+ close(fd);
+ goto fail;
+ }
+
+ tdb->file->next = files;
+ tdb->file->fd = fd;
+ tdb->file->device = st.st_dev;
+ tdb->file->inode = st.st_ino;
+ tdb->file->map_ptr = NULL;
+ tdb->file->map_size = sizeof(struct tdb_header);
+ }
+
+ /* ensure there is only one process initialising at once */
+ ecode = tdb_lock_open(tdb, openlock, TDB_LOCK_WAIT|TDB_LOCK_NOCHECK);
+ if (ecode != TDB_SUCCESS) {
+ saved_errno = errno;
+ goto fail_errno;
+ }
+
+ /* call their open hook if they gave us one. */
+ if (openhook) {
+ ecode = openhook->fn(tdb->file->fd, openhook->data);
+ if (ecode != TDB_SUCCESS) {
+ tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
+ "tdb_open: open hook failed");
+ goto fail;
+ }
+ open_flags |= O_CREAT;
+ }
+
+ /* If they used O_TRUNC, read will return 0. */
+ rlen = pread(tdb->file->fd, &hdr, sizeof(hdr), 0);
+ if (rlen == 0 && (open_flags & O_CREAT)) {
+ ecode = tdb_new_database(tdb, seed, &hdr);
+ if (ecode != TDB_SUCCESS) {
+ goto fail;
+ }
+ } else if (rlen < 0) {
+ ecode = tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
+ "tdb_open: error %s reading %s",
+ strerror(errno), name);
+ goto fail;
+ } else if (rlen < sizeof(hdr)
+ || strcmp(hdr.magic_food, TDB_MAGIC_FOOD) != 0) {
+ ecode = tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
+ "tdb_open: %s is not a tdb file", name);
+ goto fail;
+ }
+
+ if (hdr.version != TDB_VERSION) {
+ if (hdr.version == bswap_64(TDB_VERSION))
+ tdb->flags |= TDB_CONVERT;
+ else {
+ /* wrong version */
+ ecode = tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
+ "tdb_open:"
+ " %s is unknown version 0x%llx",
+ name, (long long)hdr.version);
+ goto fail;
+ }
+ }
+
+ tdb_convert(tdb, &hdr, sizeof(hdr));
+ tdb->hash_seed = hdr.hash_seed;
+ hash_test = TDB_HASH_MAGIC;
+ hash_test = tdb_hash(tdb, &hash_test, sizeof(hash_test));
+ if (hdr.hash_test != hash_test) {
+ /* wrong hash variant */
+ ecode = tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
+ "tdb_open:"
+ " %s uses a different hash function",
+ name);
+ goto fail;
+ }
+
+ /* Clear any features we don't understand. */
+ if ((open_flags & O_ACCMODE) != O_RDONLY) {
+ hdr.features_used &= TDB_FEATURE_MASK;
+ if (tdb_write_convert(tdb, offsetof(struct tdb_header,
+ features_used),
+ &hdr.features_used,
+ sizeof(hdr.features_used)) == -1)
+ goto fail;
+ }
+
+ tdb_unlock_open(tdb, openlock);
+
+ /* This make sure we have current map_size and mmap. */
+ tdb->methods->oob(tdb, tdb->file->map_size + 1, true);
+
+ /* Now it's fully formed, recover if necessary. */
+ berr = tdb_needs_recovery(tdb);
+ if (unlikely(berr != false)) {
+ if (berr < 0) {
+ ecode = berr;
+ goto fail;
+ }
+ ecode = tdb_lock_and_recover(tdb);
+ if (ecode != TDB_SUCCESS) {
+ goto fail;
+ }
+ }
+
+ ecode = tdb_ftable_init(tdb);
+ if (ecode != TDB_SUCCESS) {
+ goto fail;
+ }
+
+ /* Add to linked list if we're new. */
+ if (tdb->file->refcnt == 1)
+ files = tdb->file;
+ return tdb;
+
+ fail:
+ /* Map ecode to some logical errno. */
+ switch (ecode) {
+ case TDB_ERR_CORRUPT:
+ case TDB_ERR_IO:
+ saved_errno = EIO;
+ break;
+ case TDB_ERR_LOCK:
+ saved_errno = EWOULDBLOCK;
+ break;
+ case TDB_ERR_OOM:
+ saved_errno = ENOMEM;
+ break;
+ case TDB_ERR_EINVAL:
+ saved_errno = EINVAL;
+ break;
+ default:
+ saved_errno = EINVAL;
+ break;
+ }
+
+fail_errno:
+#ifdef TDB_TRACE
+ close(tdb->tracefd);
+#endif
+ if (tdb->file) {
+ tdb_lock_cleanup(tdb);
+ if (--tdb->file->refcnt == 0) {
+ assert(tdb->file->num_lockrecs == 0);
+ if (tdb->file->map_ptr) {
+ if (tdb->flags & TDB_INTERNAL) {
+ free(tdb->file->map_ptr);
+ } else
+ tdb_munmap(tdb->file);
+ }
+ if (close(tdb->file->fd) != 0)
+ tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
+ "tdb_open: failed to close tdb fd"
+ " on error: %s", strerror(errno));
+ free(tdb->file->lockrecs);
+ free(tdb->file);
+ }
+ }
+
+ free(tdb);
+ errno = saved_errno;
+ return NULL;
+}
+
+int tdb_close(struct tdb_context *tdb)
+{
+ int ret = 0;
+
+ tdb_trace(tdb, "tdb_close");
+
+ if (tdb->transaction) {
+ tdb_transaction_cancel(tdb);
+ }
+
+ if (tdb->file->map_ptr) {
+ if (tdb->flags & TDB_INTERNAL)
+ free(tdb->file->map_ptr);
+ else
+ tdb_munmap(tdb->file);
+ }
+ if (tdb->file) {
+ struct tdb_file **i;
+
+ tdb_lock_cleanup(tdb);
+ if (--tdb->file->refcnt == 0) {
+ ret = close(tdb->file->fd);
+
+ /* Remove from files list */
+ for (i = &files; *i; i = &(*i)->next) {
+ if (*i == tdb->file) {
+ *i = tdb->file->next;
+ break;
+ }
+ }
+ free(tdb->file->lockrecs);
+ free(tdb->file);
+ }
+ }
+
+#ifdef TDB_TRACE
+ close(tdb->tracefd);
+#endif
+ free(tdb);
+
+ return ret;
+}
diff --git a/lib/tdb2/private.h b/lib/tdb2/private.h
new file mode 100644
index 00000000000..135e3df9364
--- /dev/null
+++ b/lib/tdb2/private.h
@@ -0,0 +1,614 @@
+#ifndef TDB_PRIVATE_H
+#define TDB_PRIVATE_H
+ /*
+ Trivial Database 2: private types and prototypes
+ Copyright (C) Rusty Russell 2010
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 3 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "config.h"
+#if HAVE_FILE_OFFSET_BITS
+#define _FILE_OFFSET_BITS 64
+#endif
+#include <stdint.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <sys/time.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <string.h>
+#include <errno.h>
+#include <stdio.h>
+#include <utime.h>
+#include <unistd.h>
+#include <ccan/tdb2/tdb2.h>
+#include <ccan/likely/likely.h>
+#include <ccan/compiler/compiler.h>
+#include <ccan/endian/endian.h>
+
+#ifndef TEST_IT
+#define TEST_IT(cond)
+#endif
+
+/* #define TDB_TRACE 1 */
+
+#ifndef __STRING
+#define __STRING(x) #x
+#endif
+
+#ifndef __STRINGSTRING
+#define __STRINGSTRING(x) __STRING(x)
+#endif
+
+#ifndef __location__
+#define __location__ __FILE__ ":" __STRINGSTRING(__LINE__)
+#endif
+
+typedef uint64_t tdb_len_t;
+typedef uint64_t tdb_off_t;
+
+#define TDB_MAGIC_FOOD "TDB file\n"
+#define TDB_VERSION ((uint64_t)(0x26011967 + 7))
+#define TDB_USED_MAGIC ((uint64_t)0x1999)
+#define TDB_HTABLE_MAGIC ((uint64_t)0x1888)
+#define TDB_CHAIN_MAGIC ((uint64_t)0x1777)
+#define TDB_FTABLE_MAGIC ((uint64_t)0x1666)
+#define TDB_FREE_MAGIC ((uint64_t)0xFE)
+#define TDB_HASH_MAGIC (0xA1ABE11A01092008ULL)
+#define TDB_RECOVERY_MAGIC (0xf53bc0e7ad124589ULL)
+#define TDB_RECOVERY_INVALID_MAGIC (0x0ULL)
+
+#define TDB_OFF_IS_ERR(off) unlikely(off >= (tdb_off_t)TDB_ERR_LAST)
+
+/* Packing errors into pointers and v.v. */
+#define TDB_PTR_IS_ERR(ptr) \
+ unlikely((unsigned long)(ptr) >= (unsigned long)TDB_ERR_LAST)
+#define TDB_PTR_ERR(p) ((enum TDB_ERROR)(long)(p))
+#define TDB_ERR_PTR(err) ((void *)(long)(err))
+
+/* Common case of returning true, false or -ve error. */
+typedef int tdb_bool_err;
+
+/* Prevent others from opening the file. */
+#define TDB_OPEN_LOCK 0
+/* Doing a transaction. */
+#define TDB_TRANSACTION_LOCK 1
+/* Expanding file. */
+#define TDB_EXPANSION_LOCK 2
+/* Hash chain locks. */
+#define TDB_HASH_LOCK_START 64
+
+/* Range for hash locks. */
+#define TDB_HASH_LOCK_RANGE_BITS 30
+#define TDB_HASH_LOCK_RANGE (1 << TDB_HASH_LOCK_RANGE_BITS)
+
+/* We have 1024 entries in the top level. */
+#define TDB_TOPLEVEL_HASH_BITS 10
+/* And 64 entries in each sub-level: thus 64 bits exactly after 9 levels. */
+#define TDB_SUBLEVEL_HASH_BITS 6
+/* And 8 entries in each group, ie 8 groups per sublevel. */
+#define TDB_HASH_GROUP_BITS 3
+/* This is currently 10: beyond this we chain. */
+#define TDB_MAX_LEVELS (1+(64-TDB_TOPLEVEL_HASH_BITS) / TDB_SUBLEVEL_HASH_BITS)
+
+/* Extend file by least 100 times larger than needed. */
+#define TDB_EXTENSION_FACTOR 100
+
+/* We steal bits from the offsets to store hash info. */
+#define TDB_OFF_HASH_GROUP_MASK ((1ULL << TDB_HASH_GROUP_BITS) - 1)
+/* We steal this many upper bits, giving a maximum offset of 64 exabytes. */
+#define TDB_OFF_UPPER_STEAL 8
+#define TDB_OFF_UPPER_STEAL_EXTRA 7
+/* The bit number where we store extra hash bits. */
+#define TDB_OFF_HASH_EXTRA_BIT 57
+#define TDB_OFF_UPPER_STEAL_SUBHASH_BIT 56
+
+/* Additional features we understand. Currently: none. */
+#define TDB_FEATURE_MASK ((uint64_t)0)
+
+/* The bit number where we store the extra hash bits. */
+/* Convenience mask to get actual offset. */
+#define TDB_OFF_MASK \
+ (((1ULL << (64 - TDB_OFF_UPPER_STEAL)) - 1) - TDB_OFF_HASH_GROUP_MASK)
+
+/* How many buckets in a free list: see size_to_bucket(). */
+#define TDB_FREE_BUCKETS (64 - TDB_OFF_UPPER_STEAL)
+
+/* We have to be able to fit a free record here. */
+#define TDB_MIN_DATA_LEN \
+ (sizeof(struct tdb_free_record) - sizeof(struct tdb_used_record))
+
+/* Indicates this entry is not on an flist (can happen during coalescing) */
+#define TDB_FTABLE_NONE ((1ULL << TDB_OFF_UPPER_STEAL) - 1)
+
+struct tdb_used_record {
+ /* For on-disk compatibility, we avoid bitfields:
+ magic: 16, (highest)
+ key_len_bits: 5,
+ extra_padding: 32
+ hash_bits: 11
+ */
+ uint64_t magic_and_meta;
+ /* The bottom key_len_bits*2 are key length, rest is data length. */
+ uint64_t key_and_data_len;
+};
+
+static inline unsigned rec_key_bits(const struct tdb_used_record *r)
+{
+ return ((r->magic_and_meta >> 43) & ((1 << 5)-1)) * 2;
+}
+
+static inline uint64_t rec_key_length(const struct tdb_used_record *r)
+{
+ return r->key_and_data_len & ((1ULL << rec_key_bits(r)) - 1);
+}
+
+static inline uint64_t rec_data_length(const struct tdb_used_record *r)
+{
+ return r->key_and_data_len >> rec_key_bits(r);
+}
+
+static inline uint64_t rec_extra_padding(const struct tdb_used_record *r)
+{
+ return (r->magic_and_meta >> 11) & 0xFFFFFFFF;
+}
+
+static inline uint32_t rec_hash(const struct tdb_used_record *r)
+{
+ return r->magic_and_meta & ((1 << 11) - 1);
+}
+
+static inline uint16_t rec_magic(const struct tdb_used_record *r)
+{
+ return (r->magic_and_meta >> 48);
+}
+
+struct tdb_free_record {
+ uint64_t magic_and_prev; /* TDB_OFF_UPPER_STEAL bits magic, then prev */
+ uint64_t ftable_and_len; /* Len not counting these two fields. */
+ /* This is why the minimum record size is 8 bytes. */
+ uint64_t next;
+};
+
+static inline uint64_t frec_prev(const struct tdb_free_record *f)
+{
+ return f->magic_and_prev & ((1ULL << (64 - TDB_OFF_UPPER_STEAL)) - 1);
+}
+
+static inline uint64_t frec_magic(const struct tdb_free_record *f)
+{
+ return f->magic_and_prev >> (64 - TDB_OFF_UPPER_STEAL);
+}
+
+static inline uint64_t frec_len(const struct tdb_free_record *f)
+{
+ return f->ftable_and_len & ((1ULL << (64 - TDB_OFF_UPPER_STEAL))-1);
+}
+
+static inline unsigned frec_ftable(const struct tdb_free_record *f)
+{
+ return f->ftable_and_len >> (64 - TDB_OFF_UPPER_STEAL);
+}
+
+struct tdb_recovery_record {
+ uint64_t magic;
+ /* Length of record (add this header to get total length). */
+ uint64_t max_len;
+ /* Length used. */
+ uint64_t len;
+ /* Old length of file before transaction. */
+ uint64_t eof;
+};
+
+/* If we bottom out of the subhashes, we chain. */
+struct tdb_chain {
+ tdb_off_t rec[1 << TDB_HASH_GROUP_BITS];
+ tdb_off_t next;
+};
+
+/* this is stored at the front of every database */
+struct tdb_header {
+ char magic_food[64]; /* for /etc/magic */
+ /* FIXME: Make me 32 bit? */
+ uint64_t version; /* version of the code */
+ uint64_t hash_test; /* result of hashing HASH_MAGIC. */
+ uint64_t hash_seed; /* "random" seed written at creation time. */
+ tdb_off_t free_table; /* (First) free table. */
+ tdb_off_t recovery; /* Transaction recovery area. */
+
+ uint64_t features_used; /* Features all writers understand */
+ uint64_t features_offered; /* Features offered */
+
+ uint64_t seqnum; /* Sequence number for TDB_SEQNUM */
+
+ tdb_off_t reserved[23];
+
+ /* Top level hash table. */
+ tdb_off_t hashtable[1ULL << TDB_TOPLEVEL_HASH_BITS];
+};
+
+struct tdb_freetable {
+ struct tdb_used_record hdr;
+ tdb_off_t next;
+ tdb_off_t buckets[TDB_FREE_BUCKETS];
+};
+
+/* Information about a particular (locked) hash entry. */
+struct hash_info {
+ /* Full hash value of entry. */
+ uint64_t h;
+ /* Start and length of lock acquired. */
+ tdb_off_t hlock_start;
+ tdb_len_t hlock_range;
+ /* Start of hash group. */
+ tdb_off_t group_start;
+ /* Bucket we belong in. */
+ unsigned int home_bucket;
+ /* Bucket we (or an empty space) were found in. */
+ unsigned int found_bucket;
+ /* How many bits of the hash are already used. */
+ unsigned int hash_used;
+ /* Current working group. */
+ tdb_off_t group[1 << TDB_HASH_GROUP_BITS];
+};
+
+struct traverse_info {
+ struct traverse_level {
+ tdb_off_t hashtable;
+ /* We ignore groups here, and treat it as a big array. */
+ unsigned entry;
+ unsigned int total_buckets;
+ } levels[TDB_MAX_LEVELS + 1];
+ unsigned int num_levels;
+ unsigned int toplevel_group;
+ /* This makes delete-everything-inside-traverse work as expected. */
+ tdb_off_t prev;
+};
+
+enum tdb_lock_flags {
+ /* WAIT == F_SETLKW, NOWAIT == F_SETLK */
+ TDB_LOCK_NOWAIT = 0,
+ TDB_LOCK_WAIT = 1,
+ /* If set, don't log an error on failure. */
+ TDB_LOCK_PROBE = 2,
+ /* If set, don't check for recovery (used by recovery code). */
+ TDB_LOCK_NOCHECK = 4,
+};
+
+struct tdb_lock {
+ struct tdb_context *owner;
+ uint32_t off;
+ uint32_t count;
+ uint32_t ltype;
+};
+
+/* This is only needed for tdb_access_commit, but used everywhere to
+ * simplify. */
+struct tdb_access_hdr {
+ struct tdb_access_hdr *next;
+ tdb_off_t off;
+ tdb_len_t len;
+ bool convert;
+};
+
+struct tdb_file {
+ /* Single list of all TDBs, to detect multiple opens. */
+ struct tdb_file *next;
+
+ /* How many are sharing us? */
+ unsigned int refcnt;
+
+ /* Mmap (if any), or malloc (for TDB_INTERNAL). */
+ void *map_ptr;
+
+ /* How much space has been mapped (<= current file size) */
+ tdb_len_t map_size;
+
+ /* The file descriptor (-1 for TDB_INTERNAL). */
+ int fd;
+
+ /* Lock information */
+ pid_t locker;
+ struct tdb_lock allrecord_lock;
+ size_t num_lockrecs;
+ struct tdb_lock *lockrecs;
+
+ /* Identity of this file. */
+ dev_t device;
+ ino_t inode;
+};
+
+struct tdb_context {
+ /* Filename of the database. */
+ const char *name;
+
+ /* Are we accessing directly? (debugging check). */
+ int direct_access;
+
+ /* Operating read-only? (Opened O_RDONLY, or in traverse_read) */
+ bool read_only;
+
+ /* mmap read only? */
+ int mmap_flags;
+
+ /* the flags passed to tdb_open, for tdb_reopen. */
+ uint32_t flags;
+
+ /* Logging function */
+ void (*log_fn)(struct tdb_context *tdb,
+ enum tdb_log_level level,
+ const char *message,
+ void *data);
+ void *log_data;
+
+ /* Hash function. */
+ uint64_t (*hash_fn)(const void *key, size_t len, uint64_t seed, void *);
+ void *hash_data;
+ uint64_t hash_seed;
+
+ /* low level (fnctl) lock functions. */
+ int (*lock_fn)(int fd, int rw, off_t off, off_t len, bool w, void *);
+ int (*unlock_fn)(int fd, int rw, off_t off, off_t len, void *);
+ void *lock_data;
+
+ /* Set if we are in a transaction. */
+ struct tdb_transaction *transaction;
+
+ /* What free table are we using? */
+ tdb_off_t ftable_off;
+ unsigned int ftable;
+
+ /* IO methods: changes for transactions. */
+ const struct tdb_methods *methods;
+
+ /* Our statistics. */
+ struct tdb_attribute_stats stats;
+
+ /* Direct access information */
+ struct tdb_access_hdr *access;
+
+ /* Last error we returned. */
+ enum TDB_ERROR last_error;
+
+ /* The actual file information */
+ struct tdb_file *file;
+};
+
+struct tdb_methods {
+ enum TDB_ERROR (*tread)(struct tdb_context *, tdb_off_t, void *,
+ tdb_len_t);
+ enum TDB_ERROR (*twrite)(struct tdb_context *, tdb_off_t, const void *,
+ tdb_len_t);
+ enum TDB_ERROR (*oob)(struct tdb_context *, tdb_off_t, bool);
+ enum TDB_ERROR (*expand_file)(struct tdb_context *, tdb_len_t);
+ void *(*direct)(struct tdb_context *, tdb_off_t, size_t, bool);
+};
+
+/*
+ internal prototypes
+*/
+/* hash.c: */
+tdb_bool_err first_in_hash(struct tdb_context *tdb,
+ struct traverse_info *tinfo,
+ TDB_DATA *kbuf, size_t *dlen);
+
+tdb_bool_err next_in_hash(struct tdb_context *tdb,
+ struct traverse_info *tinfo,
+ TDB_DATA *kbuf, size_t *dlen);
+
+/* Hash random memory. */
+uint64_t tdb_hash(struct tdb_context *tdb, const void *ptr, size_t len);
+
+/* Hash on disk. */
+uint64_t hash_record(struct tdb_context *tdb, tdb_off_t off);
+
+/* Find and lock a hash entry (or where it would be). */
+tdb_off_t find_and_lock(struct tdb_context *tdb,
+ struct tdb_data key,
+ int ltype,
+ struct hash_info *h,
+ struct tdb_used_record *rec,
+ struct traverse_info *tinfo);
+
+enum TDB_ERROR replace_in_hash(struct tdb_context *tdb,
+ struct hash_info *h,
+ tdb_off_t new_off);
+
+enum TDB_ERROR add_to_hash(struct tdb_context *tdb, struct hash_info *h,
+ tdb_off_t new_off);
+
+enum TDB_ERROR delete_from_hash(struct tdb_context *tdb, struct hash_info *h);
+
+/* For tdb_check */
+bool is_subhash(tdb_off_t val);
+
+/* free.c: */
+enum TDB_ERROR tdb_ftable_init(struct tdb_context *tdb);
+
+/* check.c needs these to iterate through free lists. */
+tdb_off_t first_ftable(struct tdb_context *tdb);
+tdb_off_t next_ftable(struct tdb_context *tdb, tdb_off_t ftable);
+
+/* This returns space or -ve error number. */
+tdb_off_t alloc(struct tdb_context *tdb, size_t keylen, size_t datalen,
+ uint64_t hash, unsigned magic, bool growing);
+
+/* Put this record in a free list. */
+enum TDB_ERROR add_free_record(struct tdb_context *tdb,
+ tdb_off_t off, tdb_len_t len_with_header,
+ enum tdb_lock_flags waitflag,
+ bool coalesce_ok);
+
+/* Set up header for a used/ftable/htable/chain record. */
+enum TDB_ERROR set_header(struct tdb_context *tdb,
+ struct tdb_used_record *rec,
+ unsigned magic, uint64_t keylen, uint64_t datalen,
+ uint64_t actuallen, unsigned hashlow);
+
+/* Used by tdb_check to verify. */
+unsigned int size_to_bucket(tdb_len_t data_len);
+tdb_off_t bucket_off(tdb_off_t ftable_off, unsigned bucket);
+
+/* Used by tdb_summary */
+tdb_off_t dead_space(struct tdb_context *tdb, tdb_off_t off);
+
+/* io.c: */
+/* Initialize tdb->methods. */
+void tdb_io_init(struct tdb_context *tdb);
+
+/* Convert endian of the buffer if required. */
+void *tdb_convert(const struct tdb_context *tdb, void *buf, tdb_len_t size);
+
+/* Unmap and try to map the tdb. */
+void tdb_munmap(struct tdb_file *file);
+void tdb_mmap(struct tdb_context *tdb);
+
+/* Either alloc a copy, or give direct access. Release frees or noop. */
+const void *tdb_access_read(struct tdb_context *tdb,
+ tdb_off_t off, tdb_len_t len, bool convert);
+void *tdb_access_write(struct tdb_context *tdb,
+ tdb_off_t off, tdb_len_t len, bool convert);
+
+/* Release result of tdb_access_read/write. */
+void tdb_access_release(struct tdb_context *tdb, const void *p);
+/* Commit result of tdb_acces_write. */
+enum TDB_ERROR tdb_access_commit(struct tdb_context *tdb, void *p);
+
+/* Convenience routine to get an offset. */
+tdb_off_t tdb_read_off(struct tdb_context *tdb, tdb_off_t off);
+
+/* Write an offset at an offset. */
+enum TDB_ERROR tdb_write_off(struct tdb_context *tdb, tdb_off_t off,
+ tdb_off_t val);
+
+/* Clear an ondisk area. */
+enum TDB_ERROR zero_out(struct tdb_context *tdb, tdb_off_t off, tdb_len_t len);
+
+/* Return a non-zero offset between >= start < end in this array (or end). */
+tdb_off_t tdb_find_nonzero_off(struct tdb_context *tdb,
+ tdb_off_t base,
+ uint64_t start,
+ uint64_t end);
+
+/* Return a zero offset in this array, or num. */
+tdb_off_t tdb_find_zero_off(struct tdb_context *tdb, tdb_off_t off,
+ uint64_t num);
+
+/* Allocate and make a copy of some offset. */
+void *tdb_alloc_read(struct tdb_context *tdb, tdb_off_t offset, tdb_len_t len);
+
+/* Writes a converted copy of a record. */
+enum TDB_ERROR tdb_write_convert(struct tdb_context *tdb, tdb_off_t off,
+ const void *rec, size_t len);
+
+/* Reads record and converts it */
+enum TDB_ERROR tdb_read_convert(struct tdb_context *tdb, tdb_off_t off,
+ void *rec, size_t len);
+
+/* Bump the seqnum (caller checks for tdb->flags & TDB_SEQNUM) */
+void tdb_inc_seqnum(struct tdb_context *tdb);
+
+/* lock.c: */
+/* Lock/unlock a range of hashes. */
+enum TDB_ERROR tdb_lock_hashes(struct tdb_context *tdb,
+ tdb_off_t hash_lock, tdb_len_t hash_range,
+ int ltype, enum tdb_lock_flags waitflag);
+enum TDB_ERROR tdb_unlock_hashes(struct tdb_context *tdb,
+ tdb_off_t hash_lock,
+ tdb_len_t hash_range, int ltype);
+
+/* For closing the file. */
+void tdb_lock_cleanup(struct tdb_context *tdb);
+
+/* Lock/unlock a particular free bucket. */
+enum TDB_ERROR tdb_lock_free_bucket(struct tdb_context *tdb, tdb_off_t b_off,
+ enum tdb_lock_flags waitflag);
+void tdb_unlock_free_bucket(struct tdb_context *tdb, tdb_off_t b_off);
+
+/* Serialize transaction start. */
+enum TDB_ERROR tdb_transaction_lock(struct tdb_context *tdb, int ltype);
+void tdb_transaction_unlock(struct tdb_context *tdb, int ltype);
+
+/* Do we have any hash locks (ie. via tdb_chainlock) ? */
+bool tdb_has_hash_locks(struct tdb_context *tdb);
+
+/* Lock entire database. */
+enum TDB_ERROR tdb_allrecord_lock(struct tdb_context *tdb, int ltype,
+ enum tdb_lock_flags flags, bool upgradable);
+void tdb_allrecord_unlock(struct tdb_context *tdb, int ltype);
+enum TDB_ERROR tdb_allrecord_upgrade(struct tdb_context *tdb);
+
+/* Serialize db open. */
+enum TDB_ERROR tdb_lock_open(struct tdb_context *tdb,
+ int ltype, enum tdb_lock_flags flags);
+void tdb_unlock_open(struct tdb_context *tdb, int ltype);
+bool tdb_has_open_lock(struct tdb_context *tdb);
+
+/* Serialize db expand. */
+enum TDB_ERROR tdb_lock_expand(struct tdb_context *tdb, int ltype);
+void tdb_unlock_expand(struct tdb_context *tdb, int ltype);
+bool tdb_has_expansion_lock(struct tdb_context *tdb);
+
+/* If it needs recovery, grab all the locks and do it. */
+enum TDB_ERROR tdb_lock_and_recover(struct tdb_context *tdb);
+
+/* Default lock and unlock functions. */
+int tdb_fcntl_lock(int fd, int rw, off_t off, off_t len, bool waitflag, void *);
+int tdb_fcntl_unlock(int fd, int rw, off_t off, off_t len, void *);
+
+/* transaction.c: */
+enum TDB_ERROR tdb_transaction_recover(struct tdb_context *tdb);
+tdb_bool_err tdb_needs_recovery(struct tdb_context *tdb);
+
+/* tdb.c: */
+enum TDB_ERROR COLD tdb_logerr(struct tdb_context *tdb,
+ enum TDB_ERROR ecode,
+ enum tdb_log_level level,
+ const char *fmt, ...);
+
+#ifdef TDB_TRACE
+void tdb_trace(struct tdb_context *tdb, const char *op);
+void tdb_trace_seqnum(struct tdb_context *tdb, uint32_t seqnum, const char *op);
+void tdb_trace_open(struct tdb_context *tdb, const char *op,
+ unsigned hash_size, unsigned tdb_flags, unsigned open_flags);
+void tdb_trace_ret(struct tdb_context *tdb, const char *op, int ret);
+void tdb_trace_retrec(struct tdb_context *tdb, const char *op, TDB_DATA ret);
+void tdb_trace_1rec(struct tdb_context *tdb, const char *op,
+ TDB_DATA rec);
+void tdb_trace_1rec_ret(struct tdb_context *tdb, const char *op,
+ TDB_DATA rec, int ret);
+void tdb_trace_1rec_retrec(struct tdb_context *tdb, const char *op,
+ TDB_DATA rec, TDB_DATA ret);
+void tdb_trace_2rec_flag_ret(struct tdb_context *tdb, const char *op,
+ TDB_DATA rec1, TDB_DATA rec2, unsigned flag,
+ int ret);
+void tdb_trace_2rec_retrec(struct tdb_context *tdb, const char *op,
+ TDB_DATA rec1, TDB_DATA rec2, TDB_DATA ret);
+#else
+#define tdb_trace(tdb, op)
+#define tdb_trace_seqnum(tdb, seqnum, op)
+#define tdb_trace_open(tdb, op, hash_size, tdb_flags, open_flags)
+#define tdb_trace_ret(tdb, op, ret)
+#define tdb_trace_retrec(tdb, op, ret)
+#define tdb_trace_1rec(tdb, op, rec)
+#define tdb_trace_1rec_ret(tdb, op, rec, ret)
+#define tdb_trace_1rec_retrec(tdb, op, rec, ret)
+#define tdb_trace_2rec_flag_ret(tdb, op, rec1, rec2, flag, ret)
+#define tdb_trace_2rec_retrec(tdb, op, rec1, rec2, ret)
+#endif /* !TDB_TRACE */
+
+#endif
diff --git a/lib/tdb2/summary.c b/lib/tdb2/summary.c
new file mode 100644
index 00000000000..26cdd3e4fe2
--- /dev/null
+++ b/lib/tdb2/summary.c
@@ -0,0 +1,282 @@
+ /*
+ Trivial Database 2: human-readable summary code
+ Copyright (C) Rusty Russell 2010
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 3 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+#include "private.h"
+#include <assert.h>
+#include <ccan/tally/tally.h>
+
+static tdb_off_t count_hash(struct tdb_context *tdb,
+ tdb_off_t hash_off, unsigned bits)
+{
+ const tdb_off_t *h;
+ tdb_off_t count = 0;
+ unsigned int i;
+
+ h = tdb_access_read(tdb, hash_off, sizeof(*h) << bits, true);
+ if (TDB_PTR_IS_ERR(h)) {
+ return TDB_PTR_ERR(h);
+ }
+ for (i = 0; i < (1 << bits); i++)
+ count += (h[i] != 0);
+
+ tdb_access_release(tdb, h);
+ return count;
+}
+
+static enum TDB_ERROR summarize(struct tdb_context *tdb,
+ struct tally *hashes,
+ struct tally *ftables,
+ struct tally *fr,
+ struct tally *keys,
+ struct tally *data,
+ struct tally *extra,
+ struct tally *uncoal,
+ struct tally *chains)
+{
+ tdb_off_t off;
+ tdb_len_t len;
+ tdb_len_t unc = 0;
+
+ for (off = sizeof(struct tdb_header);
+ off < tdb->file->map_size;
+ off += len) {
+ const union {
+ struct tdb_used_record u;
+ struct tdb_free_record f;
+ struct tdb_recovery_record r;
+ } *p;
+ /* We might not be able to get the whole thing. */
+ p = tdb_access_read(tdb, off, sizeof(p->f), true);
+ if (TDB_PTR_IS_ERR(p)) {
+ return TDB_PTR_ERR(p);
+ }
+ if (frec_magic(&p->f) != TDB_FREE_MAGIC) {
+ if (unc > 1) {
+ tally_add(uncoal, unc);
+ unc = 0;
+ }
+ }
+
+ if (p->r.magic == TDB_RECOVERY_INVALID_MAGIC
+ || p->r.magic == TDB_RECOVERY_MAGIC) {
+ len = sizeof(p->r) + p->r.max_len;
+ } else if (frec_magic(&p->f) == TDB_FREE_MAGIC) {
+ len = frec_len(&p->f);
+ tally_add(fr, len);
+ len += sizeof(p->u);
+ unc++;
+ } else if (rec_magic(&p->u) == TDB_USED_MAGIC) {
+ len = sizeof(p->u)
+ + rec_key_length(&p->u)
+ + rec_data_length(&p->u)
+ + rec_extra_padding(&p->u);
+
+ tally_add(keys, rec_key_length(&p->u));
+ tally_add(data, rec_data_length(&p->u));
+ tally_add(extra, rec_extra_padding(&p->u));
+ } else if (rec_magic(&p->u) == TDB_HTABLE_MAGIC) {
+ tdb_off_t count = count_hash(tdb,
+ off + sizeof(p->u),
+ TDB_SUBLEVEL_HASH_BITS);
+ if (TDB_OFF_IS_ERR(count)) {
+ return count;
+ }
+ tally_add(hashes, count);
+ tally_add(extra, rec_extra_padding(&p->u));
+ len = sizeof(p->u)
+ + rec_data_length(&p->u)
+ + rec_extra_padding(&p->u);
+ } else if (rec_magic(&p->u) == TDB_FTABLE_MAGIC) {
+ len = sizeof(p->u)
+ + rec_data_length(&p->u)
+ + rec_extra_padding(&p->u);
+ tally_add(ftables, rec_data_length(&p->u));
+ tally_add(extra, rec_extra_padding(&p->u));
+ } else if (rec_magic(&p->u) == TDB_CHAIN_MAGIC) {
+ len = sizeof(p->u)
+ + rec_data_length(&p->u)
+ + rec_extra_padding(&p->u);
+ tally_add(chains, 1);
+ tally_add(extra, rec_extra_padding(&p->u));
+ } else {
+ len = dead_space(tdb, off);
+ if (TDB_OFF_IS_ERR(len)) {
+ return len;
+ }
+ }
+ tdb_access_release(tdb, p);
+ }
+ if (unc)
+ tally_add(uncoal, unc);
+ return TDB_SUCCESS;
+}
+
+#define SUMMARY_FORMAT \
+ "Size of file/data: %zu/%zu\n" \
+ "Number of records: %zu\n" \
+ "Smallest/average/largest keys: %zu/%zu/%zu\n%s" \
+ "Smallest/average/largest data: %zu/%zu/%zu\n%s" \
+ "Smallest/average/largest padding: %zu/%zu/%zu\n%s" \
+ "Number of free records: %zu\n" \
+ "Smallest/average/largest free records: %zu/%zu/%zu\n%s" \
+ "Number of uncoalesced records: %zu\n" \
+ "Smallest/average/largest uncoalesced runs: %zu/%zu/%zu\n%s" \
+ "Toplevel hash used: %u of %u\n" \
+ "Number of chains: %zu\n" \
+ "Number of subhashes: %zu\n" \
+ "Smallest/average/largest subhash entries: %zu/%zu/%zu\n%s" \
+ "Percentage keys/data/padding/free/rechdrs/freehdrs/hashes: %.0f/%.0f/%.0f/%.0f/%.0f/%.0f/%.0f\n"
+
+#define BUCKET_SUMMARY_FORMAT_A \
+ "Free bucket %zu: total entries %zu.\n" \
+ "Smallest/average/largest length: %zu/%zu/%zu\n%s"
+#define BUCKET_SUMMARY_FORMAT_B \
+ "Free bucket %zu-%zu: total entries %zu.\n" \
+ "Smallest/average/largest length: %zu/%zu/%zu\n%s"
+
+#define HISTO_WIDTH 70
+#define HISTO_HEIGHT 20
+
+enum TDB_ERROR tdb_summary(struct tdb_context *tdb,
+ enum tdb_summary_flags flags,
+ char **summary)
+{
+ tdb_len_t len;
+ struct tally *ftables, *hashes, *freet, *keys, *data, *extra, *uncoal,
+ *chains;
+ char *hashesg, *freeg, *keysg, *datag, *extrag, *uncoalg;
+ enum TDB_ERROR ecode;
+
+ hashesg = freeg = keysg = datag = extrag = uncoalg = NULL;
+
+ ecode = tdb_allrecord_lock(tdb, F_RDLCK, TDB_LOCK_WAIT, false);
+ if (ecode != TDB_SUCCESS) {
+ return tdb->last_error = ecode;
+ }
+
+ ecode = tdb_lock_expand(tdb, F_RDLCK);
+ if (ecode != TDB_SUCCESS) {
+ tdb_allrecord_unlock(tdb, F_RDLCK);
+ return tdb->last_error = ecode;
+ }
+
+ /* Start stats off empty. */
+ ftables = tally_new(HISTO_HEIGHT);
+ hashes = tally_new(HISTO_HEIGHT);
+ freet = tally_new(HISTO_HEIGHT);
+ keys = tally_new(HISTO_HEIGHT);
+ data = tally_new(HISTO_HEIGHT);
+ extra = tally_new(HISTO_HEIGHT);
+ uncoal = tally_new(HISTO_HEIGHT);
+ chains = tally_new(HISTO_HEIGHT);
+ if (!ftables || !hashes || !freet || !keys || !data || !extra
+ || !uncoal || !chains) {
+ ecode = tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR,
+ "tdb_summary: failed to allocate"
+ " tally structures");
+ goto unlock;
+ }
+
+ ecode = summarize(tdb, hashes, ftables, freet, keys, data, extra,
+ uncoal, chains);
+ if (ecode != TDB_SUCCESS) {
+ goto unlock;
+ }
+
+ if (flags & TDB_SUMMARY_HISTOGRAMS) {
+ hashesg = tally_histogram(hashes, HISTO_WIDTH, HISTO_HEIGHT);
+ freeg = tally_histogram(freet, HISTO_WIDTH, HISTO_HEIGHT);
+ keysg = tally_histogram(keys, HISTO_WIDTH, HISTO_HEIGHT);
+ datag = tally_histogram(data, HISTO_WIDTH, HISTO_HEIGHT);
+ extrag = tally_histogram(extra, HISTO_WIDTH, HISTO_HEIGHT);
+ uncoalg = tally_histogram(uncoal, HISTO_WIDTH, HISTO_HEIGHT);
+ }
+
+ /* 20 is max length of a %llu. */
+ len = strlen(SUMMARY_FORMAT) + 33*20 + 1
+ + (hashesg ? strlen(hashesg) : 0)
+ + (freeg ? strlen(freeg) : 0)
+ + (keysg ? strlen(keysg) : 0)
+ + (datag ? strlen(datag) : 0)
+ + (extrag ? strlen(extrag) : 0)
+ + (uncoalg ? strlen(uncoalg) : 0);
+
+ *summary = malloc(len);
+ if (!*summary) {
+ ecode = tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR,
+ "tdb_summary: failed to allocate string");
+ goto unlock;
+ }
+
+ sprintf(*summary, SUMMARY_FORMAT,
+ (size_t)tdb->file->map_size,
+ tally_total(keys, NULL) + tally_total(data, NULL),
+ tally_num(keys),
+ tally_min(keys), tally_mean(keys), tally_max(keys),
+ keysg ? keysg : "",
+ tally_min(data), tally_mean(data), tally_max(data),
+ datag ? datag : "",
+ tally_min(extra), tally_mean(extra), tally_max(extra),
+ extrag ? extrag : "",
+ tally_num(freet),
+ tally_min(freet), tally_mean(freet), tally_max(freet),
+ freeg ? freeg : "",
+ tally_total(uncoal, NULL),
+ tally_min(uncoal), tally_mean(uncoal), tally_max(uncoal),
+ uncoalg ? uncoalg : "",
+ (unsigned)count_hash(tdb, offsetof(struct tdb_header,
+ hashtable),
+ TDB_TOPLEVEL_HASH_BITS),
+ 1 << TDB_TOPLEVEL_HASH_BITS,
+ tally_num(chains),
+ tally_num(hashes),
+ tally_min(hashes), tally_mean(hashes), tally_max(hashes),
+ hashesg ? hashesg : "",
+ tally_total(keys, NULL) * 100.0 / tdb->file->map_size,
+ tally_total(data, NULL) * 100.0 / tdb->file->map_size,
+ tally_total(extra, NULL) * 100.0 / tdb->file->map_size,
+ tally_total(freet, NULL) * 100.0 / tdb->file->map_size,
+ (tally_num(keys) + tally_num(freet) + tally_num(hashes))
+ * sizeof(struct tdb_used_record) * 100.0 / tdb->file->map_size,
+ tally_num(ftables) * sizeof(struct tdb_freetable)
+ * 100.0 / tdb->file->map_size,
+ (tally_num(hashes)
+ * (sizeof(tdb_off_t) << TDB_SUBLEVEL_HASH_BITS)
+ + (sizeof(tdb_off_t) << TDB_TOPLEVEL_HASH_BITS)
+ + sizeof(struct tdb_chain) * tally_num(chains))
+ * 100.0 / tdb->file->map_size);
+
+unlock:
+ free(hashesg);
+ free(freeg);
+ free(keysg);
+ free(datag);
+ free(extrag);
+ free(uncoalg);
+ free(hashes);
+ free(freet);
+ free(keys);
+ free(data);
+ free(extra);
+ free(uncoal);
+ free(ftables);
+ free(chains);
+
+ tdb_allrecord_unlock(tdb, F_RDLCK);
+ tdb_unlock_expand(tdb, F_RDLCK);
+ return tdb->last_error = ecode;
+}
diff --git a/lib/tdb2/tdb.c b/lib/tdb2/tdb.c
new file mode 100644
index 00000000000..b8b5aac1288
--- /dev/null
+++ b/lib/tdb2/tdb.c
@@ -0,0 +1,484 @@
+#include "private.h"
+#include <ccan/asprintf/asprintf.h>
+#include <stdarg.h>
+
+static enum TDB_ERROR update_rec_hdr(struct tdb_context *tdb,
+ tdb_off_t off,
+ tdb_len_t keylen,
+ tdb_len_t datalen,
+ struct tdb_used_record *rec,
+ uint64_t h)
+{
+ uint64_t dataroom = rec_data_length(rec) + rec_extra_padding(rec);
+ enum TDB_ERROR ecode;
+
+ ecode = set_header(tdb, rec, TDB_USED_MAGIC, keylen, datalen,
+ keylen + dataroom, h);
+ if (ecode == TDB_SUCCESS) {
+ ecode = tdb_write_convert(tdb, off, rec, sizeof(*rec));
+ }
+ return ecode;
+}
+
+static enum TDB_ERROR replace_data(struct tdb_context *tdb,
+ struct hash_info *h,
+ struct tdb_data key, struct tdb_data dbuf,
+ tdb_off_t old_off, tdb_len_t old_room,
+ bool growing)
+{
+ tdb_off_t new_off;
+ enum TDB_ERROR ecode;
+
+ /* Allocate a new record. */
+ new_off = alloc(tdb, key.dsize, dbuf.dsize, h->h, TDB_USED_MAGIC,
+ growing);
+ if (TDB_OFF_IS_ERR(new_off)) {
+ return new_off;
+ }
+
+ /* We didn't like the existing one: remove it. */
+ if (old_off) {
+ tdb->stats.frees++;
+ ecode = add_free_record(tdb, old_off,
+ sizeof(struct tdb_used_record)
+ + key.dsize + old_room,
+ TDB_LOCK_WAIT, true);
+ if (ecode == TDB_SUCCESS)
+ ecode = replace_in_hash(tdb, h, new_off);
+ } else {
+ ecode = add_to_hash(tdb, h, new_off);
+ }
+ if (ecode != TDB_SUCCESS) {
+ return ecode;
+ }
+
+ new_off += sizeof(struct tdb_used_record);
+ ecode = tdb->methods->twrite(tdb, new_off, key.dptr, key.dsize);
+ if (ecode != TDB_SUCCESS) {
+ return ecode;
+ }
+
+ new_off += key.dsize;
+ ecode = tdb->methods->twrite(tdb, new_off, dbuf.dptr, dbuf.dsize);
+ if (ecode != TDB_SUCCESS) {
+ return ecode;
+ }
+
+ if (tdb->flags & TDB_SEQNUM)
+ tdb_inc_seqnum(tdb);
+
+ return TDB_SUCCESS;
+}
+
+static enum TDB_ERROR update_data(struct tdb_context *tdb,
+ tdb_off_t off,
+ struct tdb_data dbuf,
+ tdb_len_t extra)
+{
+ enum TDB_ERROR ecode;
+
+ ecode = tdb->methods->twrite(tdb, off, dbuf.dptr, dbuf.dsize);
+ if (ecode == TDB_SUCCESS && extra) {
+ /* Put a zero in; future versions may append other data. */
+ ecode = tdb->methods->twrite(tdb, off + dbuf.dsize, "", 1);
+ }
+ if (tdb->flags & TDB_SEQNUM)
+ tdb_inc_seqnum(tdb);
+
+ return ecode;
+}
+
+enum TDB_ERROR tdb_store(struct tdb_context *tdb,
+ struct tdb_data key, struct tdb_data dbuf, int flag)
+{
+ struct hash_info h;
+ tdb_off_t off;
+ tdb_len_t old_room = 0;
+ struct tdb_used_record rec;
+ enum TDB_ERROR ecode;
+
+ off = find_and_lock(tdb, key, F_WRLCK, &h, &rec, NULL);
+ if (TDB_OFF_IS_ERR(off)) {
+ return tdb->last_error = off;
+ }
+
+ /* Now we have lock on this hash bucket. */
+ if (flag == TDB_INSERT) {
+ if (off) {
+ ecode = TDB_ERR_EXISTS;
+ goto out;
+ }
+ } else {
+ if (off) {
+ old_room = rec_data_length(&rec)
+ + rec_extra_padding(&rec);
+ if (old_room >= dbuf.dsize) {
+ /* Can modify in-place. Easy! */
+ ecode = update_rec_hdr(tdb, off,
+ key.dsize, dbuf.dsize,
+ &rec, h.h);
+ if (ecode != TDB_SUCCESS) {
+ goto out;
+ }
+ ecode = update_data(tdb,
+ off + sizeof(rec)
+ + key.dsize, dbuf,
+ old_room - dbuf.dsize);
+ if (ecode != TDB_SUCCESS) {
+ goto out;
+ }
+ tdb_unlock_hashes(tdb, h.hlock_start,
+ h.hlock_range, F_WRLCK);
+ return tdb->last_error = TDB_SUCCESS;
+ }
+ } else {
+ if (flag == TDB_MODIFY) {
+ /* if the record doesn't exist and we
+ are in TDB_MODIFY mode then we should fail
+ the store */
+ ecode = TDB_ERR_NOEXIST;
+ goto out;
+ }
+ }
+ }
+
+ /* If we didn't use the old record, this implies we're growing. */
+ ecode = replace_data(tdb, &h, key, dbuf, off, old_room, off);
+out:
+ tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range, F_WRLCK);
+ return tdb->last_error = ecode;
+}
+
+enum TDB_ERROR tdb_append(struct tdb_context *tdb,
+ struct tdb_data key, struct tdb_data dbuf)
+{
+ struct hash_info h;
+ tdb_off_t off;
+ struct tdb_used_record rec;
+ tdb_len_t old_room = 0, old_dlen;
+ unsigned char *newdata;
+ struct tdb_data new_dbuf;
+ enum TDB_ERROR ecode;
+
+ off = find_and_lock(tdb, key, F_WRLCK, &h, &rec, NULL);
+ if (TDB_OFF_IS_ERR(off)) {
+ return tdb->last_error = off;
+ }
+
+ if (off) {
+ old_dlen = rec_data_length(&rec);
+ old_room = old_dlen + rec_extra_padding(&rec);
+
+ /* Fast path: can append in place. */
+ if (rec_extra_padding(&rec) >= dbuf.dsize) {
+ ecode = update_rec_hdr(tdb, off, key.dsize,
+ old_dlen + dbuf.dsize, &rec,
+ h.h);
+ if (ecode != TDB_SUCCESS) {
+ goto out;
+ }
+
+ off += sizeof(rec) + key.dsize + old_dlen;
+ ecode = update_data(tdb, off, dbuf,
+ rec_extra_padding(&rec));
+ goto out;
+ }
+
+ /* Slow path. */
+ newdata = malloc(key.dsize + old_dlen + dbuf.dsize);
+ if (!newdata) {
+ ecode = tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR,
+ "tdb_append:"
+ " failed to allocate %zu bytes",
+ (size_t)(key.dsize + old_dlen
+ + dbuf.dsize));
+ goto out;
+ }
+ ecode = tdb->methods->tread(tdb, off + sizeof(rec) + key.dsize,
+ newdata, old_dlen);
+ if (ecode != TDB_SUCCESS) {
+ goto out_free_newdata;
+ }
+ memcpy(newdata + old_dlen, dbuf.dptr, dbuf.dsize);
+ new_dbuf.dptr = newdata;
+ new_dbuf.dsize = old_dlen + dbuf.dsize;
+ } else {
+ newdata = NULL;
+ new_dbuf = dbuf;
+ }
+
+ /* If they're using tdb_append(), it implies they're growing record. */
+ ecode = replace_data(tdb, &h, key, new_dbuf, off, old_room, true);
+
+out_free_newdata:
+ free(newdata);
+out:
+ tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range, F_WRLCK);
+ return tdb->last_error = ecode;
+}
+
+enum TDB_ERROR tdb_fetch(struct tdb_context *tdb, struct tdb_data key,
+ struct tdb_data *data)
+{
+ tdb_off_t off;
+ struct tdb_used_record rec;
+ struct hash_info h;
+ enum TDB_ERROR ecode;
+
+ off = find_and_lock(tdb, key, F_RDLCK, &h, &rec, NULL);
+ if (TDB_OFF_IS_ERR(off)) {
+ return tdb->last_error = off;
+ }
+
+ if (!off) {
+ ecode = TDB_ERR_NOEXIST;
+ } else {
+ data->dsize = rec_data_length(&rec);
+ data->dptr = tdb_alloc_read(tdb, off + sizeof(rec) + key.dsize,
+ data->dsize);
+ if (TDB_PTR_IS_ERR(data->dptr)) {
+ ecode = TDB_PTR_ERR(data->dptr);
+ } else
+ ecode = TDB_SUCCESS;
+ }
+
+ tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range, F_RDLCK);
+ return tdb->last_error = ecode;
+}
+
+bool tdb_exists(struct tdb_context *tdb, TDB_DATA key)
+{
+ tdb_off_t off;
+ struct tdb_used_record rec;
+ struct hash_info h;
+
+ off = find_and_lock(tdb, key, F_RDLCK, &h, &rec, NULL);
+ if (TDB_OFF_IS_ERR(off)) {
+ tdb->last_error = off;
+ return false;
+ }
+ tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range, F_RDLCK);
+
+ tdb->last_error = TDB_SUCCESS;
+ return off ? true : false;
+}
+
+enum TDB_ERROR tdb_delete(struct tdb_context *tdb, struct tdb_data key)
+{
+ tdb_off_t off;
+ struct tdb_used_record rec;
+ struct hash_info h;
+ enum TDB_ERROR ecode;
+
+ off = find_and_lock(tdb, key, F_WRLCK, &h, &rec, NULL);
+ if (TDB_OFF_IS_ERR(off)) {
+ return tdb->last_error = off;
+ }
+
+ if (!off) {
+ ecode = TDB_ERR_NOEXIST;
+ goto unlock;
+ }
+
+ ecode = delete_from_hash(tdb, &h);
+ if (ecode != TDB_SUCCESS) {
+ goto unlock;
+ }
+
+ /* Free the deleted entry. */
+ tdb->stats.frees++;
+ ecode = add_free_record(tdb, off,
+ sizeof(struct tdb_used_record)
+ + rec_key_length(&rec)
+ + rec_data_length(&rec)
+ + rec_extra_padding(&rec),
+ TDB_LOCK_WAIT, true);
+
+ if (tdb->flags & TDB_SEQNUM)
+ tdb_inc_seqnum(tdb);
+
+unlock:
+ tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range, F_WRLCK);
+ return tdb->last_error = ecode;
+}
+
+unsigned int tdb_get_flags(struct tdb_context *tdb)
+{
+ return tdb->flags;
+}
+
+void tdb_add_flag(struct tdb_context *tdb, unsigned flag)
+{
+ if (tdb->flags & TDB_INTERNAL) {
+ tdb->last_error = tdb_logerr(tdb, TDB_ERR_EINVAL,
+ TDB_LOG_USE_ERROR,
+ "tdb_add_flag: internal db");
+ return;
+ }
+ switch (flag) {
+ case TDB_NOLOCK:
+ tdb->flags |= TDB_NOLOCK;
+ break;
+ case TDB_NOMMAP:
+ tdb->flags |= TDB_NOMMAP;
+ tdb_munmap(tdb->file);
+ break;
+ case TDB_NOSYNC:
+ tdb->flags |= TDB_NOSYNC;
+ break;
+ case TDB_SEQNUM:
+ tdb->flags |= TDB_SEQNUM;
+ break;
+ case TDB_ALLOW_NESTING:
+ tdb->flags |= TDB_ALLOW_NESTING;
+ break;
+ default:
+ tdb->last_error = tdb_logerr(tdb, TDB_ERR_EINVAL,
+ TDB_LOG_USE_ERROR,
+ "tdb_add_flag: Unknown flag %u",
+ flag);
+ }
+}
+
+void tdb_remove_flag(struct tdb_context *tdb, unsigned flag)
+{
+ if (tdb->flags & TDB_INTERNAL) {
+ tdb->last_error = tdb_logerr(tdb, TDB_ERR_EINVAL,
+ TDB_LOG_USE_ERROR,
+ "tdb_remove_flag: internal db");
+ return;
+ }
+ switch (flag) {
+ case TDB_NOLOCK:
+ tdb->flags &= ~TDB_NOLOCK;
+ break;
+ case TDB_NOMMAP:
+ tdb->flags &= ~TDB_NOMMAP;
+ tdb_mmap(tdb);
+ break;
+ case TDB_NOSYNC:
+ tdb->flags &= ~TDB_NOSYNC;
+ break;
+ case TDB_SEQNUM:
+ tdb->flags &= ~TDB_SEQNUM;
+ break;
+ case TDB_ALLOW_NESTING:
+ tdb->flags &= ~TDB_ALLOW_NESTING;
+ break;
+ default:
+ tdb->last_error = tdb_logerr(tdb, TDB_ERR_EINVAL,
+ TDB_LOG_USE_ERROR,
+ "tdb_remove_flag: Unknown flag %u",
+ flag);
+ }
+}
+
+const char *tdb_errorstr(enum TDB_ERROR ecode)
+{
+ /* Gcc warns if you miss a case in the switch, so use that. */
+ switch (ecode) {
+ case TDB_SUCCESS: return "Success";
+ case TDB_ERR_CORRUPT: return "Corrupt database";
+ case TDB_ERR_IO: return "IO Error";
+ case TDB_ERR_LOCK: return "Locking error";
+ case TDB_ERR_OOM: return "Out of memory";
+ case TDB_ERR_EXISTS: return "Record exists";
+ case TDB_ERR_EINVAL: return "Invalid parameter";
+ case TDB_ERR_NOEXIST: return "Record does not exist";
+ case TDB_ERR_RDONLY: return "write not permitted";
+ }
+ return "Invalid error code";
+}
+
+enum TDB_ERROR tdb_error(struct tdb_context *tdb)
+{
+ return tdb->last_error;
+}
+
+enum TDB_ERROR COLD tdb_logerr(struct tdb_context *tdb,
+ enum TDB_ERROR ecode,
+ enum tdb_log_level level,
+ const char *fmt, ...)
+{
+ char *message;
+ va_list ap;
+ size_t len;
+ /* tdb_open paths care about errno, so save it. */
+ int saved_errno = errno;
+
+ if (!tdb->log_fn)
+ return ecode;
+
+ va_start(ap, fmt);
+ len = vasprintf(&message, fmt, ap);
+ va_end(ap);
+
+ if (len < 0) {
+ tdb->log_fn(tdb, TDB_LOG_ERROR,
+ "out of memory formatting message:", tdb->log_data);
+ tdb->log_fn(tdb, level, fmt, tdb->log_data);
+ } else {
+ tdb->log_fn(tdb, level, message, tdb->log_data);
+ free(message);
+ }
+ errno = saved_errno;
+ return ecode;
+}
+
+enum TDB_ERROR tdb_parse_record_(struct tdb_context *tdb,
+ TDB_DATA key,
+ enum TDB_ERROR (*parse)(TDB_DATA k,
+ TDB_DATA d,
+ void *data),
+ void *data)
+{
+ tdb_off_t off;
+ struct tdb_used_record rec;
+ struct hash_info h;
+ enum TDB_ERROR ecode;
+
+ off = find_and_lock(tdb, key, F_RDLCK, &h, &rec, NULL);
+ if (TDB_OFF_IS_ERR(off)) {
+ return tdb->last_error = off;
+ }
+
+ if (!off) {
+ ecode = TDB_ERR_NOEXIST;
+ } else {
+ const void *dptr;
+ dptr = tdb_access_read(tdb, off + sizeof(rec) + key.dsize,
+ rec_data_length(&rec), false);
+ if (TDB_PTR_IS_ERR(dptr)) {
+ ecode = TDB_PTR_ERR(dptr);
+ } else {
+ TDB_DATA d = tdb_mkdata(dptr, rec_data_length(&rec));
+
+ ecode = parse(key, d, data);
+ tdb_access_release(tdb, dptr);
+ }
+ }
+
+ tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range, F_RDLCK);
+ return tdb->last_error = ecode;
+}
+
+const char *tdb_name(const struct tdb_context *tdb)
+{
+ return tdb->name;
+}
+
+int64_t tdb_get_seqnum(struct tdb_context *tdb)
+{
+ tdb_off_t off = tdb_read_off(tdb, offsetof(struct tdb_header, seqnum));
+ if (TDB_OFF_IS_ERR(off))
+ tdb->last_error = off;
+ else
+ tdb->last_error = TDB_SUCCESS;
+ return off;
+}
+
+
+int tdb_fd(const struct tdb_context *tdb)
+{
+ return tdb->file->fd;
+}
diff --git a/lib/tdb2/tdb2.h b/lib/tdb2/tdb2.h
new file mode 100644
index 00000000000..c6e09e9f16e
--- /dev/null
+++ b/lib/tdb2/tdb2.h
@@ -0,0 +1,846 @@
+#ifndef CCAN_TDB2_H
+#define CCAN_TDB2_H
+
+/*
+ TDB version 2: trivial database library
+
+ Copyright (C) Andrew Tridgell 1999-2004
+ Copyright (C) Rusty Russell 2010-2011
+
+ ** NOTE! The following LGPL license applies to the tdb
+ ** library. This does NOT imply that all of Samba is released
+ ** under the LGPL
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 3 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef _SAMBA_BUILD_
+/* For mode_t */
+#include <sys/types.h>
+/* For O_* flags. */
+#include <sys/stat.h>
+/* For sig_atomic_t. */
+#include <signal.h>
+/* For uint64_t */
+#include <stdint.h>
+/* For bool */
+#include <stdbool.h>
+/* For memcmp */
+#include <string.h>
+#endif
+#include <ccan/compiler/compiler.h>
+#include <ccan/typesafe_cb/typesafe_cb.h>
+#include <ccan/cast/cast.h>
+
+union tdb_attribute;
+struct tdb_context;
+
+/**
+ * tdb_open - open a database file
+ * @name: the file name (can be NULL if flags contains TDB_INTERNAL)
+ * @tdb_flags: options for this database
+ * @open_flags: flags argument for tdb's open() call.
+ * @mode: mode argument for tdb's open() call.
+ * @attributes: linked list of extra attributes for this tdb.
+ *
+ * This call opens (and potentially creates) a database file.
+ * Multiple processes can have the TDB file open at once.
+ *
+ * On failure it will return NULL, and set errno: it may also call
+ * any log attribute found in @attributes.
+ *
+ * See also:
+ * union tdb_attribute
+ */
+struct tdb_context *tdb_open(const char *name, int tdb_flags,
+ int open_flags, mode_t mode,
+ union tdb_attribute *attributes);
+
+
+/* flags for tdb_open() */
+#define TDB_DEFAULT 0 /* just a readability place holder */
+#define TDB_INTERNAL 2 /* don't store on disk */
+#define TDB_NOLOCK 4 /* don't do any locking */
+#define TDB_NOMMAP 8 /* don't use mmap */
+#define TDB_CONVERT 16 /* convert endian */
+#define TDB_NOSYNC 64 /* don't use synchronous transactions */
+#define TDB_SEQNUM 128 /* maintain a sequence number */
+#define TDB_ALLOW_NESTING 256 /* fake nested transactions */
+
+/**
+ * tdb_close - close and free a tdb.
+ * @tdb: the tdb context returned from tdb_open()
+ *
+ * This always succeeds, in that @tdb is unusable after this call. But if
+ * some unexpected error occurred while closing, it will return non-zero
+ * (the only clue as to cause will be via the log attribute).
+ */
+int tdb_close(struct tdb_context *tdb);
+
+/**
+ * struct tdb_data - representation of keys or values.
+ * @dptr: the data pointer
+ * @dsize: the size of the data pointed to by dptr.
+ *
+ * This is the "blob" representation of keys and data used by TDB.
+ */
+typedef struct tdb_data {
+ unsigned char *dptr;
+ size_t dsize;
+} TDB_DATA;
+
+/**
+ * enum TDB_ERROR - error returns for TDB
+ *
+ * See Also:
+ * tdb_errorstr()
+ */
+enum TDB_ERROR {
+ TDB_SUCCESS = 0, /* No error. */
+ TDB_ERR_CORRUPT = -1, /* We read the db, and it was bogus. */
+ TDB_ERR_IO = -2, /* We couldn't read/write the db. */
+ TDB_ERR_LOCK = -3, /* Locking failed. */
+ TDB_ERR_OOM = -4, /* Out of Memory. */
+ TDB_ERR_EXISTS = -5, /* The key already exists. */
+ TDB_ERR_NOEXIST = -6, /* The key does not exist. */
+ TDB_ERR_EINVAL = -7, /* You're using it wrong. */
+ TDB_ERR_RDONLY = -8, /* The database is read-only. */
+ TDB_ERR_LAST = TDB_ERR_RDONLY
+};
+
+/**
+ * tdb_store - store a key/value pair in a tdb.
+ * @tdb: the tdb context returned from tdb_open()
+ * @key: the key
+ * @dbuf: the data to associate with the key.
+ * @flag: TDB_REPLACE, TDB_INSERT or TDB_MODIFY.
+ *
+ * This inserts (or overwrites) a key/value pair in the TDB. If flag
+ * is TDB_REPLACE, it doesn't matter whether the key exists or not;
+ * TDB_INSERT means it must not exist (returns TDB_ERR_EXISTS otherwise),
+ * and TDB_MODIFY means it must exist (returns TDB_ERR_NOEXIST otherwise).
+ *
+ * On success, this returns TDB_SUCCESS.
+ *
+ * See also:
+ * tdb_fetch, tdb_transaction_start, tdb_append, tdb_delete.
+ */
+enum TDB_ERROR tdb_store(struct tdb_context *tdb,
+ struct tdb_data key,
+ struct tdb_data dbuf,
+ int flag);
+
+/* flags to tdb_store() */
+#define TDB_REPLACE 1 /* A readability place holder */
+#define TDB_INSERT 2 /* Don't overwrite an existing entry */
+#define TDB_MODIFY 3 /* Don't create an existing entry */
+
+/**
+ * tdb_fetch - fetch a value from a tdb.
+ * @tdb: the tdb context returned from tdb_open()
+ * @key: the key
+ * @data: pointer to data.
+ *
+ * This looks up a key in the database and sets it in @data.
+ *
+ * If it returns TDB_SUCCESS, the key was found: it is your
+ * responsibility to call free() on @data->dptr.
+ *
+ * Otherwise, it returns an error (usually, TDB_ERR_NOEXIST) and @data is
+ * undefined.
+ */
+enum TDB_ERROR tdb_fetch(struct tdb_context *tdb, struct tdb_data key,
+ struct tdb_data *data);
+
+/**
+ * tdb_errorstr - map the tdb error onto a constant readable string
+ * @ecode: the enum TDB_ERROR to map.
+ *
+ * This is useful for displaying errors to users.
+ */
+const char *tdb_errorstr(enum TDB_ERROR ecode);
+
+/**
+ * tdb_append - append a value to a key/value pair in a tdb.
+ * @tdb: the tdb context returned from tdb_open()
+ * @key: the key
+ * @dbuf: the data to append.
+ *
+ * This is equivalent to fetching a record, reallocating .dptr to add the
+ * data, and writing it back, only it's much more efficient. If the key
+ * doesn't exist, it's equivalent to tdb_store (with an additional hint that
+ * you expect to expand the record in future).
+ *
+ * See Also:
+ * tdb_fetch(), tdb_store()
+ */
+enum TDB_ERROR tdb_append(struct tdb_context *tdb,
+ struct tdb_data key, struct tdb_data dbuf);
+
+/**
+ * tdb_delete - delete a key from a tdb.
+ * @tdb: the tdb context returned from tdb_open()
+ * @key: the key to delete.
+ *
+ * Returns TDB_SUCCESS on success, or an error (usually TDB_ERR_NOEXIST).
+ *
+ * See Also:
+ * tdb_fetch(), tdb_store()
+ */
+enum TDB_ERROR tdb_delete(struct tdb_context *tdb, struct tdb_data key);
+
+/**
+ * tdb_exists - does a key exist in the database?
+ * @tdb: the tdb context returned from tdb_open()
+ * @key: the key to search for.
+ *
+ * Returns true if it exists, or false if it doesn't or any other error.
+ */
+bool tdb_exists(struct tdb_context *tdb, TDB_DATA key);
+
+/**
+ * tdb_deq - are struct tdb_data equal?
+ * @a: one struct tdb_data
+ * @b: another struct tdb_data
+ */
+static inline bool tdb_deq(struct tdb_data a, struct tdb_data b)
+{
+ return a.dsize == b.dsize && memcmp(a.dptr, b.dptr, a.dsize) == 0;
+}
+
+/**
+ * tdb_mkdata - make a struct tdb_data from const data
+ * @p: the constant pointer
+ * @len: the length
+ *
+ * As the dptr member of struct tdb_data is not constant, you need to
+ * cast it. This function keeps thost casts in one place, as well as
+ * suppressing the warning some compilers give when casting away a
+ * qualifier (eg. gcc with -Wcast-qual)
+ */
+static inline struct tdb_data tdb_mkdata(const void *p, size_t len)
+{
+ struct tdb_data d;
+ d.dptr = cast_const(void *, p);
+ d.dsize = len;
+ return d;
+}
+
+/**
+ * tdb_transaction_start - start a transaction
+ * @tdb: the tdb context returned from tdb_open()
+ *
+ * This begins a series of atomic operations. Other processes will be able
+ * to read the tdb, but not alter it (they will block), nor will they see
+ * any changes until tdb_transaction_commit() is called.
+ *
+ * Note that if the TDB_ALLOW_NESTING flag is set, a tdb_transaction_start()
+ * within a transaction will succeed, but it's not a real transaction:
+ * (1) An inner transaction which is committed is not actually committed until
+ * the outer transaction is; if the outer transaction is cancelled, the
+ * inner ones are discarded.
+ * (2) tdb_transaction_cancel() marks the outer transaction as having an error,
+ * so the final tdb_transaction_commit() will fail.
+ * (3) the outer transaction will see the results of the inner transaction.
+ *
+ * See Also:
+ * tdb_transaction_cancel, tdb_transaction_commit.
+ */
+enum TDB_ERROR tdb_transaction_start(struct tdb_context *tdb);
+
+/**
+ * tdb_transaction_cancel - abandon a transaction
+ * @tdb: the tdb context returned from tdb_open()
+ *
+ * This aborts a transaction, discarding any changes which were made.
+ * tdb_close() does this implicitly.
+ */
+void tdb_transaction_cancel(struct tdb_context *tdb);
+
+/**
+ * tdb_transaction_commit - commit a transaction
+ * @tdb: the tdb context returned from tdb_open()
+ *
+ * This completes a transaction, writing any changes which were made.
+ *
+ * fsync() is used to commit the transaction (unless TDB_NOSYNC is set),
+ * making it robust against machine crashes, but very slow compared to
+ * other TDB operations.
+ *
+ * A failure can only be caused by unexpected errors (eg. I/O or
+ * memory); this is no point looping on transaction failure.
+ *
+ * See Also:
+ * tdb_transaction_prepare_commit()
+ */
+enum TDB_ERROR tdb_transaction_commit(struct tdb_context *tdb);
+
+/**
+ * tdb_transaction_prepare_commit - prepare to commit a transaction
+ * @tdb: the tdb context returned from tdb_open()
+ *
+ * This ensures we have the resources to commit a transaction (using
+ * tdb_transaction_commit): if this succeeds then a transaction will only
+ * fail if the write() or fsync() calls fail.
+ *
+ * If this fails you must still call tdb_transaction_cancel() to cancel
+ * the transaction.
+ *
+ * See Also:
+ * tdb_transaction_commit()
+ */
+enum TDB_ERROR tdb_transaction_prepare_commit(struct tdb_context *tdb);
+
+/**
+ * tdb_traverse - traverse a TDB
+ * @tdb: the tdb context returned from tdb_open()
+ * @fn: the function to call for every key/value pair (or NULL)
+ * @p: the pointer to hand to @f
+ *
+ * This walks the TDB until all they keys have been traversed, or @fn
+ * returns non-zero. If the traverse function or other processes are
+ * changing data or adding or deleting keys, the traverse may be
+ * unreliable: keys may be skipped or (rarely) visited twice.
+ *
+ * There is one specific exception: the special case of deleting the
+ * current key does not undermine the reliability of the traversal.
+ *
+ * On success, returns the number of keys iterated. On error returns
+ * a negative enum TDB_ERROR value.
+ */
+#define tdb_traverse(tdb, fn, p) \
+ tdb_traverse_(tdb, typesafe_cb_preargs(int, void *, (fn), (p), \
+ struct tdb_context *, \
+ TDB_DATA, TDB_DATA), (p))
+
+int64_t tdb_traverse_(struct tdb_context *tdb,
+ int (*fn)(struct tdb_context *,
+ TDB_DATA, TDB_DATA, void *), void *p);
+
+/**
+ * tdb_parse_record - operate directly on data in the database.
+ * @tdb: the tdb context returned from tdb_open()
+ * @key: the key whose record we should hand to @parse
+ * @parse: the function to call for the data
+ * @data: the private pointer to hand to @parse (types must match).
+ *
+ * This avoids a copy for many cases, by handing you a pointer into
+ * the memory-mapped database. It also locks the record to prevent
+ * other accesses at the same time.
+ *
+ * Do not alter the data handed to parse()!
+ */
+#define tdb_parse_record(tdb, key, parse, data) \
+ tdb_parse_record_((tdb), (key), \
+ typesafe_cb_preargs(enum TDB_ERROR, void *, \
+ (parse), (data), \
+ TDB_DATA, TDB_DATA), (data))
+
+enum TDB_ERROR tdb_parse_record_(struct tdb_context *tdb,
+ TDB_DATA key,
+ enum TDB_ERROR (*parse)(TDB_DATA k,
+ TDB_DATA d,
+ void *data),
+ void *data);
+
+/**
+ * tdb_get_seqnum - get a database sequence number
+ * @tdb: the tdb context returned from tdb_open()
+ *
+ * This returns a sequence number: any change to the database from a
+ * tdb context opened with the TDB_SEQNUM flag will cause that number
+ * to increment. Note that the incrementing is unreliable (it is done
+ * without locking), so this is only useful as an optimization.
+ *
+ * For example, you may have a regular database backup routine which
+ * does not operate if the sequence number is unchanged. In the
+ * unlikely event of a failed increment, it will be backed up next
+ * time any way.
+ *
+ * Returns an enum TDB_ERROR (ie. negative) on error.
+ */
+int64_t tdb_get_seqnum(struct tdb_context *tdb);
+
+/**
+ * tdb_firstkey - get the "first" key in a TDB
+ * @tdb: the tdb context returned from tdb_open()
+ * @key: pointer to key.
+ *
+ * This returns an arbitrary key in the database; with tdb_nextkey() it allows
+ * open-coded traversal of the database, though it is slightly less efficient
+ * than tdb_traverse.
+ *
+ * It is your responsibility to free @key->dptr on success.
+ *
+ * Returns TDB_ERR_NOEXIST if the database is empty.
+ */
+enum TDB_ERROR tdb_firstkey(struct tdb_context *tdb, struct tdb_data *key);
+
+/**
+ * tdb_nextkey - get the "next" key in a TDB
+ * @tdb: the tdb context returned from tdb_open()
+ * @key: a key returned by tdb_firstkey() or tdb_nextkey().
+ *
+ * This returns another key in the database; it will free @key.dptr for
+ * your convenience.
+ *
+ * Returns TDB_ERR_NOEXIST if there are no more keys.
+ */
+enum TDB_ERROR tdb_nextkey(struct tdb_context *tdb, struct tdb_data *key);
+
+/**
+ * tdb_chainlock - lock a record in the TDB
+ * @tdb: the tdb context returned from tdb_open()
+ * @key: the key to lock.
+ *
+ * This prevents any access occurring to a group of keys including @key,
+ * even if @key does not exist. This allows primitive atomic updates of
+ * records without using transactions.
+ *
+ * You cannot begin a transaction while holding a tdb_chainlock(), nor can
+ * you do any operations on any other keys in the database. This also means
+ * that you cannot hold more than one tdb_chainlock() at a time.
+ *
+ * See Also:
+ * tdb_chainunlock()
+ */
+enum TDB_ERROR tdb_chainlock(struct tdb_context *tdb, TDB_DATA key);
+
+/**
+ * tdb_chainunlock - unlock a record in the TDB
+ * @tdb: the tdb context returned from tdb_open()
+ * @key: the key to unlock.
+ *
+ * The key must have previously been locked by tdb_chainlock().
+ */
+void tdb_chainunlock(struct tdb_context *tdb, TDB_DATA key);
+
+/**
+ * tdb_chainlock_read - lock a record in the TDB, for reading
+ * @tdb: the tdb context returned from tdb_open()
+ * @key: the key to lock.
+ *
+ * This prevents any changes from occurring to a group of keys including @key,
+ * even if @key does not exist. This allows primitive atomic updates of
+ * records without using transactions.
+ *
+ * You cannot begin a transaction while holding a tdb_chainlock_read(), nor can
+ * you do any operations on any other keys in the database. This also means
+ * that you cannot hold more than one tdb_chainlock()/read() at a time.
+ *
+ * See Also:
+ * tdb_chainlock()
+ */
+enum TDB_ERROR tdb_chainlock_read(struct tdb_context *tdb, TDB_DATA key);
+
+/**
+ * tdb_chainunlock_read - unlock a record in the TDB for reading
+ * @tdb: the tdb context returned from tdb_open()
+ * @key: the key to unlock.
+ *
+ * The key must have previously been locked by tdb_chainlock_read().
+ */
+void tdb_chainunlock_read(struct tdb_context *tdb, TDB_DATA key);
+
+/**
+ * tdb_lockall - lock the entire TDB
+ * @tdb: the tdb context returned from tdb_open()
+ *
+ * You cannot hold a tdb_chainlock while calling this. It nests, so you
+ * must call tdb_unlockall as many times as you call tdb_lockall.
+ */
+enum TDB_ERROR tdb_lockall(struct tdb_context *tdb);
+
+/**
+ * tdb_unlockall - unlock the entire TDB
+ * @tdb: the tdb context returned from tdb_open()
+ */
+void tdb_unlockall(struct tdb_context *tdb);
+
+/**
+ * tdb_lockall_read - lock the entire TDB for reading
+ * @tdb: the tdb context returned from tdb_open()
+ *
+ * This prevents others writing to the database, eg. tdb_delete, tdb_store,
+ * tdb_append, but not tdb_fetch.
+ *
+ * You cannot hold a tdb_chainlock while calling this. It nests, so you
+ * must call tdb_unlockall_read as many times as you call tdb_lockall_read.
+ */
+enum TDB_ERROR tdb_lockall_read(struct tdb_context *tdb);
+
+/**
+ * tdb_unlockall_read - unlock the entire TDB for reading
+ * @tdb: the tdb context returned from tdb_open()
+ */
+void tdb_unlockall_read(struct tdb_context *tdb);
+
+/**
+ * tdb_wipe_all - wipe the database clean
+ * @tdb: the tdb context returned from tdb_open()
+ *
+ * Completely erase the database. This is faster than iterating through
+ * each key and doing tdb_delete.
+ */
+enum TDB_ERROR tdb_wipe_all(struct tdb_context *tdb);
+
+/**
+ * tdb_check - check a TDB for consistency
+ * @tdb: the tdb context returned from tdb_open()
+ * @check: function to check each key/data pair (or NULL)
+ * @data: argument for @check, must match type.
+ *
+ * This performs a consistency check of the open database, optionally calling
+ * a check() function on each record so you can do your own data consistency
+ * checks as well. If check() returns an error, that is returned from
+ * tdb_check().
+ *
+ * Returns TDB_SUCCESS or an error.
+ */
+#define tdb_check(tdb, check, data) \
+ tdb_check_((tdb), typesafe_cb_preargs(enum TDB_ERROR, void *, \
+ (check), (data), \
+ struct tdb_data, \
+ struct tdb_data), \
+ (data))
+
+enum TDB_ERROR tdb_check_(struct tdb_context *tdb,
+ enum TDB_ERROR (*check)(struct tdb_data k,
+ struct tdb_data d,
+ void *data),
+ void *data);
+
+/**
+ * tdb_error - get the last error (not threadsafe)
+ * @tdb: the tdb context returned from tdb_open()
+ *
+ * Returns the last error returned by a TDB function.
+ *
+ * This makes porting from TDB1 easier, but note that the last error is not
+ * reliable in threaded programs.
+ */
+enum TDB_ERROR tdb_error(struct tdb_context *tdb);
+
+/**
+ * enum tdb_summary_flags - flags for tdb_summary.
+ */
+enum tdb_summary_flags {
+ TDB_SUMMARY_HISTOGRAMS = 1 /* Draw graphs in the summary. */
+};
+
+/**
+ * tdb_summary - return a string describing the TDB state
+ * @tdb: the tdb context returned from tdb_open()
+ * @flags: flags to control the summary output.
+ * @summary: pointer to string to allocate.
+ *
+ * This returns a developer-readable string describing the overall
+ * state of the tdb, such as the percentage used and sizes of records.
+ * It is designed to provide information about the tdb at a glance
+ * without displaying any keys or data in the database.
+ *
+ * On success, sets @summary to point to a malloc()'ed nul-terminated
+ * multi-line string. It is your responsibility to free() it.
+ */
+enum TDB_ERROR tdb_summary(struct tdb_context *tdb,
+ enum tdb_summary_flags flags,
+ char **summary);
+
+
+/**
+ * tdb_get_flags - return the flags for a tdb
+ * @tdb: the tdb context returned from tdb_open()
+ *
+ * This returns the flags on the current tdb. Some of these are caused by
+ * the flags argument to tdb_open(), others (such as TDB_CONVERT) are
+ * intuited.
+ */
+unsigned int tdb_get_flags(struct tdb_context *tdb);
+
+/**
+ * tdb_add_flag - set a flag for a tdb
+ * @tdb: the tdb context returned from tdb_open()
+ * @flag: one of TDB_NOLOCK, TDB_NOMMAP, TDB_NOSYNC or TDB_ALLOW_NESTING.
+ *
+ * You can use this to set a flag on the TDB. You cannot set these flags
+ * on a TDB_INTERNAL tdb.
+ */
+void tdb_add_flag(struct tdb_context *tdb, unsigned flag);
+
+/**
+ * tdb_remove_flag - unset a flag for a tdb
+ * @tdb: the tdb context returned from tdb_open()
+ * @flag: one of TDB_NOLOCK, TDB_NOMMAP, TDB_NOSYNC or TDB_ALLOW_NESTING.
+ *
+ * You can use this to clear a flag on the TDB. You cannot clear flags
+ * on a TDB_INTERNAL tdb.
+ */
+void tdb_remove_flag(struct tdb_context *tdb, unsigned flag);
+
+/**
+ * enum tdb_attribute_type - descriminator for union tdb_attribute.
+ */
+enum tdb_attribute_type {
+ TDB_ATTRIBUTE_LOG = 0,
+ TDB_ATTRIBUTE_HASH = 1,
+ TDB_ATTRIBUTE_SEED = 2,
+ TDB_ATTRIBUTE_STATS = 3,
+ TDB_ATTRIBUTE_OPENHOOK = 4,
+ TDB_ATTRIBUTE_FLOCK = 5
+};
+
+/**
+ * tdb_get_attribute - get an attribute for an existing tdb
+ * @tdb: the tdb context returned from tdb_open()
+ * @attr: the union tdb_attribute to set.
+ *
+ * This gets an attribute from a TDB which has previously been set (or
+ * may return the default values). Set @attr.base.attr to the
+ * attribute type you want get.
+ *
+ * Currently this does not work for TDB_ATTRIBUTE_OPENHOOK.
+ */
+enum TDB_ERROR tdb_get_attribute(struct tdb_context *tdb,
+ union tdb_attribute *attr);
+
+/**
+ * tdb_set_attribute - set an attribute for an existing tdb
+ * @tdb: the tdb context returned from tdb_open()
+ * @attr: the union tdb_attribute to set.
+ *
+ * This sets an attribute on a TDB, overriding any previous attribute
+ * of the same type. It returns TDB_ERR_EINVAL if the attribute is
+ * unknown or invalid.
+ *
+ * Note that TDB_ATTRIBUTE_HASH, TDB_ATTRIBUTE_SEED and
+ * TDB_ATTRIBUTE_OPENHOOK cannot currently be set after tdb_open.
+ */
+enum TDB_ERROR tdb_set_attribute(struct tdb_context *tdb,
+ const union tdb_attribute *attr);
+
+/**
+ * tdb_unset_attribute - reset an attribute for an existing tdb
+ * @tdb: the tdb context returned from tdb_open()
+ * @type: the attribute type to unset.
+ *
+ * This unsets an attribute on a TDB, returning it to the defaults
+ * (where applicable).
+ *
+ * Note that it only makes sense for TDB_ATTRIBUTE_LOG and TDB_ATTRIBUTE_FLOCK
+ * to be unset.
+ */
+void tdb_unset_attribute(struct tdb_context *tdb,
+ enum tdb_attribute_type type);
+
+/**
+ * tdb_name - get the name of a tdb
+ * @tdb: the tdb context returned from tdb_open()
+ *
+ * This returns a copy of the name string, made at tdb_open() time. If that
+ * argument was NULL (possible for a TDB_INTERNAL db) this will return NULL.
+ *
+ * This is mostly useful for logging.
+ */
+const char *tdb_name(const struct tdb_context *tdb);
+
+/**
+ * tdb_fd - get the file descriptor of a tdb
+ * @tdb: the tdb context returned from tdb_open()
+ *
+ * This returns the file descriptor for the underlying database file, or -1
+ * for TDB_INTERNAL.
+ */
+int tdb_fd(const struct tdb_context *tdb);
+
+/**
+ * struct tdb_attribute_base - common fields for all tdb attributes.
+ */
+struct tdb_attribute_base {
+ enum tdb_attribute_type attr;
+ union tdb_attribute *next;
+};
+
+/**
+ * enum tdb_log_level - log levels for tdb_attribute_log
+ * @TDB_LOG_ERROR: used to log unrecoverable errors such as I/O errors
+ * or internal consistency failures.
+ * @TDB_LOG_USE_ERROR: used to log usage errors such as invalid parameters
+ * or writing to a read-only database.
+ * @TDB_LOG_WARNING: used for informational messages on issues which
+ * are unusual but handled by TDB internally, such
+ * as a failure to mmap or failure to open /dev/urandom.
+ */
+enum tdb_log_level {
+ TDB_LOG_ERROR,
+ TDB_LOG_USE_ERROR,
+ TDB_LOG_WARNING
+};
+
+/**
+ * struct tdb_attribute_log - log function attribute
+ *
+ * This attribute provides a hook for you to log errors.
+ */
+struct tdb_attribute_log {
+ struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_LOG */
+ void (*fn)(struct tdb_context *tdb,
+ enum tdb_log_level level,
+ const char *message,
+ void *data);
+ void *data;
+};
+
+/**
+ * struct tdb_attribute_hash - hash function attribute
+ *
+ * This attribute allows you to provide an alternative hash function.
+ * This hash function will be handed keys from the database; it will also
+ * be handed the 8-byte TDB_HASH_MAGIC value for checking the header (the
+ * tdb_open() will fail if the hash value doesn't match the header).
+ *
+ * Note that if your hash function gives different results on
+ * different machine endians, your tdb will no longer work across
+ * different architectures!
+ */
+struct tdb_attribute_hash {
+ struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_HASH */
+ uint64_t (*fn)(const void *key, size_t len, uint64_t seed,
+ void *data);
+ void *data;
+};
+
+/**
+ * struct tdb_attribute_seed - hash function seed attribute
+ *
+ * The hash function seed is normally taken from /dev/urandom (or equivalent)
+ * but can be set manually here. This is mainly for testing purposes.
+ */
+struct tdb_attribute_seed {
+ struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_SEED */
+ uint64_t seed;
+};
+
+/**
+ * struct tdb_attribute_stats - tdb operational statistics
+ *
+ * This attribute records statistics of various low-level TDB operations.
+ * This can be used to assist performance evaluation. This is only
+ * useful for tdb_get_attribute().
+ *
+ * New fields will be added at the end, hence the "size" argument which
+ * indicates how large your structure is: it must be filled in before
+ * calling tdb_get_attribute(), which will overwrite it with the size
+ * tdb knows about.
+ */
+struct tdb_attribute_stats {
+ struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_STATS */
+ size_t size; /* = sizeof(struct tdb_attribute_stats) */
+ uint64_t allocs;
+ uint64_t alloc_subhash;
+ uint64_t alloc_chain;
+ uint64_t alloc_bucket_exact;
+ uint64_t alloc_bucket_max;
+ uint64_t alloc_leftover;
+ uint64_t alloc_coalesce_tried;
+ uint64_t alloc_coalesce_iterate_clash;
+ uint64_t alloc_coalesce_lockfail;
+ uint64_t alloc_coalesce_race;
+ uint64_t alloc_coalesce_succeeded;
+ uint64_t alloc_coalesce_num_merged;
+ uint64_t compares;
+ uint64_t compare_wrong_bucket;
+ uint64_t compare_wrong_offsetbits;
+ uint64_t compare_wrong_keylen;
+ uint64_t compare_wrong_rechash;
+ uint64_t compare_wrong_keycmp;
+ uint64_t transactions;
+ uint64_t transaction_cancel;
+ uint64_t transaction_nest;
+ uint64_t transaction_expand_file;
+ uint64_t transaction_read_direct;
+ uint64_t transaction_read_direct_fail;
+ uint64_t transaction_write_direct;
+ uint64_t transaction_write_direct_fail;
+ uint64_t expands;
+ uint64_t frees;
+ uint64_t locks;
+ uint64_t lock_lowlevel;
+ uint64_t lock_nonblock;
+ uint64_t lock_nonblock_fail;
+};
+
+/**
+ * struct tdb_attribute_openhook - tdb special effects hook for open
+ *
+ * This attribute contains a function to call once we have the OPEN_LOCK
+ * for the tdb, but before we've examined its contents. If this succeeds,
+ * the tdb will be populated if it's then zero-length.
+ *
+ * This is a hack to allow support for TDB1-style TDB_CLEAR_IF_FIRST
+ * behaviour.
+ */
+struct tdb_attribute_openhook {
+ struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_OPENHOOK */
+ enum TDB_ERROR (*fn)(int fd, void *data);
+ void *data;
+};
+
+/**
+ * struct tdb_attribute_flock - tdb special effects hook for file locking
+ *
+ * This attribute contains function to call to place locks on a file; it can
+ * be used to support non-blocking operations or lock proxying.
+ *
+ * They should return 0 on success, -1 on failure and set errno.
+ *
+ * An error will be logged on error if errno is neither EAGAIN nor EINTR
+ * (normally it would only return EAGAIN if waitflag is false, and
+ * loop internally on EINTR).
+ */
+struct tdb_attribute_flock {
+ struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_FLOCK */
+ int (*lock)(int fd,int rw, off_t off, off_t len, bool waitflag, void *);
+ int (*unlock)(int fd, int rw, off_t off, off_t len, void *);
+ void *data;
+};
+
+/**
+ * union tdb_attribute - tdb attributes.
+ *
+ * This represents all the known attributes.
+ *
+ * See also:
+ * struct tdb_attribute_log, struct tdb_attribute_hash,
+ * struct tdb_attribute_seed, struct tdb_attribute_stats,
+ * struct tdb_attribute_openhook, struct tdb_attribute_flock.
+ */
+union tdb_attribute {
+ struct tdb_attribute_base base;
+ struct tdb_attribute_log log;
+ struct tdb_attribute_hash hash;
+ struct tdb_attribute_seed seed;
+ struct tdb_attribute_stats stats;
+ struct tdb_attribute_openhook openhook;
+ struct tdb_attribute_flock flock;
+};
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* tdb2.h */
diff --git a/lib/tdb2/test/external-agent.c b/lib/tdb2/test/external-agent.c
new file mode 100644
index 00000000000..055b5de736f
--- /dev/null
+++ b/lib/tdb2/test/external-agent.c
@@ -0,0 +1,250 @@
+#include "external-agent.h"
+#include "logging.h"
+#include "lock-tracking.h"
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include <err.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <limits.h>
+#include <string.h>
+#include <errno.h>
+#include <ccan/tdb2/private.h>
+#include <ccan/tap/tap.h>
+#include <stdio.h>
+#include <stdarg.h>
+
+static struct tdb_context *tdb;
+
+static enum TDB_ERROR clear_if_first(int fd, void *arg)
+{
+/* We hold a lock offset 63 always, so we can tell if anyone is holding it. */
+ struct flock fl;
+
+ fl.l_type = F_WRLCK;
+ fl.l_whence = SEEK_SET;
+ fl.l_start = 63;
+ fl.l_len = 1;
+
+ if (fcntl(fd, F_SETLK, &fl) == 0) {
+ /* We must be first ones to open it! */
+ diag("agent truncating file!");
+ if (ftruncate(fd, 0) != 0) {
+ return TDB_ERR_IO;
+ }
+ }
+ fl.l_type = F_RDLCK;
+ if (fcntl(fd, F_SETLKW, &fl) != 0) {
+ return TDB_ERR_IO;
+ }
+ return TDB_SUCCESS;
+}
+
+static enum agent_return do_operation(enum operation op, const char *name)
+{
+ TDB_DATA k;
+ enum agent_return ret;
+ TDB_DATA data;
+ enum TDB_ERROR ecode;
+ union tdb_attribute cif;
+
+ if (op != OPEN && op != OPEN_WITH_HOOK && !tdb) {
+ diag("external: No tdb open!");
+ return OTHER_FAILURE;
+ }
+
+ diag("external: %s", operation_name(op));
+
+ k = tdb_mkdata(name, strlen(name));
+
+ locking_would_block = 0;
+ switch (op) {
+ case OPEN:
+ if (tdb) {
+ diag("Already have tdb %s open", tdb->name);
+ return OTHER_FAILURE;
+ }
+ tdb = tdb_open(name, TDB_DEFAULT, O_RDWR, 0, &tap_log_attr);
+ if (!tdb) {
+ if (!locking_would_block)
+ diag("Opening tdb gave %s", strerror(errno));
+ forget_locking();
+ ret = OTHER_FAILURE;
+ } else
+ ret = SUCCESS;
+ break;
+ case OPEN_WITH_HOOK:
+ if (tdb) {
+ diag("Already have tdb %s open", tdb->name);
+ return OTHER_FAILURE;
+ }
+ cif.openhook.base.attr = TDB_ATTRIBUTE_OPENHOOK;
+ cif.openhook.base.next = &tap_log_attr;
+ cif.openhook.fn = clear_if_first;
+ tdb = tdb_open(name, TDB_DEFAULT, O_RDWR, 0, &cif);
+ if (!tdb) {
+ if (!locking_would_block)
+ diag("Opening tdb gave %s", strerror(errno));
+ forget_locking();
+ ret = OTHER_FAILURE;
+ } else
+ ret = SUCCESS;
+ break;
+ case FETCH:
+ ecode = tdb_fetch(tdb, k, &data);
+ if (ecode == TDB_ERR_NOEXIST) {
+ ret = FAILED;
+ } else if (ecode < 0) {
+ ret = OTHER_FAILURE;
+ } else if (!tdb_deq(data, k)) {
+ ret = OTHER_FAILURE;
+ free(data.dptr);
+ } else {
+ ret = SUCCESS;
+ free(data.dptr);
+ }
+ break;
+ case STORE:
+ ret = tdb_store(tdb, k, k, 0) == 0 ? SUCCESS : OTHER_FAILURE;
+ break;
+ case TRANSACTION_START:
+ ret = tdb_transaction_start(tdb) == 0 ? SUCCESS : OTHER_FAILURE;
+ break;
+ case TRANSACTION_COMMIT:
+ ret = tdb_transaction_commit(tdb)==0 ? SUCCESS : OTHER_FAILURE;
+ break;
+ case NEEDS_RECOVERY:
+ ret = tdb_needs_recovery(tdb) ? SUCCESS : FAILED;
+ break;
+ case CHECK:
+ ret = tdb_check(tdb, NULL, NULL) == 0 ? SUCCESS : OTHER_FAILURE;
+ break;
+ case CLOSE:
+ ret = tdb_close(tdb) == 0 ? SUCCESS : OTHER_FAILURE;
+ tdb = NULL;
+ break;
+ case SEND_SIGNAL:
+ /* We do this async */
+ ret = SUCCESS;
+ break;
+ default:
+ ret = OTHER_FAILURE;
+ }
+
+ if (locking_would_block)
+ ret = WOULD_HAVE_BLOCKED;
+
+ return ret;
+}
+
+struct agent {
+ int cmdfd, responsefd;
+};
+
+/* Do this before doing any tdb stuff. Return handle, or NULL. */
+struct agent *prepare_external_agent(void)
+{
+ int pid, ret;
+ int command[2], response[2];
+ char name[1+PATH_MAX];
+
+ if (pipe(command) != 0 || pipe(response) != 0)
+ return NULL;
+
+ pid = fork();
+ if (pid < 0)
+ return NULL;
+
+ if (pid != 0) {
+ struct agent *agent = malloc(sizeof(*agent));
+
+ close(command[0]);
+ close(response[1]);
+ agent->cmdfd = command[1];
+ agent->responsefd = response[0];
+ return agent;
+ }
+
+ close(command[1]);
+ close(response[0]);
+
+ /* We want to fail, not block. */
+ nonblocking_locks = true;
+ log_prefix = "external: ";
+ while ((ret = read(command[0], name, sizeof(name))) > 0) {
+ enum agent_return result;
+
+ result = do_operation(name[0], name+1);
+ if (write(response[1], &result, sizeof(result))
+ != sizeof(result))
+ err(1, "Writing response");
+ if (name[0] == SEND_SIGNAL) {
+ struct timeval ten_ms;
+ ten_ms.tv_sec = 0;
+ ten_ms.tv_usec = 10000;
+ select(0, NULL, NULL, NULL, &ten_ms);
+ kill(getppid(), SIGUSR1);
+ }
+ }
+ exit(0);
+}
+
+/* Ask the external agent to try to do an operation. */
+enum agent_return external_agent_operation(struct agent *agent,
+ enum operation op,
+ const char *name)
+{
+ enum agent_return res;
+ unsigned int len;
+ char *string;
+
+ if (!name)
+ name = "";
+ len = 1 + strlen(name) + 1;
+ string = malloc(len);
+
+ string[0] = op;
+ strcpy(string+1, name);
+
+ if (write(agent->cmdfd, string, len) != len
+ || read(agent->responsefd, &res, sizeof(res)) != sizeof(res))
+ res = AGENT_DIED;
+
+ free(string);
+ return res;
+}
+
+const char *agent_return_name(enum agent_return ret)
+{
+ return ret == SUCCESS ? "SUCCESS"
+ : ret == WOULD_HAVE_BLOCKED ? "WOULD_HAVE_BLOCKED"
+ : ret == AGENT_DIED ? "AGENT_DIED"
+ : ret == FAILED ? "FAILED"
+ : ret == OTHER_FAILURE ? "OTHER_FAILURE"
+ : "**INVALID**";
+}
+
+const char *operation_name(enum operation op)
+{
+ switch (op) {
+ case OPEN: return "OPEN";
+ case OPEN_WITH_HOOK: return "OPEN_WITH_HOOK";
+ case FETCH: return "FETCH";
+ case STORE: return "STORE";
+ case CHECK: return "CHECK";
+ case TRANSACTION_START: return "TRANSACTION_START";
+ case TRANSACTION_COMMIT: return "TRANSACTION_COMMIT";
+ case NEEDS_RECOVERY: return "NEEDS_RECOVERY";
+ case SEND_SIGNAL: return "SEND_SIGNAL";
+ case CLOSE: return "CLOSE";
+ }
+ return "**INVALID**";
+}
+
+void free_external_agent(struct agent *agent)
+{
+ close(agent->cmdfd);
+ close(agent->responsefd);
+ free(agent);
+}
diff --git a/lib/tdb2/test/external-agent.h b/lib/tdb2/test/external-agent.h
new file mode 100644
index 00000000000..9eada107508
--- /dev/null
+++ b/lib/tdb2/test/external-agent.h
@@ -0,0 +1,43 @@
+#ifndef TDB2_TEST_EXTERNAL_AGENT_H
+#define TDB2_TEST_EXTERNAL_AGENT_H
+
+/* For locking tests, we need a different process to try things at
+ * various times. */
+enum operation {
+ OPEN,
+ OPEN_WITH_HOOK,
+ FETCH,
+ STORE,
+ TRANSACTION_START,
+ TRANSACTION_COMMIT,
+ NEEDS_RECOVERY,
+ CHECK,
+ SEND_SIGNAL,
+ CLOSE,
+};
+
+/* Do this before doing any tdb stuff. Return handle, or -1. */
+struct agent *prepare_external_agent(void);
+
+enum agent_return {
+ SUCCESS,
+ WOULD_HAVE_BLOCKED,
+ AGENT_DIED,
+ FAILED, /* For fetch, or NEEDS_RECOVERY */
+ OTHER_FAILURE,
+};
+
+/* Ask the external agent to try to do an operation.
+ * name == tdb name for OPEN/OPEN_WITH_CLEAR_IF_FIRST,
+ * record name for FETCH/STORE (store stores name as data too)
+ */
+enum agent_return external_agent_operation(struct agent *handle,
+ enum operation op,
+ const char *name);
+
+/* Mapping enum -> string. */
+const char *agent_return_name(enum agent_return ret);
+const char *operation_name(enum operation op);
+
+void free_external_agent(struct agent *agent);
+#endif /* TDB2_TEST_EXTERNAL_AGENT_H */
diff --git a/lib/tdb2/test/failtest_helper.c b/lib/tdb2/test/failtest_helper.c
new file mode 100644
index 00000000000..1358a6c6b26
--- /dev/null
+++ b/lib/tdb2/test/failtest_helper.c
@@ -0,0 +1,117 @@
+#include "failtest_helper.h"
+#include "logging.h"
+#include <string.h>
+#include <ccan/tap/tap.h>
+
+/* FIXME: From ccan/str */
+static inline bool strends(const char *str, const char *postfix)
+{
+ if (strlen(str) < strlen(postfix))
+ return false;
+
+ return !strcmp(str + strlen(str) - strlen(postfix), postfix);
+}
+
+bool failmatch(const struct failtest_call *call,
+ const char *file, int line, enum failtest_call_type type)
+{
+ return call->type == type
+ && call->line == line
+ && ((strcmp(call->file, file) == 0)
+ || (strends(call->file, file)
+ && (call->file[strlen(call->file) - strlen(file) - 1]
+ == '/')));
+}
+
+static const struct failtest_call *
+find_repeat(const struct failtest_call *start, const struct failtest_call *end,
+ const struct failtest_call *call)
+{
+ const struct failtest_call *i;
+
+ for (i = start; i < end; i++) {
+ if (failmatch(i, call->file, call->line, call->type))
+ return i;
+ }
+ return NULL;
+}
+
+static bool is_nonblocking_lock(const struct failtest_call *call)
+{
+ return call->type == FAILTEST_FCNTL && call->u.fcntl.cmd == F_SETLK;
+}
+
+static bool is_unlock(const struct failtest_call *call)
+{
+ return call->type == FAILTEST_FCNTL
+ && call->u.fcntl.arg.fl.l_type == F_UNLCK;
+}
+
+bool exit_check_log(struct failtest_call *history, unsigned num)
+{
+ unsigned int i;
+
+ for (i = 0; i < num; i++) {
+ if (!history[i].fail)
+ continue;
+ /* Failing the /dev/urandom open doesn't count: we fall back. */
+ if (failmatch(&history[i], URANDOM_OPEN))
+ continue;
+
+ /* Similarly with read fail. */
+ if (failmatch(&history[i], URANDOM_READ))
+ continue;
+
+ /* Initial allocation of tdb doesn't log. */
+ if (failmatch(&history[i], INITIAL_TDB_MALLOC))
+ continue;
+
+ /* We don't block "failures" on non-blocking locks. */
+ if (is_nonblocking_lock(&history[i]))
+ continue;
+
+ if (!tap_log_messages)
+ diag("We didn't log for %u (%s:%u)",
+ i, history[i].file, history[i].line);
+ return tap_log_messages != 0;
+ }
+ return true;
+}
+
+/* Some places we soldier on despite errors: only fail them once. */
+enum failtest_result
+block_repeat_failures(struct failtest_call *history, unsigned num)
+{
+ const struct failtest_call *i, *last = &history[num-1];
+
+ if (failmatch(last, INITIAL_TDB_MALLOC)
+ || failmatch(last, URANDOM_OPEN)
+ || failmatch(last, URANDOM_READ)) {
+ if (find_repeat(history, last, last))
+ return FAIL_DONT_FAIL;
+ return FAIL_PROBE;
+ }
+
+ /* Unlock or non-blocking lock is fail-once. */
+ if (is_unlock(last)) {
+ /* Find a previous unlock at this point? */
+ for (i = find_repeat(history, last, last);
+ i;
+ i = find_repeat(history, i, last)) {
+ if (is_unlock(i))
+ return FAIL_DONT_FAIL;
+ }
+ return FAIL_PROBE;
+ } else if (is_nonblocking_lock(last)) {
+ /* Find a previous non-blocking lock at this point? */
+ for (i = find_repeat(history, last, last);
+ i;
+ i = find_repeat(history, i, last)) {
+ if (is_nonblocking_lock(i))
+ return FAIL_DONT_FAIL;
+ }
+ return FAIL_PROBE;
+ }
+
+ return FAIL_OK;
+}
diff --git a/lib/tdb2/test/failtest_helper.h b/lib/tdb2/test/failtest_helper.h
new file mode 100644
index 00000000000..a62efbad58e
--- /dev/null
+++ b/lib/tdb2/test/failtest_helper.h
@@ -0,0 +1,17 @@
+#ifndef TDB2_TEST_FAILTEST_HELPER_H
+#define TDB2_TEST_FAILTEST_HELPER_H
+#include <ccan/failtest/failtest.h>
+#include <stdbool.h>
+
+/* FIXME: Check these! */
+#define INITIAL_TDB_MALLOC "open.c", 338, FAILTEST_MALLOC
+#define URANDOM_OPEN "open.c", 45, FAILTEST_OPEN
+#define URANDOM_READ "open.c", 25, FAILTEST_READ
+
+bool exit_check_log(struct failtest_call *history, unsigned num);
+bool failmatch(const struct failtest_call *call,
+ const char *file, int line, enum failtest_call_type type);
+enum failtest_result
+block_repeat_failures(struct failtest_call *history, unsigned num);
+
+#endif /* TDB2_TEST_LOGGING_H */
diff --git a/lib/tdb2/test/layout.c b/lib/tdb2/test/layout.c
new file mode 100644
index 00000000000..31889ad080c
--- /dev/null
+++ b/lib/tdb2/test/layout.c
@@ -0,0 +1,348 @@
+/* TDB tools to create various canned database layouts. */
+#include "layout.h"
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <err.h>
+#include "logging.h"
+
+struct tdb_layout *new_tdb_layout(const char *filename)
+{
+ struct tdb_layout *layout = malloc(sizeof(*layout));
+ layout->filename = filename;
+ layout->num_elems = 0;
+ layout->elem = NULL;
+ return layout;
+}
+
+static void add(struct tdb_layout *layout, union tdb_layout_elem elem)
+{
+ layout->elem = realloc(layout->elem,
+ sizeof(layout->elem[0])
+ * (layout->num_elems+1));
+ layout->elem[layout->num_elems++] = elem;
+}
+
+void tdb_layout_add_freetable(struct tdb_layout *layout)
+{
+ union tdb_layout_elem elem;
+ elem.base.type = FREETABLE;
+ add(layout, elem);
+}
+
+void tdb_layout_add_free(struct tdb_layout *layout, tdb_len_t len,
+ unsigned ftable)
+{
+ union tdb_layout_elem elem;
+ elem.base.type = FREE;
+ elem.free.len = len;
+ elem.free.ftable_num = ftable;
+ add(layout, elem);
+}
+
+static struct tdb_data dup_key(struct tdb_data key)
+{
+ struct tdb_data ret;
+ ret.dsize = key.dsize;
+ ret.dptr = malloc(ret.dsize);
+ memcpy(ret.dptr, key.dptr, ret.dsize);
+ return ret;
+}
+
+void tdb_layout_add_used(struct tdb_layout *layout,
+ TDB_DATA key, TDB_DATA data,
+ tdb_len_t extra)
+{
+ union tdb_layout_elem elem;
+ elem.base.type = DATA;
+ elem.used.key = dup_key(key);
+ elem.used.data = dup_key(data);
+ elem.used.extra = extra;
+ add(layout, elem);
+}
+
+static tdb_len_t free_record_len(tdb_len_t len)
+{
+ return sizeof(struct tdb_used_record) + len;
+}
+
+static tdb_len_t data_record_len(struct tle_used *used)
+{
+ tdb_len_t len;
+ len = sizeof(struct tdb_used_record)
+ + used->key.dsize + used->data.dsize + used->extra;
+ assert(len >= sizeof(struct tdb_free_record));
+ return len;
+}
+
+static tdb_len_t hashtable_len(struct tle_hashtable *htable)
+{
+ return sizeof(struct tdb_used_record)
+ + (sizeof(tdb_off_t) << TDB_SUBLEVEL_HASH_BITS)
+ + htable->extra;
+}
+
+static tdb_len_t freetable_len(struct tle_freetable *ftable)
+{
+ return sizeof(struct tdb_freetable);
+}
+
+static void set_free_record(void *mem, tdb_len_t len)
+{
+ /* We do all the work in add_to_freetable */
+}
+
+static void add_zero_pad(struct tdb_used_record *u, size_t len, size_t extra)
+{
+ if (extra)
+ ((char *)(u + 1))[len] = '\0';
+}
+
+static void set_data_record(void *mem, struct tdb_context *tdb,
+ struct tle_used *used)
+{
+ struct tdb_used_record *u = mem;
+
+ set_header(tdb, u, TDB_USED_MAGIC, used->key.dsize, used->data.dsize,
+ used->key.dsize + used->data.dsize + used->extra,
+ tdb_hash(tdb, used->key.dptr, used->key.dsize));
+ memcpy(u + 1, used->key.dptr, used->key.dsize);
+ memcpy((char *)(u + 1) + used->key.dsize,
+ used->data.dptr, used->data.dsize);
+ add_zero_pad(u, used->key.dsize + used->data.dsize, used->extra);
+}
+
+static void set_hashtable(void *mem, struct tdb_context *tdb,
+ struct tle_hashtable *htable)
+{
+ struct tdb_used_record *u = mem;
+ tdb_len_t len = sizeof(tdb_off_t) << TDB_SUBLEVEL_HASH_BITS;
+
+ set_header(tdb, u, TDB_HTABLE_MAGIC, 0, len, len + htable->extra, 0);
+ memset(u + 1, 0, len);
+ add_zero_pad(u, len, htable->extra);
+}
+
+static void set_freetable(void *mem, struct tdb_context *tdb,
+ struct tle_freetable *freetable, struct tdb_header *hdr,
+ tdb_off_t last_ftable)
+{
+ struct tdb_freetable *ftable = mem;
+ memset(ftable, 0, sizeof(*ftable));
+ set_header(tdb, &ftable->hdr, TDB_FTABLE_MAGIC, 0,
+ sizeof(*ftable) - sizeof(ftable->hdr),
+ sizeof(*ftable) - sizeof(ftable->hdr), 0);
+
+ if (last_ftable) {
+ ftable = (struct tdb_freetable *)((char *)hdr + last_ftable);
+ ftable->next = freetable->base.off;
+ } else {
+ hdr->free_table = freetable->base.off;
+ }
+}
+
+static void add_to_freetable(struct tdb_context *tdb,
+ tdb_off_t eoff,
+ tdb_off_t elen,
+ unsigned ftable,
+ struct tle_freetable *freetable)
+{
+ tdb->ftable_off = freetable->base.off;
+ tdb->ftable = ftable;
+ add_free_record(tdb, eoff, sizeof(struct tdb_used_record) + elen,
+ TDB_LOCK_WAIT, false);
+}
+
+static tdb_off_t hbucket_off(tdb_off_t group_start, unsigned ingroup)
+{
+ return group_start
+ + (ingroup % (1 << TDB_HASH_GROUP_BITS)) * sizeof(tdb_off_t);
+}
+
+/* Get bits from a value. */
+static uint32_t bits(uint64_t val, unsigned start, unsigned num)
+{
+ assert(num <= 32);
+ return (val >> start) & ((1U << num) - 1);
+}
+
+/* We take bits from the top: that way we can lock whole sections of the hash
+ * by using lock ranges. */
+static uint32_t use_bits(uint64_t h, unsigned num, unsigned *used)
+{
+ *used += num;
+ return bits(h, 64 - *used, num);
+}
+
+static tdb_off_t encode_offset(tdb_off_t new_off, unsigned bucket,
+ uint64_t h)
+{
+ return bucket
+ | new_off
+ | ((uint64_t)bits(h, 64 - TDB_OFF_UPPER_STEAL_EXTRA,
+ TDB_OFF_UPPER_STEAL_EXTRA)
+ << TDB_OFF_HASH_EXTRA_BIT);
+}
+
+/* FIXME: Our hash table handling here is primitive: we don't expand! */
+static void add_to_hashtable(struct tdb_context *tdb,
+ tdb_off_t eoff,
+ struct tdb_data key)
+{
+ uint64_t h = tdb_hash(tdb, key.dptr, key.dsize);
+ tdb_off_t b_off, group_start;
+ unsigned i, group, in_group;
+ unsigned used = 0;
+
+ group = use_bits(h, TDB_TOPLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS, &used);
+ in_group = use_bits(h, TDB_HASH_GROUP_BITS, &used);
+
+ group_start = offsetof(struct tdb_header, hashtable)
+ + group * (sizeof(tdb_off_t) << TDB_HASH_GROUP_BITS);
+
+ for (i = 0; i < (1 << TDB_HASH_GROUP_BITS); i++) {
+ unsigned bucket = (in_group + i) % (1 << TDB_HASH_GROUP_BITS);
+
+ b_off = hbucket_off(group_start, bucket);
+ if (tdb_read_off(tdb, b_off) == 0) {
+ tdb_write_off(tdb, b_off,
+ encode_offset(eoff, bucket, h));
+ return;
+ }
+ }
+ abort();
+}
+
+static struct tle_freetable *find_ftable(struct tdb_layout *layout, unsigned num)
+{
+ unsigned i;
+
+ for (i = 0; i < layout->num_elems; i++) {
+ if (layout->elem[i].base.type != FREETABLE)
+ continue;
+ if (num == 0)
+ return &layout->elem[i].ftable;
+ num--;
+ }
+ abort();
+}
+
+/* FIXME: Support TDB_CONVERT */
+struct tdb_context *tdb_layout_get(struct tdb_layout *layout)
+{
+ unsigned int i;
+ tdb_off_t off, len, last_ftable;
+ char *mem;
+ struct tdb_context *tdb;
+
+ off = sizeof(struct tdb_header);
+
+ /* First pass of layout: calc lengths */
+ for (i = 0; i < layout->num_elems; i++) {
+ union tdb_layout_elem *e = &layout->elem[i];
+ e->base.off = off;
+ switch (e->base.type) {
+ case FREETABLE:
+ len = freetable_len(&e->ftable);
+ break;
+ case FREE:
+ len = free_record_len(e->free.len);
+ break;
+ case DATA:
+ len = data_record_len(&e->used);
+ break;
+ case HASHTABLE:
+ len = hashtable_len(&e->hashtable);
+ break;
+ default:
+ abort();
+ }
+ off += len;
+ }
+
+ mem = malloc(off);
+ /* Fill with some weird pattern. */
+ memset(mem, 0x99, off);
+ /* Now populate our header, cribbing from a real TDB header. */
+ tdb = tdb_open(NULL, TDB_INTERNAL, O_RDWR, 0, &tap_log_attr);
+ memcpy(mem, tdb->file->map_ptr, sizeof(struct tdb_header));
+
+ /* Mug the tdb we have to make it use this. */
+ free(tdb->file->map_ptr);
+ tdb->file->map_ptr = mem;
+ tdb->file->map_size = off;
+
+ last_ftable = 0;
+ for (i = 0; i < layout->num_elems; i++) {
+ union tdb_layout_elem *e = &layout->elem[i];
+ switch (e->base.type) {
+ case FREETABLE:
+ set_freetable(mem + e->base.off, tdb, &e->ftable,
+ (struct tdb_header *)mem, last_ftable);
+ last_ftable = e->base.off;
+ break;
+ case FREE:
+ set_free_record(mem + e->base.off, e->free.len);
+ break;
+ case DATA:
+ set_data_record(mem + e->base.off, tdb, &e->used);
+ break;
+ case HASHTABLE:
+ set_hashtable(mem + e->base.off, tdb, &e->hashtable);
+ break;
+ }
+ }
+ /* Must have a free table! */
+ assert(last_ftable);
+
+ /* Now fill the free and hash tables. */
+ for (i = 0; i < layout->num_elems; i++) {
+ union tdb_layout_elem *e = &layout->elem[i];
+ switch (e->base.type) {
+ case FREE:
+ add_to_freetable(tdb, e->base.off, e->free.len,
+ e->free.ftable_num,
+ find_ftable(layout, e->free.ftable_num));
+ break;
+ case DATA:
+ add_to_hashtable(tdb, e->base.off, e->used.key);
+ break;
+ default:
+ break;
+ }
+ }
+
+ tdb->ftable_off = find_ftable(layout, 0)->base.off;
+
+ /* Get physical if they asked for it. */
+ if (layout->filename) {
+ int fd = open(layout->filename, O_WRONLY|O_TRUNC|O_CREAT,
+ 0600);
+ if (fd < 0)
+ err(1, "opening %s for writing", layout->filename);
+ if (write(fd, tdb->file->map_ptr, tdb->file->map_size)
+ != tdb->file->map_size)
+ err(1, "writing %s", layout->filename);
+ close(fd);
+ tdb_close(tdb);
+ /* NOMMAP is for lockcheck. */
+ tdb = tdb_open(layout->filename, TDB_NOMMAP, O_RDWR, 0,
+ &tap_log_attr);
+ }
+
+ return tdb;
+}
+
+void tdb_layout_free(struct tdb_layout *layout)
+{
+ unsigned int i;
+
+ for (i = 0; i < layout->num_elems; i++) {
+ if (layout->elem[i].base.type == DATA) {
+ free(layout->elem[i].used.key.dptr);
+ free(layout->elem[i].used.data.dptr);
+ }
+ }
+ free(layout->elem);
+ free(layout);
+}
diff --git a/lib/tdb2/test/layout.h b/lib/tdb2/test/layout.h
new file mode 100644
index 00000000000..6e2e6657a70
--- /dev/null
+++ b/lib/tdb2/test/layout.h
@@ -0,0 +1,68 @@
+#ifndef TDB2_TEST_LAYOUT_H
+#define TDB2_TEST_LAYOUT_H
+#include <ccan/tdb2/private.h>
+
+struct tdb_layout *new_tdb_layout(const char *filename);
+void tdb_layout_add_freetable(struct tdb_layout *layout);
+void tdb_layout_add_free(struct tdb_layout *layout, tdb_len_t len,
+ unsigned ftable);
+void tdb_layout_add_used(struct tdb_layout *layout,
+ TDB_DATA key, TDB_DATA data,
+ tdb_len_t extra);
+#if 0 /* FIXME: Allow allocation of subtables */
+void tdb_layout_add_hashtable(struct tdb_layout *layout,
+ int htable_parent, /* -1 == toplevel */
+ unsigned int bucket,
+ tdb_len_t extra);
+#endif
+struct tdb_context *tdb_layout_get(struct tdb_layout *layout);
+void tdb_layout_free(struct tdb_layout *layout);
+
+enum layout_type {
+ FREETABLE, FREE, DATA, HASHTABLE,
+};
+
+/* Shared by all union members. */
+struct tle_base {
+ enum layout_type type;
+ tdb_off_t off;
+};
+
+struct tle_freetable {
+ struct tle_base base;
+};
+
+struct tle_free {
+ struct tle_base base;
+ tdb_len_t len;
+ unsigned ftable_num;
+};
+
+struct tle_used {
+ struct tle_base base;
+ TDB_DATA key;
+ TDB_DATA data;
+ tdb_len_t extra;
+};
+
+struct tle_hashtable {
+ struct tle_base base;
+ int parent;
+ unsigned int bucket;
+ tdb_len_t extra;
+};
+
+union tdb_layout_elem {
+ struct tle_base base;
+ struct tle_freetable ftable;
+ struct tle_free free;
+ struct tle_used used;
+ struct tle_hashtable hashtable;
+};
+
+struct tdb_layout {
+ const char *filename;
+ unsigned int num_elems;
+ union tdb_layout_elem *elem;
+};
+#endif /* TDB2_TEST_LAYOUT_H */
diff --git a/lib/tdb2/test/lock-tracking.c b/lib/tdb2/test/lock-tracking.c
new file mode 100644
index 00000000000..05dba32fd3f
--- /dev/null
+++ b/lib/tdb2/test/lock-tracking.c
@@ -0,0 +1,147 @@
+/* We save the locks so we can reaquire them. */
+#include <unistd.h>
+#include <fcntl.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include <ccan/tap/tap.h>
+#include <ccan/tdb2/private.h>
+#include "lock-tracking.h"
+
+struct lock {
+ struct lock *next;
+ unsigned int off;
+ unsigned int len;
+ int type;
+};
+static struct lock *locks;
+int locking_errors = 0;
+bool suppress_lockcheck = false;
+bool nonblocking_locks;
+int locking_would_block = 0;
+void (*unlock_callback)(int fd);
+
+int fcntl_with_lockcheck(int fd, int cmd, ... /* arg */ )
+{
+ va_list ap;
+ int ret, arg3;
+ struct flock *fl;
+ bool may_block = false;
+
+ if (cmd != F_SETLK && cmd != F_SETLKW) {
+ /* This may be totally bogus, but we don't know in general. */
+ va_start(ap, cmd);
+ arg3 = va_arg(ap, int);
+ va_end(ap);
+
+ return fcntl(fd, cmd, arg3);
+ }
+
+ va_start(ap, cmd);
+ fl = va_arg(ap, struct flock *);
+ va_end(ap);
+
+ if (cmd == F_SETLKW && nonblocking_locks) {
+ cmd = F_SETLK;
+ may_block = true;
+ }
+ ret = fcntl(fd, cmd, fl);
+
+ /* Detect when we failed, but might have been OK if we waited. */
+ if (may_block && ret == -1 && (errno == EAGAIN || errno == EACCES)) {
+ locking_would_block++;
+ }
+
+ if (fl->l_type == F_UNLCK) {
+ struct lock **l;
+ struct lock *old = NULL;
+
+ for (l = &locks; *l; l = &(*l)->next) {
+ if ((*l)->off == fl->l_start
+ && (*l)->len == fl->l_len) {
+ if (ret == 0) {
+ old = *l;
+ *l = (*l)->next;
+ free(old);
+ }
+ break;
+ }
+ }
+ if (!old && !suppress_lockcheck) {
+ diag("Unknown unlock %u@%u - %i",
+ (int)fl->l_len, (int)fl->l_start, ret);
+ locking_errors++;
+ }
+ } else {
+ struct lock *new, *i;
+ unsigned int fl_end = fl->l_start + fl->l_len;
+ if (fl->l_len == 0)
+ fl_end = (unsigned int)-1;
+
+ /* Check for overlaps: we shouldn't do this. */
+ for (i = locks; i; i = i->next) {
+ unsigned int i_end = i->off + i->len;
+ if (i->len == 0)
+ i_end = (unsigned int)-1;
+
+ if (fl->l_start >= i->off && fl->l_start < i_end)
+ break;
+ if (fl_end > i->off && fl_end < i_end)
+ break;
+
+ /* tdb_allrecord_lock does this, handle adjacent: */
+ if (fl->l_start > TDB_HASH_LOCK_START
+ && fl->l_start == i_end && fl->l_type == i->type) {
+ if (ret == 0) {
+ i->len = fl->l_len
+ ? i->len + fl->l_len
+ : 0;
+ }
+ goto done;
+ }
+ }
+ if (i) {
+ /* Special case: upgrade of allrecord lock. */
+ if (i->type == F_RDLCK && fl->l_type == F_WRLCK
+ && i->off == TDB_HASH_LOCK_START
+ && fl->l_start == TDB_HASH_LOCK_START
+ && i->len == 0
+ && fl->l_len == 0) {
+ if (ret == 0)
+ i->type = F_WRLCK;
+ goto done;
+ }
+ if (!suppress_lockcheck) {
+ diag("%s lock %u@%u overlaps %u@%u",
+ fl->l_type == F_WRLCK ? "write" : "read",
+ (int)fl->l_len, (int)fl->l_start,
+ i->len, (int)i->off);
+ locking_errors++;
+ }
+ }
+
+ if (ret == 0) {
+ new = malloc(sizeof *new);
+ new->off = fl->l_start;
+ new->len = fl->l_len;
+ new->type = fl->l_type;
+ new->next = locks;
+ locks = new;
+ }
+ }
+done:
+ if (ret == 0 && fl->l_type == F_UNLCK && unlock_callback)
+ unlock_callback(fd);
+ return ret;
+}
+
+unsigned int forget_locking(void)
+{
+ unsigned int num = 0;
+ while (locks) {
+ struct lock *next = locks->next;
+ free(locks);
+ locks = next;
+ num++;
+ }
+ return num;
+}
diff --git a/lib/tdb2/test/lock-tracking.h b/lib/tdb2/test/lock-tracking.h
new file mode 100644
index 00000000000..f2c9c44653b
--- /dev/null
+++ b/lib/tdb2/test/lock-tracking.h
@@ -0,0 +1,25 @@
+#ifndef LOCK_TRACKING_H
+#define LOCK_TRACKING_H
+#include <stdbool.h>
+
+/* Set this if you want a callback after fnctl unlock. */
+extern void (*unlock_callback)(int fd);
+
+/* Replacement fcntl. */
+int fcntl_with_lockcheck(int fd, int cmd, ... /* arg */ );
+
+/* Discard locking info: returns number of locks outstanding. */
+unsigned int forget_locking(void);
+
+/* Number of errors in locking. */
+extern int locking_errors;
+
+/* Suppress lock checking. */
+extern bool suppress_lockcheck;
+
+/* Make all locks non-blocking. */
+extern bool nonblocking_locks;
+
+/* Number of times we failed a lock because we made it non-blocking. */
+extern int locking_would_block;
+#endif /* LOCK_TRACKING_H */
diff --git a/lib/tdb2/test/logging.c b/lib/tdb2/test/logging.c
new file mode 100644
index 00000000000..d32cfa9b593
--- /dev/null
+++ b/lib/tdb2/test/logging.c
@@ -0,0 +1,24 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <ccan/tap/tap.h>
+#include "logging.h"
+
+unsigned tap_log_messages;
+const char *log_prefix = "";
+bool suppress_logging;
+
+union tdb_attribute tap_log_attr = {
+ .log = { .base = { .attr = TDB_ATTRIBUTE_LOG },
+ .fn = tap_log_fn }
+};
+
+void tap_log_fn(struct tdb_context *tdb,
+ enum tdb_log_level level,
+ const char *message, void *priv)
+{
+ if (suppress_logging)
+ return;
+
+ diag("tdb log level %u: %s%s", level, log_prefix, message);
+ tap_log_messages++;
+}
diff --git a/lib/tdb2/test/logging.h b/lib/tdb2/test/logging.h
new file mode 100644
index 00000000000..d172f867fd2
--- /dev/null
+++ b/lib/tdb2/test/logging.h
@@ -0,0 +1,15 @@
+#ifndef TDB2_TEST_LOGGING_H
+#define TDB2_TEST_LOGGING_H
+#include <ccan/tdb2/tdb2.h>
+#include <stdbool.h>
+#include <string.h>
+
+extern bool suppress_logging;
+extern const char *log_prefix;
+extern unsigned tap_log_messages;
+extern union tdb_attribute tap_log_attr;
+
+void tap_log_fn(struct tdb_context *tdb,
+ enum tdb_log_level level,
+ const char *message, void *priv);
+#endif /* TDB2_TEST_LOGGING_H */
diff --git a/lib/tdb2/test/run-001-encode.c b/lib/tdb2/test/run-001-encode.c
new file mode 100644
index 00000000000..ffa4b93c02f
--- /dev/null
+++ b/lib/tdb2/test/run-001-encode.c
@@ -0,0 +1,48 @@
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/open.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/transaction.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tap/tap.h>
+#include "logging.h"
+
+int main(int argc, char *argv[])
+{
+ unsigned int i;
+ struct tdb_used_record rec;
+ struct tdb_context tdb = { .log_fn = tap_log_fn };
+
+ plan_tests(64 + 32 + 48*6 + 1);
+
+ /* We should be able to encode any data value. */
+ for (i = 0; i < 64; i++)
+ ok1(set_header(&tdb, &rec, TDB_USED_MAGIC, 0, 1ULL << i,
+ 1ULL << i, 0) == 0);
+
+ /* And any key and data with < 64 bits between them. */
+ for (i = 0; i < 32; i++) {
+ tdb_len_t dlen = 1ULL >> (63 - i), klen = 1ULL << i;
+ ok1(set_header(&tdb, &rec, TDB_USED_MAGIC, klen, dlen,
+ klen + dlen, 0) == 0);
+ }
+
+ /* We should neatly encode all values. */
+ for (i = 0; i < 48; i++) {
+ uint64_t h = 1ULL << (i < 5 ? i : 4);
+ uint64_t klen = 1ULL << (i < 16 ? i : 15);
+ uint64_t dlen = 1ULL << i;
+ uint64_t xlen = 1ULL << (i < 32 ? i : 31);
+ ok1(set_header(&tdb, &rec, TDB_USED_MAGIC, klen, dlen,
+ klen+dlen+xlen, h) == 0);
+ ok1(rec_key_length(&rec) == klen);
+ ok1(rec_data_length(&rec) == dlen);
+ ok1(rec_extra_padding(&rec) == xlen);
+ ok1((uint64_t)rec_hash(&rec) == h);
+ ok1(rec_magic(&rec) == TDB_USED_MAGIC);
+ }
+ ok1(tap_log_messages == 0);
+ return exit_status();
+}
diff --git a/lib/tdb2/test/run-001-fls.c b/lib/tdb2/test/run-001-fls.c
new file mode 100644
index 00000000000..d54cad1d1c4
--- /dev/null
+++ b/lib/tdb2/test/run-001-fls.c
@@ -0,0 +1,40 @@
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/open.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/transaction.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tap/tap.h>
+
+static unsigned int dumb_fls(uint64_t num)
+{
+ int i;
+
+ for (i = 63; i >= 0; i--) {
+ if (num & (1ULL << i))
+ break;
+ }
+ return i + 1;
+}
+
+int main(int argc, char *argv[])
+{
+ unsigned int i, j;
+
+ plan_tests(64 * 64 + 2);
+
+ ok1(fls64(0) == 0);
+ ok1(dumb_fls(0) == 0);
+
+ for (i = 0; i < 64; i++) {
+ for (j = 0; j < 64; j++) {
+ uint64_t val = (1ULL << i) | (1ULL << j);
+ ok(fls64(val) == dumb_fls(val),
+ "%llu -> %u should be %u", (long long)val,
+ fls64(val), dumb_fls(val));
+ }
+ }
+ return exit_status();
+}
diff --git a/lib/tdb2/test/run-01-new_database.c b/lib/tdb2/test/run-01-new_database.c
new file mode 100644
index 00000000000..32ebaf09c10
--- /dev/null
+++ b/lib/tdb2/test/run-01-new_database.c
@@ -0,0 +1,42 @@
+#include <ccan/failtest/failtest_override.h>
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/open.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/transaction.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tap/tap.h>
+#include <ccan/failtest/failtest.h>
+#include "logging.h"
+#include "failtest_helper.h"
+
+int main(int argc, char *argv[])
+{
+ unsigned int i;
+ struct tdb_context *tdb;
+ int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP,
+ TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT,
+ TDB_NOMMAP|TDB_CONVERT };
+
+ failtest_init(argc, argv);
+ failtest_hook = block_repeat_failures;
+ failtest_exit_check = exit_check_log;
+ plan_tests(sizeof(flags) / sizeof(flags[0]) * 3);
+ for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+ tdb = tdb_open("run-new_database.tdb", flags[i],
+ O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+ if (!ok1(tdb))
+ failtest_exit(exit_status());
+ if (tdb) {
+ bool ok = ok1(tdb_check(tdb, NULL, NULL) == 0);
+ tdb_close(tdb);
+ if (!ok)
+ failtest_exit(exit_status());
+ }
+ if (!ok1(tap_log_messages == 0))
+ break;
+ }
+ failtest_exit(exit_status());
+}
diff --git a/lib/tdb2/test/run-02-expand.c b/lib/tdb2/test/run-02-expand.c
new file mode 100644
index 00000000000..6666ae167eb
--- /dev/null
+++ b/lib/tdb2/test/run-02-expand.c
@@ -0,0 +1,80 @@
+#include <ccan/failtest/failtest_override.h>
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/open.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tdb2/transaction.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tap/tap.h>
+#include <ccan/failtest/failtest.h>
+#include "logging.h"
+#include "failtest_helper.h"
+
+static bool failtest_suppress = false;
+
+/* Don't need to test everything here, just want expand testing. */
+static enum failtest_result
+suppress_failure(struct failtest_call *history, unsigned num)
+{
+ if (failtest_suppress)
+ return FAIL_DONT_FAIL;
+ return block_repeat_failures(history, num);
+}
+
+int main(int argc, char *argv[])
+{
+ unsigned int i;
+ uint64_t val;
+ struct tdb_context *tdb;
+ int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP,
+ TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT,
+ TDB_NOMMAP|TDB_CONVERT };
+
+ plan_tests(sizeof(flags) / sizeof(flags[0]) * 11 + 1);
+
+ failtest_init(argc, argv);
+ failtest_hook = suppress_failure;
+ failtest_exit_check = exit_check_log;
+
+ for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+ failtest_suppress = true;
+ tdb = tdb_open("run-expand.tdb", flags[i],
+ O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+ if (!ok1(tdb))
+ break;
+
+ val = tdb->file->map_size;
+ /* Need some hash lock for expand. */
+ ok1(tdb_lock_hashes(tdb, 0, 1, F_WRLCK, TDB_LOCK_WAIT) == 0);
+ failtest_suppress = false;
+ if (!ok1(tdb_expand(tdb, 1) == 0)) {
+ failtest_suppress = true;
+ tdb_close(tdb);
+ break;
+ }
+ failtest_suppress = true;
+
+ ok1(tdb->file->map_size >= val + 1 * TDB_EXTENSION_FACTOR);
+ ok1(tdb_unlock_hashes(tdb, 0, 1, F_WRLCK) == 0);
+ ok1(tdb_check(tdb, NULL, NULL) == 0);
+
+ val = tdb->file->map_size;
+ ok1(tdb_lock_hashes(tdb, 0, 1, F_WRLCK, TDB_LOCK_WAIT) == 0);
+ failtest_suppress = false;
+ if (!ok1(tdb_expand(tdb, 1024) == 0)) {
+ failtest_suppress = true;
+ tdb_close(tdb);
+ break;
+ }
+ failtest_suppress = true;
+ ok1(tdb_unlock_hashes(tdb, 0, 1, F_WRLCK) == 0);
+ ok1(tdb->file->map_size >= val + 1024 * TDB_EXTENSION_FACTOR);
+ ok1(tdb_check(tdb, NULL, NULL) == 0);
+ tdb_close(tdb);
+ }
+
+ ok1(tap_log_messages == 0);
+ failtest_exit(exit_status());
+}
diff --git a/lib/tdb2/test/run-03-coalesce.c b/lib/tdb2/test/run-03-coalesce.c
new file mode 100644
index 00000000000..3fdd11c0770
--- /dev/null
+++ b/lib/tdb2/test/run-03-coalesce.c
@@ -0,0 +1,170 @@
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/open.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tdb2/transaction.c>
+#include <ccan/tap/tap.h>
+#include "logging.h"
+#include "layout.h"
+
+static tdb_len_t free_record_length(struct tdb_context *tdb, tdb_off_t off)
+{
+ struct tdb_free_record f;
+ enum TDB_ERROR ecode;
+
+ ecode = tdb_read_convert(tdb, off, &f, sizeof(f));
+ if (ecode != TDB_SUCCESS)
+ return ecode;
+ if (frec_magic(&f) != TDB_FREE_MAGIC)
+ return TDB_ERR_CORRUPT;
+ return frec_len(&f);
+}
+
+int main(int argc, char *argv[])
+{
+ tdb_off_t b_off, test;
+ struct tdb_context *tdb;
+ struct tdb_layout *layout;
+ struct tdb_data data, key;
+ tdb_len_t len;
+
+ /* FIXME: Test TDB_CONVERT */
+ /* FIXME: Test lock order fail. */
+
+ plan_tests(42);
+ data = tdb_mkdata("world", 5);
+ key = tdb_mkdata("hello", 5);
+
+ /* No coalescing can be done due to EOF */
+ layout = new_tdb_layout("run-03-coalesce.tdb");
+ tdb_layout_add_freetable(layout);
+ len = 1024;
+ tdb_layout_add_free(layout, len, 0);
+ tdb = tdb_layout_get(layout);
+ ok1(tdb_check(tdb, NULL, NULL) == 0);
+ ok1(free_record_length(tdb, layout->elem[1].base.off) == len);
+
+ /* Figure out which bucket free entry is. */
+ b_off = bucket_off(tdb->ftable_off, size_to_bucket(len));
+ /* Lock and fail to coalesce. */
+ ok1(tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) == 0);
+ test = layout->elem[1].base.off;
+ ok1(coalesce(tdb, layout->elem[1].base.off, b_off, len, &test)
+ == 0);
+ tdb_unlock_free_bucket(tdb, b_off);
+ ok1(free_record_length(tdb, layout->elem[1].base.off) == len);
+ ok1(test == layout->elem[1].base.off);
+ ok1(tdb_check(tdb, NULL, NULL) == 0);
+ tdb_close(tdb);
+ tdb_layout_free(layout);
+
+ /* No coalescing can be done due to used record */
+ layout = new_tdb_layout("run-03-coalesce.tdb");
+ tdb_layout_add_freetable(layout);
+ tdb_layout_add_free(layout, 1024, 0);
+ tdb_layout_add_used(layout, key, data, 6);
+ tdb = tdb_layout_get(layout);
+ ok1(free_record_length(tdb, layout->elem[1].base.off) == 1024);
+ ok1(tdb_check(tdb, NULL, NULL) == 0);
+
+ /* Figure out which bucket free entry is. */
+ b_off = bucket_off(tdb->ftable_off, size_to_bucket(1024));
+ /* Lock and fail to coalesce. */
+ ok1(tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) == 0);
+ test = layout->elem[1].base.off;
+ ok1(coalesce(tdb, layout->elem[1].base.off, b_off, 1024, &test)
+ == 0);
+ tdb_unlock_free_bucket(tdb, b_off);
+ ok1(free_record_length(tdb, layout->elem[1].base.off) == 1024);
+ ok1(test == layout->elem[1].base.off);
+ ok1(tdb_check(tdb, NULL, NULL) == 0);
+ tdb_close(tdb);
+ tdb_layout_free(layout);
+
+ /* Coalescing can be done due to two free records, then EOF */
+ layout = new_tdb_layout("run-03-coalesce.tdb");
+ tdb_layout_add_freetable(layout);
+ tdb_layout_add_free(layout, 1024, 0);
+ tdb_layout_add_free(layout, 2048, 0);
+ tdb = tdb_layout_get(layout);
+ ok1(free_record_length(tdb, layout->elem[1].base.off) == 1024);
+ ok1(free_record_length(tdb, layout->elem[2].base.off) == 2048);
+ ok1(tdb_check(tdb, NULL, NULL) == 0);
+
+ /* Figure out which bucket (first) free entry is. */
+ b_off = bucket_off(tdb->ftable_off, size_to_bucket(1024));
+ /* Lock and coalesce. */
+ ok1(tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) == 0);
+ test = layout->elem[2].base.off;
+ ok1(coalesce(tdb, layout->elem[1].base.off, b_off, 1024, &test)
+ == 1024 + sizeof(struct tdb_used_record) + 2048);
+ /* Should tell us it's erased this one... */
+ ok1(test == TDB_ERR_NOEXIST);
+ ok1(tdb->file->allrecord_lock.count == 0 && tdb->file->num_lockrecs == 0);
+ ok1(free_record_length(tdb, layout->elem[1].base.off)
+ == 1024 + sizeof(struct tdb_used_record) + 2048);
+ ok1(tdb_check(tdb, NULL, NULL) == 0);
+ tdb_close(tdb);
+ tdb_layout_free(layout);
+
+ /* Coalescing can be done due to two free records, then data */
+ layout = new_tdb_layout("run-03-coalesce.tdb");
+ tdb_layout_add_freetable(layout);
+ tdb_layout_add_free(layout, 1024, 0);
+ tdb_layout_add_free(layout, 512, 0);
+ tdb_layout_add_used(layout, key, data, 6);
+ tdb = tdb_layout_get(layout);
+ ok1(free_record_length(tdb, layout->elem[1].base.off) == 1024);
+ ok1(free_record_length(tdb, layout->elem[2].base.off) == 512);
+ ok1(tdb_check(tdb, NULL, NULL) == 0);
+
+ /* Figure out which bucket free entry is. */
+ b_off = bucket_off(tdb->ftable_off, size_to_bucket(1024));
+ /* Lock and coalesce. */
+ ok1(tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) == 0);
+ test = layout->elem[2].base.off;
+ ok1(coalesce(tdb, layout->elem[1].base.off, b_off, 1024, &test)
+ == 1024 + sizeof(struct tdb_used_record) + 512);
+ ok1(tdb->file->allrecord_lock.count == 0 && tdb->file->num_lockrecs == 0);
+ ok1(free_record_length(tdb, layout->elem[1].base.off)
+ == 1024 + sizeof(struct tdb_used_record) + 512);
+ ok1(test == TDB_ERR_NOEXIST);
+ ok1(tdb_check(tdb, NULL, NULL) == 0);
+ tdb_close(tdb);
+ tdb_layout_free(layout);
+
+ /* Coalescing can be done due to three free records, then EOF */
+ layout = new_tdb_layout("run-03-coalesce.tdb");
+ tdb_layout_add_freetable(layout);
+ tdb_layout_add_free(layout, 1024, 0);
+ tdb_layout_add_free(layout, 512, 0);
+ tdb_layout_add_free(layout, 256, 0);
+ tdb = tdb_layout_get(layout);
+ ok1(free_record_length(tdb, layout->elem[1].base.off) == 1024);
+ ok1(free_record_length(tdb, layout->elem[2].base.off) == 512);
+ ok1(free_record_length(tdb, layout->elem[3].base.off) == 256);
+ ok1(tdb_check(tdb, NULL, NULL) == 0);
+
+ /* Figure out which bucket free entry is. */
+ b_off = bucket_off(tdb->ftable_off, size_to_bucket(1024));
+ /* Lock and coalesce. */
+ ok1(tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) == 0);
+ test = layout->elem[2].base.off;
+ ok1(coalesce(tdb, layout->elem[1].base.off, b_off, 1024, &test)
+ == 1024 + sizeof(struct tdb_used_record) + 512
+ + sizeof(struct tdb_used_record) + 256);
+ ok1(tdb->file->allrecord_lock.count == 0
+ && tdb->file->num_lockrecs == 0);
+ ok1(free_record_length(tdb, layout->elem[1].base.off)
+ == 1024 + sizeof(struct tdb_used_record) + 512
+ + sizeof(struct tdb_used_record) + 256);
+ ok1(tdb_check(tdb, NULL, NULL) == 0);
+ tdb_close(tdb);
+ tdb_layout_free(layout);
+
+ ok1(tap_log_messages == 0);
+ return exit_status();
+}
diff --git a/lib/tdb2/test/run-04-basichash.c b/lib/tdb2/test/run-04-basichash.c
new file mode 100644
index 00000000000..62031bdb40a
--- /dev/null
+++ b/lib/tdb2/test/run-04-basichash.c
@@ -0,0 +1,267 @@
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/open.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/transaction.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tap/tap.h>
+#include "logging.h"
+
+/* We rig the hash so adjacent-numbered records always clash. */
+static uint64_t clash(const void *key, size_t len, uint64_t seed, void *priv)
+{
+ return ((uint64_t)*(const unsigned int *)key)
+ << (64 - TDB_TOPLEVEL_HASH_BITS - 1);
+}
+
+int main(int argc, char *argv[])
+{
+ unsigned int i, j;
+ struct tdb_context *tdb;
+ unsigned int v;
+ struct tdb_used_record rec;
+ struct tdb_data key = { (unsigned char *)&v, sizeof(v) };
+ struct tdb_data dbuf = { (unsigned char *)&v, sizeof(v) };
+ union tdb_attribute hattr = { .hash = { .base = { TDB_ATTRIBUTE_HASH },
+ .fn = clash } };
+ int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP,
+ TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT,
+ TDB_NOMMAP|TDB_CONVERT,
+ };
+
+ hattr.base.next = &tap_log_attr;
+
+ plan_tests(sizeof(flags) / sizeof(flags[0])
+ * (91 + (2 * ((1 << TDB_HASH_GROUP_BITS) - 1))) + 1);
+ for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+ struct hash_info h;
+ tdb_off_t new_off, off, subhash;
+
+ tdb = tdb_open("run-04-basichash.tdb", flags[i],
+ O_RDWR|O_CREAT|O_TRUNC, 0600, &hattr);
+ ok1(tdb);
+ if (!tdb)
+ continue;
+
+ v = 0;
+ /* Should not find it. */
+ ok1(find_and_lock(tdb, key, F_WRLCK, &h, &rec, NULL) == 0);
+ /* Should have created correct hash. */
+ ok1(h.h == tdb_hash(tdb, key.dptr, key.dsize));
+ /* Should have located space in group 0, bucket 0. */
+ ok1(h.group_start == offsetof(struct tdb_header, hashtable));
+ ok1(h.home_bucket == 0);
+ ok1(h.found_bucket == 0);
+ ok1(h.hash_used == TDB_TOPLEVEL_HASH_BITS);
+
+ /* Should have lock on bucket 0 */
+ ok1(h.hlock_start == 0);
+ ok1(h.hlock_range ==
+ 1ULL << (64-(TDB_TOPLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS)));
+ ok1((tdb->flags & TDB_NOLOCK) || tdb->file->num_lockrecs == 1);
+ ok1((tdb->flags & TDB_NOLOCK)
+ || tdb->file->lockrecs[0].off == TDB_HASH_LOCK_START);
+ /* FIXME: Check lock length */
+
+ /* Allocate a new record. */
+ new_off = alloc(tdb, key.dsize, dbuf.dsize, h.h,
+ TDB_USED_MAGIC, false);
+ ok1(!TDB_OFF_IS_ERR(new_off));
+
+ /* We should be able to add it now. */
+ ok1(add_to_hash(tdb, &h, new_off) == 0);
+
+ /* Make sure we fill it in for later finding. */
+ off = new_off + sizeof(struct tdb_used_record);
+ ok1(!tdb->methods->twrite(tdb, off, key.dptr, key.dsize));
+ off += key.dsize;
+ ok1(!tdb->methods->twrite(tdb, off, dbuf.dptr, dbuf.dsize));
+
+ /* We should be able to unlock that OK. */
+ ok1(tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range,
+ F_WRLCK) == 0);
+
+ /* Database should be consistent. */
+ ok1(tdb_check(tdb, NULL, NULL) == 0);
+
+ /* Now, this should give a successful lookup. */
+ ok1(find_and_lock(tdb, key, F_WRLCK, &h, &rec, NULL)
+ == new_off);
+ /* Should have created correct hash. */
+ ok1(h.h == tdb_hash(tdb, key.dptr, key.dsize));
+ /* Should have located space in group 0, bucket 0. */
+ ok1(h.group_start == offsetof(struct tdb_header, hashtable));
+ ok1(h.home_bucket == 0);
+ ok1(h.found_bucket == 0);
+ ok1(h.hash_used == TDB_TOPLEVEL_HASH_BITS);
+
+ /* Should have lock on bucket 0 */
+ ok1(h.hlock_start == 0);
+ ok1(h.hlock_range ==
+ 1ULL << (64-(TDB_TOPLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS)));
+ ok1((tdb->flags & TDB_NOLOCK) || tdb->file->num_lockrecs == 1);
+ ok1((tdb->flags & TDB_NOLOCK)
+ || tdb->file->lockrecs[0].off == TDB_HASH_LOCK_START);
+ /* FIXME: Check lock length */
+
+ ok1(tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range,
+ F_WRLCK) == 0);
+
+ /* Database should be consistent. */
+ ok1(tdb_check(tdb, NULL, NULL) == 0);
+
+ /* Test expansion. */
+ v = 1;
+ ok1(find_and_lock(tdb, key, F_WRLCK, &h, &rec, NULL) == 0);
+ /* Should have created correct hash. */
+ ok1(h.h == tdb_hash(tdb, key.dptr, key.dsize));
+ /* Should have located space in group 0, bucket 1. */
+ ok1(h.group_start == offsetof(struct tdb_header, hashtable));
+ ok1(h.home_bucket == 0);
+ ok1(h.found_bucket == 1);
+ ok1(h.hash_used == TDB_TOPLEVEL_HASH_BITS);
+
+ /* Should have lock on bucket 0 */
+ ok1(h.hlock_start == 0);
+ ok1(h.hlock_range ==
+ 1ULL << (64-(TDB_TOPLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS)));
+ ok1((tdb->flags & TDB_NOLOCK) || tdb->file->num_lockrecs == 1);
+ ok1((tdb->flags & TDB_NOLOCK)
+ || tdb->file->lockrecs[0].off == TDB_HASH_LOCK_START);
+ /* FIXME: Check lock length */
+
+ /* Make it expand 0'th bucket. */
+ ok1(expand_group(tdb, &h) == 0);
+ /* First one should be subhash, next should be empty. */
+ ok1(is_subhash(h.group[0]));
+ subhash = (h.group[0] & TDB_OFF_MASK);
+ for (j = 1; j < (1 << TDB_HASH_GROUP_BITS); j++)
+ ok1(h.group[j] == 0);
+
+ ok1(tdb_write_convert(tdb, h.group_start,
+ h.group, sizeof(h.group)) == 0);
+ ok1(tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range,
+ F_WRLCK) == 0);
+
+ /* Should be happy with expansion. */
+ ok1(tdb_check(tdb, NULL, NULL) == 0);
+
+ /* Should be able to find it. */
+ v = 0;
+ ok1(find_and_lock(tdb, key, F_WRLCK, &h, &rec, NULL)
+ == new_off);
+ /* Should have created correct hash. */
+ ok1(h.h == tdb_hash(tdb, key.dptr, key.dsize));
+ /* Should have located space in expanded group 0, bucket 0. */
+ ok1(h.group_start == subhash + sizeof(struct tdb_used_record));
+ ok1(h.home_bucket == 0);
+ ok1(h.found_bucket == 0);
+ ok1(h.hash_used == TDB_TOPLEVEL_HASH_BITS
+ + TDB_SUBLEVEL_HASH_BITS);
+
+ /* Should have lock on bucket 0 */
+ ok1(h.hlock_start == 0);
+ ok1(h.hlock_range ==
+ 1ULL << (64-(TDB_TOPLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS)));
+ ok1((tdb->flags & TDB_NOLOCK) || tdb->file->num_lockrecs == 1);
+ ok1((tdb->flags & TDB_NOLOCK)
+ || tdb->file->lockrecs[0].off == TDB_HASH_LOCK_START);
+ /* FIXME: Check lock length */
+
+ /* Simple delete should work. */
+ ok1(delete_from_hash(tdb, &h) == 0);
+ ok1(add_free_record(tdb, new_off,
+ sizeof(struct tdb_used_record)
+ + rec_key_length(&rec)
+ + rec_data_length(&rec)
+ + rec_extra_padding(&rec),
+ TDB_LOCK_NOWAIT, false) == 0);
+ ok1(tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range,
+ F_WRLCK) == 0);
+ ok1(tdb_check(tdb, NULL, NULL) == 0);
+
+ /* Test second-level expansion: should expand 0th bucket. */
+ v = 0;
+ ok1(find_and_lock(tdb, key, F_WRLCK, &h, &rec, NULL) == 0);
+ /* Should have created correct hash. */
+ ok1(h.h == tdb_hash(tdb, key.dptr, key.dsize));
+ /* Should have located space in group 0, bucket 0. */
+ ok1(h.group_start == subhash + sizeof(struct tdb_used_record));
+ ok1(h.home_bucket == 0);
+ ok1(h.found_bucket == 0);
+ ok1(h.hash_used == TDB_TOPLEVEL_HASH_BITS+TDB_SUBLEVEL_HASH_BITS);
+
+ /* Should have lock on bucket 0 */
+ ok1(h.hlock_start == 0);
+ ok1(h.hlock_range ==
+ 1ULL << (64-(TDB_TOPLEVEL_HASH_BITS-TDB_HASH_GROUP_BITS)));
+ ok1((tdb->flags & TDB_NOLOCK) || tdb->file->num_lockrecs == 1);
+ ok1((tdb->flags & TDB_NOLOCK)
+ || tdb->file->lockrecs[0].off == TDB_HASH_LOCK_START);
+ /* FIXME: Check lock length */
+
+ ok1(expand_group(tdb, &h) == 0);
+ /* First one should be subhash, next should be empty. */
+ ok1(is_subhash(h.group[0]));
+ subhash = (h.group[0] & TDB_OFF_MASK);
+ for (j = 1; j < (1 << TDB_HASH_GROUP_BITS); j++)
+ ok1(h.group[j] == 0);
+ ok1(tdb_write_convert(tdb, h.group_start,
+ h.group, sizeof(h.group)) == 0);
+ ok1(tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range,
+ F_WRLCK) == 0);
+
+ /* Should be happy with expansion. */
+ ok1(tdb_check(tdb, NULL, NULL) == 0);
+
+ ok1(find_and_lock(tdb, key, F_WRLCK, &h, &rec, NULL) == 0);
+ /* Should have created correct hash. */
+ ok1(h.h == tdb_hash(tdb, key.dptr, key.dsize));
+ /* Should have located space in group 0, bucket 0. */
+ ok1(h.group_start == subhash + sizeof(struct tdb_used_record));
+ ok1(h.home_bucket == 0);
+ ok1(h.found_bucket == 0);
+ ok1(h.hash_used == TDB_TOPLEVEL_HASH_BITS
+ + TDB_SUBLEVEL_HASH_BITS * 2);
+
+ /* We should be able to add it now. */
+ /* Allocate a new record. */
+ new_off = alloc(tdb, key.dsize, dbuf.dsize, h.h,
+ TDB_USED_MAGIC, false);
+ ok1(!TDB_OFF_IS_ERR(new_off));
+ ok1(add_to_hash(tdb, &h, new_off) == 0);
+
+ /* Make sure we fill it in for later finding. */
+ off = new_off + sizeof(struct tdb_used_record);
+ ok1(!tdb->methods->twrite(tdb, off, key.dptr, key.dsize));
+ off += key.dsize;
+ ok1(!tdb->methods->twrite(tdb, off, dbuf.dptr, dbuf.dsize));
+
+ /* We should be able to unlock that OK. */
+ ok1(tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range,
+ F_WRLCK) == 0);
+
+ /* Database should be consistent. */
+ ok1(tdb_check(tdb, NULL, NULL) == 0);
+
+ /* Should be able to find it. */
+ v = 0;
+ ok1(find_and_lock(tdb, key, F_WRLCK, &h, &rec, NULL)
+ == new_off);
+ /* Should have created correct hash. */
+ ok1(h.h == tdb_hash(tdb, key.dptr, key.dsize));
+ /* Should have located space in expanded group 0, bucket 0. */
+ ok1(h.group_start == subhash + sizeof(struct tdb_used_record));
+ ok1(h.home_bucket == 0);
+ ok1(h.found_bucket == 0);
+ ok1(h.hash_used == TDB_TOPLEVEL_HASH_BITS
+ + TDB_SUBLEVEL_HASH_BITS * 2);
+
+ tdb_close(tdb);
+ }
+
+ ok1(tap_log_messages == 0);
+ return exit_status();
+}
diff --git a/lib/tdb2/test/run-05-readonly-open.c b/lib/tdb2/test/run-05-readonly-open.c
new file mode 100644
index 00000000000..0f1a4343d82
--- /dev/null
+++ b/lib/tdb2/test/run-05-readonly-open.c
@@ -0,0 +1,88 @@
+#include <ccan/failtest/failtest_override.h>
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/open.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/transaction.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tap/tap.h>
+#include <ccan/failtest/failtest.h>
+#include "logging.h"
+#include "failtest_helper.h"
+
+static bool failtest_suppress = false;
+
+/* Don't need to test everything here, just want expand testing. */
+static enum failtest_result
+suppress_failure(struct failtest_call *history, unsigned num)
+{
+ if (failtest_suppress)
+ return FAIL_DONT_FAIL;
+ return block_repeat_failures(history, num);
+}
+
+int main(int argc, char *argv[])
+{
+ unsigned int i;
+ struct tdb_context *tdb;
+ int flags[] = { TDB_DEFAULT, TDB_NOMMAP,
+ TDB_CONVERT, TDB_NOMMAP|TDB_CONVERT };
+ struct tdb_data key = tdb_mkdata("key", 3);
+ struct tdb_data data = tdb_mkdata("data", 4), d;
+ union tdb_attribute seed_attr;
+ unsigned int msgs = 0;
+
+ failtest_init(argc, argv);
+ failtest_hook = suppress_failure;
+ failtest_exit_check = exit_check_log;
+
+ seed_attr.base.attr = TDB_ATTRIBUTE_SEED;
+ seed_attr.base.next = &tap_log_attr;
+ seed_attr.seed.seed = 0;
+
+ failtest_suppress = true;
+ plan_tests(sizeof(flags) / sizeof(flags[0]) * 11);
+ for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+ tdb = tdb_open("run-05-readonly-open.tdb", flags[i],
+ O_RDWR|O_CREAT|O_TRUNC, 0600, &seed_attr);
+ ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0);
+ tdb_close(tdb);
+
+ failtest_suppress = false;
+ tdb = tdb_open("run-05-readonly-open.tdb", flags[i],
+ O_RDONLY, 0600, &tap_log_attr);
+ if (!ok1(tdb))
+ break;
+ ok1(tap_log_messages == msgs);
+ /* Fetch should succeed, stores should fail. */
+ if (!ok1(tdb_fetch(tdb, key, &d) == 0))
+ goto fail;
+ ok1(tdb_deq(d, data));
+ free(d.dptr);
+ if (!ok1(tdb_store(tdb, key, data, TDB_MODIFY)
+ == TDB_ERR_RDONLY))
+ goto fail;
+ ok1(tap_log_messages == ++msgs);
+ if (!ok1(tdb_store(tdb, key, data, TDB_INSERT)
+ == TDB_ERR_RDONLY))
+ goto fail;
+ ok1(tap_log_messages == ++msgs);
+ failtest_suppress = true;
+ ok1(tdb_check(tdb, NULL, NULL) == 0);
+ tdb_close(tdb);
+ ok1(tap_log_messages == msgs);
+ /* SIGH: failtest bug, it doesn't save the tdb file because
+ * we have it read-only. If we go around again, it gets
+ * changed underneath us and things get screwy. */
+ if (failtest_has_failed())
+ break;
+ }
+ failtest_exit(exit_status());
+
+fail:
+ failtest_suppress = true;
+ tdb_close(tdb);
+ failtest_exit(exit_status());
+}
diff --git a/lib/tdb2/test/run-10-simple-store.c b/lib/tdb2/test/run-10-simple-store.c
new file mode 100644
index 00000000000..35c387a3be5
--- /dev/null
+++ b/lib/tdb2/test/run-10-simple-store.c
@@ -0,0 +1,76 @@
+#include <ccan/failtest/failtest_override.h>
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/open.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/transaction.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tap/tap.h>
+#include <ccan/failtest/failtest.h>
+#include "logging.h"
+#include "failtest_helper.h"
+
+static bool failtest_suppress = false;
+
+/* Don't need to test everything here, just want expand testing. */
+static enum failtest_result
+suppress_failure(struct failtest_call *history, unsigned num)
+{
+ if (failtest_suppress)
+ return FAIL_DONT_FAIL;
+ return block_repeat_failures(history, num);
+}
+
+int main(int argc, char *argv[])
+{
+ unsigned int i;
+ struct tdb_context *tdb;
+ int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP,
+ TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT,
+ TDB_NOMMAP|TDB_CONVERT };
+ struct tdb_data key = tdb_mkdata("key", 3);
+ struct tdb_data data = tdb_mkdata("data", 4);
+
+ failtest_init(argc, argv);
+ failtest_hook = suppress_failure;
+ failtest_exit_check = exit_check_log;
+
+ failtest_suppress = true;
+ plan_tests(sizeof(flags) / sizeof(flags[0]) * 7 + 1);
+ for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+ tdb = tdb_open("run-10-simple-store.tdb", flags[i],
+ O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+ if (!ok1(tdb))
+ break;
+ /* Modify should fail. */
+ failtest_suppress = false;
+ if (!ok1(tdb_store(tdb, key, data, TDB_MODIFY)
+ == TDB_ERR_NOEXIST))
+ goto fail;
+ failtest_suppress = true;
+ ok1(tdb_check(tdb, NULL, NULL) == 0);
+ /* Insert should succeed. */
+ failtest_suppress = false;
+ if (!ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0))
+ goto fail;
+ failtest_suppress = true;
+ ok1(tdb_check(tdb, NULL, NULL) == 0);
+ /* Second insert should fail. */
+ failtest_suppress = false;
+ if (!ok1(tdb_store(tdb, key, data, TDB_INSERT)
+ == TDB_ERR_EXISTS))
+ goto fail;
+ failtest_suppress = true;
+ ok1(tdb_check(tdb, NULL, NULL) == 0);
+ tdb_close(tdb);
+ }
+ ok1(tap_log_messages == 0);
+ failtest_exit(exit_status());
+
+fail:
+ failtest_suppress = true;
+ tdb_close(tdb);
+ failtest_exit(exit_status());
+}
diff --git a/lib/tdb2/test/run-11-simple-fetch.c b/lib/tdb2/test/run-11-simple-fetch.c
new file mode 100644
index 00000000000..29b6bf08727
--- /dev/null
+++ b/lib/tdb2/test/run-11-simple-fetch.c
@@ -0,0 +1,76 @@
+#include <ccan/failtest/failtest_override.h>
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/open.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/transaction.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tap/tap.h>
+#include <ccan/failtest/failtest.h>
+#include "logging.h"
+#include "failtest_helper.h"
+
+static bool failtest_suppress = false;
+
+/* Don't need to test everything here, just want fetch testing. */
+static enum failtest_result
+suppress_failure(struct failtest_call *history, unsigned num)
+{
+ if (failtest_suppress)
+ return FAIL_DONT_FAIL;
+ return block_repeat_failures(history, num);
+}
+
+int main(int argc, char *argv[])
+{
+ unsigned int i;
+ struct tdb_context *tdb;
+ int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP,
+ TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT,
+ TDB_NOMMAP|TDB_CONVERT };
+ struct tdb_data key = tdb_mkdata("key", 3);
+ struct tdb_data data = tdb_mkdata("data", 4);
+
+ failtest_init(argc, argv);
+ failtest_hook = suppress_failure;
+ failtest_exit_check = exit_check_log;
+
+ failtest_suppress = true;
+ plan_tests(sizeof(flags) / sizeof(flags[0]) * 8 + 1);
+ for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+ tdb = tdb_open("run-11-simple-fetch.tdb", flags[i],
+ O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+ ok1(tdb);
+ if (tdb) {
+ struct tdb_data d = { NULL, 0 }; /* Bogus GCC warning */
+
+ /* fetch should fail. */
+ failtest_suppress = false;
+ if (!ok1(tdb_fetch(tdb, key, &d) == TDB_ERR_NOEXIST))
+ goto fail;
+ failtest_suppress = true;
+ ok1(tdb_check(tdb, NULL, NULL) == 0);
+ /* Insert should succeed. */
+ ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0);
+ ok1(tdb_check(tdb, NULL, NULL) == 0);
+ /* Fetch should now work. */
+ failtest_suppress = false;
+ if (!ok1(tdb_fetch(tdb, key, &d) == TDB_SUCCESS))
+ goto fail;
+ failtest_suppress = true;
+ ok1(tdb_deq(d, data));
+ free(d.dptr);
+ ok1(tdb_check(tdb, NULL, NULL) == 0);
+ tdb_close(tdb);
+ }
+ }
+ ok1(tap_log_messages == 0);
+ return exit_status();
+
+fail:
+ failtest_suppress = true;
+ tdb_close(tdb);
+ failtest_exit(exit_status());
+}
diff --git a/lib/tdb2/test/run-12-store.c b/lib/tdb2/test/run-12-store.c
new file mode 100644
index 00000000000..ba2e4f89717
--- /dev/null
+++ b/lib/tdb2/test/run-12-store.c
@@ -0,0 +1,58 @@
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/open.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tdb2/transaction.c>
+#include <ccan/tap/tap.h>
+#include "logging.h"
+
+/* We use the same seed which we saw a failure on. */
+static uint64_t fixedhash(const void *key, size_t len, uint64_t seed, void *p)
+{
+ return hash64_stable((const unsigned char *)key, len,
+ *(uint64_t *)p);
+}
+
+int main(int argc, char *argv[])
+{
+ unsigned int i, j;
+ struct tdb_context *tdb;
+ uint64_t seed = 16014841315512641303ULL;
+ union tdb_attribute fixed_hattr
+ = { .hash = { .base = { TDB_ATTRIBUTE_HASH },
+ .fn = fixedhash,
+ .data = &seed } };
+ int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP,
+ TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT,
+ TDB_NOMMAP|TDB_CONVERT };
+ struct tdb_data key = { (unsigned char *)&j, sizeof(j) };
+ struct tdb_data data = { (unsigned char *)&j, sizeof(j) };
+
+ fixed_hattr.base.next = &tap_log_attr;
+
+ plan_tests(sizeof(flags) / sizeof(flags[0]) * (1 + 500 * 3) + 1);
+ for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+ tdb = tdb_open("run-12-store.tdb", flags[i],
+ O_RDWR|O_CREAT|O_TRUNC, 0600, &fixed_hattr);
+ ok1(tdb);
+ if (!tdb)
+ continue;
+
+ /* We seemed to lose some keys.
+ * Insert and check they're in there! */
+ for (j = 0; j < 500; j++) {
+ struct tdb_data d = { NULL, 0 }; /* Bogus GCC warning */
+ ok1(tdb_store(tdb, key, data, TDB_REPLACE) == 0);
+ ok1(tdb_fetch(tdb, key, &d) == TDB_SUCCESS);
+ ok1(tdb_deq(d, data));
+ free(d.dptr);
+ }
+ tdb_close(tdb);
+ }
+
+ ok1(tap_log_messages == 0);
+ return exit_status();
+}
diff --git a/lib/tdb2/test/run-13-delete.c b/lib/tdb2/test/run-13-delete.c
new file mode 100644
index 00000000000..3b464d927ef
--- /dev/null
+++ b/lib/tdb2/test/run-13-delete.c
@@ -0,0 +1,207 @@
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/open.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tdb2/transaction.c>
+#include <ccan/tap/tap.h>
+#include "logging.h"
+
+/* We rig the hash so adjacent-numbered records always clash. */
+static uint64_t clash(const void *key, size_t len, uint64_t seed, void *priv)
+{
+ return ((uint64_t)*(const unsigned int *)key)
+ << (64 - TDB_TOPLEVEL_HASH_BITS - 1);
+}
+
+/* We use the same seed which we saw a failure on. */
+static uint64_t fixedhash(const void *key, size_t len, uint64_t seed, void *p)
+{
+ return hash64_stable((const unsigned char *)key, len,
+ *(uint64_t *)p);
+}
+
+static bool store_records(struct tdb_context *tdb)
+{
+ int i;
+ struct tdb_data key = { (unsigned char *)&i, sizeof(i) };
+ struct tdb_data d, data = { (unsigned char *)&i, sizeof(i) };
+
+ for (i = 0; i < 1000; i++) {
+ if (tdb_store(tdb, key, data, TDB_REPLACE) != 0)
+ return false;
+ tdb_fetch(tdb, key, &d);
+ if (!tdb_deq(d, data))
+ return false;
+ free(d.dptr);
+ }
+ return true;
+}
+
+static void test_val(struct tdb_context *tdb, uint64_t val)
+{
+ uint64_t v;
+ struct tdb_data key = { (unsigned char *)&v, sizeof(v) };
+ struct tdb_data d, data = { (unsigned char *)&v, sizeof(v) };
+
+ /* Insert an entry, then delete it. */
+ v = val;
+ /* Delete should fail. */
+ ok1(tdb_delete(tdb, key) == TDB_ERR_NOEXIST);
+ ok1(tdb_check(tdb, NULL, NULL) == 0);
+
+ /* Insert should succeed. */
+ ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0);
+ ok1(tdb_check(tdb, NULL, NULL) == 0);
+
+ /* Delete should succeed. */
+ ok1(tdb_delete(tdb, key) == 0);
+ ok1(tdb_check(tdb, NULL, NULL) == 0);
+
+ /* Re-add it, then add collision. */
+ ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0);
+ v = val + 1;
+ ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0);
+ ok1(tdb_check(tdb, NULL, NULL) == 0);
+
+ /* Can find both? */
+ ok1(tdb_fetch(tdb, key, &d) == TDB_SUCCESS);
+ ok1(d.dsize == data.dsize);
+ free(d.dptr);
+ v = val;
+ ok1(tdb_fetch(tdb, key, &d) == TDB_SUCCESS);
+ ok1(d.dsize == data.dsize);
+ free(d.dptr);
+
+ /* Delete second one. */
+ v = val + 1;
+ ok1(tdb_delete(tdb, key) == 0);
+ ok1(tdb_check(tdb, NULL, NULL) == 0);
+
+ /* Re-add */
+ ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0);
+ ok1(tdb_check(tdb, NULL, NULL) == 0);
+
+ /* Now, try deleting first one. */
+ v = val;
+ ok1(tdb_delete(tdb, key) == 0);
+ ok1(tdb_check(tdb, NULL, NULL) == 0);
+
+ /* Can still find second? */
+ v = val + 1;
+ ok1(tdb_fetch(tdb, key, &d) == TDB_SUCCESS);
+ ok1(d.dsize == data.dsize);
+ free(d.dptr);
+
+ /* Now, this will be ideally placed. */
+ v = val + 2;
+ ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0);
+ ok1(tdb_check(tdb, NULL, NULL) == 0);
+
+ /* This will collide with both. */
+ v = val;
+ ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0);
+
+ /* We can still find them all, right? */
+ ok1(tdb_fetch(tdb, key, &d) == TDB_SUCCESS);
+ ok1(d.dsize == data.dsize);
+ free(d.dptr);
+ v = val + 1;
+ ok1(tdb_fetch(tdb, key, &d) == TDB_SUCCESS);
+ ok1(d.dsize == data.dsize);
+ free(d.dptr);
+ v = val + 2;
+ ok1(tdb_fetch(tdb, key, &d) == TDB_SUCCESS);
+ ok1(d.dsize == data.dsize);
+ free(d.dptr);
+
+ /* And if we delete val + 1, that val + 2 should not move! */
+ v = val + 1;
+ ok1(tdb_delete(tdb, key) == 0);
+ ok1(tdb_check(tdb, NULL, NULL) == 0);
+
+ v = val;
+ ok1(tdb_fetch(tdb, key, &d) == TDB_SUCCESS);
+ ok1(d.dsize == data.dsize);
+ free(d.dptr);
+ v = val + 2;
+ ok1(tdb_fetch(tdb, key, &d) == TDB_SUCCESS);
+ ok1(d.dsize == data.dsize);
+ free(d.dptr);
+
+ /* Delete those two, so we are empty. */
+ ok1(tdb_delete(tdb, key) == 0);
+ v = val;
+ ok1(tdb_delete(tdb, key) == 0);
+
+ ok1(tdb_check(tdb, NULL, NULL) == 0);
+}
+
+int main(int argc, char *argv[])
+{
+ unsigned int i, j;
+ struct tdb_context *tdb;
+ uint64_t seed = 16014841315512641303ULL;
+ union tdb_attribute clash_hattr
+ = { .hash = { .base = { TDB_ATTRIBUTE_HASH },
+ .fn = clash } };
+ union tdb_attribute fixed_hattr
+ = { .hash = { .base = { TDB_ATTRIBUTE_HASH },
+ .fn = fixedhash,
+ .data = &seed } };
+ int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP,
+ TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT,
+ TDB_NOMMAP|TDB_CONVERT };
+ /* These two values gave trouble before. */
+ int vals[] = { 755, 837 };
+
+ clash_hattr.base.next = &tap_log_attr;
+ fixed_hattr.base.next = &tap_log_attr;
+
+ plan_tests(sizeof(flags) / sizeof(flags[0])
+ * (39 * 3 + 5 + sizeof(vals)/sizeof(vals[0])*2) + 1);
+ for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+ tdb = tdb_open("run-13-delete.tdb", flags[i],
+ O_RDWR|O_CREAT|O_TRUNC, 0600, &clash_hattr);
+ ok1(tdb);
+ if (!tdb)
+ continue;
+
+ /* Check start of hash table. */
+ test_val(tdb, 0);
+
+ /* Check end of hash table. */
+ test_val(tdb, -1ULL);
+
+ /* Check mixed bitpattern. */
+ test_val(tdb, 0x123456789ABCDEF0ULL);
+
+ ok1(!tdb->file || (tdb->file->allrecord_lock.count == 0
+ && tdb->file->num_lockrecs == 0));
+ tdb_close(tdb);
+
+ /* Deleting these entries in the db gave problems. */
+ tdb = tdb_open("run-13-delete.tdb", flags[i],
+ O_RDWR|O_CREAT|O_TRUNC, 0600, &fixed_hattr);
+ ok1(tdb);
+ if (!tdb)
+ continue;
+
+ ok1(store_records(tdb));
+ ok1(tdb_check(tdb, NULL, NULL) == 0);
+ for (j = 0; j < sizeof(vals)/sizeof(vals[0]); j++) {
+ struct tdb_data key;
+
+ key.dptr = (unsigned char *)&vals[j];
+ key.dsize = sizeof(vals[j]);
+ ok1(tdb_delete(tdb, key) == 0);
+ ok1(tdb_check(tdb, NULL, NULL) == 0);
+ }
+ tdb_close(tdb);
+ }
+
+ ok1(tap_log_messages == 0);
+ return exit_status();
+}
diff --git a/lib/tdb2/test/run-14-exists.c b/lib/tdb2/test/run-14-exists.c
new file mode 100644
index 00000000000..f264a6f2c98
--- /dev/null
+++ b/lib/tdb2/test/run-14-exists.c
@@ -0,0 +1,57 @@
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/open.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tdb2/transaction.c>
+#include <ccan/tap/tap.h>
+#include "logging.h"
+
+static bool test_records(struct tdb_context *tdb)
+{
+ int i;
+ struct tdb_data key = { (unsigned char *)&i, sizeof(i) };
+ struct tdb_data data = { (unsigned char *)&i, sizeof(i) };
+
+ for (i = 0; i < 1000; i++) {
+ if (tdb_exists(tdb, key))
+ return false;
+ if (tdb_store(tdb, key, data, TDB_REPLACE) != 0)
+ return false;
+ if (!tdb_exists(tdb, key))
+ return false;
+ }
+
+ for (i = 0; i < 1000; i++) {
+ if (!tdb_exists(tdb, key))
+ return false;
+ if (tdb_delete(tdb, key) != 0)
+ return false;
+ if (tdb_exists(tdb, key))
+ return false;
+ }
+ return true;
+}
+
+int main(int argc, char *argv[])
+{
+ unsigned int i;
+ struct tdb_context *tdb;
+ int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP,
+ TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT,
+ TDB_NOMMAP|TDB_CONVERT };
+
+ plan_tests(sizeof(flags) / sizeof(flags[0]) * 2 + 1);
+ for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+ tdb = tdb_open("run-14-exists.tdb", flags[i],
+ O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+ if (ok1(tdb))
+ ok1(test_records(tdb));
+ tdb_close(tdb);
+ }
+
+ ok1(tap_log_messages == 0);
+ return exit_status();
+}
diff --git a/lib/tdb2/test/run-15-append.c b/lib/tdb2/test/run-15-append.c
new file mode 100644
index 00000000000..d2f9ec65989
--- /dev/null
+++ b/lib/tdb2/test/run-15-append.c
@@ -0,0 +1,135 @@
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/open.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tdb2/transaction.c>
+#include <ccan/tap/tap.h>
+#include <ccan/ilog/ilog.h>
+#include "logging.h"
+
+#define MAX_SIZE 13100
+#define SIZE_STEP 131
+
+static tdb_off_t tdb_offset(struct tdb_context *tdb, struct tdb_data key)
+{
+ tdb_off_t off;
+ struct tdb_used_record rec;
+ struct hash_info h;
+
+ off = find_and_lock(tdb, key, F_RDLCK, &h, &rec, NULL);
+ if (TDB_OFF_IS_ERR(off))
+ return 0;
+ tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range, F_RDLCK);
+ return off;
+}
+
+int main(int argc, char *argv[])
+{
+ unsigned int i, j, moves;
+ struct tdb_context *tdb;
+ unsigned char *buffer;
+ tdb_off_t oldoff = 0, newoff;
+ int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP,
+ TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT,
+ TDB_NOMMAP|TDB_CONVERT };
+ struct tdb_data key = tdb_mkdata("key", 3);
+ struct tdb_data data;
+
+ buffer = malloc(MAX_SIZE);
+ for (i = 0; i < MAX_SIZE; i++)
+ buffer[i] = i;
+
+ plan_tests(sizeof(flags) / sizeof(flags[0])
+ * ((3 + MAX_SIZE/SIZE_STEP * 5) * 2 + 7)
+ + 1);
+
+ /* Using tdb_store. */
+ for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+ tdb = tdb_open("run-append.tdb", flags[i],
+ O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+ ok1(tdb);
+ if (!tdb)
+ continue;
+
+ moves = 0;
+ for (j = 0; j < MAX_SIZE; j += SIZE_STEP) {
+ data.dptr = buffer;
+ data.dsize = j;
+ ok1(tdb_store(tdb, key, data, TDB_REPLACE) == 0);
+ ok1(tdb_check(tdb, NULL, NULL) == 0);
+ ok1(tdb_fetch(tdb, key, &data) == TDB_SUCCESS);
+ ok1(data.dsize == j);
+ ok1(memcmp(data.dptr, buffer, data.dsize) == 0);
+ free(data.dptr);
+ newoff = tdb_offset(tdb, key);
+ if (newoff != oldoff)
+ moves++;
+ oldoff = newoff;
+ }
+ ok1(!tdb->file || (tdb->file->allrecord_lock.count == 0
+ && tdb->file->num_lockrecs == 0));
+ /* We should increase by 50% each time... */
+ ok(moves <= ilog64(j / SIZE_STEP)*2, "Moved %u times", moves);
+ tdb_close(tdb);
+ }
+
+ /* Using tdb_append. */
+ for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+ size_t prev_len = 0;
+ tdb = tdb_open("run-append.tdb", flags[i],
+ O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+ ok1(tdb);
+ if (!tdb)
+ continue;
+
+ moves = 0;
+ for (j = 0; j < MAX_SIZE; j += SIZE_STEP) {
+ data.dptr = buffer + prev_len;
+ data.dsize = j - prev_len;
+ ok1(tdb_append(tdb, key, data) == 0);
+ ok1(tdb_check(tdb, NULL, NULL) == 0);
+ ok1(tdb_fetch(tdb, key, &data) == TDB_SUCCESS);
+ ok1(data.dsize == j);
+ ok1(memcmp(data.dptr, buffer, data.dsize) == 0);
+ free(data.dptr);
+ prev_len = data.dsize;
+ newoff = tdb_offset(tdb, key);
+ if (newoff != oldoff)
+ moves++;
+ oldoff = newoff;
+ }
+ ok1(!tdb->file || (tdb->file->allrecord_lock.count == 0
+ && tdb->file->num_lockrecs == 0));
+ /* We should increase by 50% each time... */
+ ok(moves <= ilog64(j / SIZE_STEP)*2, "Moved %u times", moves);
+ tdb_close(tdb);
+ }
+
+ for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+ tdb = tdb_open("run-append.tdb", flags[i],
+ O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+ ok1(tdb);
+ if (!tdb)
+ continue;
+
+ /* Huge initial store. */
+ data.dptr = buffer;
+ data.dsize = MAX_SIZE;
+ ok1(tdb_append(tdb, key, data) == 0);
+ ok1(tdb_check(tdb, NULL, NULL) == 0);
+ ok1(tdb_fetch(tdb, key, &data) == TDB_SUCCESS);
+ ok1(data.dsize == MAX_SIZE);
+ ok1(memcmp(data.dptr, buffer, data.dsize) == 0);
+ free(data.dptr);
+ ok1(!tdb->file || (tdb->file->allrecord_lock.count == 0
+ && tdb->file->num_lockrecs == 0));
+ tdb_close(tdb);
+ }
+
+ ok1(tap_log_messages == 0);
+ free(buffer);
+ return exit_status();
+}
diff --git a/lib/tdb2/test/run-16-wipe_all.c b/lib/tdb2/test/run-16-wipe_all.c
new file mode 100644
index 00000000000..d9c5128e0bb
--- /dev/null
+++ b/lib/tdb2/test/run-16-wipe_all.c
@@ -0,0 +1,50 @@
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/open.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tdb2/transaction.c>
+#include <ccan/tdb2/traverse.c>
+#include <ccan/tap/tap.h>
+#include "logging.h"
+
+static bool add_records(struct tdb_context *tdb)
+{
+ int i;
+ struct tdb_data key = { (unsigned char *)&i, sizeof(i) };
+ struct tdb_data data = { (unsigned char *)&i, sizeof(i) };
+
+ for (i = 0; i < 1000; i++) {
+ if (tdb_store(tdb, key, data, TDB_REPLACE) != 0)
+ return false;
+ }
+ return true;
+}
+
+
+int main(int argc, char *argv[])
+{
+ unsigned int i;
+ struct tdb_context *tdb;
+ int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP,
+ TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT,
+ TDB_NOMMAP|TDB_CONVERT };
+
+ plan_tests(sizeof(flags) / sizeof(flags[0]) * 4 + 1);
+ for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+ tdb = tdb_open("run-16-wipe_all.tdb", flags[i],
+ O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+ if (ok1(tdb)) {
+ struct tdb_data key;
+ ok1(add_records(tdb));
+ ok1(tdb_wipe_all(tdb) == TDB_SUCCESS);
+ ok1(tdb_firstkey(tdb, &key) == TDB_ERR_NOEXIST);
+ tdb_close(tdb);
+ }
+ }
+
+ ok1(tap_log_messages == 0);
+ return exit_status();
+}
diff --git a/lib/tdb2/test/run-20-growhash.c b/lib/tdb2/test/run-20-growhash.c
new file mode 100644
index 00000000000..22a88c45043
--- /dev/null
+++ b/lib/tdb2/test/run-20-growhash.c
@@ -0,0 +1,144 @@
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/open.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/transaction.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tap/tap.h>
+#include "logging.h"
+
+static uint64_t myhash(const void *key, size_t len, uint64_t seed, void *priv)
+{
+ return *(const uint64_t *)key;
+}
+
+static void add_bits(uint64_t *val, unsigned new, unsigned new_bits,
+ unsigned *done)
+{
+ *done += new_bits;
+ *val |= ((uint64_t)new << (64 - *done));
+}
+
+static uint64_t make_key(unsigned topgroup, unsigned topbucket,
+ unsigned subgroup1, unsigned subbucket1,
+ unsigned subgroup2, unsigned subbucket2)
+{
+ uint64_t key = 0;
+ unsigned done = 0;
+
+ add_bits(&key, topgroup, TDB_TOPLEVEL_HASH_BITS - TDB_HASH_GROUP_BITS,
+ &done);
+ add_bits(&key, topbucket, TDB_HASH_GROUP_BITS, &done);
+ add_bits(&key, subgroup1, TDB_SUBLEVEL_HASH_BITS - TDB_HASH_GROUP_BITS,
+ &done);
+ add_bits(&key, subbucket1, TDB_HASH_GROUP_BITS, &done);
+ add_bits(&key, subgroup2, TDB_SUBLEVEL_HASH_BITS - TDB_HASH_GROUP_BITS,
+ &done);
+ add_bits(&key, subbucket2, TDB_HASH_GROUP_BITS, &done);
+ return key;
+}
+
+int main(int argc, char *argv[])
+{
+ unsigned int i, j;
+ struct tdb_context *tdb;
+ uint64_t kdata;
+ struct tdb_used_record rec;
+ struct tdb_data key = { (unsigned char *)&kdata, sizeof(kdata) };
+ struct tdb_data dbuf = { (unsigned char *)&kdata, sizeof(kdata) };
+ union tdb_attribute hattr = { .hash = { .base = { TDB_ATTRIBUTE_HASH },
+ .fn = myhash } };
+ int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP,
+ TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT,
+ TDB_NOMMAP|TDB_CONVERT,
+ };
+
+ hattr.base.next = &tap_log_attr;
+
+ plan_tests(sizeof(flags) / sizeof(flags[0])
+ * (9 + (20 + 2 * ((1 << TDB_HASH_GROUP_BITS) - 2))
+ * (1 << TDB_HASH_GROUP_BITS)) + 1);
+ for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+ struct hash_info h;
+
+ tdb = tdb_open("run-04-basichash.tdb", flags[i],
+ O_RDWR|O_CREAT|O_TRUNC, 0600, &hattr);
+ ok1(tdb);
+ if (!tdb)
+ continue;
+
+ /* Fill a group. */
+ for (j = 0; j < (1 << TDB_HASH_GROUP_BITS); j++) {
+ kdata = make_key(0, j, 0, 0, 0, 0);
+ ok1(tdb_store(tdb, key, dbuf, TDB_INSERT) == 0);
+ }
+ ok1(tdb_check(tdb, NULL, NULL) == 0);
+
+ /* Check first still exists. */
+ kdata = make_key(0, 0, 0, 0, 0, 0);
+ ok1(find_and_lock(tdb, key, F_RDLCK, &h, &rec, NULL) != 0);
+ /* Should have created correct hash. */
+ ok1(h.h == tdb_hash(tdb, key.dptr, key.dsize));
+ /* Should have located space in group 0, bucket 0. */
+ ok1(h.group_start == offsetof(struct tdb_header, hashtable));
+ ok1(h.home_bucket == 0);
+ ok1(h.found_bucket == 0);
+ ok1(h.hash_used == TDB_TOPLEVEL_HASH_BITS);
+ /* Entire group should be full! */
+ for (j = 0; j < (1 << TDB_HASH_GROUP_BITS); j++)
+ ok1(h.group[j] != 0);
+
+ ok1(tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range,
+ F_RDLCK) == 0);
+
+ /* Now, add one more to each should expand (that) bucket. */
+ for (j = 0; j < (1 << TDB_HASH_GROUP_BITS); j++) {
+ unsigned int k;
+ kdata = make_key(0, j, 0, 1, 0, 0);
+ ok1(tdb_store(tdb, key, dbuf, TDB_INSERT) == 0);
+ ok1(tdb_check(tdb, NULL, NULL) == 0);
+
+ ok1(find_and_lock(tdb, key, F_RDLCK, &h, &rec, NULL));
+ /* Should have created correct hash. */
+ ok1(h.h == tdb_hash(tdb, key.dptr, key.dsize));
+ /* Should have moved to subhash */
+ ok1(h.group_start >= sizeof(struct tdb_header));
+ ok1(h.home_bucket == 1);
+ ok1(h.found_bucket == 1);
+ ok1(h.hash_used == TDB_TOPLEVEL_HASH_BITS
+ + TDB_SUBLEVEL_HASH_BITS);
+ ok1(tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range,
+ F_RDLCK) == 0);
+
+ /* Keep adding, make it expand again. */
+ for (k = 2; k < (1 << TDB_HASH_GROUP_BITS); k++) {
+ kdata = make_key(0, j, 0, k, 0, 0);
+ ok1(tdb_store(tdb, key, dbuf, TDB_INSERT) == 0);
+ ok1(tdb_check(tdb, NULL, NULL) == 0);
+ }
+
+ /* This should tip it over to sub-sub-hash. */
+ kdata = make_key(0, j, 0, 0, 0, 1);
+ ok1(tdb_store(tdb, key, dbuf, TDB_INSERT) == 0);
+ ok1(tdb_check(tdb, NULL, NULL) == 0);
+
+ ok1(find_and_lock(tdb, key, F_RDLCK, &h, &rec, NULL));
+ /* Should have created correct hash. */
+ ok1(h.h == tdb_hash(tdb, key.dptr, key.dsize));
+ /* Should have moved to subhash */
+ ok1(h.group_start >= sizeof(struct tdb_header));
+ ok1(h.home_bucket == 1);
+ ok1(h.found_bucket == 1);
+ ok1(h.hash_used == TDB_TOPLEVEL_HASH_BITS
+ + TDB_SUBLEVEL_HASH_BITS + TDB_SUBLEVEL_HASH_BITS);
+ ok1(tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range,
+ F_RDLCK) == 0);
+ }
+ tdb_close(tdb);
+ }
+
+ ok1(tap_log_messages == 0);
+ return exit_status();
+}
diff --git a/lib/tdb2/test/run-21-parse_record.c b/lib/tdb2/test/run-21-parse_record.c
new file mode 100644
index 00000000000..773cdff4e0c
--- /dev/null
+++ b/lib/tdb2/test/run-21-parse_record.c
@@ -0,0 +1,70 @@
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/open.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tdb2/transaction.c>
+#include <ccan/tap/tap.h>
+#include "logging.h"
+
+static enum TDB_ERROR parse(TDB_DATA key, TDB_DATA data, TDB_DATA *expected)
+{
+ if (!tdb_deq(data, *expected))
+ return TDB_ERR_EINVAL;
+ return TDB_SUCCESS;
+}
+
+static enum TDB_ERROR parse_err(TDB_DATA key, TDB_DATA data, void *unused)
+{
+ return 100;
+}
+
+static bool test_records(struct tdb_context *tdb)
+{
+ int i;
+ struct tdb_data key = { (unsigned char *)&i, sizeof(i) };
+ struct tdb_data data = { (unsigned char *)&i, sizeof(i) };
+
+ for (i = 0; i < 1000; i++) {
+ if (tdb_store(tdb, key, data, TDB_REPLACE) != 0)
+ return false;
+ }
+
+ for (i = 0; i < 1000; i++) {
+ if (tdb_parse_record(tdb, key, parse, &data) != TDB_SUCCESS)
+ return false;
+ }
+
+ if (tdb_parse_record(tdb, key, parse, &data) != TDB_ERR_NOEXIST)
+ return false;
+
+ /* Test error return from parse function. */
+ i = 0;
+ if (tdb_parse_record(tdb, key, parse_err, NULL) != 100)
+ return false;
+
+ return true;
+}
+
+int main(int argc, char *argv[])
+{
+ unsigned int i;
+ struct tdb_context *tdb;
+ int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP,
+ TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT,
+ TDB_NOMMAP|TDB_CONVERT };
+
+ plan_tests(sizeof(flags) / sizeof(flags[0]) * 2 + 1);
+ for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+ tdb = tdb_open("run-14-exists.tdb", flags[i],
+ O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+ if (ok1(tdb))
+ ok1(test_records(tdb));
+ tdb_close(tdb);
+ }
+
+ ok1(tap_log_messages == 0);
+ return exit_status();
+}
diff --git a/lib/tdb2/test/run-25-hashoverload.c b/lib/tdb2/test/run-25-hashoverload.c
new file mode 100644
index 00000000000..83f549d6b26
--- /dev/null
+++ b/lib/tdb2/test/run-25-hashoverload.c
@@ -0,0 +1,121 @@
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/open.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/transaction.c>
+#include <ccan/tdb2/traverse.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tap/tap.h>
+#include "logging.h"
+
+static uint64_t badhash(const void *key, size_t len, uint64_t seed, void *priv)
+{
+ return 0;
+}
+
+static int trav(struct tdb_context *tdb, TDB_DATA key, TDB_DATA dbuf, void *p)
+{
+ if (p)
+ return tdb_delete(tdb, key);
+ return 0;
+}
+
+int main(int argc, char *argv[])
+{
+ unsigned int i, j;
+ struct tdb_context *tdb;
+ struct tdb_data key = { (unsigned char *)&j, sizeof(j) };
+ struct tdb_data dbuf = { (unsigned char *)&j, sizeof(j) };
+ union tdb_attribute hattr = { .hash = { .base = { TDB_ATTRIBUTE_HASH },
+ .fn = badhash } };
+ int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP,
+ TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT,
+ TDB_NOMMAP|TDB_CONVERT,
+ };
+
+ hattr.base.next = &tap_log_attr;
+
+ plan_tests(6883);
+ for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+ struct tdb_data d = { NULL, 0 }; /* Bogus GCC warning */
+
+ tdb = tdb_open("run-25-hashoverload.tdb", flags[i],
+ O_RDWR|O_CREAT|O_TRUNC, 0600, &hattr);
+ ok1(tdb);
+ if (!tdb)
+ continue;
+
+ /* Fill a group. */
+ for (j = 0; j < (1 << TDB_HASH_GROUP_BITS); j++) {
+ ok1(tdb_store(tdb, key, dbuf, TDB_INSERT) == 0);
+ }
+ ok1(tdb_check(tdb, NULL, NULL) == 0);
+
+ /* Now store one last value: should form chain. */
+ ok1(tdb_store(tdb, key, dbuf, TDB_INSERT) == 0);
+ ok1(tdb_check(tdb, NULL, NULL) == 0);
+
+ /* Check we can find them all. */
+ for (j = 0; j < (1 << TDB_HASH_GROUP_BITS) + 1; j++) {
+ ok1(tdb_fetch(tdb, key, &d) == TDB_SUCCESS);
+ ok1(d.dsize == sizeof(j));
+ ok1(d.dptr != NULL);
+ ok1(d.dptr && memcmp(d.dptr, &j, d.dsize) == 0);
+ free(d.dptr);
+ }
+
+ /* Now add a *lot* more. */
+ for (j = (1 << TDB_HASH_GROUP_BITS) + 1;
+ j < (16 << TDB_HASH_GROUP_BITS);
+ j++) {
+ ok1(tdb_store(tdb, key, dbuf, TDB_INSERT) == 0);
+ ok1(tdb_fetch(tdb, key, &d) == TDB_SUCCESS);
+ ok1(d.dsize == sizeof(j));
+ ok1(d.dptr != NULL);
+ ok1(d.dptr && memcmp(d.dptr, &j, d.dsize) == 0);
+ free(d.dptr);
+ }
+ ok1(tdb_check(tdb, NULL, NULL) == 0);
+
+ /* Traverse through them. */
+ ok1(tdb_traverse(tdb, trav, NULL) == j);
+
+ /* Empty the first chain-worth. */
+ for (j = 0; j < (1 << TDB_HASH_GROUP_BITS); j++)
+ ok1(tdb_delete(tdb, key) == 0);
+
+ ok1(tdb_check(tdb, NULL, NULL) == 0);
+
+ for (j = (1 << TDB_HASH_GROUP_BITS);
+ j < (16 << TDB_HASH_GROUP_BITS);
+ j++) {
+ ok1(tdb_fetch(tdb, key, &d) == TDB_SUCCESS);
+ ok1(d.dsize == sizeof(j));
+ ok1(d.dptr != NULL);
+ ok1(d.dptr && memcmp(d.dptr, &j, d.dsize) == 0);
+ free(d.dptr);
+ }
+
+ /* Traverse through them. */
+ ok1(tdb_traverse(tdb, trav, NULL)
+ == (15 << TDB_HASH_GROUP_BITS));
+
+ /* Re-add */
+ for (j = 0; j < (1 << TDB_HASH_GROUP_BITS); j++) {
+ ok1(tdb_store(tdb, key, dbuf, TDB_INSERT) == 0);
+ }
+ ok1(tdb_check(tdb, NULL, NULL) == 0);
+
+ /* Now try deleting as we go. */
+ ok1(tdb_traverse(tdb, trav, trav)
+ == (16 << TDB_HASH_GROUP_BITS));
+ ok1(tdb_check(tdb, NULL, NULL) == 0);
+ ok1(tdb_traverse(tdb, trav, NULL) == 0);
+ tdb_close(tdb);
+ }
+
+ ok1(tap_log_messages == 0);
+ return exit_status();
+}
diff --git a/lib/tdb2/test/run-30-exhaust-before-expand.c b/lib/tdb2/test/run-30-exhaust-before-expand.c
new file mode 100644
index 00000000000..2386f85f26f
--- /dev/null
+++ b/lib/tdb2/test/run-30-exhaust-before-expand.c
@@ -0,0 +1,79 @@
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/open.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/transaction.c>
+#include <ccan/tap/tap.h>
+#include <err.h>
+#include "logging.h"
+
+static bool empty_freetable(struct tdb_context *tdb)
+{
+ struct tdb_freetable ftab;
+ unsigned int i;
+
+ /* Now, free table should be completely exhausted in zone 0 */
+ if (tdb_read_convert(tdb, tdb->ftable_off, &ftab, sizeof(ftab)) != 0)
+ abort();
+
+ for (i = 0; i < sizeof(ftab.buckets)/sizeof(ftab.buckets[0]); i++) {
+ if (ftab.buckets[i])
+ return false;
+ }
+ return true;
+}
+
+
+int main(int argc, char *argv[])
+{
+ unsigned int i, j;
+ struct tdb_context *tdb;
+ int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP,
+ TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT,
+ TDB_NOMMAP|TDB_CONVERT };
+
+ plan_tests(sizeof(flags) / sizeof(flags[0]) * 9 + 1);
+
+ for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+ TDB_DATA k;
+ uint64_t size;
+ bool was_empty = false;
+
+ k.dptr = (void *)&j;
+ k.dsize = sizeof(j);
+
+ tdb = tdb_open("run-30-exhaust-before-expand.tdb", flags[i],
+ O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+ ok1(tdb);
+ if (!tdb)
+ continue;
+
+ ok1(empty_freetable(tdb));
+ /* Need some hash lock for expand. */
+ ok1(tdb_lock_hashes(tdb, 0, 1, F_WRLCK, TDB_LOCK_WAIT) == 0);
+ /* Create some free space. */
+ ok1(tdb_expand(tdb, 1) == 0);
+ ok1(tdb_unlock_hashes(tdb, 0, 1, F_WRLCK) == 0);
+ ok1(tdb_check(tdb, NULL, NULL) == 0);
+ ok1(!empty_freetable(tdb));
+
+ size = tdb->file->map_size;
+ /* Insert minimal-length records until we expand. */
+ for (j = 0; tdb->file->map_size == size; j++) {
+ was_empty = empty_freetable(tdb);
+ if (tdb_store(tdb, k, k, TDB_INSERT) != 0)
+ err(1, "Failed to store record %i", j);
+ }
+
+ /* Would have been empty before expansion, but no longer. */
+ ok1(was_empty);
+ ok1(!empty_freetable(tdb));
+ tdb_close(tdb);
+ }
+
+ ok1(tap_log_messages == 0);
+ return exit_status();
+}
diff --git a/lib/tdb2/test/run-50-multiple-freelists.c b/lib/tdb2/test/run-50-multiple-freelists.c
new file mode 100644
index 00000000000..7a48c3e0eee
--- /dev/null
+++ b/lib/tdb2/test/run-50-multiple-freelists.c
@@ -0,0 +1,71 @@
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/open.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tap/tap.h>
+#include <ccan/tdb2/transaction.c>
+#include "logging.h"
+#include "layout.h"
+
+int main(int argc, char *argv[])
+{
+ tdb_off_t off;
+ struct tdb_context *tdb;
+ struct tdb_layout *layout;
+ TDB_DATA key, data;
+
+ plan_tests(11);
+ key = tdb_mkdata("Hello", 5);
+ data = tdb_mkdata("world", 5);
+
+ /* Create a TDB with three free tables. */
+ layout = new_tdb_layout(NULL);
+ tdb_layout_add_freetable(layout);
+ tdb_layout_add_freetable(layout);
+ tdb_layout_add_freetable(layout);
+ tdb_layout_add_free(layout, 80, 0);
+ /* Used record prevent coalescing. */
+ tdb_layout_add_used(layout, key, data, 6);
+ tdb_layout_add_free(layout, 160, 1);
+ key.dsize--;
+ tdb_layout_add_used(layout, key, data, 7);
+ tdb_layout_add_free(layout, 320, 2);
+ key.dsize--;
+ tdb_layout_add_used(layout, key, data, 8);
+ tdb_layout_add_free(layout, 40, 0);
+ tdb = tdb_layout_get(layout);
+ ok1(tdb_check(tdb, NULL, NULL) == 0);
+
+ off = get_free(tdb, 0, 80 - sizeof(struct tdb_used_record), 0,
+ TDB_USED_MAGIC, 0);
+ ok1(off == layout->elem[3].base.off);
+ ok1(tdb->ftable_off == layout->elem[0].base.off);
+
+ off = get_free(tdb, 0, 160 - sizeof(struct tdb_used_record), 0,
+ TDB_USED_MAGIC, 0);
+ ok1(off == layout->elem[5].base.off);
+ ok1(tdb->ftable_off == layout->elem[1].base.off);
+
+ off = get_free(tdb, 0, 320 - sizeof(struct tdb_used_record), 0,
+ TDB_USED_MAGIC, 0);
+ ok1(off == layout->elem[7].base.off);
+ ok1(tdb->ftable_off == layout->elem[2].base.off);
+
+ off = get_free(tdb, 0, 40 - sizeof(struct tdb_used_record), 0,
+ TDB_USED_MAGIC, 0);
+ ok1(off == layout->elem[9].base.off);
+ ok1(tdb->ftable_off == layout->elem[0].base.off);
+
+ /* Now we fail. */
+ off = get_free(tdb, 0, 0, 1, TDB_USED_MAGIC, 0);
+ ok1(off == 0);
+
+ tdb_close(tdb);
+ tdb_layout_free(layout);
+
+ ok1(tap_log_messages == 0);
+ return exit_status();
+}
diff --git a/lib/tdb2/test/run-55-transaction.c b/lib/tdb2/test/run-55-transaction.c
new file mode 100644
index 00000000000..1650e40e1f3
--- /dev/null
+++ b/lib/tdb2/test/run-55-transaction.c
@@ -0,0 +1,75 @@
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/open.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tdb2/transaction.c>
+#include <ccan/tap/tap.h>
+#include "logging.h"
+
+int main(int argc, char *argv[])
+{
+ unsigned int i;
+ struct tdb_context *tdb;
+ unsigned char *buffer;
+ int flags[] = { TDB_DEFAULT, TDB_NOMMAP,
+ TDB_CONVERT, TDB_NOMMAP|TDB_CONVERT };
+ struct tdb_data key = tdb_mkdata("key", 3);
+ struct tdb_data data;
+
+ buffer = malloc(1000);
+ for (i = 0; i < 1000; i++)
+ buffer[i] = i;
+
+ plan_tests(sizeof(flags) / sizeof(flags[0]) * 20 + 1);
+
+ for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+ tdb = tdb_open("run-55-transaction.tdb", flags[i],
+ O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+ ok1(tdb);
+ if (!tdb)
+ continue;
+
+ ok1(tdb_transaction_start(tdb) == 0);
+ data.dptr = buffer;
+ data.dsize = 1000;
+ ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0);
+ ok1(tdb_fetch(tdb, key, &data) == TDB_SUCCESS);
+ ok1(data.dsize == 1000);
+ ok1(memcmp(data.dptr, buffer, data.dsize) == 0);
+ free(data.dptr);
+
+ /* Cancelling a transaction means no store */
+ tdb_transaction_cancel(tdb);
+ ok1(tdb->file->allrecord_lock.count == 0
+ && tdb->file->num_lockrecs == 0);
+ ok1(tdb_check(tdb, NULL, NULL) == 0);
+ ok1(tdb_fetch(tdb, key, &data) == TDB_ERR_NOEXIST);
+
+ /* Commit the transaction. */
+ ok1(tdb_transaction_start(tdb) == 0);
+ data.dptr = buffer;
+ data.dsize = 1000;
+ ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0);
+ ok1(tdb_fetch(tdb, key, &data) == TDB_SUCCESS);
+ ok1(data.dsize == 1000);
+ ok1(memcmp(data.dptr, buffer, data.dsize) == 0);
+ free(data.dptr);
+ ok1(tdb_transaction_commit(tdb) == 0);
+ ok1(tdb->file->allrecord_lock.count == 0
+ && tdb->file->num_lockrecs == 0);
+ ok1(tdb_check(tdb, NULL, NULL) == 0);
+ ok1(tdb_fetch(tdb, key, &data) == TDB_SUCCESS);
+ ok1(data.dsize == 1000);
+ ok1(memcmp(data.dptr, buffer, data.dsize) == 0);
+ free(data.dptr);
+
+ tdb_close(tdb);
+ }
+
+ ok1(tap_log_messages == 0);
+ free(buffer);
+ return exit_status();
+}
diff --git a/lib/tdb2/test/run-56-open-during-transaction.c b/lib/tdb2/test/run-56-open-during-transaction.c
new file mode 100644
index 00000000000..96107d637e4
--- /dev/null
+++ b/lib/tdb2/test/run-56-open-during-transaction.c
@@ -0,0 +1,175 @@
+#include "config.h"
+#include <unistd.h>
+#include "lock-tracking.h"
+
+static ssize_t pwrite_check(int fd, const void *buf, size_t count, off_t offset);
+static ssize_t write_check(int fd, const void *buf, size_t count);
+static int ftruncate_check(int fd, off_t length);
+
+#define pwrite pwrite_check
+#define write write_check
+#define fcntl fcntl_with_lockcheck
+#define ftruncate ftruncate_check
+
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/open.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tdb2/transaction.c>
+#include <ccan/tap/tap.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <stdarg.h>
+#include <err.h>
+#include "external-agent.h"
+#include "logging.h"
+
+static struct agent *agent;
+static bool opened;
+static int errors = 0;
+#define TEST_DBNAME "run-56-open-during-transaction.tdb"
+
+#undef write
+#undef pwrite
+#undef fcntl
+#undef ftruncate
+
+static bool is_same(const char *snapshot, const char *latest, off_t len)
+{
+ unsigned i;
+
+ for (i = 0; i < len; i++) {
+ if (snapshot[i] != latest[i])
+ return false;
+ }
+ return true;
+}
+
+static bool compare_file(int fd, const char *snapshot, off_t snapshot_len)
+{
+ char *contents;
+ bool same;
+
+ /* over-length read serves as length check. */
+ contents = malloc(snapshot_len+1);
+ same = pread(fd, contents, snapshot_len+1, 0) == snapshot_len
+ && is_same(snapshot, contents, snapshot_len);
+ free(contents);
+ return same;
+}
+
+static void check_file_intact(int fd)
+{
+ enum agent_return ret;
+ struct stat st;
+ char *contents;
+
+ fstat(fd, &st);
+ contents = malloc(st.st_size);
+ if (pread(fd, contents, st.st_size, 0) != st.st_size) {
+ diag("Read fail");
+ errors++;
+ return;
+ }
+
+ /* Ask agent to open file. */
+ ret = external_agent_operation(agent, OPEN, TEST_DBNAME);
+
+ /* It's OK to open it, but it must not have changed! */
+ if (!compare_file(fd, contents, st.st_size)) {
+ diag("Agent changed file after opening %s",
+ agent_return_name(ret));
+ errors++;
+ }
+
+ if (ret == SUCCESS) {
+ ret = external_agent_operation(agent, CLOSE, NULL);
+ if (ret != SUCCESS) {
+ diag("Agent failed to close tdb: %s",
+ agent_return_name(ret));
+ errors++;
+ }
+ } else if (ret != WOULD_HAVE_BLOCKED) {
+ diag("Agent opening file gave %s",
+ agent_return_name(ret));
+ errors++;
+ }
+
+ free(contents);
+}
+
+static void after_unlock(int fd)
+{
+ if (opened)
+ check_file_intact(fd);
+}
+
+static ssize_t pwrite_check(int fd,
+ const void *buf, size_t count, off_t offset)
+{
+ if (opened)
+ check_file_intact(fd);
+
+ return pwrite(fd, buf, count, offset);
+}
+
+static ssize_t write_check(int fd, const void *buf, size_t count)
+{
+ if (opened)
+ check_file_intact(fd);
+
+ return write(fd, buf, count);
+}
+
+static int ftruncate_check(int fd, off_t length)
+{
+ if (opened)
+ check_file_intact(fd);
+
+ return ftruncate(fd, length);
+
+}
+
+int main(int argc, char *argv[])
+{
+ const int flags[] = { TDB_DEFAULT,
+ TDB_NOMMAP,
+ TDB_CONVERT,
+ TDB_CONVERT | TDB_NOMMAP };
+ int i;
+ struct tdb_context *tdb;
+ TDB_DATA key, data;
+
+ plan_tests(20);
+ agent = prepare_external_agent();
+ if (!agent)
+ err(1, "preparing agent");
+
+ unlock_callback = after_unlock;
+ for (i = 0; i < sizeof(flags)/sizeof(flags[0]); i++) {
+ diag("Test with %s and %s\n",
+ (flags[i] & TDB_CONVERT) ? "CONVERT" : "DEFAULT",
+ (flags[i] & TDB_NOMMAP) ? "no mmap" : "mmap");
+ unlink(TEST_DBNAME);
+ tdb = tdb_open(TEST_DBNAME, flags[i],
+ O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+ ok1(tdb);
+
+ opened = true;
+ ok1(tdb_transaction_start(tdb) == 0);
+ key = tdb_mkdata("hi", strlen("hi"));
+ data = tdb_mkdata("world", strlen("world"));
+
+ ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0);
+ ok1(tdb_transaction_commit(tdb) == 0);
+ ok(!errors, "We had %u open errors", errors);
+
+ opened = false;
+ tdb_close(tdb);
+ }
+
+ return exit_status();
+}
diff --git a/lib/tdb2/test/run-57-die-during-transaction.c b/lib/tdb2/test/run-57-die-during-transaction.c
new file mode 100644
index 00000000000..84f01eb21a8
--- /dev/null
+++ b/lib/tdb2/test/run-57-die-during-transaction.c
@@ -0,0 +1,275 @@
+#include "config.h"
+#include <unistd.h>
+#include "lock-tracking.h"
+#include <ccan/tap/tap.h>
+#include <stdlib.h>
+#include <assert.h>
+static ssize_t pwrite_check(int fd, const void *buf, size_t count, off_t offset);
+static ssize_t write_check(int fd, const void *buf, size_t count);
+static int ftruncate_check(int fd, off_t length);
+
+#define pwrite pwrite_check
+#define write write_check
+#define fcntl fcntl_with_lockcheck
+#define ftruncate ftruncate_check
+
+/* There's a malloc inside transaction_setup_recovery, and valgrind complains
+ * when we longjmp and leak it. */
+#define MAX_ALLOCATIONS 200
+static void *allocated[MAX_ALLOCATIONS];
+
+static void *malloc_noleak(size_t len)
+{
+ unsigned int i;
+
+ for (i = 0; i < MAX_ALLOCATIONS; i++)
+ if (!allocated[i]) {
+ allocated[i] = malloc(len);
+ return allocated[i];
+ }
+ diag("Too many allocations!");
+ abort();
+}
+
+static void free_noleak(void *p)
+{
+ unsigned int i;
+
+ /* We don't catch realloc, so don't care if we miss one. */
+ for (i = 0; i < MAX_ALLOCATIONS; i++) {
+ if (allocated[i] == p) {
+ allocated[i] = NULL;
+ break;
+ }
+ }
+ free(p);
+}
+
+static void free_all(void)
+{
+ unsigned int i;
+
+ for (i = 0; i < MAX_ALLOCATIONS; i++) {
+ free(allocated[i]);
+ allocated[i] = NULL;
+ }
+}
+
+#define malloc malloc_noleak
+#define free free_noleak
+
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/open.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tdb2/transaction.c>
+#undef malloc
+#undef free
+#undef write
+#undef pwrite
+#undef fcntl
+#undef ftruncate
+
+#include <stdbool.h>
+#include <stdarg.h>
+#include <err.h>
+#include <setjmp.h>
+#include "external-agent.h"
+#include "logging.h"
+
+static bool in_transaction;
+static int target, current;
+static jmp_buf jmpbuf;
+#define TEST_DBNAME "run-57-die-during-transaction.tdb"
+#define KEY_STRING "helloworld"
+
+static void maybe_die(int fd)
+{
+ if (in_transaction && current++ == target) {
+ longjmp(jmpbuf, 1);
+ }
+}
+
+static ssize_t pwrite_check(int fd,
+ const void *buf, size_t count, off_t offset)
+{
+ ssize_t ret;
+
+ maybe_die(fd);
+
+ ret = pwrite(fd, buf, count, offset);
+ if (ret != count)
+ return ret;
+
+ maybe_die(fd);
+ return ret;
+}
+
+static ssize_t write_check(int fd, const void *buf, size_t count)
+{
+ ssize_t ret;
+
+ maybe_die(fd);
+
+ ret = write(fd, buf, count);
+ if (ret != count)
+ return ret;
+
+ maybe_die(fd);
+ return ret;
+}
+
+static int ftruncate_check(int fd, off_t length)
+{
+ int ret;
+
+ maybe_die(fd);
+
+ ret = ftruncate(fd, length);
+
+ maybe_die(fd);
+ return ret;
+}
+
+static bool test_death(enum operation op, struct agent *agent)
+{
+ struct tdb_context *tdb = NULL;
+ TDB_DATA key;
+ enum agent_return ret;
+ int needed_recovery = 0;
+
+ current = target = 0;
+reset:
+ unlink(TEST_DBNAME);
+ tdb = tdb_open(TEST_DBNAME, TDB_NOMMAP,
+ O_CREAT|O_TRUNC|O_RDWR, 0600, &tap_log_attr);
+ if (!tdb) {
+ diag("Failed opening TDB: %s", strerror(errno));
+ return false;
+ }
+
+ if (setjmp(jmpbuf) != 0) {
+ /* We're partway through. Simulate our death. */
+ close(tdb->file->fd);
+ forget_locking();
+ in_transaction = false;
+
+ ret = external_agent_operation(agent, NEEDS_RECOVERY, "");
+ if (ret == SUCCESS)
+ needed_recovery++;
+ else if (ret != FAILED) {
+ diag("Step %u agent NEEDS_RECOVERY = %s", current,
+ agent_return_name(ret));
+ return false;
+ }
+
+ ret = external_agent_operation(agent, op, KEY_STRING);
+ if (ret != SUCCESS) {
+ diag("Step %u op %s failed = %s", current,
+ operation_name(op),
+ agent_return_name(ret));
+ return false;
+ }
+
+ ret = external_agent_operation(agent, NEEDS_RECOVERY, "");
+ if (ret != FAILED) {
+ diag("Still needs recovery after step %u = %s",
+ current, agent_return_name(ret));
+ return false;
+ }
+
+ ret = external_agent_operation(agent, CHECK, "");
+ if (ret != SUCCESS) {
+ diag("Step %u check failed = %s", current,
+ agent_return_name(ret));
+ return false;
+ }
+
+ ret = external_agent_operation(agent, CLOSE, "");
+ if (ret != SUCCESS) {
+ diag("Step %u close failed = %s", current,
+ agent_return_name(ret));
+ return false;
+ }
+
+ /* Suppress logging as this tries to use closed fd. */
+ suppress_logging = true;
+ suppress_lockcheck = true;
+ tdb_close(tdb);
+ suppress_logging = false;
+ suppress_lockcheck = false;
+ target++;
+ current = 0;
+ free_all();
+ goto reset;
+ }
+
+ /* Put key for agent to fetch. */
+ key = tdb_mkdata(KEY_STRING, strlen(KEY_STRING));
+ if (tdb_store(tdb, key, key, TDB_INSERT) != 0)
+ return false;
+
+ /* This is the key we insert in transaction. */
+ key.dsize--;
+
+ ret = external_agent_operation(agent, OPEN, TEST_DBNAME);
+ if (ret != SUCCESS)
+ errx(1, "Agent failed to open: %s", agent_return_name(ret));
+
+ ret = external_agent_operation(agent, FETCH, KEY_STRING);
+ if (ret != SUCCESS)
+ errx(1, "Agent failed find key: %s", agent_return_name(ret));
+
+ in_transaction = true;
+ if (tdb_transaction_start(tdb) != 0)
+ return false;
+
+ if (tdb_store(tdb, key, key, TDB_INSERT) != 0)
+ return false;
+
+ if (tdb_transaction_commit(tdb) != 0)
+ return false;
+
+ in_transaction = false;
+
+ /* We made it! */
+ diag("Completed %u runs", current);
+ tdb_close(tdb);
+ ret = external_agent_operation(agent, CLOSE, "");
+ if (ret != SUCCESS) {
+ diag("Step %u close failed = %s", current,
+ agent_return_name(ret));
+ return false;
+ }
+
+ ok1(needed_recovery);
+ ok1(locking_errors == 0);
+ ok1(forget_locking() == 0);
+ locking_errors = 0;
+ return true;
+}
+
+int main(int argc, char *argv[])
+{
+ enum operation ops[] = { FETCH, STORE, TRANSACTION_START };
+ struct agent *agent;
+ int i;
+
+ plan_tests(12);
+ unlock_callback = maybe_die;
+
+ agent = prepare_external_agent();
+ if (!agent)
+ err(1, "preparing agent");
+
+ for (i = 0; i < sizeof(ops)/sizeof(ops[0]); i++) {
+ diag("Testing %s after death", operation_name(ops[i]));
+ ok1(test_death(ops[i], agent));
+ }
+
+ free_external_agent(agent);
+ return exit_status();
+}
diff --git a/lib/tdb2/test/run-64-bit-tdb.c b/lib/tdb2/test/run-64-bit-tdb.c
new file mode 100644
index 00000000000..78dadca0164
--- /dev/null
+++ b/lib/tdb2/test/run-64-bit-tdb.c
@@ -0,0 +1,80 @@
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/open.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tdb2/traverse.c>
+#include <ccan/tdb2/transaction.c>
+#include <ccan/tap/tap.h>
+#include "logging.h"
+
+int main(int argc, char *argv[])
+{
+ unsigned int i;
+ struct tdb_context *tdb;
+ int flags[] = { TDB_DEFAULT, TDB_NOMMAP,
+ TDB_CONVERT,
+ TDB_NOMMAP|TDB_CONVERT };
+
+ if (sizeof(off_t) <= 4) {
+ plan_tests(1);
+ pass("No 64 bit off_t");
+ return exit_status();
+ }
+
+ plan_tests(sizeof(flags) / sizeof(flags[0]) * 14);
+ for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+ off_t old_size;
+ TDB_DATA k, d;
+ struct hash_info h;
+ struct tdb_used_record rec;
+ tdb_off_t off;
+
+ tdb = tdb_open("run-64-bit-tdb.tdb", flags[i],
+ O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+ ok1(tdb);
+ if (!tdb)
+ continue;
+
+ old_size = tdb->file->map_size;
+
+ /* This makes a sparse file */
+ ok1(ftruncate(tdb->file->fd, 0xFFFFFFF0) == 0);
+ ok1(add_free_record(tdb, old_size, 0xFFFFFFF0 - old_size,
+ TDB_LOCK_WAIT, false) == TDB_SUCCESS);
+
+ /* Now add a little record past the 4G barrier. */
+ ok1(tdb_expand_file(tdb, 100) == TDB_SUCCESS);
+ ok1(add_free_record(tdb, 0xFFFFFFF0, 100, TDB_LOCK_WAIT, false)
+ == TDB_SUCCESS);
+
+ ok1(tdb_check(tdb, NULL, NULL) == TDB_SUCCESS);
+
+ /* Test allocation path. */
+ k = tdb_mkdata("key", 4);
+ d = tdb_mkdata("data", 5);
+ ok1(tdb_store(tdb, k, d, TDB_INSERT) == 0);
+ ok1(tdb_check(tdb, NULL, NULL) == TDB_SUCCESS);
+
+ /* Make sure it put it at end as we expected. */
+ off = find_and_lock(tdb, k, F_RDLCK, &h, &rec, NULL);
+ ok1(off >= 0xFFFFFFF0);
+ tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range, F_RDLCK);
+
+ ok1(tdb_fetch(tdb, k, &d) == 0);
+ ok1(d.dsize == 5);
+ ok1(strcmp((char *)d.dptr, "data") == 0);
+ free(d.dptr);
+
+ ok1(tdb_delete(tdb, k) == 0);
+ ok1(tdb_check(tdb, NULL, NULL) == TDB_SUCCESS);
+
+ tdb_close(tdb);
+ }
+
+ /* We might get messages about mmap failing, so don't test
+ * tap_log_messages */
+ return exit_status();
+}
diff --git a/lib/tdb2/test/run-80-tdb_fd.c b/lib/tdb2/test/run-80-tdb_fd.c
new file mode 100644
index 00000000000..e8b2fae2dd6
--- /dev/null
+++ b/lib/tdb2/test/run-80-tdb_fd.c
@@ -0,0 +1,35 @@
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/open.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/transaction.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tap/tap.h>
+#include "logging.h"
+
+int main(int argc, char *argv[])
+{
+ unsigned int i;
+ struct tdb_context *tdb;
+ int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP,
+ TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT,
+ TDB_NOMMAP|TDB_CONVERT };
+
+ plan_tests(sizeof(flags) / sizeof(flags[0]) * 3);
+ for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+ tdb = tdb_open("run-new_database.tdb", flags[i],
+ O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+ if (!ok1(tdb))
+ continue;
+
+ if (flags[i] & TDB_INTERNAL)
+ ok1(tdb_fd(tdb) == -1);
+ else
+ ok1(tdb_fd(tdb) > 2);
+ tdb_close(tdb);
+ ok1(tap_log_messages == 0);
+ }
+ return exit_status();
+}
diff --git a/lib/tdb2/test/run-81-seqnum.c b/lib/tdb2/test/run-81-seqnum.c
new file mode 100644
index 00000000000..6e8b2698b6f
--- /dev/null
+++ b/lib/tdb2/test/run-81-seqnum.c
@@ -0,0 +1,71 @@
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/open.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/transaction.c>
+#include <ccan/tdb2/traverse.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tap/tap.h>
+#include "logging.h"
+
+int main(int argc, char *argv[])
+{
+ unsigned int i;
+ struct tdb_context *tdb;
+ struct tdb_data d = { NULL, 0 }; /* Bogus GCC warning */
+ struct tdb_data key = tdb_mkdata("key", 3);
+ struct tdb_data data = tdb_mkdata("data", 4);
+ int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP,
+ TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT,
+ TDB_NOMMAP|TDB_CONVERT };
+
+ plan_tests(sizeof(flags) / sizeof(flags[0]) * 15 + 4 * 13);
+ for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+ tdb = tdb_open("run-new_database.tdb", flags[i]|TDB_SEQNUM,
+ O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+ if (!ok1(tdb))
+ continue;
+
+ ok1(tdb_get_seqnum(tdb) == 0);
+ ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0);
+ ok1(tdb_get_seqnum(tdb) == 1);
+ /* Fetch doesn't change seqnum */
+ if (ok1(tdb_fetch(tdb, key, &d) == TDB_SUCCESS))
+ free(d.dptr);
+ ok1(tdb_get_seqnum(tdb) == 1);
+ ok1(tdb_append(tdb, key, data) == TDB_SUCCESS);
+ ok1(tdb_get_seqnum(tdb) == 2);
+
+ ok1(tdb_delete(tdb, key) == TDB_SUCCESS);
+ ok1(tdb_get_seqnum(tdb) == 3);
+ /* Empty append works */
+ ok1(tdb_append(tdb, key, data) == TDB_SUCCESS);
+ ok1(tdb_get_seqnum(tdb) == 4);
+
+ ok1(tdb_wipe_all(tdb) == TDB_SUCCESS);
+ ok1(tdb_get_seqnum(tdb) == 5);
+
+ if (!(flags[i] & TDB_INTERNAL)) {
+ ok1(tdb_transaction_start(tdb) == TDB_SUCCESS);
+ ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0);
+ ok1(tdb_get_seqnum(tdb) == 6);
+ ok1(tdb_append(tdb, key, data) == TDB_SUCCESS);
+ ok1(tdb_get_seqnum(tdb) == 7);
+ ok1(tdb_delete(tdb, key) == TDB_SUCCESS);
+ ok1(tdb_get_seqnum(tdb) == 8);
+ ok1(tdb_transaction_commit(tdb) == TDB_SUCCESS);
+ ok1(tdb_get_seqnum(tdb) == 8);
+
+ ok1(tdb_transaction_start(tdb) == TDB_SUCCESS);
+ ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0);
+ ok1(tdb_get_seqnum(tdb) == 9);
+ tdb_transaction_cancel(tdb);
+ ok1(tdb_get_seqnum(tdb) == 8);
+ }
+ tdb_close(tdb);
+ ok1(tap_log_messages == 0);
+ }
+ return exit_status();
+}
diff --git a/lib/tdb2/test/run-82-lockattr.c b/lib/tdb2/test/run-82-lockattr.c
new file mode 100644
index 00000000000..bfc2653222f
--- /dev/null
+++ b/lib/tdb2/test/run-82-lockattr.c
@@ -0,0 +1,263 @@
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/open.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tdb2/transaction.c>
+#include <ccan/tdb2/traverse.c>
+#include <ccan/tap/tap.h>
+#include "logging.h"
+
+static int mylock(int fd, int rw, off_t off, off_t len, bool waitflag,
+ void *_err)
+{
+ int *lock_err = _err;
+ struct flock fl;
+ int ret;
+
+ if (*lock_err) {
+ errno = *lock_err;
+ return -1;
+ }
+
+ do {
+ fl.l_type = rw;
+ fl.l_whence = SEEK_SET;
+ fl.l_start = off;
+ fl.l_len = len;
+
+ if (waitflag)
+ ret = fcntl(fd, F_SETLKW, &fl);
+ else
+ ret = fcntl(fd, F_SETLK, &fl);
+ } while (ret != 0 && errno == EINTR);
+
+ return ret;
+}
+
+static int myunlock(int fd, int rw, off_t off, off_t len, void *_err)
+{
+ int *lock_err = _err;
+ struct flock fl;
+ int ret;
+
+ if (*lock_err) {
+ errno = *lock_err;
+ return -1;
+ }
+
+ do {
+ fl.l_type = F_UNLCK;
+ fl.l_whence = SEEK_SET;
+ fl.l_start = off;
+ fl.l_len = len;
+
+ ret = fcntl(fd, F_SETLKW, &fl);
+ } while (ret != 0 && errno == EINTR);
+
+ return ret;
+}
+
+static int trav_err;
+static int trav(struct tdb_context *tdb, TDB_DATA k, TDB_DATA d, int *err)
+{
+ *err = trav_err;
+ return 0;
+}
+
+int main(int argc, char *argv[])
+{
+ unsigned int i;
+ struct tdb_context *tdb;
+ int flags[] = { TDB_DEFAULT, TDB_NOMMAP,
+ TDB_CONVERT, TDB_NOMMAP|TDB_CONVERT };
+ union tdb_attribute lock_attr;
+ struct tdb_data key = tdb_mkdata("key", 3);
+ struct tdb_data data = tdb_mkdata("data", 4);
+ int lock_err;
+
+ lock_attr.base.attr = TDB_ATTRIBUTE_FLOCK;
+ lock_attr.base.next = &tap_log_attr;
+ lock_attr.flock.lock = mylock;
+ lock_attr.flock.unlock = myunlock;
+ lock_attr.flock.data = &lock_err;
+
+ plan_tests(sizeof(flags) / sizeof(flags[0]) * 80);
+
+ for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+ struct tdb_data d;
+
+ /* Nonblocking open; expect no error message. */
+ lock_err = EAGAIN;
+ tdb = tdb_open("run-82-lockattr.tdb", flags[i],
+ O_RDWR|O_CREAT|O_TRUNC, 0600, &lock_attr);
+ ok(errno == lock_err, "Errno is %u", errno);
+ ok1(!tdb);
+ ok1(tap_log_messages == 0);
+
+ lock_err = EINTR;
+ tdb = tdb_open("run-82-lockattr.tdb", flags[i],
+ O_RDWR|O_CREAT|O_TRUNC, 0600, &lock_attr);
+ ok(errno == lock_err, "Errno is %u", errno);
+ ok1(!tdb);
+ ok1(tap_log_messages == 0);
+
+ /* Forced fail open. */
+ lock_err = ENOMEM;
+ tdb = tdb_open("run-82-lockattr.tdb", flags[i],
+ O_RDWR|O_CREAT|O_TRUNC, 0600, &lock_attr);
+ ok1(errno == lock_err);
+ ok1(!tdb);
+ ok1(tap_log_messages == 1);
+ tap_log_messages = 0;
+
+ lock_err = 0;
+ tdb = tdb_open("run-82-lockattr.tdb", flags[i],
+ O_RDWR|O_CREAT|O_TRUNC, 0600, &lock_attr);
+ if (!ok1(tdb))
+ continue;
+ ok1(tap_log_messages == 0);
+
+ /* Nonblocking store. */
+ lock_err = EAGAIN;
+ ok1(tdb_store(tdb, key, data, TDB_REPLACE) == TDB_ERR_LOCK);
+ ok1(tap_log_messages == 0);
+ lock_err = EINTR;
+ ok1(tdb_store(tdb, key, data, TDB_REPLACE) == TDB_ERR_LOCK);
+ ok1(tap_log_messages == 0);
+ lock_err = ENOMEM;
+ ok1(tdb_store(tdb, key, data, TDB_REPLACE) == TDB_ERR_LOCK);
+ ok1(tap_log_messages == 1);
+ tap_log_messages = 0;
+
+ /* Nonblocking fetch. */
+ lock_err = EAGAIN;
+ ok1(!tdb_exists(tdb, key));
+ ok1(tap_log_messages == 0);
+ lock_err = EINTR;
+ ok1(!tdb_exists(tdb, key));
+ ok1(tap_log_messages == 0);
+ lock_err = ENOMEM;
+ ok1(!tdb_exists(tdb, key));
+ ok1(tap_log_messages == 1);
+ tap_log_messages = 0;
+
+ lock_err = EAGAIN;
+ ok1(tdb_fetch(tdb, key, &d) == TDB_ERR_LOCK);
+ ok1(tap_log_messages == 0);
+ lock_err = EINTR;
+ ok1(tdb_fetch(tdb, key, &d) == TDB_ERR_LOCK);
+ ok1(tap_log_messages == 0);
+ lock_err = ENOMEM;
+ ok1(tdb_fetch(tdb, key, &d) == TDB_ERR_LOCK);
+ ok1(tap_log_messages == 1);
+ tap_log_messages = 0;
+
+ /* Nonblocking delete. */
+ lock_err = EAGAIN;
+ ok1(tdb_delete(tdb, key) == TDB_ERR_LOCK);
+ ok1(tap_log_messages == 0);
+ lock_err = EINTR;
+ ok1(tdb_delete(tdb, key) == TDB_ERR_LOCK);
+ ok1(tap_log_messages == 0);
+ lock_err = ENOMEM;
+ ok1(tdb_delete(tdb, key) == TDB_ERR_LOCK);
+ ok1(tap_log_messages == 1);
+ tap_log_messages = 0;
+
+ /* Nonblocking locks. */
+ lock_err = EAGAIN;
+ ok1(tdb_chainlock(tdb, key) == TDB_ERR_LOCK);
+ ok1(tap_log_messages == 0);
+ lock_err = EINTR;
+ ok1(tdb_chainlock(tdb, key) == TDB_ERR_LOCK);
+ ok1(tap_log_messages == 0);
+ lock_err = ENOMEM;
+ ok1(tdb_chainlock(tdb, key) == TDB_ERR_LOCK);
+ ok1(tap_log_messages == 1);
+ tap_log_messages = 0;
+
+ lock_err = EAGAIN;
+ ok1(tdb_chainlock_read(tdb, key) == TDB_ERR_LOCK);
+ ok1(tap_log_messages == 0);
+ lock_err = EINTR;
+ ok1(tdb_chainlock_read(tdb, key) == TDB_ERR_LOCK);
+ ok1(tap_log_messages == 0);
+ lock_err = ENOMEM;
+ ok1(tdb_chainlock_read(tdb, key) == TDB_ERR_LOCK);
+ ok1(tap_log_messages == 1);
+ tap_log_messages = 0;
+
+ lock_err = EAGAIN;
+ ok1(tdb_lockall(tdb) == TDB_ERR_LOCK);
+ ok1(tap_log_messages == 0);
+ lock_err = EINTR;
+ ok1(tdb_lockall(tdb) == TDB_ERR_LOCK);
+ ok1(tap_log_messages == 0);
+ lock_err = ENOMEM;
+ ok1(tdb_lockall(tdb) == TDB_ERR_LOCK);
+ /* This actually does divide and conquer. */
+ ok1(tap_log_messages > 0);
+ tap_log_messages = 0;
+
+ lock_err = EAGAIN;
+ ok1(tdb_lockall_read(tdb) == TDB_ERR_LOCK);
+ ok1(tap_log_messages == 0);
+ lock_err = EINTR;
+ ok1(tdb_lockall_read(tdb) == TDB_ERR_LOCK);
+ ok1(tap_log_messages == 0);
+ lock_err = ENOMEM;
+ ok1(tdb_lockall_read(tdb) == TDB_ERR_LOCK);
+ ok1(tap_log_messages > 0);
+ tap_log_messages = 0;
+
+ /* Nonblocking traverse; go nonblock partway through. */
+ lock_err = 0;
+ ok1(tdb_store(tdb, key, data, TDB_REPLACE) == 0);
+ trav_err = EAGAIN;
+ ok1(tdb_traverse(tdb, trav, &lock_err) == TDB_ERR_LOCK);
+ ok1(tap_log_messages == 0);
+ trav_err = EINTR;
+ lock_err = 0;
+ ok1(tdb_traverse(tdb, trav, &lock_err) == TDB_ERR_LOCK);
+ ok1(tap_log_messages == 0);
+ trav_err = ENOMEM;
+ lock_err = 0;
+ ok1(tdb_traverse(tdb, trav, &lock_err) == TDB_ERR_LOCK);
+ ok1(tap_log_messages == 1);
+ tap_log_messages = 0;
+
+ /* Nonblocking transactions. */
+ lock_err = EAGAIN;
+ ok1(tdb_transaction_start(tdb) == TDB_ERR_LOCK);
+ ok1(tap_log_messages == 0);
+ lock_err = EINTR;
+ ok1(tdb_transaction_start(tdb) == TDB_ERR_LOCK);
+ ok1(tap_log_messages == 0);
+ lock_err = ENOMEM;
+ ok1(tdb_transaction_start(tdb) == TDB_ERR_LOCK);
+ ok1(tap_log_messages == 1);
+ tap_log_messages = 0;
+
+ /* Nonblocking transaction prepare. */
+ lock_err = 0;
+ ok1(tdb_transaction_start(tdb) == 0);
+ ok1(tdb_delete(tdb, key) == 0);
+
+ lock_err = EAGAIN;
+ ok1(tdb_transaction_prepare_commit(tdb) == TDB_ERR_LOCK);
+ ok1(tap_log_messages == 0);
+
+ lock_err = 0;
+ ok1(tdb_transaction_prepare_commit(tdb) == 0);
+ ok1(tdb_transaction_commit(tdb) == 0);
+
+ /* And the transaction was committed, right? */
+ ok1(!tdb_exists(tdb, key));
+ tdb_close(tdb);
+ ok1(tap_log_messages == 0);
+ }
+ return exit_status();
+}
diff --git a/lib/tdb2/test/run-83-openhook.c b/lib/tdb2/test/run-83-openhook.c
new file mode 100644
index 00000000000..320be7d4da1
--- /dev/null
+++ b/lib/tdb2/test/run-83-openhook.c
@@ -0,0 +1,98 @@
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/open.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tdb2/transaction.c>
+#include <ccan/tap/tap.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <stdarg.h>
+#include <err.h>
+#include "external-agent.h"
+#include "logging.h"
+
+static enum TDB_ERROR clear_if_first(int fd, void *arg)
+{
+/* We hold a lock offset 63 always, so we can tell if anyone is holding it. */
+ struct flock fl;
+
+ if (arg != clear_if_first)
+ return TDB_ERR_CORRUPT;
+
+ fl.l_type = F_WRLCK;
+ fl.l_whence = SEEK_SET;
+ fl.l_start = 63;
+ fl.l_len = 1;
+
+ if (fcntl(fd, F_SETLK, &fl) == 0) {
+ /* We must be first ones to open it! */
+ diag("truncating file!");
+ if (ftruncate(fd, 0) != 0) {
+ return TDB_ERR_IO;
+ }
+ }
+ fl.l_type = F_RDLCK;
+ if (fcntl(fd, F_SETLKW, &fl) != 0) {
+ return TDB_ERR_IO;
+ }
+ return TDB_SUCCESS;
+}
+
+int main(int argc, char *argv[])
+{
+ unsigned int i;
+ struct tdb_context *tdb;
+ struct agent *agent;
+ union tdb_attribute cif;
+ struct tdb_data key = tdb_mkdata("key", 3);
+ int flags[] = { TDB_DEFAULT, TDB_NOMMAP,
+ TDB_CONVERT, TDB_NOMMAP|TDB_CONVERT };
+
+ cif.openhook.base.attr = TDB_ATTRIBUTE_OPENHOOK;
+ cif.openhook.base.next = &tap_log_attr;
+ cif.openhook.fn = clear_if_first;
+ cif.openhook.data = clear_if_first;
+
+ agent = prepare_external_agent();
+ plan_tests(sizeof(flags) / sizeof(flags[0]) * 13);
+ for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+ /* Create it */
+ tdb = tdb_open("run-83-openhook.tdb", flags[i],
+ O_RDWR|O_CREAT|O_TRUNC, 0600, NULL);
+ ok1(tdb);
+ ok1(tdb_store(tdb, key, key, TDB_REPLACE) == 0);
+ tdb_close(tdb);
+
+ /* Now, open with CIF, should clear it. */
+ tdb = tdb_open("run-83-openhook.tdb", flags[i],
+ O_RDWR, 0, &cif);
+ ok1(tdb);
+ ok1(!tdb_exists(tdb, key));
+ ok1(tdb_store(tdb, key, key, TDB_REPLACE) == 0);
+
+ /* Agent should not clear it, since it's still open. */
+ ok1(external_agent_operation(agent, OPEN_WITH_HOOK,
+ "run-83-openhook.tdb") == SUCCESS);
+ ok1(external_agent_operation(agent, FETCH, "key") == SUCCESS);
+ ok1(external_agent_operation(agent, CLOSE, "") == SUCCESS);
+
+ /* Still exists for us too. */
+ ok1(tdb_exists(tdb, key));
+
+ /* Close it, now agent should clear it. */
+ tdb_close(tdb);
+
+ ok1(external_agent_operation(agent, OPEN_WITH_HOOK,
+ "run-83-openhook.tdb") == SUCCESS);
+ ok1(external_agent_operation(agent, FETCH, "key") == FAILED);
+ ok1(external_agent_operation(agent, CLOSE, "") == SUCCESS);
+
+ ok1(tap_log_messages == 0);
+ }
+
+ free_external_agent(agent);
+ return exit_status();
+}
diff --git a/lib/tdb2/test/run-90-get-set-attributes.c b/lib/tdb2/test/run-90-get-set-attributes.c
new file mode 100644
index 00000000000..159d8a01eaf
--- /dev/null
+++ b/lib/tdb2/test/run-90-get-set-attributes.c
@@ -0,0 +1,165 @@
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/open.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tdb2/transaction.c>
+#include <ccan/tdb2/traverse.c>
+#include <ccan/tap/tap.h>
+#include "logging.h"
+
+static int mylock(int fd, int rw, off_t off, off_t len, bool waitflag,
+ void *unused)
+{
+ return 0;
+}
+
+static int myunlock(int fd, int rw, off_t off, off_t len, void *unused)
+{
+ return 0;
+}
+
+static uint64_t hash_fn(const void *key, size_t len, uint64_t seed,
+ void *priv)
+{
+ return 0;
+}
+
+int main(int argc, char *argv[])
+{
+ unsigned int i;
+ struct tdb_context *tdb;
+ int flags[] = { TDB_DEFAULT, TDB_NOMMAP,
+ TDB_CONVERT, TDB_NOMMAP|TDB_CONVERT };
+ union tdb_attribute seed_attr;
+ union tdb_attribute hash_attr;
+ union tdb_attribute lock_attr;
+
+ hash_attr.base.attr = TDB_ATTRIBUTE_HASH;
+ hash_attr.base.next = &seed_attr;
+ hash_attr.hash.fn = hash_fn;
+ hash_attr.hash.data = &hash_attr;
+
+ seed_attr.base.attr = TDB_ATTRIBUTE_SEED;
+ seed_attr.base.next = &lock_attr;
+ seed_attr.seed.seed = 100;
+
+ lock_attr.base.attr = TDB_ATTRIBUTE_FLOCK;
+ lock_attr.base.next = &tap_log_attr;
+ lock_attr.flock.lock = mylock;
+ lock_attr.flock.unlock = myunlock;
+ lock_attr.flock.data = &lock_attr;
+
+ plan_tests(sizeof(flags) / sizeof(flags[0]) * 50);
+
+ for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+ union tdb_attribute attr;
+
+ /* First open with no attributes. */
+ tdb = tdb_open("run-90-get-set-attributes.tdb", flags[i],
+ O_RDWR|O_CREAT|O_TRUNC, 0600, NULL);
+ ok1(tdb);
+
+ /* Get log on no attributes will fail */
+ attr.base.attr = TDB_ATTRIBUTE_LOG;
+ ok1(tdb_get_attribute(tdb, &attr) == TDB_ERR_NOEXIST);
+ /* These always work. */
+ attr.base.attr = TDB_ATTRIBUTE_HASH;
+ ok1(tdb_get_attribute(tdb, &attr) == 0);
+ ok1(attr.base.attr == TDB_ATTRIBUTE_HASH);
+ ok1(attr.hash.fn == jenkins_hash);
+ attr.base.attr = TDB_ATTRIBUTE_FLOCK;
+ ok1(tdb_get_attribute(tdb, &attr) == 0);
+ ok1(attr.base.attr == TDB_ATTRIBUTE_FLOCK);
+ ok1(attr.flock.lock == tdb_fcntl_lock);
+ ok1(attr.flock.unlock == tdb_fcntl_unlock);
+ attr.base.attr = TDB_ATTRIBUTE_SEED;
+ ok1(tdb_get_attribute(tdb, &attr) == 0);
+ ok1(attr.base.attr == TDB_ATTRIBUTE_SEED);
+ /* This is possible, just astronomically unlikely. */
+ ok1(attr.seed.seed != 0);
+
+ /* Unset attributes. */
+ tdb_unset_attribute(tdb, TDB_ATTRIBUTE_LOG);
+ tdb_unset_attribute(tdb, TDB_ATTRIBUTE_FLOCK);
+
+ /* Set them. */
+ ok1(tdb_set_attribute(tdb, &tap_log_attr) == 0);
+ ok1(tdb_set_attribute(tdb, &lock_attr) == 0);
+ /* These should fail. */
+ ok1(tdb_set_attribute(tdb, &seed_attr) == TDB_ERR_EINVAL);
+ ok1(tap_log_messages == 1);
+ ok1(tdb_set_attribute(tdb, &hash_attr) == TDB_ERR_EINVAL);
+ ok1(tap_log_messages == 2);
+ tap_log_messages = 0;
+
+ /* Getting them should work as expected. */
+ attr.base.attr = TDB_ATTRIBUTE_LOG;
+ ok1(tdb_get_attribute(tdb, &attr) == 0);
+ ok1(attr.base.attr == TDB_ATTRIBUTE_LOG);
+ ok1(attr.log.fn == tap_log_attr.log.fn);
+ ok1(attr.log.data == tap_log_attr.log.data);
+
+ attr.base.attr = TDB_ATTRIBUTE_FLOCK;
+ ok1(tdb_get_attribute(tdb, &attr) == 0);
+ ok1(attr.base.attr == TDB_ATTRIBUTE_FLOCK);
+ ok1(attr.flock.lock == mylock);
+ ok1(attr.flock.unlock == myunlock);
+ ok1(attr.flock.data == &lock_attr);
+
+ /* Unset them again. */
+ tdb_unset_attribute(tdb, TDB_ATTRIBUTE_FLOCK);
+ ok1(tap_log_messages == 0);
+ tdb_unset_attribute(tdb, TDB_ATTRIBUTE_LOG);
+ ok1(tap_log_messages == 0);
+
+ tdb_close(tdb);
+ ok1(tap_log_messages == 0);
+
+ /* Now open with all attributes. */
+ tdb = tdb_open("run-90-get-set-attributes.tdb", flags[i],
+ O_RDWR|O_CREAT|O_TRUNC, 0600, &hash_attr);
+ ok1(tdb);
+
+ /* Get will succeed */
+ attr.base.attr = TDB_ATTRIBUTE_LOG;
+ ok1(tdb_get_attribute(tdb, &attr) == 0);
+ ok1(attr.base.attr == TDB_ATTRIBUTE_LOG);
+ ok1(attr.log.fn == tap_log_attr.log.fn);
+ ok1(attr.log.data == tap_log_attr.log.data);
+
+ attr.base.attr = TDB_ATTRIBUTE_HASH;
+ ok1(tdb_get_attribute(tdb, &attr) == 0);
+ ok1(attr.base.attr == TDB_ATTRIBUTE_HASH);
+ ok1(attr.hash.fn == hash_fn);
+ ok1(attr.hash.data == &hash_attr);
+
+ attr.base.attr = TDB_ATTRIBUTE_FLOCK;
+ ok1(tdb_get_attribute(tdb, &attr) == 0);
+ ok1(attr.base.attr == TDB_ATTRIBUTE_FLOCK);
+ ok1(attr.flock.lock == mylock);
+ ok1(attr.flock.unlock == myunlock);
+ ok1(attr.flock.data == &lock_attr);
+
+ attr.base.attr = TDB_ATTRIBUTE_SEED;
+ ok1(tdb_get_attribute(tdb, &attr) == 0);
+ ok1(attr.base.attr == TDB_ATTRIBUTE_SEED);
+ ok1(attr.seed.seed == seed_attr.seed.seed);
+
+ /* Unset attributes. */
+ tdb_unset_attribute(tdb, TDB_ATTRIBUTE_HASH);
+ ok1(tap_log_messages == 1);
+ tdb_unset_attribute(tdb, TDB_ATTRIBUTE_SEED);
+ ok1(tap_log_messages == 2);
+ tdb_unset_attribute(tdb, TDB_ATTRIBUTE_FLOCK);
+ tdb_unset_attribute(tdb, TDB_ATTRIBUTE_LOG);
+ ok1(tap_log_messages == 2);
+ tap_log_messages = 0;
+
+ tdb_close(tdb);
+
+ }
+ return exit_status();
+}
diff --git a/lib/tdb2/test/run-91-get-stats.c b/lib/tdb2/test/run-91-get-stats.c
new file mode 100644
index 00000000000..795dfd6602c
--- /dev/null
+++ b/lib/tdb2/test/run-91-get-stats.c
@@ -0,0 +1,59 @@
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/open.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tdb2/transaction.c>
+#include <ccan/tdb2/traverse.c>
+#include <ccan/tap/tap.h>
+#include "logging.h"
+
+int main(int argc, char *argv[])
+{
+ unsigned int i;
+ struct tdb_context *tdb;
+ int flags[] = { TDB_DEFAULT, TDB_NOMMAP,
+ TDB_CONVERT, TDB_NOMMAP|TDB_CONVERT };
+
+ plan_tests(sizeof(flags) / sizeof(flags[0]) * 11);
+
+ for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+ union tdb_attribute *attr;
+ struct tdb_data key = tdb_mkdata("key", 3);
+
+ tdb = tdb_open("run-91-get-stats.tdb", flags[i],
+ O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+ ok1(tdb);
+ ok1(tdb_store(tdb, key, key, TDB_REPLACE) == 0);
+
+ /* Use malloc so valgrind will catch overruns. */
+ attr = malloc(sizeof *attr);
+ attr->stats.base.attr = TDB_ATTRIBUTE_STATS;
+ attr->stats.size = sizeof(*attr);
+
+ ok1(tdb_get_attribute(tdb, attr) == 0);
+ ok1(attr->stats.size == sizeof(*attr));
+ ok1(attr->stats.allocs > 0);
+ ok1(attr->stats.expands > 0);
+ ok1(attr->stats.locks > 0);
+ free(attr);
+
+ /* Try short one. */
+ attr = malloc(offsetof(struct tdb_attribute_stats, allocs)
+ + sizeof(attr->stats.allocs));
+ attr->stats.base.attr = TDB_ATTRIBUTE_STATS;
+ attr->stats.size = offsetof(struct tdb_attribute_stats, allocs)
+ + sizeof(attr->stats.allocs);
+ ok1(tdb_get_attribute(tdb, attr) == 0);
+ ok1(attr->stats.size == sizeof(*attr));
+ ok1(attr->stats.allocs > 0);
+ free(attr);
+ ok1(tap_log_messages == 0);
+
+ tdb_close(tdb);
+
+ }
+ return exit_status();
+}
diff --git a/lib/tdb2/test/run-add-remove-flags.c b/lib/tdb2/test/run-add-remove-flags.c
new file mode 100644
index 00000000000..1dc84636628
--- /dev/null
+++ b/lib/tdb2/test/run-add-remove-flags.c
@@ -0,0 +1,93 @@
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/open.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tdb2/traverse.c>
+#include <ccan/tdb2/transaction.c>
+#include <ccan/tap/tap.h>
+#include "logging.h"
+
+int main(int argc, char *argv[])
+{
+ unsigned int i;
+ struct tdb_context *tdb;
+ int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP,
+ TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT,
+ TDB_NOMMAP|TDB_CONVERT };
+
+ plan_tests(87);
+ for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+ tdb = tdb_open("run-add-remove-flags.tdb", flags[i],
+ O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+ ok1(tdb);
+ if (!tdb)
+ continue;
+
+ ok1(tdb_get_flags(tdb) == tdb->flags);
+ tap_log_messages = 0;
+ tdb_add_flag(tdb, TDB_NOLOCK);
+ if (flags[i] & TDB_INTERNAL)
+ ok1(tap_log_messages == 1);
+ else {
+ ok1(tap_log_messages == 0);
+ ok1(tdb_get_flags(tdb) & TDB_NOLOCK);
+ }
+
+ tap_log_messages = 0;
+ tdb_add_flag(tdb, TDB_NOMMAP);
+ if (flags[i] & TDB_INTERNAL)
+ ok1(tap_log_messages == 1);
+ else {
+ ok1(tap_log_messages == 0);
+ ok1(tdb_get_flags(tdb) & TDB_NOMMAP);
+ ok1(tdb->file->map_ptr == NULL);
+ }
+
+ tap_log_messages = 0;
+ tdb_add_flag(tdb, TDB_NOSYNC);
+ if (flags[i] & TDB_INTERNAL)
+ ok1(tap_log_messages == 1);
+ else {
+ ok1(tap_log_messages == 0);
+ ok1(tdb_get_flags(tdb) & TDB_NOSYNC);
+ }
+
+ ok1(tdb_get_flags(tdb) == tdb->flags);
+
+ tap_log_messages = 0;
+ tdb_remove_flag(tdb, TDB_NOLOCK);
+ if (flags[i] & TDB_INTERNAL)
+ ok1(tap_log_messages == 1);
+ else {
+ ok1(tap_log_messages == 0);
+ ok1(!(tdb_get_flags(tdb) & TDB_NOLOCK));
+ }
+
+ tap_log_messages = 0;
+ tdb_remove_flag(tdb, TDB_NOMMAP);
+ if (flags[i] & TDB_INTERNAL)
+ ok1(tap_log_messages == 1);
+ else {
+ ok1(tap_log_messages == 0);
+ ok1(!(tdb_get_flags(tdb) & TDB_NOMMAP));
+ ok1(tdb->file->map_ptr != NULL);
+ }
+
+ tap_log_messages = 0;
+ tdb_remove_flag(tdb, TDB_NOSYNC);
+ if (flags[i] & TDB_INTERNAL)
+ ok1(tap_log_messages == 1);
+ else {
+ ok1(tap_log_messages == 0);
+ ok1(!(tdb_get_flags(tdb) & TDB_NOSYNC));
+ }
+
+ tdb_close(tdb);
+ }
+
+ ok1(tap_log_messages == 0);
+ return exit_status();
+}
diff --git a/lib/tdb2/test/run-check-callback.c b/lib/tdb2/test/run-check-callback.c
new file mode 100644
index 00000000000..1e87436717f
--- /dev/null
+++ b/lib/tdb2/test/run-check-callback.c
@@ -0,0 +1,90 @@
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tdb2/traverse.c>
+#include <ccan/tdb2/transaction.c>
+#include <ccan/tdb2/open.c>
+#include <ccan/tap/tap.h>
+#include "logging.h"
+
+#define NUM_RECORDS 1000
+
+static bool store_records(struct tdb_context *tdb)
+{
+ int i;
+ struct tdb_data key = { (unsigned char *)&i, sizeof(i) };
+ struct tdb_data data = { (unsigned char *)&i, sizeof(i) };
+
+ for (i = 0; i < NUM_RECORDS; i++)
+ if (tdb_store(tdb, key, data, TDB_REPLACE) != 0)
+ return false;
+ return true;
+}
+
+static enum TDB_ERROR check(struct tdb_data key,
+ struct tdb_data data,
+ bool *array)
+{
+ int val;
+
+ if (key.dsize != sizeof(val)) {
+ diag("Wrong key size: %u\n", key.dsize);
+ return TDB_ERR_CORRUPT;
+ }
+
+ if (key.dsize != data.dsize
+ || memcmp(key.dptr, data.dptr, sizeof(val)) != 0) {
+ diag("Key and data differ\n");
+ return TDB_ERR_CORRUPT;
+ }
+
+ memcpy(&val, key.dptr, sizeof(val));
+ if (val >= NUM_RECORDS || val < 0) {
+ diag("check value %i\n", val);
+ return TDB_ERR_CORRUPT;
+ }
+
+ if (array[val]) {
+ diag("Value %i already seen\n", val);
+ return TDB_ERR_CORRUPT;
+ }
+
+ array[val] = true;
+ return TDB_SUCCESS;
+}
+
+int main(int argc, char *argv[])
+{
+ unsigned int i, j;
+ struct tdb_context *tdb;
+ int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP,
+ TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT,
+ TDB_NOMMAP|TDB_CONVERT };
+
+ plan_tests(sizeof(flags) / sizeof(flags[0]) * 4 + 1);
+ for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+ bool array[NUM_RECORDS];
+
+ tdb = tdb_open("run-check-callback.tdb", flags[i],
+ O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+ ok1(tdb);
+ if (!tdb)
+ continue;
+
+ ok1(store_records(tdb));
+ for (j = 0; j < NUM_RECORDS; j++)
+ array[j] = false;
+ ok1(tdb_check(tdb, check, array) == TDB_SUCCESS);
+ for (j = 0; j < NUM_RECORDS; j++)
+ if (!array[j])
+ break;
+ ok1(j == NUM_RECORDS);
+ tdb_close(tdb);
+ }
+
+ ok1(tap_log_messages == 0);
+ return exit_status();
+}
diff --git a/lib/tdb2/test/run-expand-in-transaction.c b/lib/tdb2/test/run-expand-in-transaction.c
new file mode 100644
index 00000000000..49ba03c924a
--- /dev/null
+++ b/lib/tdb2/test/run-expand-in-transaction.c
@@ -0,0 +1,45 @@
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/open.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tdb2/transaction.c>
+#include <ccan/tap/tap.h>
+#include "logging.h"
+
+int main(int argc, char *argv[])
+{
+ unsigned int i;
+ struct tdb_context *tdb;
+ int flags[] = { TDB_DEFAULT, TDB_NOMMAP,
+ TDB_CONVERT, TDB_NOMMAP|TDB_CONVERT,
+ TDB_CONVERT|TDB_NOSYNC,
+ TDB_NOMMAP|TDB_CONVERT|TDB_NOSYNC };
+ struct tdb_data key = tdb_mkdata("key", 3);
+ struct tdb_data data = tdb_mkdata("data", 4);
+
+ plan_tests(sizeof(flags) / sizeof(flags[0]) * 7 + 1);
+
+ for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+ size_t size;
+ tdb = tdb_open("run-expand-in-transaction.tdb", flags[i],
+ O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+ ok1(tdb);
+ if (!tdb)
+ continue;
+
+ size = tdb->file->map_size;
+ ok1(tdb_transaction_start(tdb) == 0);
+ ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0);
+ ok1(tdb->file->map_size > size);
+ ok1(tdb_transaction_commit(tdb) == 0);
+ ok1(tdb->file->map_size > size);
+ ok1(tdb_check(tdb, NULL, NULL) == 0);
+ tdb_close(tdb);
+ }
+
+ ok1(tap_log_messages == 0);
+ return exit_status();
+}
diff --git a/lib/tdb2/test/run-features.c b/lib/tdb2/test/run-features.c
new file mode 100644
index 00000000000..6d82dc308c9
--- /dev/null
+++ b/lib/tdb2/test/run-features.c
@@ -0,0 +1,70 @@
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/open.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tdb2/summary.c>
+#include <ccan/tdb2/transaction.c>
+#include <ccan/tap/tap.h>
+#include "logging.h"
+
+int main(int argc, char *argv[])
+{
+ unsigned int i, j;
+ struct tdb_context *tdb;
+ int flags[] = { TDB_DEFAULT, TDB_NOMMAP,
+ TDB_CONVERT, TDB_NOMMAP|TDB_CONVERT };
+ struct tdb_data key = { (unsigned char *)&j, sizeof(j) };
+ struct tdb_data data = { (unsigned char *)&j, sizeof(j) };
+
+ plan_tests(sizeof(flags) / sizeof(flags[0]) * 8 + 1);
+ for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+ uint64_t features;
+ tdb = tdb_open("run-features.tdb", flags[i],
+ O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+ ok1(tdb);
+ if (!tdb)
+ continue;
+
+ /* Put some stuff in there. */
+ for (j = 0; j < 100; j++) {
+ if (tdb_store(tdb, key, data, TDB_REPLACE) != 0)
+ fail("Storing in tdb");
+ }
+
+ /* Mess with features fields in hdr. */
+ features = (~TDB_FEATURE_MASK ^ 1);
+ ok1(tdb_write_convert(tdb, offsetof(struct tdb_header,
+ features_used),
+ &features, sizeof(features)) == 0);
+ ok1(tdb_write_convert(tdb, offsetof(struct tdb_header,
+ features_offered),
+ &features, sizeof(features)) == 0);
+ tdb_close(tdb);
+
+ tdb = tdb_open("run-features.tdb", flags[i], O_RDWR, 0,
+ &tap_log_attr);
+ ok1(tdb);
+ if (!tdb)
+ continue;
+
+ /* Should not have changed features offered. */
+ ok1(tdb_read_convert(tdb, offsetof(struct tdb_header,
+ features_offered),
+ &features, sizeof(features)) == 0);
+ ok1(features == (~TDB_FEATURE_MASK ^ 1));
+
+ /* Should have cleared unknown bits in features_used. */
+ ok1(tdb_read_convert(tdb, offsetof(struct tdb_header,
+ features_used),
+ &features, sizeof(features)) == 0);
+ ok1(features == (1 & TDB_FEATURE_MASK));
+
+ tdb_close(tdb);
+ }
+
+ ok1(tap_log_messages == 0);
+ return exit_status();
+}
diff --git a/lib/tdb2/test/run-firstkey-nextkey.c b/lib/tdb2/test/run-firstkey-nextkey.c
new file mode 100644
index 00000000000..65a6090a96b
--- /dev/null
+++ b/lib/tdb2/test/run-firstkey-nextkey.c
@@ -0,0 +1,162 @@
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/open.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tdb2/traverse.c>
+#include <ccan/tdb2/transaction.c>
+#include <ccan/tap/tap.h>
+#include "logging.h"
+
+#define NUM_RECORDS 1000
+
+static bool store_records(struct tdb_context *tdb)
+{
+ int i;
+ struct tdb_data key = { (unsigned char *)&i, sizeof(i) };
+ struct tdb_data data = { (unsigned char *)&i, sizeof(i) };
+
+ for (i = 0; i < NUM_RECORDS; i++)
+ if (tdb_store(tdb, key, data, TDB_REPLACE) != 0)
+ return false;
+ return true;
+}
+
+struct trav_data {
+ unsigned int records[NUM_RECORDS];
+ unsigned int calls;
+};
+
+static int trav(struct tdb_context *tdb, TDB_DATA key, TDB_DATA dbuf, void *p)
+{
+ struct trav_data *td = p;
+ int val;
+
+ memcpy(&val, dbuf.dptr, dbuf.dsize);
+ td->records[td->calls++] = val;
+ return 0;
+}
+
+/* Since tdb_nextkey frees dptr, we need to clone it. */
+static TDB_DATA dup_key(TDB_DATA key)
+{
+ void *p = malloc(key.dsize);
+ memcpy(p, key.dptr, key.dsize);
+ key.dptr = p;
+ return key;
+}
+
+int main(int argc, char *argv[])
+{
+ unsigned int i, j;
+ int num;
+ struct trav_data td;
+ TDB_DATA k;
+ struct tdb_context *tdb;
+ union tdb_attribute seed_attr;
+ enum TDB_ERROR ecode;
+
+ int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP,
+ TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT,
+ TDB_NOMMAP|TDB_CONVERT };
+
+ seed_attr.base.attr = TDB_ATTRIBUTE_SEED;
+ seed_attr.base.next = &tap_log_attr;
+ seed_attr.seed.seed = 6334326220117065685ULL;
+
+ plan_tests(sizeof(flags) / sizeof(flags[0])
+ * (NUM_RECORDS*6 + (NUM_RECORDS-1)*3 + 22) + 1);
+ for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+ tdb = tdb_open("run-traverse.tdb", flags[i],
+ O_RDWR|O_CREAT|O_TRUNC, 0600, &seed_attr);
+ ok1(tdb);
+ if (!tdb)
+ continue;
+
+ ok1(tdb_firstkey(tdb, &k) == TDB_ERR_NOEXIST);
+
+ /* One entry... */
+ k.dptr = (unsigned char *)&num;
+ k.dsize = sizeof(num);
+ num = 0;
+ ok1(tdb_store(tdb, k, k, TDB_INSERT) == 0);
+ ok1(tdb_firstkey(tdb, &k) == TDB_SUCCESS);
+ ok1(k.dsize == sizeof(num));
+ ok1(memcmp(k.dptr, &num, sizeof(num)) == 0);
+ ok1(tdb_nextkey(tdb, &k) == TDB_ERR_NOEXIST);
+
+ /* Two entries. */
+ k.dptr = (unsigned char *)&num;
+ k.dsize = sizeof(num);
+ num = 1;
+ ok1(tdb_store(tdb, k, k, TDB_INSERT) == 0);
+ ok1(tdb_firstkey(tdb, &k) == TDB_SUCCESS);
+ ok1(k.dsize == sizeof(num));
+ memcpy(&num, k.dptr, sizeof(num));
+ ok1(num == 0 || num == 1);
+ ok1(tdb_nextkey(tdb, &k) == TDB_SUCCESS);
+ ok1(k.dsize == sizeof(j));
+ memcpy(&j, k.dptr, sizeof(j));
+ ok1(j == 0 || j == 1);
+ ok1(j != num);
+ ok1(tdb_nextkey(tdb, &k) == TDB_ERR_NOEXIST);
+
+ /* Clean up. */
+ k.dptr = (unsigned char *)&num;
+ k.dsize = sizeof(num);
+ num = 0;
+ ok1(tdb_delete(tdb, k) == 0);
+ num = 1;
+ ok1(tdb_delete(tdb, k) == 0);
+
+ /* Now lots of records. */
+ ok1(store_records(tdb));
+ td.calls = 0;
+
+ num = tdb_traverse(tdb, trav, &td);
+ ok1(num == NUM_RECORDS);
+ ok1(td.calls == NUM_RECORDS);
+
+ /* Simple loop should match tdb_traverse */
+ for (j = 0, ecode = tdb_firstkey(tdb, &k); j < td.calls; j++) {
+ int val;
+
+ ok1(ecode == TDB_SUCCESS);
+ ok1(k.dsize == sizeof(val));
+ memcpy(&val, k.dptr, k.dsize);
+ ok1(td.records[j] == val);
+ ecode = tdb_nextkey(tdb, &k);
+ }
+
+ /* But arbitrary orderings should work too. */
+ for (j = td.calls-1; j > 0; j--) {
+ k.dptr = (unsigned char *)&td.records[j-1];
+ k.dsize = sizeof(td.records[j-1]);
+ k = dup_key(k);
+ ok1(tdb_nextkey(tdb, &k) == TDB_SUCCESS);
+ ok1(k.dsize == sizeof(td.records[j]));
+ ok1(memcmp(k.dptr, &td.records[j], k.dsize) == 0);
+ free(k.dptr);
+ }
+
+ /* Even delete should work. */
+ for (j = 0, ecode = tdb_firstkey(tdb, &k);
+ ecode != TDB_ERR_NOEXIST;
+ j++) {
+ ok1(ecode == TDB_SUCCESS);
+ ok1(k.dsize == 4);
+ ok1(tdb_delete(tdb, k) == 0);
+ ecode = tdb_nextkey(tdb, &k);
+ }
+
+ diag("delete using first/nextkey gave %u of %u records",
+ j, NUM_RECORDS);
+ ok1(j == NUM_RECORDS);
+ tdb_close(tdb);
+ }
+
+ ok1(tap_log_messages == 0);
+ return exit_status();
+}
diff --git a/lib/tdb2/test/run-fork-test.c b/lib/tdb2/test/run-fork-test.c
new file mode 100644
index 00000000000..e9813e0a0f0
--- /dev/null
+++ b/lib/tdb2/test/run-fork-test.c
@@ -0,0 +1,180 @@
+/* Test forking while holding lock.
+ *
+ * There are only five ways to do this currently:
+ * (1) grab a tdb_chainlock, then fork.
+ * (2) grab a tdb_lockall, then fork.
+ * (3) grab a tdb_lockall_read, then fork.
+ * (4) start a transaction, then fork.
+ * (5) fork from inside a tdb_parse() callback.
+ *
+ * Note that we don't hold a lock across tdb_traverse callbacks, so
+ * that doesn't matter.
+ */
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/open.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tdb2/transaction.c>
+#include <ccan/tap/tap.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include "logging.h"
+
+static enum TDB_ERROR fork_in_parse(TDB_DATA key, TDB_DATA data,
+ struct tdb_context *tdb)
+{
+ int status;
+
+ if (fork() == 0) {
+ /* We expect this to fail. */
+ if (tdb_store(tdb, key, data, TDB_REPLACE) != TDB_ERR_LOCK)
+ exit(1);
+
+ if (tdb_fetch(tdb, key, &data) != TDB_ERR_LOCK)
+ exit(1);
+
+ if (tap_log_messages != 2)
+ exit(2);
+
+ tdb_close(tdb);
+ if (tap_log_messages != 2)
+ exit(3);
+ exit(0);
+ }
+ wait(&status);
+ ok1(WIFEXITED(status) && WEXITSTATUS(status) == 0);
+ return TDB_SUCCESS;
+}
+
+int main(int argc, char *argv[])
+{
+ unsigned int i;
+ struct tdb_context *tdb;
+ int flags[] = { TDB_DEFAULT, TDB_NOMMAP,
+ TDB_CONVERT, TDB_NOMMAP|TDB_CONVERT };
+ struct tdb_data key = tdb_mkdata("key", 3);
+ struct tdb_data data = tdb_mkdata("data", 4);
+
+ plan_tests(sizeof(flags) / sizeof(flags[0]) * 14);
+ for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+ int status;
+
+ tap_log_messages = 0;
+
+ tdb = tdb_open("run-fork-test.tdb", flags[i],
+ O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+ if (!ok1(tdb))
+ continue;
+
+ /* Put a record in here. */
+ ok1(tdb_store(tdb, key, data, TDB_REPLACE) == TDB_SUCCESS);
+
+ ok1(tdb_chainlock(tdb, key) == TDB_SUCCESS);
+ if (fork() == 0) {
+ /* We expect this to fail. */
+ if (tdb_store(tdb, key, data, TDB_REPLACE) != TDB_ERR_LOCK)
+ return 1;
+
+ if (tdb_fetch(tdb, key, &data) != TDB_ERR_LOCK)
+ return 1;
+
+ if (tap_log_messages != 2)
+ return 2;
+
+ tdb_chainunlock(tdb, key);
+ if (tap_log_messages != 3)
+ return 3;
+ tdb_close(tdb);
+ if (tap_log_messages != 3)
+ return 4;
+ return 0;
+ }
+ wait(&status);
+ ok1(WIFEXITED(status) && WEXITSTATUS(status) == 0);
+ tdb_chainunlock(tdb, key);
+
+ ok1(tdb_lockall(tdb) == TDB_SUCCESS);
+ if (fork() == 0) {
+ /* We expect this to fail. */
+ if (tdb_store(tdb, key, data, TDB_REPLACE) != TDB_ERR_LOCK)
+ return 1;
+
+ if (tdb_fetch(tdb, key, &data) != TDB_ERR_LOCK)
+ return 1;
+
+ if (tap_log_messages != 2)
+ return 2;
+
+ tdb_unlockall(tdb);
+ if (tap_log_messages != 2)
+ return 3;
+ tdb_close(tdb);
+ if (tap_log_messages != 2)
+ return 4;
+ return 0;
+ }
+ wait(&status);
+ ok1(WIFEXITED(status) && WEXITSTATUS(status) == 0);
+ tdb_unlockall(tdb);
+
+ ok1(tdb_lockall_read(tdb) == TDB_SUCCESS);
+ if (fork() == 0) {
+ /* We expect this to fail. */
+ /* This would always fail anyway... */
+ if (tdb_store(tdb, key, data, TDB_REPLACE) != TDB_ERR_LOCK)
+ return 1;
+
+ if (tdb_fetch(tdb, key, &data) != TDB_ERR_LOCK)
+ return 1;
+
+ if (tap_log_messages != 2)
+ return 2;
+
+ tdb_unlockall_read(tdb);
+ if (tap_log_messages != 2)
+ return 3;
+ tdb_close(tdb);
+ if (tap_log_messages != 2)
+ return 4;
+ return 0;
+ }
+ wait(&status);
+ ok1(WIFEXITED(status) && WEXITSTATUS(status) == 0);
+ tdb_unlockall_read(tdb);
+
+ ok1(tdb_transaction_start(tdb) == TDB_SUCCESS);
+ /* If transactions is empty, noop "commit" succeeds. */
+ ok1(tdb_delete(tdb, key) == TDB_SUCCESS);
+ if (fork() == 0) {
+ /* We expect this to fail. */
+ if (tdb_store(tdb, key, data, TDB_REPLACE) != TDB_ERR_LOCK)
+ return 1;
+
+ if (tdb_fetch(tdb, key, &data) != TDB_ERR_LOCK)
+ return 1;
+
+ if (tap_log_messages != 2)
+ return 2;
+
+ if (tdb_transaction_commit(tdb) != TDB_ERR_LOCK)
+ return 3;
+
+ tdb_close(tdb);
+ if (tap_log_messages < 3)
+ return 4;
+ return 0;
+ }
+ wait(&status);
+ ok1(WIFEXITED(status) && WEXITSTATUS(status) == 0);
+ tdb_transaction_cancel(tdb);
+
+ ok1(tdb_parse_record(tdb, key, fork_in_parse, tdb)
+ == TDB_SUCCESS);
+ tdb_close(tdb);
+ ok1(tap_log_messages == 0);
+ }
+ return exit_status();
+}
diff --git a/lib/tdb2/test/run-lockall.c b/lib/tdb2/test/run-lockall.c
new file mode 100644
index 00000000000..4aedf59743d
--- /dev/null
+++ b/lib/tdb2/test/run-lockall.c
@@ -0,0 +1,80 @@
+#include "config.h"
+#include <unistd.h>
+#include "lock-tracking.h"
+
+#define fcntl fcntl_with_lockcheck
+
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/open.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tdb2/transaction.c>
+#include <ccan/tap/tap.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <stdarg.h>
+#include <err.h>
+#include "external-agent.h"
+#include "logging.h"
+
+#define TEST_DBNAME "run-lockall.tdb"
+
+#undef fcntl
+
+int main(int argc, char *argv[])
+{
+ struct agent *agent;
+ const int flags[] = { TDB_DEFAULT,
+ TDB_NOMMAP,
+ TDB_CONVERT,
+ TDB_CONVERT | TDB_NOMMAP };
+ int i;
+
+ plan_tests(13 * sizeof(flags)/sizeof(flags[0]) + 1);
+ agent = prepare_external_agent();
+ if (!agent)
+ err(1, "preparing agent");
+
+ for (i = 0; i < sizeof(flags)/sizeof(flags[0]); i++) {
+ enum agent_return ret;
+ struct tdb_context *tdb;
+
+ tdb = tdb_open(TEST_DBNAME, flags[i],
+ O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+ ok1(tdb);
+
+ ret = external_agent_operation(agent, OPEN, TEST_DBNAME);
+ ok1(ret == SUCCESS);
+
+ ok1(tdb_lockall(tdb) == TDB_SUCCESS);
+ ok1(external_agent_operation(agent, STORE, "key")
+ == WOULD_HAVE_BLOCKED);
+ ok1(external_agent_operation(agent, FETCH, "key")
+ == WOULD_HAVE_BLOCKED);
+ /* Test nesting. */
+ ok1(tdb_lockall(tdb) == TDB_SUCCESS);
+ tdb_unlockall(tdb);
+ tdb_unlockall(tdb);
+
+ ok1(external_agent_operation(agent, STORE, "key") == SUCCESS);
+
+ ok1(tdb_lockall_read(tdb) == TDB_SUCCESS);
+ ok1(external_agent_operation(agent, STORE, "key")
+ == WOULD_HAVE_BLOCKED);
+ ok1(external_agent_operation(agent, FETCH, "key") == SUCCESS);
+ ok1(tdb_lockall_read(tdb) == TDB_SUCCESS);
+ tdb_unlockall_read(tdb);
+ tdb_unlockall_read(tdb);
+
+ ok1(external_agent_operation(agent, STORE, "key") == SUCCESS);
+ ok1(external_agent_operation(agent, CLOSE, NULL) == SUCCESS);
+ tdb_close(tdb);
+ }
+
+ free_external_agent(agent);
+ ok1(tap_log_messages == 0);
+ return exit_status();
+}
diff --git a/lib/tdb2/test/run-locktimeout.c b/lib/tdb2/test/run-locktimeout.c
new file mode 100644
index 00000000000..bb5b5db29b4
--- /dev/null
+++ b/lib/tdb2/test/run-locktimeout.c
@@ -0,0 +1,192 @@
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/open.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/transaction.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tap/tap.h>
+#include "logging.h"
+#include "external-agent.h"
+
+#undef alarm
+#define alarm fast_alarm
+
+/* Speed things up by doing things in milliseconds. */
+static unsigned int fast_alarm(unsigned int milli_seconds)
+{
+ struct itimerval it;
+
+ it.it_interval.tv_sec = it.it_interval.tv_usec = 0;
+ it.it_value.tv_sec = milli_seconds / 1000;
+ it.it_value.tv_usec = milli_seconds * 1000;
+ setitimer(ITIMER_REAL, &it, NULL);
+ return 0;
+}
+
+#define CatchSignal(sig, handler) signal((sig), (handler))
+
+static void do_nothing(int signum)
+{
+}
+
+/* This example code is taken from SAMBA, so try not to change it. */
+static struct flock flock_struct;
+
+/* Return a value which is none of v1, v2 or v3. */
+static inline short int invalid_value(short int v1, short int v2, short int v3)
+{
+ short int try = (v1+v2+v3)^((v1+v2+v3) << 16);
+ while (try == v1 || try == v2 || try == v3)
+ try++;
+ return try;
+}
+
+/* We invalidate in as many ways as we can, so the OS rejects it */
+static void invalidate_flock_struct(int signum)
+{
+ flock_struct.l_type = invalid_value(F_RDLCK, F_WRLCK, F_UNLCK);
+ flock_struct.l_whence = invalid_value(SEEK_SET, SEEK_CUR, SEEK_END);
+ flock_struct.l_start = -1;
+ /* A large negative. */
+ flock_struct.l_len = (((off_t)1 << (sizeof(off_t)*CHAR_BIT - 1)) + 1);
+}
+
+static int timeout_lock(int fd, int rw, off_t off, off_t len, bool waitflag,
+ void *_timeout)
+{
+ int ret, saved_errno = errno;
+ unsigned int timeout = *(unsigned int *)_timeout;
+
+ flock_struct.l_type = rw;
+ flock_struct.l_whence = SEEK_SET;
+ flock_struct.l_start = off;
+ flock_struct.l_len = len;
+
+ CatchSignal(SIGALRM, invalidate_flock_struct);
+ alarm(timeout);
+
+ for (;;) {
+ if (waitflag)
+ ret = fcntl(fd, F_SETLKW, &flock_struct);
+ else
+ ret = fcntl(fd, F_SETLK, &flock_struct);
+
+ if (ret == 0)
+ break;
+
+ /* Not signalled? Something else went wrong. */
+ if (flock_struct.l_len == len) {
+ if (errno == EAGAIN || errno == EINTR)
+ continue;
+ saved_errno = errno;
+ break;
+ } else {
+ saved_errno = EINTR;
+ break;
+ }
+ }
+
+ alarm(0);
+ errno = saved_errno;
+ return ret;
+}
+
+static int tdb_chainlock_with_timeout_internal(struct tdb_context *tdb,
+ TDB_DATA key,
+ unsigned int timeout,
+ int rw_type)
+{
+ union tdb_attribute locking;
+ enum TDB_ERROR ecode;
+
+ if (timeout) {
+ locking.base.attr = TDB_ATTRIBUTE_FLOCK;
+ ecode = tdb_get_attribute(tdb, &locking);
+ if (ecode != TDB_SUCCESS)
+ return ecode;
+
+ /* Replace locking function with our own. */
+ locking.flock.data = &timeout;
+ locking.flock.lock = timeout_lock;
+
+ ecode = tdb_set_attribute(tdb, &locking);
+ if (ecode != TDB_SUCCESS)
+ return ecode;
+ }
+ if (rw_type == F_RDLCK)
+ ecode = tdb_chainlock_read(tdb, key);
+ else
+ ecode = tdb_chainlock(tdb, key);
+
+ if (timeout) {
+ tdb_unset_attribute(tdb, TDB_ATTRIBUTE_FLOCK);
+ }
+ return ecode;
+}
+
+int main(int argc, char *argv[])
+{
+ unsigned int i;
+ struct tdb_context *tdb;
+ TDB_DATA key = tdb_mkdata("hello", 5);
+ int flags[] = { TDB_DEFAULT, TDB_NOMMAP,
+ TDB_CONVERT, TDB_NOMMAP|TDB_CONVERT };
+ struct agent *agent;
+
+ plan_tests(sizeof(flags) / sizeof(flags[0]) * 15);
+
+ agent = prepare_external_agent();
+
+ for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+ enum TDB_ERROR ecode;
+ tdb = tdb_open("run-locktimeout.tdb", flags[i],
+ O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+ if (!ok1(tdb))
+ break;
+
+ /* Simple cases: should succeed. */
+ ecode = tdb_chainlock_with_timeout_internal(tdb, key, 20,
+ F_RDLCK);
+ ok1(ecode == TDB_SUCCESS);
+ ok1(tap_log_messages == 0);
+
+ tdb_chainunlock_read(tdb, key);
+ ok1(tap_log_messages == 0);
+
+ ecode = tdb_chainlock_with_timeout_internal(tdb, key, 20,
+ F_WRLCK);
+ ok1(ecode == TDB_SUCCESS);
+ ok1(tap_log_messages == 0);
+
+ tdb_chainunlock(tdb, key);
+ ok1(tap_log_messages == 0);
+
+ /* OK, get agent to start transaction, then we should time out. */
+ ok1(external_agent_operation(agent, OPEN, "run-locktimeout.tdb")
+ == SUCCESS);
+ ok1(external_agent_operation(agent, TRANSACTION_START, "")
+ == SUCCESS);
+ ecode = tdb_chainlock_with_timeout_internal(tdb, key, 20,
+ F_WRLCK);
+ ok1(ecode == TDB_ERR_LOCK);
+ ok1(tap_log_messages == 0);
+
+ /* Even if we get a different signal, should be fine. */
+ CatchSignal(SIGUSR1, do_nothing);
+ external_agent_operation(agent, SEND_SIGNAL, "");
+ ecode = tdb_chainlock_with_timeout_internal(tdb, key, 20,
+ F_WRLCK);
+ ok1(ecode == TDB_ERR_LOCK);
+ ok1(tap_log_messages == 0);
+
+ ok1(external_agent_operation(agent, TRANSACTION_COMMIT, "")
+ == SUCCESS);
+ ok1(external_agent_operation(agent, CLOSE, "")
+ == SUCCESS);
+ tdb_close(tdb);
+ }
+ free_external_agent(agent);
+ return exit_status();
+}
diff --git a/lib/tdb2/test/run-missing-entries.c b/lib/tdb2/test/run-missing-entries.c
new file mode 100644
index 00000000000..e99572f64c1
--- /dev/null
+++ b/lib/tdb2/test/run-missing-entries.c
@@ -0,0 +1,48 @@
+/* Another test revealed that we lost an entry. This reproduces it. */
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/open.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tdb2/traverse.c>
+#include <ccan/tdb2/transaction.c>
+#include <ccan/tap/tap.h>
+#include "logging.h"
+
+#define NUM_RECORDS 1189
+
+/* We use the same seed which we saw this failure on. */
+static uint64_t failhash(const void *key, size_t len, uint64_t seed, void *p)
+{
+ seed = 699537674708983027ULL;
+ return hash64_stable((const unsigned char *)key, len, seed);
+}
+
+int main(int argc, char *argv[])
+{
+ int i;
+ struct tdb_context *tdb;
+ struct tdb_data key = { (unsigned char *)&i, sizeof(i) };
+ struct tdb_data data = { (unsigned char *)&i, sizeof(i) };
+ union tdb_attribute hattr = { .hash = { .base = { TDB_ATTRIBUTE_HASH },
+ .fn = failhash } };
+
+ hattr.base.next = &tap_log_attr;
+ plan_tests(1 + 2 * NUM_RECORDS + 1);
+
+ tdb = tdb_open("run-missing-entries.tdb", TDB_INTERNAL,
+ O_RDWR|O_CREAT|O_TRUNC, 0600, &hattr);
+ ok1(tdb);
+ if (tdb) {
+ for (i = 0; i < NUM_RECORDS; i++) {
+ ok1(tdb_store(tdb, key, data, TDB_REPLACE) == 0);
+ ok1(tdb_check(tdb, NULL, NULL) == 0);
+ }
+ tdb_close(tdb);
+ }
+
+ ok1(tap_log_messages == 0);
+ return exit_status();
+}
diff --git a/lib/tdb2/test/run-open-multiple-times.c b/lib/tdb2/test/run-open-multiple-times.c
new file mode 100644
index 00000000000..240828df16b
--- /dev/null
+++ b/lib/tdb2/test/run-open-multiple-times.c
@@ -0,0 +1,84 @@
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/open.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tdb2/transaction.c>
+#include <ccan/tap/tap.h>
+#include "logging.h"
+
+int main(int argc, char *argv[])
+{
+ unsigned int i;
+ struct tdb_context *tdb, *tdb2;
+ struct tdb_data key = { (unsigned char *)&i, sizeof(i) };
+ struct tdb_data data = { (unsigned char *)&i, sizeof(i) };
+ struct tdb_data d = { NULL, 0 }; /* Bogus GCC warning */
+ int flags[] = { TDB_DEFAULT, TDB_NOMMAP,
+ TDB_CONVERT, TDB_NOMMAP|TDB_CONVERT };
+
+ plan_tests(sizeof(flags) / sizeof(flags[0]) * 28);
+ for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+ tdb = tdb_open("run-open-multiple-times.tdb", flags[i],
+ O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+ ok1(tdb);
+ if (!tdb)
+ continue;
+ tdb2 = tdb_open("run-open-multiple-times.tdb", flags[i],
+ O_RDWR|O_CREAT, 0600, &tap_log_attr);
+ ok1(tdb_check(tdb, NULL, NULL) == 0);
+ ok1(tdb_check(tdb2, NULL, NULL) == 0);
+
+ /* Store in one, fetch in the other. */
+ ok1(tdb_store(tdb, key, data, TDB_REPLACE) == 0);
+ ok1(tdb_fetch(tdb2, key, &d) == TDB_SUCCESS);
+ ok1(tdb_deq(d, data));
+ free(d.dptr);
+
+ /* Vice versa, with delete. */
+ ok1(tdb_delete(tdb2, key) == 0);
+ ok1(tdb_fetch(tdb, key, &d) == TDB_ERR_NOEXIST);
+
+ /* OK, now close first one, check second still good. */
+ ok1(tdb_close(tdb) == 0);
+
+ ok1(tdb_store(tdb2, key, data, TDB_REPLACE) == 0);
+ ok1(tdb_fetch(tdb2, key, &d) == TDB_SUCCESS);
+ ok1(tdb_deq(d, data));
+ free(d.dptr);
+
+ /* Reopen */
+ tdb = tdb_open("run-open-multiple-times.tdb", flags[i],
+ O_RDWR|O_CREAT, 0600, &tap_log_attr);
+ ok1(tdb);
+
+ ok1(tdb_transaction_start(tdb2) == 0);
+
+ /* Anything in the other one should fail. */
+ ok1(tdb_fetch(tdb, key, &d) == TDB_ERR_LOCK);
+ ok1(tap_log_messages == 1);
+ ok1(tdb_store(tdb, key, data, TDB_REPLACE) == TDB_ERR_LOCK);
+ ok1(tap_log_messages == 2);
+ ok1(tdb_transaction_start(tdb) == TDB_ERR_LOCK);
+ ok1(tap_log_messages == 3);
+ ok1(tdb_chainlock(tdb, key) == TDB_ERR_LOCK);
+ ok1(tap_log_messages == 4);
+
+ /* Transaciton should work as normal. */
+ ok1(tdb_store(tdb2, key, data, TDB_REPLACE) == TDB_SUCCESS);
+
+ /* Now... try closing with locks held. */
+ ok1(tdb_close(tdb2) == 0);
+
+ ok1(tdb_fetch(tdb, key, &d) == TDB_SUCCESS);
+ ok1(tdb_deq(d, data));
+ free(d.dptr);
+ ok1(tdb_close(tdb) == 0);
+ ok1(tap_log_messages == 4);
+ tap_log_messages = 0;
+ }
+
+ return exit_status();
+}
diff --git a/lib/tdb2/test/run-record-expand.c b/lib/tdb2/test/run-record-expand.c
new file mode 100644
index 00000000000..109a099278b
--- /dev/null
+++ b/lib/tdb2/test/run-record-expand.c
@@ -0,0 +1,53 @@
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/open.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tdb2/transaction.c>
+#include <ccan/tap/tap.h>
+#include "logging.h"
+
+#define MAX_SIZE 10000
+#define SIZE_STEP 131
+
+int main(int argc, char *argv[])
+{
+ unsigned int i;
+ struct tdb_context *tdb;
+ int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP,
+ TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT,
+ TDB_NOMMAP|TDB_CONVERT };
+ struct tdb_data key = tdb_mkdata("key", 3);
+ struct tdb_data data;
+
+ data.dptr = malloc(MAX_SIZE);
+ memset(data.dptr, 0x24, MAX_SIZE);
+
+ plan_tests(sizeof(flags) / sizeof(flags[0])
+ * (3 + (1 + (MAX_SIZE/SIZE_STEP)) * 2) + 1);
+ for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+ tdb = tdb_open("run-record-expand.tdb", flags[i],
+ O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+ ok1(tdb);
+ if (!tdb)
+ continue;
+
+ data.dsize = 0;
+ ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0);
+ ok1(tdb_check(tdb, NULL, NULL) == 0);
+ for (data.dsize = 0;
+ data.dsize < MAX_SIZE;
+ data.dsize += SIZE_STEP) {
+ memset(data.dptr, data.dsize, data.dsize);
+ ok1(tdb_store(tdb, key, data, TDB_MODIFY) == 0);
+ ok1(tdb_check(tdb, NULL, NULL) == 0);
+ }
+ tdb_close(tdb);
+ }
+ ok1(tap_log_messages == 0);
+ free(data.dptr);
+
+ return exit_status();
+}
diff --git a/lib/tdb2/test/run-remap-in-read_traverse.c b/lib/tdb2/test/run-remap-in-read_traverse.c
new file mode 100644
index 00000000000..d784ca34074
--- /dev/null
+++ b/lib/tdb2/test/run-remap-in-read_traverse.c
@@ -0,0 +1,65 @@
+/* We had a bug where we marked the tdb read-only for a tdb_traverse_read.
+ * If we then expanded the tdb, we would remap read-only, and later SEGV. */
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/open.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tdb2/traverse.c>
+#include <ccan/tdb2/transaction.c>
+#include <ccan/tap/tap.h>
+#include "external-agent.h"
+#include "logging.h"
+
+static bool file_larger(int fd, tdb_len_t size)
+{
+ struct stat st;
+
+ fstat(fd, &st);
+ return st.st_size != size;
+}
+
+static unsigned add_records_to_grow(struct agent *agent, int fd, tdb_len_t size)
+{
+ unsigned int i;
+
+ for (i = 0; !file_larger(fd, size); i++) {
+ char data[20];
+ sprintf(data, "%i", i);
+ if (external_agent_operation(agent, STORE, data) != SUCCESS)
+ return 0;
+ }
+ diag("Added %u records to grow file", i);
+ return i;
+}
+
+int main(int argc, char *argv[])
+{
+ unsigned int i;
+ struct agent *agent;
+ struct tdb_context *tdb;
+ struct tdb_data d = tdb_mkdata("hello", 5);
+ const char filename[] = "run-remap-in-read_traverse.tdb";
+
+ plan_tests(4);
+
+ agent = prepare_external_agent();
+
+ tdb = tdb_open(filename, TDB_DEFAULT,
+ O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+
+ ok1(external_agent_operation(agent, OPEN, filename) == SUCCESS);
+ i = add_records_to_grow(agent, tdb->file->fd, tdb->file->map_size);
+
+ /* Do a traverse. */
+ ok1(tdb_traverse(tdb, NULL, NULL) == i);
+
+ /* Now store something! */
+ ok1(tdb_store(tdb, d, d, TDB_INSERT) == 0);
+ ok1(tap_log_messages == 0);
+ tdb_close(tdb);
+ free_external_agent(agent);
+ return exit_status();
+}
diff --git a/lib/tdb2/test/run-seed.c b/lib/tdb2/test/run-seed.c
new file mode 100644
index 00000000000..a9b370b6e55
--- /dev/null
+++ b/lib/tdb2/test/run-seed.c
@@ -0,0 +1,67 @@
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/open.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tdb2/transaction.c>
+#include <ccan/tap/tap.h>
+#include "logging.h"
+
+static int log_count = 0;
+
+/* Normally we get a log when setting random seed. */
+static void my_log_fn(struct tdb_context *tdb,
+ enum tdb_log_level level,
+ const char *message, void *priv)
+{
+ log_count++;
+}
+
+static union tdb_attribute log_attr = {
+ .log = { .base = { .attr = TDB_ATTRIBUTE_LOG },
+ .fn = my_log_fn }
+};
+
+int main(int argc, char *argv[])
+{
+ unsigned int i;
+ struct tdb_context *tdb;
+ union tdb_attribute attr;
+ int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP,
+ TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT,
+ TDB_NOMMAP|TDB_CONVERT };
+
+ attr.seed.base.attr = TDB_ATTRIBUTE_SEED;
+ attr.seed.base.next = &log_attr;
+ attr.seed.seed = 42;
+
+ plan_tests(sizeof(flags) / sizeof(flags[0]) * 4 + 4 * 3);
+ for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+ struct tdb_header hdr;
+ int fd;
+ tdb = tdb_open("run-seed.tdb", flags[i],
+ O_RDWR|O_CREAT|O_TRUNC, 0600, &attr);
+ ok1(tdb);
+ if (!tdb)
+ continue;
+ ok1(tdb_check(tdb, NULL, NULL) == 0);
+ ok1(tdb->hash_seed == 42);
+ ok1(log_count == 0);
+ tdb_close(tdb);
+
+ if (flags[i] & TDB_INTERNAL)
+ continue;
+
+ fd = open("run-seed.tdb", O_RDONLY);
+ ok1(fd >= 0);
+ ok1(read(fd, &hdr, sizeof(hdr)) == sizeof(hdr));
+ if (flags[i] & TDB_CONVERT)
+ ok1(bswap_64(hdr.hash_seed) == 42);
+ else
+ ok1(hdr.hash_seed == 42);
+ close(fd);
+ }
+ return exit_status();
+}
diff --git a/lib/tdb2/test/run-simple-delete.c b/lib/tdb2/test/run-simple-delete.c
new file mode 100644
index 00000000000..d06bf2d2bd1
--- /dev/null
+++ b/lib/tdb2/test/run-simple-delete.c
@@ -0,0 +1,42 @@
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/open.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tdb2/transaction.c>
+#include <ccan/tap/tap.h>
+#include "logging.h"
+
+int main(int argc, char *argv[])
+{
+ unsigned int i;
+ struct tdb_context *tdb;
+ int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP,
+ TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT,
+ TDB_NOMMAP|TDB_CONVERT };
+ struct tdb_data key = tdb_mkdata("key", 3);
+ struct tdb_data data = tdb_mkdata("data", 4);
+
+ plan_tests(sizeof(flags) / sizeof(flags[0]) * 7 + 1);
+ for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+ tdb = tdb_open("run-simple-delete.tdb", flags[i],
+ O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+ ok1(tdb);
+ if (tdb) {
+ /* Delete should fail. */
+ ok1(tdb_delete(tdb, key) == TDB_ERR_NOEXIST);
+ ok1(tdb_check(tdb, NULL, NULL) == 0);
+ /* Insert should succeed. */
+ ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0);
+ ok1(tdb_check(tdb, NULL, NULL) == 0);
+ /* Delete should now work. */
+ ok1(tdb_delete(tdb, key) == 0);
+ ok1(tdb_check(tdb, NULL, NULL) == 0);
+ tdb_close(tdb);
+ }
+ }
+ ok1(tap_log_messages == 0);
+ return exit_status();
+}
diff --git a/lib/tdb2/test/run-summary.c b/lib/tdb2/test/run-summary.c
new file mode 100644
index 00000000000..c92e7593738
--- /dev/null
+++ b/lib/tdb2/test/run-summary.c
@@ -0,0 +1,60 @@
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/open.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tdb2/summary.c>
+#include <ccan/tdb2/transaction.c>
+#include <ccan/tap/tap.h>
+#include "logging.h"
+
+int main(int argc, char *argv[])
+{
+ unsigned int i, j;
+ struct tdb_context *tdb;
+ int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP,
+ TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT,
+ TDB_NOMMAP|TDB_CONVERT };
+ struct tdb_data key = { (unsigned char *)&j, sizeof(j) };
+ struct tdb_data data = { (unsigned char *)&j, sizeof(j) };
+ char *summary;
+
+ plan_tests(sizeof(flags) / sizeof(flags[0]) * (1 + 2 * 5) + 1);
+ for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+ tdb = tdb_open("run-summary.tdb", flags[i],
+ O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
+ ok1(tdb);
+ if (!tdb)
+ continue;
+
+ /* Put some stuff in there. */
+ for (j = 0; j < 500; j++) {
+ /* Make sure padding varies to we get some graphs! */
+ data.dsize = j % (sizeof(j) + 1);
+ if (tdb_store(tdb, key, data, TDB_REPLACE) != 0)
+ fail("Storing in tdb");
+ }
+
+ for (j = 0;
+ j <= TDB_SUMMARY_HISTOGRAMS;
+ j += TDB_SUMMARY_HISTOGRAMS) {
+ ok1(tdb_summary(tdb, j, &summary) == TDB_SUCCESS);
+ ok1(strstr(summary, "Number of records: 500\n"));
+ ok1(strstr(summary, "Smallest/average/largest keys: 4/4/4\n"));
+ ok1(strstr(summary, "Smallest/average/largest data: 0/2/4\n"));
+ if (j == TDB_SUMMARY_HISTOGRAMS)
+ ok1(strstr(summary, "|")
+ && strstr(summary, "*"));
+ else
+ ok1(!strstr(summary, "|")
+ && !strstr(summary, "*"));
+ free(summary);
+ }
+ tdb_close(tdb);
+ }
+
+ ok1(tap_log_messages == 0);
+ return exit_status();
+}
diff --git a/lib/tdb2/test/run-tdb_errorstr.c b/lib/tdb2/test/run-tdb_errorstr.c
new file mode 100644
index 00000000000..27bdfcd67cf
--- /dev/null
+++ b/lib/tdb2/test/run-tdb_errorstr.c
@@ -0,0 +1,59 @@
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/open.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tdb2/transaction.c>
+#include <ccan/tap/tap.h>
+
+int main(int argc, char *argv[])
+{
+ enum TDB_ERROR err;
+ plan_tests(TDB_ERR_RDONLY*-1 + 2);
+
+ for (err = TDB_SUCCESS; err >= TDB_ERR_RDONLY; err--) {
+ switch (err) {
+ case TDB_SUCCESS:
+ ok1(!strcmp(tdb_errorstr(err),
+ "Success"));
+ break;
+ case TDB_ERR_IO:
+ ok1(!strcmp(tdb_errorstr(err),
+ "IO Error"));
+ break;
+ case TDB_ERR_LOCK:
+ ok1(!strcmp(tdb_errorstr(err),
+ "Locking error"));
+ break;
+ case TDB_ERR_OOM:
+ ok1(!strcmp(tdb_errorstr(err),
+ "Out of memory"));
+ break;
+ case TDB_ERR_EXISTS:
+ ok1(!strcmp(tdb_errorstr(err),
+ "Record exists"));
+ break;
+ case TDB_ERR_EINVAL:
+ ok1(!strcmp(tdb_errorstr(err),
+ "Invalid parameter"));
+ break;
+ case TDB_ERR_NOEXIST:
+ ok1(!strcmp(tdb_errorstr(err),
+ "Record does not exist"));
+ break;
+ case TDB_ERR_RDONLY:
+ ok1(!strcmp(tdb_errorstr(err),
+ "write not permitted"));
+ break;
+ case TDB_ERR_CORRUPT:
+ ok1(!strcmp(tdb_errorstr(err),
+ "Corrupt database"));
+ break;
+ }
+ }
+ ok1(!strcmp(tdb_errorstr(err), "Invalid error code"));
+
+ return exit_status();
+}
diff --git a/lib/tdb2/test/run-traverse.c b/lib/tdb2/test/run-traverse.c
new file mode 100644
index 00000000000..f973d95d0f6
--- /dev/null
+++ b/lib/tdb2/test/run-traverse.c
@@ -0,0 +1,211 @@
+#include <ccan/tdb2/tdb.c>
+#include <ccan/tdb2/open.c>
+#include <ccan/tdb2/free.c>
+#include <ccan/tdb2/lock.c>
+#include <ccan/tdb2/io.c>
+#include <ccan/tdb2/hash.c>
+#include <ccan/tdb2/check.c>
+#include <ccan/tdb2/traverse.c>
+#include <ccan/tdb2/transaction.c>
+#include <ccan/tap/tap.h>
+#include "logging.h"
+
+#define NUM_RECORDS 1000
+
+/* We use the same seed which we saw a failure on. */
+static uint64_t fixedhash(const void *key, size_t len, uint64_t seed, void *p)
+{
+ return hash64_stable((const unsigned char *)key, len,
+ *(uint64_t *)p);
+}
+
+static bool store_records(struct tdb_context *tdb)
+{
+ int i;
+ struct tdb_data key = { (unsigned char *)&i, sizeof(i) };
+ struct tdb_data data = { (unsigned char *)&i, sizeof(i) };
+
+ for (i = 0; i < NUM_RECORDS; i++)
+ if (tdb_store(tdb, key, data, TDB_REPLACE) != 0)
+ return false;
+ return true;
+}
+
+struct trav_data {
+ unsigned int calls, call_limit;
+ int low, high;
+ bool mismatch;
+ bool delete;
+ enum TDB_ERROR delete_error;
+};
+
+static int trav(struct tdb_context *tdb, TDB_DATA key, TDB_DATA dbuf,
+ struct trav_data *td)
+{
+ int val;
+
+ td->calls++;
+ if (key.dsize != sizeof(val) || dbuf.dsize != sizeof(val)
+ || memcmp(key.dptr, dbuf.dptr, key.dsize) != 0) {
+ td->mismatch = true;
+ return -1;
+ }
+ memcpy(&val, dbuf.dptr, dbuf.dsize);
+ if (val < td->low)
+ td->low = val;
+ if (val > td->high)
+ td->high = val;
+
+ if (td->delete) {
+ td->delete_error = tdb_delete(tdb, key);
+ if (td->delete_error != TDB_SUCCESS) {
+ return -1;
+ }
+ }
+
+ if (td->calls == td->call_limit)
+ return 1;
+ return 0;
+}
+
+struct trav_grow_data {
+ unsigned int calls;
+ unsigned int num_large;
+ bool mismatch;
+ enum TDB_ERROR error;
+};
+
+static int trav_grow(struct tdb_context *tdb, TDB_DATA key, TDB_DATA dbuf,
+ struct trav_grow_data *tgd)
+{
+ int val;
+ unsigned char buffer[128] = { 0 };
+
+ tgd->calls++;
+ if (key.dsize != sizeof(val) || dbuf.dsize < sizeof(val)
+ || memcmp(key.dptr, dbuf.dptr, key.dsize) != 0) {
+ tgd->mismatch = true;
+ return -1;
+ }
+
+ if (dbuf.dsize > sizeof(val))
+ /* We must have seen this before! */
+ tgd->num_large++;
+
+ /* Make a big difference to the database. */
+ dbuf.dptr = buffer;
+ dbuf.dsize = sizeof(buffer);
+ tgd->error = tdb_append(tdb, key, dbuf);
+ if (tgd->error != TDB_SUCCESS) {
+ return -1;
+ }
+ return 0;
+}
+
+int main(int argc, char *argv[])
+{
+ unsigned int i;
+ int num;
+ struct trav_data td;
+ struct trav_grow_data tgd;
+ struct tdb_context *tdb;
+ uint64_t seed = 16014841315512641303ULL;
+ int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP,
+ TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT,
+ TDB_NOMMAP|TDB_CONVERT };
+ union tdb_attribute hattr = { .hash = { .base = { TDB_ATTRIBUTE_HASH },
+ .fn = fixedhash,
+ .data = &seed } };
+
+ hattr.base.next = &tap_log_attr;
+
+ plan_tests(sizeof(flags) / sizeof(flags[0]) * 32 + 1);
+ for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+ tdb = tdb_open("run-traverse.tdb", flags[i],
+ O_RDWR|O_CREAT|O_TRUNC, 0600, &hattr);
+ ok1(tdb);
+ if (!tdb)
+ continue;
+
+ ok1(tdb_traverse(tdb, NULL, NULL) == 0);
+
+ ok1(store_records(tdb));
+ num = tdb_traverse(tdb, NULL, NULL);
+ ok1(num == NUM_RECORDS);
+
+ /* Full traverse. */
+ td.calls = 0;
+ td.call_limit = UINT_MAX;
+ td.low = INT_MAX;
+ td.high = INT_MIN;
+ td.mismatch = false;
+ td.delete = false;
+
+ num = tdb_traverse(tdb, trav, &td);
+ ok1(num == NUM_RECORDS);
+ ok1(!td.mismatch);
+ ok1(td.calls == NUM_RECORDS);
+ ok1(td.low == 0);
+ ok1(td.high == NUM_RECORDS-1);
+
+ /* Short traverse. */
+ td.calls = 0;
+ td.call_limit = NUM_RECORDS / 2;
+ td.low = INT_MAX;
+ td.high = INT_MIN;
+ td.mismatch = false;
+ td.delete = false;
+
+ num = tdb_traverse(tdb, trav, &td);
+ ok1(num == NUM_RECORDS / 2);
+ ok1(!td.mismatch);
+ ok1(td.calls == NUM_RECORDS / 2);
+ ok1(td.low <= NUM_RECORDS / 2);
+ ok1(td.high > NUM_RECORDS / 2);
+ ok1(tdb_check(tdb, NULL, NULL) == 0);
+ ok1(tap_log_messages == 0);
+
+ /* Deleting traverse (delete everything). */
+ td.calls = 0;
+ td.call_limit = UINT_MAX;
+ td.low = INT_MAX;
+ td.high = INT_MIN;
+ td.mismatch = false;
+ td.delete = true;
+ td.delete_error = TDB_SUCCESS;
+ num = tdb_traverse(tdb, trav, &td);
+ ok1(num == NUM_RECORDS);
+ ok1(td.delete_error == TDB_SUCCESS);
+ ok1(!td.mismatch);
+ ok1(td.calls == NUM_RECORDS);
+ ok1(td.low == 0);
+ ok1(td.high == NUM_RECORDS - 1);
+ ok1(tdb_check(tdb, NULL, NULL) == 0);
+
+ /* Now it's empty! */
+ ok1(tdb_traverse(tdb, NULL, NULL) == 0);
+
+ /* Re-add. */
+ ok1(store_records(tdb));
+ ok1(tdb_traverse(tdb, NULL, NULL) == NUM_RECORDS);
+ ok1(tdb_check(tdb, NULL, NULL) == 0);
+
+ /* Grow. This will cause us to be reshuffled. */
+ tgd.calls = 0;
+ tgd.num_large = 0;
+ tgd.mismatch = false;
+ tgd.error = TDB_SUCCESS;
+ ok1(tdb_traverse(tdb, trav_grow, &tgd) > 1);
+ ok1(tgd.error == 0);
+ ok1(!tgd.mismatch);
+ ok1(tdb_check(tdb, NULL, NULL) == 0);
+ ok1(tgd.num_large < tgd.calls);
+ diag("growing db: %u calls, %u repeats",
+ tgd.calls, tgd.num_large);
+
+ tdb_close(tdb);
+ }
+
+ ok1(tap_log_messages == 0);
+ return exit_status();
+}
diff --git a/lib/tdb2/tools/Makefile b/lib/tdb2/tools/Makefile
new file mode 100644
index 00000000000..11188c3baf6
--- /dev/null
+++ b/lib/tdb2/tools/Makefile
@@ -0,0 +1,16 @@
+OBJS:=../../tdb2.o ../../hash.o ../../tally.o
+CFLAGS:=-I../../.. -I.. -Wall -g -O3 #-g -pg
+LDFLAGS:=-L../../..
+
+default: tdb2torture tdb2tool tdb2dump tdb2restore mktdb2 speed growtdb-bench
+
+tdb2dump: tdb2dump.c $(OBJS)
+tdb2restore: tdb2restore.c $(OBJS)
+tdb2torture: tdb2torture.c $(OBJS)
+tdb2tool: tdb2tool.c $(OBJS)
+mktdb2: mktdb2.c $(OBJS)
+speed: speed.c $(OBJS)
+growtdb-bench: growtdb-bench.c $(OBJS)
+
+clean:
+ rm -f tdb2torture tdb2dump tdb2restore tdb2tool mktdb2 speed growtdb-bench
diff --git a/lib/tdb2/tools/growtdb-bench.c b/lib/tdb2/tools/growtdb-bench.c
new file mode 100644
index 00000000000..f7f6845a8a4
--- /dev/null
+++ b/lib/tdb2/tools/growtdb-bench.c
@@ -0,0 +1,112 @@
+#include "tdb2.h"
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <err.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
+static void logfn(struct tdb_context *tdb,
+ enum tdb_log_level level,
+ const char *message,
+ void *data)
+{
+ fprintf(stderr, "tdb:%s:%s\n", tdb_name(tdb), message);
+}
+
+int main(int argc, char *argv[])
+{
+ unsigned int i, j, users, groups;
+ TDB_DATA idxkey, idxdata;
+ TDB_DATA k, d, gk;
+ char cmd[100];
+ struct tdb_context *tdb;
+ enum TDB_ERROR ecode;
+ union tdb_attribute log;
+
+ if (argc != 3) {
+ printf("Usage: growtdb-bench <users> <groups>\n");
+ exit(1);
+ }
+ users = atoi(argv[1]);
+ groups = atoi(argv[2]);
+
+ sprintf(cmd, "cat /proc/%i/statm", getpid());
+
+ log.base.attr = TDB_ATTRIBUTE_LOG;
+ log.base.next = NULL;
+ log.log.fn = logfn;
+
+ tdb = tdb_open("/tmp/growtdb.tdb", TDB_DEFAULT,
+ O_RDWR|O_CREAT|O_TRUNC, 0600, &log);
+
+ idxkey.dptr = (unsigned char *)"User index";
+ idxkey.dsize = strlen("User index");
+ idxdata.dsize = 51;
+ idxdata.dptr = calloc(idxdata.dsize, 1);
+
+ /* Create users. */
+ k.dsize = 48;
+ k.dptr = calloc(k.dsize, 1);
+ d.dsize = 64;
+ d.dptr = calloc(d.dsize, 1);
+
+ tdb_transaction_start(tdb);
+ for (i = 0; i < users; i++) {
+ memcpy(k.dptr, &i, sizeof(i));
+ ecode = tdb_store(tdb, k, d, TDB_INSERT);
+ if (ecode != TDB_SUCCESS)
+ errx(1, "tdb insert failed: %s", tdb_errorstr(ecode));
+
+ /* This simulates a growing index record. */
+ ecode = tdb_append(tdb, idxkey, idxdata);
+ if (ecode != TDB_SUCCESS)
+ errx(1, "tdb append failed: %s", tdb_errorstr(ecode));
+ }
+ if ((ecode = tdb_transaction_commit(tdb)) != 0)
+ errx(1, "tdb commit1 failed: %s", tdb_errorstr(ecode));
+
+ if ((ecode = tdb_check(tdb, NULL, NULL)) != 0)
+ errx(1, "tdb_check failed after initial insert!");
+
+ system(cmd);
+
+ /* Now put them all in groups: add 32 bytes to each record for
+ * a group. */
+ gk.dsize = 48;
+ gk.dptr = calloc(k.dsize, 1);
+ gk.dptr[gk.dsize-1] = 1;
+
+ d.dsize = 32;
+ for (i = 0; i < groups; i++) {
+ tdb_transaction_start(tdb);
+ /* Create the "group". */
+ memcpy(gk.dptr, &i, sizeof(i));
+ ecode = tdb_store(tdb, gk, d, TDB_INSERT);
+ if (ecode != TDB_SUCCESS)
+ errx(1, "tdb insert failed: %s", tdb_errorstr(ecode));
+
+ /* Now populate it. */
+ for (j = 0; j < users; j++) {
+ /* Append to the user. */
+ memcpy(k.dptr, &j, sizeof(j));
+ if ((ecode = tdb_append(tdb, k, d)) != 0)
+ errx(1, "tdb append failed: %s",
+ tdb_errorstr(ecode));
+
+ /* Append to the group. */
+ if ((ecode = tdb_append(tdb, gk, d)) != 0)
+ errx(1, "tdb append failed: %s",
+ tdb_errorstr(ecode));
+ }
+ if ((ecode = tdb_transaction_commit(tdb)) != 0)
+ errx(1, "tdb commit2 failed: %s", tdb_errorstr(ecode));
+ if ((ecode = tdb_check(tdb, NULL, NULL)) != 0)
+ errx(1, "tdb_check failed after iteration %i!", i);
+ system(cmd);
+ }
+
+ return 0;
+}
diff --git a/lib/tdb2/tools/mktdb2.c b/lib/tdb2/tools/mktdb2.c
new file mode 100644
index 00000000000..c8c280349e6
--- /dev/null
+++ b/lib/tdb2/tools/mktdb2.c
@@ -0,0 +1,29 @@
+#include "tdb2.h"
+#include <stdlib.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <err.h>
+
+int main(int argc, char *argv[])
+{
+ unsigned int i, num_recs;
+ struct tdb_context *tdb;
+
+ if (argc != 3 || (num_recs = atoi(argv[2])) == 0)
+ errx(1, "Usage: mktdb <tdbfile> <numrecords>");
+
+ tdb = tdb_open(argv[1], TDB_DEFAULT, O_CREAT|O_TRUNC|O_RDWR, 0600,NULL);
+ if (!tdb)
+ err(1, "Opening %s", argv[1]);
+
+ for (i = 0; i < num_recs; i++) {
+ TDB_DATA d;
+
+ d.dptr = (void *)&i;
+ d.dsize = sizeof(i);
+ if (tdb_store(tdb, d, d, TDB_INSERT) != 0)
+ err(1, "Failed to store record %i", i);
+ }
+ printf("Done\n");
+ return 0;
+}
diff --git a/lib/tdb2/tools/speed.c b/lib/tdb2/tools/speed.c
new file mode 100644
index 00000000000..3222465a712
--- /dev/null
+++ b/lib/tdb2/tools/speed.c
@@ -0,0 +1,440 @@
+/* Simple speed test for TDB */
+#include <err.h>
+#include <time.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <sys/time.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdbool.h>
+#include "tdb2.h"
+
+/* Nanoseconds per operation */
+static size_t normalize(const struct timeval *start,
+ const struct timeval *stop,
+ unsigned int num)
+{
+ struct timeval diff;
+
+ timersub(stop, start, &diff);
+
+ /* Floating point is more accurate here. */
+ return (double)(diff.tv_sec * 1000000 + diff.tv_usec)
+ / num * 1000;
+}
+
+static size_t file_size(void)
+{
+ struct stat st;
+
+ if (stat("/tmp/speed.tdb", &st) != 0)
+ return -1;
+ return st.st_size;
+}
+
+static int count_record(struct tdb_context *tdb,
+ TDB_DATA key, TDB_DATA data, void *p)
+{
+ int *total = p;
+ *total += *(int *)data.dptr;
+ return 0;
+}
+
+static void dump_and_clear_stats(struct tdb_context **tdb,
+ int flags,
+ union tdb_attribute *attr)
+{
+ union tdb_attribute stats;
+ enum TDB_ERROR ecode;
+
+ stats.base.attr = TDB_ATTRIBUTE_STATS;
+ stats.stats.size = sizeof(stats.stats);
+ ecode = tdb_get_attribute(*tdb, &stats);
+ if (ecode != TDB_SUCCESS)
+ errx(1, "Getting stats: %s", tdb_errorstr(ecode));
+
+ printf("allocs = %llu\n",
+ (unsigned long long)stats.stats.allocs);
+ printf(" alloc_subhash = %llu\n",
+ (unsigned long long)stats.stats.alloc_subhash);
+ printf(" alloc_chain = %llu\n",
+ (unsigned long long)stats.stats.alloc_chain);
+ printf(" alloc_bucket_exact = %llu\n",
+ (unsigned long long)stats.stats.alloc_bucket_exact);
+ printf(" alloc_bucket_max = %llu\n",
+ (unsigned long long)stats.stats.alloc_bucket_max);
+ printf(" alloc_leftover = %llu\n",
+ (unsigned long long)stats.stats.alloc_leftover);
+ printf(" alloc_coalesce_tried = %llu\n",
+ (unsigned long long)stats.stats.alloc_coalesce_tried);
+ printf(" alloc_coalesce_iterate_clash = %llu\n",
+ (unsigned long long)stats.stats.alloc_coalesce_iterate_clash);
+ printf(" alloc_coalesce_lockfail = %llu\n",
+ (unsigned long long)stats.stats.alloc_coalesce_lockfail);
+ printf(" alloc_coalesce_race = %llu\n",
+ (unsigned long long)stats.stats.alloc_coalesce_race);
+ printf(" alloc_coalesce_succeeded = %llu\n",
+ (unsigned long long)stats.stats.alloc_coalesce_succeeded);
+ printf(" alloc_coalesce_num_merged = %llu\n",
+ (unsigned long long)stats.stats.alloc_coalesce_num_merged);
+ printf("compares = %llu\n",
+ (unsigned long long)stats.stats.compares);
+ printf(" compare_wrong_bucket = %llu\n",
+ (unsigned long long)stats.stats.compare_wrong_bucket);
+ printf(" compare_wrong_offsetbits = %llu\n",
+ (unsigned long long)stats.stats.compare_wrong_offsetbits);
+ printf(" compare_wrong_keylen = %llu\n",
+ (unsigned long long)stats.stats.compare_wrong_keylen);
+ printf(" compare_wrong_rechash = %llu\n",
+ (unsigned long long)stats.stats.compare_wrong_rechash);
+ printf(" compare_wrong_keycmp = %llu\n",
+ (unsigned long long)stats.stats.compare_wrong_keycmp);
+ printf("transactions = %llu\n",
+ (unsigned long long)stats.stats.transactions);
+ printf(" transaction_cancel = %llu\n",
+ (unsigned long long)stats.stats.transaction_cancel);
+ printf(" transaction_nest = %llu\n",
+ (unsigned long long)stats.stats.transaction_nest);
+ printf(" transaction_expand_file = %llu\n",
+ (unsigned long long)stats.stats.transaction_expand_file);
+ printf(" transaction_read_direct = %llu\n",
+ (unsigned long long)stats.stats.transaction_read_direct);
+ printf(" transaction_read_direct_fail = %llu\n",
+ (unsigned long long)stats.stats.transaction_read_direct_fail);
+ printf(" transaction_write_direct = %llu\n",
+ (unsigned long long)stats.stats.transaction_write_direct);
+ printf(" transaction_write_direct_fail = %llu\n",
+ (unsigned long long)stats.stats.transaction_write_direct_fail);
+ printf("expands = %llu\n",
+ (unsigned long long)stats.stats.expands);
+ printf("frees = %llu\n",
+ (unsigned long long)stats.stats.frees);
+ printf("locks = %llu\n",
+ (unsigned long long)stats.stats.locks);
+ printf(" lock_lowlevel = %llu\n",
+ (unsigned long long)stats.stats.lock_lowlevel);
+ printf(" lock_nonblock = %llu\n",
+ (unsigned long long)stats.stats.lock_nonblock);
+ printf(" lock_nonblock_fail = %llu\n",
+ (unsigned long long)stats.stats.lock_nonblock_fail);
+
+ /* Now clear. */
+ tdb_close(*tdb);
+ *tdb = tdb_open("/tmp/speed.tdb", flags, O_RDWR, 0, attr);
+}
+
+static void tdb_log(struct tdb_context *tdb, enum tdb_log_level level,
+ const char *message, void *data)
+{
+ fputs(message, stderr);
+ putc('\n', stderr);
+}
+
+int main(int argc, char *argv[])
+{
+ unsigned int i, j, num = 1000, stage = 0, stopat = -1;
+ int flags = TDB_DEFAULT;
+ bool transaction = false, summary = false;
+ TDB_DATA key, data;
+ struct tdb_context *tdb;
+ struct timeval start, stop;
+ union tdb_attribute seed, log;
+ bool do_stats = false;
+ enum TDB_ERROR ecode;
+
+ /* Try to keep benchmarks even. */
+ seed.base.attr = TDB_ATTRIBUTE_SEED;
+ seed.base.next = NULL;
+ seed.seed.seed = 0;
+
+ log.base.attr = TDB_ATTRIBUTE_LOG;
+ log.base.next = &seed;
+ log.log.fn = tdb_log;
+
+ if (argv[1] && strcmp(argv[1], "--internal") == 0) {
+ flags = TDB_INTERNAL;
+ argc--;
+ argv++;
+ }
+ if (argv[1] && strcmp(argv[1], "--transaction") == 0) {
+ transaction = true;
+ argc--;
+ argv++;
+ }
+ if (argv[1] && strcmp(argv[1], "--no-sync") == 0) {
+ flags |= TDB_NOSYNC;
+ argc--;
+ argv++;
+ }
+ if (argv[1] && strcmp(argv[1], "--summary") == 0) {
+ summary = true;
+ argc--;
+ argv++;
+ }
+ if (argv[1] && strcmp(argv[1], "--stats") == 0) {
+ do_stats = true;
+ argc--;
+ argv++;
+ }
+
+ tdb = tdb_open("/tmp/speed.tdb", flags, O_RDWR|O_CREAT|O_TRUNC,
+ 0600, &log);
+ if (!tdb)
+ err(1, "Opening /tmp/speed.tdb");
+
+ key.dptr = (void *)&i;
+ key.dsize = sizeof(i);
+ data = key;
+
+ if (argv[1]) {
+ num = atoi(argv[1]);
+ argv++;
+ argc--;
+ }
+
+ if (argv[1]) {
+ stopat = atoi(argv[1]);
+ argv++;
+ argc--;
+ }
+
+ /* Add 1000 records. */
+ printf("Adding %u records: ", num); fflush(stdout);
+ if (transaction && (ecode = tdb_transaction_start(tdb)))
+ errx(1, "starting transaction: %s", tdb_errorstr(ecode));
+ gettimeofday(&start, NULL);
+ for (i = 0; i < num; i++)
+ if ((ecode = tdb_store(tdb, key, data, TDB_INSERT)) != 0)
+ errx(1, "Inserting key %u in tdb: %s",
+ i, tdb_errorstr(ecode));
+ gettimeofday(&stop, NULL);
+ if (transaction && (ecode = tdb_transaction_commit(tdb)))
+ errx(1, "committing transaction: %s", tdb_errorstr(ecode));
+ printf(" %zu ns (%zu bytes)\n",
+ normalize(&start, &stop, num), file_size());
+
+ if (tdb_check(tdb, NULL, NULL))
+ errx(1, "tdb_check failed!");
+ if (summary) {
+ char *sumstr = NULL;
+ tdb_summary(tdb, TDB_SUMMARY_HISTOGRAMS, &sumstr);
+ printf("%s\n", sumstr);
+ free(sumstr);
+ }
+ if (do_stats)
+ dump_and_clear_stats(&tdb, flags, &log);
+
+ if (++stage == stopat)
+ exit(0);
+
+ /* Finding 1000 records. */
+ printf("Finding %u records: ", num); fflush(stdout);
+ if (transaction && (ecode = tdb_transaction_start(tdb)))
+ errx(1, "starting transaction: %s", tdb_errorstr(ecode));
+ gettimeofday(&start, NULL);
+ for (i = 0; i < num; i++) {
+ struct tdb_data dbuf;
+ if ((ecode = tdb_fetch(tdb, key, &dbuf)) != TDB_SUCCESS
+ || *(int *)dbuf.dptr != i) {
+ errx(1, "Fetching key %u in tdb gave %u",
+ i, ecode ? ecode : *(int *)dbuf.dptr);
+ }
+ }
+ gettimeofday(&stop, NULL);
+ if (transaction && (ecode = tdb_transaction_commit(tdb)))
+ errx(1, "committing transaction: %s", tdb_errorstr(ecode));
+ printf(" %zu ns (%zu bytes)\n",
+ normalize(&start, &stop, num), file_size());
+ if (tdb_check(tdb, NULL, NULL))
+ errx(1, "tdb_check failed!");
+ if (summary) {
+ char *sumstr = NULL;
+ tdb_summary(tdb, TDB_SUMMARY_HISTOGRAMS, &sumstr);
+ printf("%s\n", sumstr);
+ free(sumstr);
+ }
+ if (do_stats)
+ dump_and_clear_stats(&tdb, flags, &log);
+ if (++stage == stopat)
+ exit(0);
+
+ /* Missing 1000 records. */
+ printf("Missing %u records: ", num); fflush(stdout);
+ if (transaction && (ecode = tdb_transaction_start(tdb)))
+ errx(1, "starting transaction: %s", tdb_errorstr(ecode));
+ gettimeofday(&start, NULL);
+ for (i = num; i < num*2; i++) {
+ struct tdb_data dbuf;
+ ecode = tdb_fetch(tdb, key, &dbuf);
+ if (ecode != TDB_ERR_NOEXIST)
+ errx(1, "Fetching key %u in tdb gave %s",
+ i, tdb_errorstr(ecode));
+ }
+ gettimeofday(&stop, NULL);
+ if (transaction && (ecode = tdb_transaction_commit(tdb)))
+ errx(1, "committing transaction: %s", tdb_errorstr(ecode));
+ printf(" %zu ns (%zu bytes)\n",
+ normalize(&start, &stop, num), file_size());
+ if (tdb_check(tdb, NULL, NULL))
+ errx(1, "tdb_check failed!");
+ if (summary) {
+ char *sumstr = NULL;
+ tdb_summary(tdb, TDB_SUMMARY_HISTOGRAMS, &sumstr);
+ printf("%s\n", sumstr);
+ free(sumstr);
+ }
+ if (do_stats)
+ dump_and_clear_stats(&tdb, flags, &log);
+ if (++stage == stopat)
+ exit(0);
+
+ /* Traverse 1000 records. */
+ printf("Traversing %u records: ", num); fflush(stdout);
+ if (transaction && (ecode = tdb_transaction_start(tdb)))
+ errx(1, "starting transaction: %s", tdb_errorstr(ecode));
+ i = 0;
+ gettimeofday(&start, NULL);
+ if (tdb_traverse(tdb, count_record, &i) != num)
+ errx(1, "Traverse returned wrong number of records");
+ if (i != (num - 1) * (num / 2))
+ errx(1, "Traverse tallied to %u", i);
+ gettimeofday(&stop, NULL);
+ if (transaction && (ecode = tdb_transaction_commit(tdb)))
+ errx(1, "committing transaction: %s", tdb_errorstr(ecode));
+ printf(" %zu ns (%zu bytes)\n",
+ normalize(&start, &stop, num), file_size());
+ if (tdb_check(tdb, NULL, NULL))
+ errx(1, "tdb_check failed!");
+ if (summary) {
+ char *sumstr = NULL;
+ tdb_summary(tdb, TDB_SUMMARY_HISTOGRAMS, &sumstr);
+ printf("%s\n", sumstr);
+ free(sumstr);
+ }
+ if (do_stats)
+ dump_and_clear_stats(&tdb, flags, &log);
+ if (++stage == stopat)
+ exit(0);
+
+ /* Delete 1000 records (not in order). */
+ printf("Deleting %u records: ", num); fflush(stdout);
+ if (transaction && (ecode = tdb_transaction_start(tdb)))
+ errx(1, "starting transaction: %s", tdb_errorstr(ecode));
+ gettimeofday(&start, NULL);
+ for (j = 0; j < num; j++) {
+ i = (j + 100003) % num;
+ if ((ecode = tdb_delete(tdb, key)) != TDB_SUCCESS)
+ errx(1, "Deleting key %u in tdb: %s",
+ i, tdb_errorstr(ecode));
+ }
+ gettimeofday(&stop, NULL);
+ if (transaction && (ecode = tdb_transaction_commit(tdb)))
+ errx(1, "committing transaction: %s", tdb_errorstr(ecode));
+ printf(" %zu ns (%zu bytes)\n",
+ normalize(&start, &stop, num), file_size());
+ if (tdb_check(tdb, NULL, NULL))
+ errx(1, "tdb_check failed!");
+ if (summary) {
+ char *sumstr = NULL;
+ tdb_summary(tdb, TDB_SUMMARY_HISTOGRAMS, &sumstr);
+ printf("%s\n", sumstr);
+ free(sumstr);
+ }
+ if (do_stats)
+ dump_and_clear_stats(&tdb, flags, &log);
+ if (++stage == stopat)
+ exit(0);
+
+ /* Re-add 1000 records (not in order). */
+ printf("Re-adding %u records: ", num); fflush(stdout);
+ if (transaction && (ecode = tdb_transaction_start(tdb)))
+ errx(1, "starting transaction: %s", tdb_errorstr(ecode));
+ gettimeofday(&start, NULL);
+ for (j = 0; j < num; j++) {
+ i = (j + 100003) % num;
+ if ((ecode = tdb_store(tdb, key, data, TDB_INSERT)) != 0)
+ errx(1, "Inserting key %u in tdb: %s",
+ i, tdb_errorstr(ecode));
+ }
+ gettimeofday(&stop, NULL);
+ if (transaction && (ecode = tdb_transaction_commit(tdb)))
+ errx(1, "committing transaction: %s", tdb_errorstr(ecode));
+ printf(" %zu ns (%zu bytes)\n",
+ normalize(&start, &stop, num), file_size());
+ if (tdb_check(tdb, NULL, NULL))
+ errx(1, "tdb_check failed!");
+ if (summary) {
+ char *sumstr = NULL;
+ tdb_summary(tdb, TDB_SUMMARY_HISTOGRAMS, &sumstr);
+ printf("%s\n", sumstr);
+ free(sumstr);
+ }
+ if (do_stats)
+ dump_and_clear_stats(&tdb, flags, &log);
+ if (++stage == stopat)
+ exit(0);
+
+ /* Append 1000 records. */
+ if (transaction && (ecode = tdb_transaction_start(tdb)))
+ errx(1, "starting transaction: %s", tdb_errorstr(ecode));
+ printf("Appending %u records: ", num); fflush(stdout);
+ gettimeofday(&start, NULL);
+ for (i = 0; i < num; i++)
+ if ((ecode = tdb_append(tdb, key, data)) != TDB_SUCCESS)
+ errx(1, "Appending key %u in tdb: %s",
+ i, tdb_errorstr(ecode));
+ gettimeofday(&stop, NULL);
+ if (transaction && (ecode = tdb_transaction_commit(tdb)))
+ errx(1, "committing transaction: %s", tdb_errorstr(ecode));
+ printf(" %zu ns (%zu bytes)\n",
+ normalize(&start, &stop, num), file_size());
+ if (tdb_check(tdb, NULL, NULL))
+ errx(1, "tdb_check failed!");
+ if (summary) {
+ char *sumstr = NULL;
+ tdb_summary(tdb, TDB_SUMMARY_HISTOGRAMS, &sumstr);
+ printf("%s\n", sumstr);
+ free(sumstr);
+ }
+ if (++stage == stopat)
+ exit(0);
+
+ /* Churn 1000 records: not in order! */
+ if (transaction && (ecode = tdb_transaction_start(tdb)))
+ errx(1, "starting transaction: %s", tdb_errorstr(ecode));
+ printf("Churning %u records: ", num); fflush(stdout);
+ gettimeofday(&start, NULL);
+ for (j = 0; j < num; j++) {
+ i = (j + 1000019) % num;
+ if ((ecode = tdb_delete(tdb, key)) != TDB_SUCCESS)
+ errx(1, "Deleting key %u in tdb: %s",
+ i, tdb_errorstr(ecode));
+ i += num;
+ if ((ecode = tdb_store(tdb, key, data, TDB_INSERT)) != 0)
+ errx(1, "Inserting key %u in tdb: %s",
+ i, tdb_errorstr(ecode));
+ }
+ gettimeofday(&stop, NULL);
+ if (transaction && (ecode = tdb_transaction_commit(tdb)))
+ errx(1, "committing transaction: %s", tdb_errorstr(ecode));
+ printf(" %zu ns (%zu bytes)\n",
+ normalize(&start, &stop, num), file_size());
+
+ if (tdb_check(tdb, NULL, NULL))
+ errx(1, "tdb_check failed!");
+ if (summary) {
+ char *sumstr = NULL;
+ tdb_summary(tdb, TDB_SUMMARY_HISTOGRAMS, &sumstr);
+ printf("%s\n", sumstr);
+ free(sumstr);
+ }
+ if (do_stats)
+ dump_and_clear_stats(&tdb, flags, &log);
+ if (++stage == stopat)
+ exit(0);
+
+ return 0;
+}
diff --git a/lib/tdb2/tools/tdb2dump.c b/lib/tdb2/tools/tdb2dump.c
new file mode 100644
index 00000000000..abe1d9b8710
--- /dev/null
+++ b/lib/tdb2/tools/tdb2dump.c
@@ -0,0 +1,115 @@
+/*
+ simple tdb2 dump util
+ Copyright (C) Andrew Tridgell 2001
+ Copyright (C) Rusty Russell 2011
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+#include "tdb2.h"
+#include <ctype.h>
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+static void print_data(TDB_DATA d)
+{
+ unsigned char *p = (unsigned char *)d.dptr;
+ int len = d.dsize;
+ while (len--) {
+ if (isprint(*p) && !strchr("\"\\", *p)) {
+ fputc(*p, stdout);
+ } else {
+ printf("\\%02X", *p);
+ }
+ p++;
+ }
+}
+
+static int traverse_fn(struct tdb_context *tdb, TDB_DATA key, TDB_DATA dbuf, void *state)
+{
+ printf("{\n");
+ printf("key(%d) = \"", (int)key.dsize);
+ print_data(key);
+ printf("\"\n");
+ printf("data(%d) = \"", (int)dbuf.dsize);
+ print_data(dbuf);
+ printf("\"\n");
+ printf("}\n");
+ return 0;
+}
+
+static int dump_tdb(const char *fname, const char *keyname)
+{
+ struct tdb_context *tdb;
+ TDB_DATA key, value;
+
+ tdb = tdb_open(fname, 0, O_RDONLY, 0, NULL);
+ if (!tdb) {
+ printf("Failed to open %s\n", fname);
+ return 1;
+ }
+
+ if (!keyname) {
+ tdb_traverse(tdb, traverse_fn, NULL);
+ } else {
+ key = tdb_mkdata(keyname, strlen(keyname));
+ if (tdb_fetch(tdb, key, &value) != 0) {
+ return 1;
+ } else {
+ print_data(value);
+ free(value.dptr);
+ }
+ }
+
+ return 0;
+}
+
+static void usage( void)
+{
+ printf( "Usage: tdb2dump [options] <filename>\n\n");
+ printf( " -h this help message\n");
+ printf( " -k keyname dumps value of keyname\n");
+}
+
+ int main(int argc, char *argv[])
+{
+ char *fname, *keyname=NULL;
+ int c;
+
+ if (argc < 2) {
+ printf("Usage: tdb2dump <fname>\n");
+ exit(1);
+ }
+
+ while ((c = getopt( argc, argv, "hk:")) != -1) {
+ switch (c) {
+ case 'h':
+ usage();
+ exit( 0);
+ case 'k':
+ keyname = optarg;
+ break;
+ default:
+ usage();
+ exit( 1);
+ }
+ }
+
+ fname = argv[optind];
+
+ return dump_tdb(fname, keyname);
+}
diff --git a/lib/tdb2/tools/tdb2restore.c b/lib/tdb2/tools/tdb2restore.c
new file mode 100644
index 00000000000..658215a16c4
--- /dev/null
+++ b/lib/tdb2/tools/tdb2restore.c
@@ -0,0 +1,227 @@
+/*
+ tdb2restore -- construct a tdb from tdbdump output.
+ Copyright (C) Volker Lendecke 2010
+ Copyright (C) Simon McVittie 2005
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "tdb2.h"
+#include <assert.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#define debug_fprintf(file, fmt, ...) do {/*nothing*/} while (0)
+
+static int read_linehead(FILE *f)
+{
+ int i, c;
+ int num_bytes;
+ char prefix[128];
+
+ while (1) {
+ c = getc(f);
+ if (c == EOF) {
+ return -1;
+ }
+ if (c == '(') {
+ break;
+ }
+ }
+ for (i=0; i<sizeof(prefix); i++) {
+ c = getc(f);
+ if (c == EOF) {
+ return -1;
+ }
+ prefix[i] = c;
+ if (c == '"') {
+ break;
+ }
+ }
+ if (i == sizeof(prefix)) {
+ return -1;
+ }
+ prefix[i] = '\0';
+
+ if (sscanf(prefix, "%d) = ", &num_bytes) != 1) {
+ return -1;
+ }
+ return num_bytes;
+}
+
+static int read_hex(void) {
+ int c;
+ c = getchar();
+ if (c == EOF) {
+ fprintf(stderr, "Unexpected EOF in data\n");
+ return -1;
+ } else if (c == '"') {
+ fprintf(stderr, "Unexpected \\\" sequence\n");
+ return -1;
+ } else if ('0' <= c && c <= '9') {
+ return c - '0';
+ } else if ('A' <= c && c <= 'F') {
+ return c - 'A' + 10;
+ } else if ('a' <= c && c <= 'f') {
+ return c - 'a' + 10;
+ } else {
+ fprintf(stderr, "Invalid hex: %c\n", c);
+ return -1;
+ }
+}
+
+static int read_data(FILE *f, struct tdb_data *d, size_t size) {
+ int c, low, high;
+ int i;
+
+ d->dptr = (unsigned char *)malloc(size);
+ if (d->dptr == NULL) {
+ return -1;
+ }
+ d->dsize = size;
+
+ for (i=0; i<size; i++) {
+ c = getc(f);
+ if (c == EOF) {
+ fprintf(stderr, "Unexpected EOF in data\n");
+ return 1;
+ } else if (c == '"') {
+ return 0;
+ } else if (c == '\\') {
+ high = read_hex();
+ if (high < 0) {
+ return -1;
+ }
+ high = high << 4;
+ assert(high == (high & 0xf0));
+ low = read_hex();
+ if (low < 0) {
+ return -1;
+ }
+ assert(low == (low & 0x0f));
+ d->dptr[i] = (low|high);
+ } else {
+ d->dptr[i] = c;
+ }
+ }
+ return 0;
+}
+
+static int swallow(FILE *f, const char *s, int *eof)
+{
+ char line[128];
+
+ if (fgets(line, sizeof(line), f) == NULL) {
+ if (eof != NULL) {
+ *eof = 1;
+ }
+ return -1;
+ }
+ if (strcmp(line, s) != 0) {
+ return -1;
+ }
+ return 0;
+}
+
+static bool read_rec(FILE *f, struct tdb_context *tdb, int *eof)
+{
+ int length;
+ struct tdb_data key, data;
+ bool ret = false;
+ enum TDB_ERROR e;
+
+ key.dptr = NULL;
+ data.dptr = NULL;
+
+ if (swallow(f, "{\n", eof) == -1) {
+ goto fail;
+ }
+ length = read_linehead(f);
+ if (length == -1) {
+ goto fail;
+ }
+ if (read_data(f, &key, length) == -1) {
+ goto fail;
+ }
+ if (swallow(f, "\"\n", NULL) == -1) {
+ goto fail;
+ }
+ length = read_linehead(f);
+ if (length == -1) {
+ goto fail;
+ }
+ if (read_data(f, &data, length) == -1) {
+ goto fail;
+ }
+ if ((swallow(f, "\"\n", NULL) == -1)
+ || (swallow(f, "}\n", NULL) == -1)) {
+ goto fail;
+ }
+ e = tdb_store(tdb, key, data, TDB_INSERT);
+ if (e != TDB_SUCCESS) {
+ fprintf(stderr, "TDB error: %s\n", tdb_errorstr(e));
+ goto fail;
+ }
+
+ ret = true;
+fail:
+ free(key.dptr);
+ free(data.dptr);
+ return ret;
+}
+
+static int restore_tdb(const char *fname)
+{
+ struct tdb_context *tdb;
+
+ tdb = tdb_open(fname, 0, O_RDWR|O_CREAT|O_EXCL, 0666, NULL);
+ if (!tdb) {
+ perror("tdb_open");
+ fprintf(stderr, "Failed to open %s\n", fname);
+ return 1;
+ }
+
+ while (1) {
+ int eof = 0;
+ if (!read_rec(stdin, tdb, &eof)) {
+ if (eof) {
+ break;
+ }
+ return 1;
+ }
+ }
+ if (tdb_close(tdb)) {
+ fprintf(stderr, "Error closing tdb\n");
+ return 1;
+ }
+ fprintf(stderr, "EOF\n");
+ return 0;
+}
+
+int main(int argc, char *argv[])
+{
+ char *fname;
+
+ if (argc < 2) {
+ printf("Usage: %s dbname < tdbdump_output\n", argv[0]);
+ exit(1);
+ }
+
+ fname = argv[1];
+
+ return restore_tdb(fname);
+}
diff --git a/lib/tdb2/tools/tdb2tool.c b/lib/tdb2/tools/tdb2tool.c
new file mode 100644
index 00000000000..cd301c80b78
--- /dev/null
+++ b/lib/tdb2/tools/tdb2tool.c
@@ -0,0 +1,798 @@
+/*
+ Unix SMB/CIFS implementation.
+ Samba database functions
+ Copyright (C) Andrew Tridgell 1999-2000
+ Copyright (C) Paul `Rusty' Russell 2000
+ Copyright (C) Jeremy Allison 2000
+ Copyright (C) Andrew Esh 2001
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "tdb2.h"
+#include <stdlib.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <string.h>
+#include <stdarg.h>
+
+static int do_command(void);
+const char *cmdname;
+char *arg1, *arg2;
+size_t arg1len, arg2len;
+int bIterate = 0;
+char *line;
+TDB_DATA iterate_kbuf;
+char cmdline[1024];
+static int disable_mmap;
+
+enum commands {
+ CMD_CREATE_TDB,
+ CMD_OPEN_TDB,
+ CMD_TRANSACTION_START,
+ CMD_TRANSACTION_COMMIT,
+ CMD_TRANSACTION_CANCEL,
+ CMD_ERASE,
+ CMD_DUMP,
+ CMD_INSERT,
+ CMD_MOVE,
+ CMD_STORE,
+ CMD_SHOW,
+ CMD_KEYS,
+ CMD_HEXKEYS,
+ CMD_DELETE,
+#if 0
+ CMD_LIST_HASH_FREE,
+ CMD_LIST_FREE,
+#endif
+ CMD_INFO,
+ CMD_MMAP,
+ CMD_SPEED,
+ CMD_FIRST,
+ CMD_NEXT,
+ CMD_SYSTEM,
+ CMD_CHECK,
+ CMD_QUIT,
+ CMD_HELP
+};
+
+typedef struct {
+ const char *name;
+ enum commands cmd;
+} COMMAND_TABLE;
+
+COMMAND_TABLE cmd_table[] = {
+ {"create", CMD_CREATE_TDB},
+ {"open", CMD_OPEN_TDB},
+#if 0
+ {"transaction_start", CMD_TRANSACTION_START},
+ {"transaction_commit", CMD_TRANSACTION_COMMIT},
+ {"transaction_cancel", CMD_TRANSACTION_CANCEL},
+#endif
+ {"erase", CMD_ERASE},
+ {"dump", CMD_DUMP},
+ {"insert", CMD_INSERT},
+ {"move", CMD_MOVE},
+ {"store", CMD_STORE},
+ {"show", CMD_SHOW},
+ {"keys", CMD_KEYS},
+ {"hexkeys", CMD_HEXKEYS},
+ {"delete", CMD_DELETE},
+#if 0
+ {"list", CMD_LIST_HASH_FREE},
+ {"free", CMD_LIST_FREE},
+#endif
+ {"info", CMD_INFO},
+ {"speed", CMD_SPEED},
+ {"mmap", CMD_MMAP},
+ {"first", CMD_FIRST},
+ {"1", CMD_FIRST},
+ {"next", CMD_NEXT},
+ {"n", CMD_NEXT},
+ {"check", CMD_CHECK},
+ {"quit", CMD_QUIT},
+ {"q", CMD_QUIT},
+ {"!", CMD_SYSTEM},
+ {NULL, CMD_HELP}
+};
+
+struct timeval tp1,tp2;
+
+static void _start_timer(void)
+{
+ gettimeofday(&tp1,NULL);
+}
+
+static double _end_timer(void)
+{
+ gettimeofday(&tp2,NULL);
+ return((tp2.tv_sec - tp1.tv_sec) +
+ (tp2.tv_usec - tp1.tv_usec)*1.0e-6);
+}
+
+static void tdb_log(struct tdb_context *tdb, enum tdb_log_level level,
+ const char *message, void *priv)
+{
+ fputs(message, stderr);
+}
+
+/* a tdb tool for manipulating a tdb database */
+
+static struct tdb_context *tdb;
+
+static int print_rec(struct tdb_context *the_tdb, TDB_DATA key, TDB_DATA dbuf, void *state);
+static int print_key(struct tdb_context *the_tdb, TDB_DATA key, TDB_DATA dbuf, void *state);
+static int print_hexkey(struct tdb_context *the_tdb, TDB_DATA key, TDB_DATA dbuf, void *state);
+
+static void print_asc(const char *buf,int len)
+{
+ int i;
+
+ /* We're probably printing ASCII strings so don't try to display
+ the trailing NULL character. */
+
+ if (buf[len - 1] == 0)
+ len--;
+
+ for (i=0;i<len;i++)
+ printf("%c",isprint(buf[i])?buf[i]:'.');
+}
+
+static void print_data(const char *buf,int len)
+{
+ int i=0;
+ if (len<=0) return;
+ printf("[%03X] ",i);
+ for (i=0;i<len;) {
+ printf("%02X ",(int)((unsigned char)buf[i]));
+ i++;
+ if (i%8 == 0) printf(" ");
+ if (i%16 == 0) {
+ print_asc(&buf[i-16],8); printf(" ");
+ print_asc(&buf[i-8],8); printf("\n");
+ if (i<len) printf("[%03X] ",i);
+ }
+ }
+ if (i%16) {
+ int n;
+
+ n = 16 - (i%16);
+ printf(" ");
+ if (n>8) printf(" ");
+ while (n--) printf(" ");
+
+ n = i%16;
+ if (n > 8) n = 8;
+ print_asc(&buf[i-(i%16)],n); printf(" ");
+ n = (i%16) - n;
+ if (n>0) print_asc(&buf[i-n],n);
+ printf("\n");
+ }
+}
+
+static void help(void)
+{
+ printf("\n"
+"tdbtool: \n"
+" create dbname : create a database\n"
+" open dbname : open an existing database\n"
+" openjh dbname : open an existing database (jenkins hash)\n"
+" transaction_start : start a transaction\n"
+" transaction_commit : commit a transaction\n"
+" transaction_cancel : cancel a transaction\n"
+" erase : erase the database\n"
+" dump : dump the database as strings\n"
+" keys : dump the database keys as strings\n"
+" hexkeys : dump the database keys as hex values\n"
+" info : print summary info about the database\n"
+" insert key data : insert a record\n"
+" move key file : move a record to a destination tdb\n"
+" store key data : store a record (replace)\n"
+" show key : show a record by key\n"
+" delete key : delete a record by key\n"
+#if 0
+" list : print the database hash table and freelist\n"
+" free : print the database freelist\n"
+#endif
+" check : check the integrity of an opened database\n"
+" speed : perform speed tests on the database\n"
+" ! command : execute system command\n"
+" 1 | first : print the first record\n"
+" n | next : print the next record\n"
+" q | quit : terminate\n"
+" \\n : repeat 'next' command\n"
+"\n");
+}
+
+static void terror(enum TDB_ERROR err, const char *why)
+{
+ if (err != TDB_SUCCESS)
+ printf("%s:%s\n", tdb_errorstr(err), why);
+ else
+ printf("%s\n", why);
+}
+
+static void create_tdb(const char *tdbname)
+{
+ union tdb_attribute log_attr;
+ log_attr.base.attr = TDB_ATTRIBUTE_LOG;
+ log_attr.base.next = NULL;
+ log_attr.log.fn = tdb_log;
+
+ if (tdb) tdb_close(tdb);
+ tdb = tdb_open(tdbname, (disable_mmap?TDB_NOMMAP:0),
+ O_RDWR | O_CREAT | O_TRUNC, 0600, &log_attr);
+ if (!tdb) {
+ printf("Could not create %s: %s\n", tdbname, strerror(errno));
+ }
+}
+
+static void open_tdb(const char *tdbname)
+{
+ union tdb_attribute log_attr;
+ log_attr.base.attr = TDB_ATTRIBUTE_LOG;
+ log_attr.base.next = NULL;
+ log_attr.log.fn = tdb_log;
+
+ if (tdb) tdb_close(tdb);
+ tdb = tdb_open(tdbname, disable_mmap?TDB_NOMMAP:0, O_RDWR, 0600,
+ &log_attr);
+ if (!tdb) {
+ printf("Could not open %s: %s\n", tdbname, strerror(errno));
+ }
+}
+
+static void insert_tdb(char *keyname, size_t keylen, char* data, size_t datalen)
+{
+ TDB_DATA key, dbuf;
+ enum TDB_ERROR ecode;
+
+ if ((keyname == NULL) || (keylen == 0)) {
+ terror(TDB_SUCCESS, "need key");
+ return;
+ }
+
+ key.dptr = (unsigned char *)keyname;
+ key.dsize = keylen;
+ dbuf.dptr = (unsigned char *)data;
+ dbuf.dsize = datalen;
+
+ ecode = tdb_store(tdb, key, dbuf, TDB_INSERT);
+ if (ecode) {
+ terror(ecode, "insert failed");
+ }
+}
+
+static void store_tdb(char *keyname, size_t keylen, char* data, size_t datalen)
+{
+ TDB_DATA key, dbuf;
+ enum TDB_ERROR ecode;
+
+ if ((keyname == NULL) || (keylen == 0)) {
+ terror(TDB_SUCCESS, "need key");
+ return;
+ }
+
+ if ((data == NULL) || (datalen == 0)) {
+ terror(TDB_SUCCESS, "need data");
+ return;
+ }
+
+ key.dptr = (unsigned char *)keyname;
+ key.dsize = keylen;
+ dbuf.dptr = (unsigned char *)data;
+ dbuf.dsize = datalen;
+
+ printf("Storing key:\n");
+ print_rec(tdb, key, dbuf, NULL);
+
+ ecode = tdb_store(tdb, key, dbuf, TDB_REPLACE);
+ if (ecode) {
+ terror(ecode, "store failed");
+ }
+}
+
+static void show_tdb(char *keyname, size_t keylen)
+{
+ TDB_DATA key, dbuf;
+ enum TDB_ERROR ecode;
+
+ if ((keyname == NULL) || (keylen == 0)) {
+ terror(TDB_SUCCESS, "need key");
+ return;
+ }
+
+ key.dptr = (unsigned char *)keyname;
+ key.dsize = keylen;
+
+ ecode = tdb_fetch(tdb, key, &dbuf);
+ if (ecode) {
+ terror(ecode, "fetch failed");
+ return;
+ }
+
+ print_rec(tdb, key, dbuf, NULL);
+
+ free( dbuf.dptr );
+}
+
+static void delete_tdb(char *keyname, size_t keylen)
+{
+ TDB_DATA key;
+ enum TDB_ERROR ecode;
+
+ if ((keyname == NULL) || (keylen == 0)) {
+ terror(TDB_SUCCESS, "need key");
+ return;
+ }
+
+ key.dptr = (unsigned char *)keyname;
+ key.dsize = keylen;
+
+ ecode = tdb_delete(tdb, key);
+ if (ecode) {
+ terror(ecode, "delete failed");
+ }
+}
+
+static void move_rec(char *keyname, size_t keylen, char* tdbname)
+{
+ TDB_DATA key, dbuf;
+ struct tdb_context *dst_tdb;
+ enum TDB_ERROR ecode;
+
+ if ((keyname == NULL) || (keylen == 0)) {
+ terror(TDB_SUCCESS, "need key");
+ return;
+ }
+
+ if ( !tdbname ) {
+ terror(TDB_SUCCESS, "need destination tdb name");
+ return;
+ }
+
+ key.dptr = (unsigned char *)keyname;
+ key.dsize = keylen;
+
+ ecode = tdb_fetch(tdb, key, &dbuf);
+ if (ecode) {
+ terror(ecode, "fetch failed");
+ return;
+ }
+
+ print_rec(tdb, key, dbuf, NULL);
+
+ dst_tdb = tdb_open(tdbname, 0, O_RDWR, 0600, NULL);
+ if ( !dst_tdb ) {
+ terror(TDB_SUCCESS, "unable to open destination tdb");
+ return;
+ }
+
+ ecode = tdb_store( dst_tdb, key, dbuf, TDB_REPLACE);
+ if (ecode)
+ terror(ecode, "failed to move record");
+ else
+ printf("record moved\n");
+
+ tdb_close( dst_tdb );
+}
+
+static int print_rec(struct tdb_context *the_tdb, TDB_DATA key, TDB_DATA dbuf, void *state)
+{
+ printf("\nkey %d bytes\n", (int)key.dsize);
+ print_asc((const char *)key.dptr, key.dsize);
+ printf("\ndata %d bytes\n", (int)dbuf.dsize);
+ print_data((const char *)dbuf.dptr, dbuf.dsize);
+ return 0;
+}
+
+static int print_key(struct tdb_context *the_tdb, TDB_DATA key, TDB_DATA dbuf, void *state)
+{
+ printf("key %d bytes: ", (int)key.dsize);
+ print_asc((const char *)key.dptr, key.dsize);
+ printf("\n");
+ return 0;
+}
+
+static int print_hexkey(struct tdb_context *the_tdb, TDB_DATA key, TDB_DATA dbuf, void *state)
+{
+ printf("key %d bytes\n", (int)key.dsize);
+ print_data((const char *)key.dptr, key.dsize);
+ printf("\n");
+ return 0;
+}
+
+static int total_bytes;
+
+static int traverse_fn(struct tdb_context *the_tdb, TDB_DATA key, TDB_DATA dbuf, void *state)
+{
+ total_bytes += dbuf.dsize;
+ return 0;
+}
+
+static void info_tdb(void)
+{
+ enum TDB_ERROR ecode;
+ char *summary;
+
+ ecode = tdb_summary(tdb, TDB_SUMMARY_HISTOGRAMS, &summary);
+
+ if (ecode) {
+ terror(ecode, "Getting summary");
+ } else {
+ printf("%s", summary);
+ free(summary);
+ }
+}
+
+static void speed_tdb(const char *tlimit)
+{
+ unsigned timelimit = tlimit?atoi(tlimit):0;
+ double t;
+ int ops;
+ if (timelimit == 0) timelimit = 5;
+
+ ops = 0;
+ printf("Testing store speed for %u seconds\n", timelimit);
+ _start_timer();
+ do {
+ long int r = random();
+ TDB_DATA key, dbuf;
+ key = tdb_mkdata("store test", strlen("store test"));
+ dbuf.dptr = (unsigned char *)&r;
+ dbuf.dsize = sizeof(r);
+ tdb_store(tdb, key, dbuf, TDB_REPLACE);
+ t = _end_timer();
+ ops++;
+ } while (t < timelimit);
+ printf("%10.3f ops/sec\n", ops/t);
+
+ ops = 0;
+ printf("Testing fetch speed for %u seconds\n", timelimit);
+ _start_timer();
+ do {
+ long int r = random();
+ TDB_DATA key, dbuf;
+ key = tdb_mkdata("store test", strlen("store test"));
+ dbuf.dptr = (unsigned char *)&r;
+ dbuf.dsize = sizeof(r);
+ tdb_fetch(tdb, key, &dbuf);
+ t = _end_timer();
+ ops++;
+ } while (t < timelimit);
+ printf("%10.3f ops/sec\n", ops/t);
+
+ ops = 0;
+ printf("Testing transaction speed for %u seconds\n", timelimit);
+ _start_timer();
+ do {
+ long int r = random();
+ TDB_DATA key, dbuf;
+ key = tdb_mkdata("transaction test", strlen("transaction test"));
+ dbuf.dptr = (unsigned char *)&r;
+ dbuf.dsize = sizeof(r);
+ tdb_transaction_start(tdb);
+ tdb_store(tdb, key, dbuf, TDB_REPLACE);
+ tdb_transaction_commit(tdb);
+ t = _end_timer();
+ ops++;
+ } while (t < timelimit);
+ printf("%10.3f ops/sec\n", ops/t);
+
+ ops = 0;
+ printf("Testing traverse speed for %u seconds\n", timelimit);
+ _start_timer();
+ do {
+ tdb_traverse(tdb, traverse_fn, NULL);
+ t = _end_timer();
+ ops++;
+ } while (t < timelimit);
+ printf("%10.3f ops/sec\n", ops/t);
+}
+
+static void toggle_mmap(void)
+{
+ disable_mmap = !disable_mmap;
+ if (disable_mmap) {
+ printf("mmap is disabled\n");
+ } else {
+ printf("mmap is enabled\n");
+ }
+}
+
+static char *tdb_getline(const char *prompt)
+{
+ static char thisline[1024];
+ char *p;
+ fputs(prompt, stdout);
+ thisline[0] = 0;
+ p = fgets(thisline, sizeof(thisline)-1, stdin);
+ if (p) p = strchr(p, '\n');
+ if (p) *p = 0;
+ return p?thisline:NULL;
+}
+
+static int do_delete_fn(struct tdb_context *the_tdb, TDB_DATA key, TDB_DATA dbuf,
+ void *state)
+{
+ return tdb_delete(the_tdb, key);
+}
+
+static void first_record(struct tdb_context *the_tdb, TDB_DATA *pkey)
+{
+ TDB_DATA dbuf;
+ enum TDB_ERROR ecode;
+ ecode = tdb_firstkey(the_tdb, pkey);
+ if (!ecode)
+ ecode = tdb_fetch(the_tdb, *pkey, &dbuf);
+ if (ecode) terror(ecode, "fetch failed");
+ else {
+ print_rec(the_tdb, *pkey, dbuf, NULL);
+ }
+}
+
+static void next_record(struct tdb_context *the_tdb, TDB_DATA *pkey)
+{
+ TDB_DATA dbuf;
+ enum TDB_ERROR ecode;
+ ecode = tdb_nextkey(the_tdb, pkey);
+
+ if (!ecode)
+ ecode = tdb_fetch(the_tdb, *pkey, &dbuf);
+ if (ecode)
+ terror(ecode, "fetch failed");
+ else
+ print_rec(the_tdb, *pkey, dbuf, NULL);
+}
+
+static void check_db(struct tdb_context *the_tdb)
+{
+ if (!the_tdb) {
+ printf("Error: No database opened!\n");
+ } else {
+ if (tdb_check(the_tdb, NULL, NULL) != 0)
+ printf("Integrity check for the opened database failed.\n");
+ else
+ printf("Database integrity is OK.\n");
+ }
+}
+
+static int do_command(void)
+{
+ COMMAND_TABLE *ctp = cmd_table;
+ enum commands mycmd = CMD_HELP;
+ int cmd_len;
+
+ if (cmdname && strlen(cmdname) == 0) {
+ mycmd = CMD_NEXT;
+ } else {
+ while (ctp->name) {
+ cmd_len = strlen(ctp->name);
+ if (strncmp(ctp->name,cmdname,cmd_len) == 0) {
+ mycmd = ctp->cmd;
+ break;
+ }
+ ctp++;
+ }
+ }
+
+ switch (mycmd) {
+ case CMD_CREATE_TDB:
+ bIterate = 0;
+ create_tdb(arg1);
+ return 0;
+ case CMD_OPEN_TDB:
+ bIterate = 0;
+ open_tdb(arg1);
+ return 0;
+ case CMD_SYSTEM:
+ /* Shell command */
+ if (system(arg1) == -1) {
+ terror(TDB_SUCCESS, "system() call failed\n");
+ }
+ return 0;
+ case CMD_QUIT:
+ return 1;
+ default:
+ /* all the rest require a open database */
+ if (!tdb) {
+ bIterate = 0;
+ terror(TDB_SUCCESS, "database not open");
+ help();
+ return 0;
+ }
+ switch (mycmd) {
+ case CMD_TRANSACTION_START:
+ bIterate = 0;
+ tdb_transaction_start(tdb);
+ return 0;
+ case CMD_TRANSACTION_COMMIT:
+ bIterate = 0;
+ tdb_transaction_commit(tdb);
+ return 0;
+ case CMD_TRANSACTION_CANCEL:
+ bIterate = 0;
+ tdb_transaction_cancel(tdb);
+ return 0;
+ case CMD_ERASE:
+ bIterate = 0;
+ tdb_traverse(tdb, do_delete_fn, NULL);
+ return 0;
+ case CMD_DUMP:
+ bIterate = 0;
+ tdb_traverse(tdb, print_rec, NULL);
+ return 0;
+ case CMD_INSERT:
+ bIterate = 0;
+ insert_tdb(arg1, arg1len,arg2,arg2len);
+ return 0;
+ case CMD_MOVE:
+ bIterate = 0;
+ move_rec(arg1,arg1len,arg2);
+ return 0;
+ case CMD_STORE:
+ bIterate = 0;
+ store_tdb(arg1,arg1len,arg2,arg2len);
+ return 0;
+ case CMD_SHOW:
+ bIterate = 0;
+ show_tdb(arg1, arg1len);
+ return 0;
+ case CMD_KEYS:
+ tdb_traverse(tdb, print_key, NULL);
+ return 0;
+ case CMD_HEXKEYS:
+ tdb_traverse(tdb, print_hexkey, NULL);
+ return 0;
+ case CMD_DELETE:
+ bIterate = 0;
+ delete_tdb(arg1,arg1len);
+ return 0;
+#if 0
+ case CMD_LIST_HASH_FREE:
+ tdb_dump_all(tdb);
+ return 0;
+ case CMD_LIST_FREE:
+ tdb_printfreelist(tdb);
+ return 0;
+#endif
+ case CMD_INFO:
+ info_tdb();
+ return 0;
+ case CMD_SPEED:
+ speed_tdb(arg1);
+ return 0;
+ case CMD_MMAP:
+ toggle_mmap();
+ return 0;
+ case CMD_FIRST:
+ bIterate = 1;
+ first_record(tdb, &iterate_kbuf);
+ return 0;
+ case CMD_NEXT:
+ if (bIterate)
+ next_record(tdb, &iterate_kbuf);
+ return 0;
+ case CMD_CHECK:
+ check_db(tdb);
+ return 0;
+ case CMD_HELP:
+ help();
+ return 0;
+ case CMD_CREATE_TDB:
+ case CMD_OPEN_TDB:
+ case CMD_SYSTEM:
+ case CMD_QUIT:
+ /*
+ * unhandled commands. cases included here to avoid compiler
+ * warnings.
+ */
+ return 0;
+ }
+ }
+
+ return 0;
+}
+
+static char *convert_string(char *instring, size_t *sizep)
+{
+ size_t length = 0;
+ char *outp, *inp;
+ char temp[3];
+
+ outp = inp = instring;
+
+ while (*inp) {
+ if (*inp == '\\') {
+ inp++;
+ if (*inp && strchr("0123456789abcdefABCDEF",(int)*inp)) {
+ temp[0] = *inp++;
+ temp[1] = '\0';
+ if (*inp && strchr("0123456789abcdefABCDEF",(int)*inp)) {
+ temp[1] = *inp++;
+ temp[2] = '\0';
+ }
+ *outp++ = (char)strtol((const char *)temp,NULL,16);
+ } else {
+ *outp++ = *inp++;
+ }
+ } else {
+ *outp++ = *inp++;
+ }
+ length++;
+ }
+ *sizep = length;
+ return instring;
+}
+
+int main(int argc, char *argv[])
+{
+ cmdname = "";
+ arg1 = NULL;
+ arg1len = 0;
+ arg2 = NULL;
+ arg2len = 0;
+
+ if (argv[1]) {
+ cmdname = "open";
+ arg1 = argv[1];
+ do_command();
+ cmdname = "";
+ arg1 = NULL;
+ }
+
+ switch (argc) {
+ case 1:
+ case 2:
+ /* Interactive mode */
+ while ((cmdname = tdb_getline("tdb> "))) {
+ arg2 = arg1 = NULL;
+ if ((arg1 = strchr((const char *)cmdname,' ')) != NULL) {
+ arg1++;
+ arg2 = arg1;
+ while (*arg2) {
+ if (*arg2 == ' ') {
+ *arg2++ = '\0';
+ break;
+ }
+ if ((*arg2++ == '\\') && (*arg2 == ' ')) {
+ arg2++;
+ }
+ }
+ }
+ if (arg1) arg1 = convert_string(arg1,&arg1len);
+ if (arg2) arg2 = convert_string(arg2,&arg2len);
+ if (do_command()) break;
+ }
+ break;
+ case 5:
+ arg2 = convert_string(argv[4],&arg2len);
+ case 4:
+ arg1 = convert_string(argv[3],&arg1len);
+ case 3:
+ cmdname = argv[2];
+ default:
+ do_command();
+ break;
+ }
+
+ if (tdb) tdb_close(tdb);
+
+ return 0;
+}
diff --git a/lib/tdb2/tools/tdb2torture.c b/lib/tdb2/tools/tdb2torture.c
new file mode 100644
index 00000000000..f6a7a5064a3
--- /dev/null
+++ b/lib/tdb2/tools/tdb2torture.c
@@ -0,0 +1,494 @@
+/* this tests tdb by doing lots of ops from several simultaneous
+ writers - that stresses the locking code.
+*/
+
+#include "tdb2.h"
+#include <stdlib.h>
+#include <err.h>
+#include <getopt.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <fcntl.h>
+#include <time.h>
+#include <sys/wait.h>
+
+//#define REOPEN_PROB 30
+#define DELETE_PROB 8
+#define STORE_PROB 4
+#define APPEND_PROB 6
+#define TRANSACTION_PROB 10
+#define TRANSACTION_PREPARE_PROB 2
+#define LOCKSTORE_PROB 5
+#define TRAVERSE_PROB 20
+#define TRAVERSE_MOD_PROB 100
+#define TRAVERSE_ABORT_PROB 500
+#define CULL_PROB 100
+#define KEYLEN 3
+#define DATALEN 100
+
+static struct tdb_context *db;
+static int in_transaction;
+static int in_traverse;
+static int error_count;
+#if TRANSACTION_PROB
+static int always_transaction = 0;
+#endif
+static int loopnum;
+static int count_pipe;
+static union tdb_attribute log_attr;
+static union tdb_attribute seed_attr;
+
+static void tdb_log(struct tdb_context *tdb, enum tdb_log_level level,
+ const char *message, void *data)
+{
+ fputs(message, stdout);
+ fflush(stdout);
+#if 0
+ {
+ char str[200];
+ signal(SIGUSR1, SIG_IGN);
+ sprintf(str,"xterm -e gdb /proc/%d/exe %d", getpid(), getpid());
+ system(str);
+ }
+#endif
+}
+
+#include "../private.h"
+
+static void segv_handler(int sig, siginfo_t *info, void *p)
+{
+ char string[100];
+
+ sprintf(string, "%u: death at %p (map_ptr %p, map_size %zu)\n",
+ getpid(), info->si_addr, db->file->map_ptr,
+ (size_t)db->file->map_size);
+ if (write(2, string, strlen(string)) > 0)
+ sleep(60);
+ _exit(11);
+}
+
+static void fatal(struct tdb_context *tdb, const char *why)
+{
+ fprintf(stderr, "%u:%s:%s\n", getpid(), why,
+ tdb ? tdb_errorstr(tdb_error(tdb)) : "(no tdb)");
+ error_count++;
+}
+
+static char *randbuf(int len)
+{
+ char *buf;
+ int i;
+ buf = (char *)malloc(len+1);
+
+ for (i=0;i<len;i++) {
+ buf[i] = 'a' + (rand() % 26);
+ }
+ buf[i] = 0;
+ return buf;
+}
+
+static void addrec_db(void);
+static int modify_traverse(struct tdb_context *tdb, TDB_DATA key, TDB_DATA dbuf,
+ void *state)
+{
+#if CULL_PROB
+ if (random() % CULL_PROB == 0) {
+ tdb_delete(tdb, key);
+ }
+#endif
+
+#if TRAVERSE_MOD_PROB
+ if (random() % TRAVERSE_MOD_PROB == 0) {
+ addrec_db();
+ }
+#endif
+
+#if TRAVERSE_ABORT_PROB
+ if (random() % TRAVERSE_ABORT_PROB == 0)
+ return 1;
+#endif
+
+ return 0;
+}
+
+static void addrec_db(void)
+{
+ int klen, dlen;
+ char *k, *d;
+ TDB_DATA key, data;
+
+ klen = 1 + (rand() % KEYLEN);
+ dlen = 1 + (rand() % DATALEN);
+
+ k = randbuf(klen);
+ d = randbuf(dlen);
+
+ key.dptr = (unsigned char *)k;
+ key.dsize = klen+1;
+
+ data.dptr = (unsigned char *)d;
+ data.dsize = dlen+1;
+
+#if REOPEN_PROB
+ if (in_traverse == 0 && in_transaction == 0 && random() % REOPEN_PROB == 0) {
+ tdb_reopen_all(0);
+ goto next;
+ }
+#endif
+
+#if TRANSACTION_PROB
+ if (in_traverse == 0 && in_transaction == 0 && (always_transaction || random() % TRANSACTION_PROB == 0)) {
+ if (tdb_transaction_start(db) != 0) {
+ fatal(db, "tdb_transaction_start failed");
+ }
+ in_transaction++;
+ goto next;
+ }
+ if (in_traverse == 0 && in_transaction && random() % TRANSACTION_PROB == 0) {
+ if (random() % TRANSACTION_PREPARE_PROB == 0) {
+ if (tdb_transaction_prepare_commit(db) != 0) {
+ fatal(db, "tdb_transaction_prepare_commit failed");
+ }
+ }
+ if (tdb_transaction_commit(db) != 0) {
+ fatal(db, "tdb_transaction_commit failed");
+ }
+ in_transaction--;
+ goto next;
+ }
+
+ if (in_traverse == 0 && in_transaction && random() % TRANSACTION_PROB == 0) {
+ tdb_transaction_cancel(db);
+ in_transaction--;
+ goto next;
+ }
+#endif
+
+#if DELETE_PROB
+ if (random() % DELETE_PROB == 0) {
+ tdb_delete(db, key);
+ goto next;
+ }
+#endif
+
+#if STORE_PROB
+ if (random() % STORE_PROB == 0) {
+ if (tdb_store(db, key, data, TDB_REPLACE) != 0) {
+ fatal(db, "tdb_store failed");
+ }
+ goto next;
+ }
+#endif
+
+#if APPEND_PROB
+ if (random() % APPEND_PROB == 0) {
+ if (tdb_append(db, key, data) != 0) {
+ fatal(db, "tdb_append failed");
+ }
+ goto next;
+ }
+#endif
+
+#if LOCKSTORE_PROB
+ if (random() % LOCKSTORE_PROB == 0) {
+ tdb_chainlock(db, key);
+ if (tdb_fetch(db, key, &data) != TDB_SUCCESS) {
+ data.dsize = 0;
+ data.dptr = NULL;
+ }
+ if (tdb_store(db, key, data, TDB_REPLACE) != 0) {
+ fatal(db, "tdb_store failed");
+ }
+ if (data.dptr) free(data.dptr);
+ tdb_chainunlock(db, key);
+ goto next;
+ }
+#endif
+
+#if TRAVERSE_PROB
+ /* FIXME: recursive traverses break transactions? */
+ if (in_traverse == 0 && random() % TRAVERSE_PROB == 0) {
+ in_traverse++;
+ tdb_traverse(db, modify_traverse, NULL);
+ in_traverse--;
+ goto next;
+ }
+#endif
+
+ if (tdb_fetch(db, key, &data) == TDB_SUCCESS)
+ free(data.dptr);
+
+next:
+ free(k);
+ free(d);
+}
+
+static int traverse_fn(struct tdb_context *tdb, TDB_DATA key, TDB_DATA dbuf,
+ void *state)
+{
+ tdb_delete(tdb, key);
+ return 0;
+}
+
+static void usage(void)
+{
+ printf("Usage: tdbtorture"
+#if TRANSACTION_PROB
+ " [-t]"
+#endif
+ " [-k] [-n NUM_PROCS] [-l NUM_LOOPS] [-s SEED] [-S]\n");
+ exit(0);
+}
+
+static void send_count_and_suicide(int sig)
+{
+ /* This ensures our successor can continue where we left off. */
+ if (write(count_pipe, &loopnum, sizeof(loopnum)) != sizeof(loopnum))
+ exit(2);
+ /* This gives a unique signature. */
+ kill(getpid(), SIGUSR2);
+}
+
+static int run_child(int i, int seed, unsigned num_loops, unsigned start,
+ int tdb_flags)
+{
+ struct sigaction act = { .sa_sigaction = segv_handler,
+ .sa_flags = SA_SIGINFO };
+ sigaction(11, &act, NULL);
+
+ db = tdb_open("torture.tdb", tdb_flags, O_RDWR | O_CREAT, 0600,
+ &log_attr);
+ if (!db) {
+ fatal(NULL, "db open failed");
+ }
+
+#if 0
+ if (i == 0) {
+ printf("pid %i\n", getpid());
+ sleep(9);
+ } else
+ sleep(10);
+#endif
+
+ srand(seed + i);
+ srandom(seed + i);
+
+ /* Set global, then we're ready to handle being killed. */
+ loopnum = start;
+ signal(SIGUSR1, send_count_and_suicide);
+
+ for (;loopnum<num_loops && error_count == 0;loopnum++) {
+ addrec_db();
+ }
+
+ if (error_count == 0) {
+ tdb_traverse(db, NULL, NULL);
+#if TRANSACTION_PROB
+ if (always_transaction) {
+ while (in_transaction) {
+ tdb_transaction_cancel(db);
+ in_transaction--;
+ }
+ if (tdb_transaction_start(db) != 0)
+ fatal(db, "tdb_transaction_start failed");
+ }
+#endif
+ tdb_traverse(db, traverse_fn, NULL);
+ tdb_traverse(db, traverse_fn, NULL);
+
+#if TRANSACTION_PROB
+ if (always_transaction) {
+ if (tdb_transaction_commit(db) != 0)
+ fatal(db, "tdb_transaction_commit failed");
+ }
+#endif
+ }
+
+ tdb_close(db);
+
+ return (error_count < 100 ? error_count : 100);
+}
+
+int main(int argc, char * const *argv)
+{
+ int i, seed = -1;
+ int num_loops = 5000;
+ int num_procs = 3;
+ int c, pfds[2];
+ extern char *optarg;
+ pid_t *pids;
+ int kill_random = 0;
+ int *done;
+ int tdb_flags = TDB_DEFAULT;
+
+ log_attr.base.attr = TDB_ATTRIBUTE_LOG;
+ log_attr.base.next = &seed_attr;
+ log_attr.log.fn = tdb_log;
+ seed_attr.base.attr = TDB_ATTRIBUTE_SEED;
+
+ while ((c = getopt(argc, argv, "n:l:s:thkS")) != -1) {
+ switch (c) {
+ case 'n':
+ num_procs = strtol(optarg, NULL, 0);
+ break;
+ case 'l':
+ num_loops = strtol(optarg, NULL, 0);
+ break;
+ case 's':
+ seed = strtol(optarg, NULL, 0);
+ break;
+ case 'S':
+ tdb_flags = TDB_NOSYNC;
+ break;
+ case 't':
+#if TRANSACTION_PROB
+ always_transaction = 1;
+#else
+ fprintf(stderr, "Transactions not supported\n");
+ usage();
+#endif
+ break;
+ case 'k':
+ kill_random = 1;
+ break;
+ default:
+ usage();
+ }
+ }
+
+ unlink("torture.tdb");
+
+ if (seed == -1) {
+ seed = (getpid() + time(NULL)) & 0x7FFFFFFF;
+ }
+ seed_attr.seed.seed = (((uint64_t)seed) << 32) | seed;
+
+ if (num_procs == 1 && !kill_random) {
+ /* Don't fork for this case, makes debugging easier. */
+ error_count = run_child(0, seed, num_loops, 0, tdb_flags);
+ goto done;
+ }
+
+ pids = (pid_t *)calloc(sizeof(pid_t), num_procs);
+ done = (int *)calloc(sizeof(int), num_procs);
+
+ if (pipe(pfds) != 0) {
+ perror("Creating pipe");
+ exit(1);
+ }
+ count_pipe = pfds[1];
+
+ for (i=0;i<num_procs;i++) {
+ if ((pids[i]=fork()) == 0) {
+ close(pfds[0]);
+ if (i == 0) {
+ printf("testing with %d processes, %d loops, seed=%d%s\n",
+ num_procs, num_loops, seed,
+#if TRANSACTION_PROB
+ always_transaction ? " (all within transactions)" : ""
+#else
+ ""
+#endif
+ );
+ }
+ exit(run_child(i, seed, num_loops, 0, tdb_flags));
+ }
+ }
+
+ while (num_procs) {
+ int status, j;
+ pid_t pid;
+
+ if (error_count != 0) {
+ /* try and stop the test on any failure */
+ for (j=0;j<num_procs;j++) {
+ if (pids[j] != 0) {
+ kill(pids[j], SIGTERM);
+ }
+ }
+ }
+
+ pid = waitpid(-1, &status, kill_random ? WNOHANG : 0);
+ if (pid == 0) {
+ struct timespec ts;
+
+ /* Sleep for 1/10 second. */
+ ts.tv_sec = 0;
+ ts.tv_nsec = 100000000;
+ nanosleep(&ts, NULL);
+
+ /* Kill someone. */
+ kill(pids[random() % num_procs], SIGUSR1);
+ continue;
+ }
+
+ if (pid == -1) {
+ perror("failed to wait for child\n");
+ exit(1);
+ }
+
+ for (j=0;j<num_procs;j++) {
+ if (pids[j] == pid) break;
+ }
+ if (j == num_procs) {
+ printf("unknown child %d exited!?\n", (int)pid);
+ exit(1);
+ }
+ if (WIFSIGNALED(status)) {
+ if (WTERMSIG(status) == SIGUSR2
+ || WTERMSIG(status) == SIGUSR1) {
+ /* SIGUSR2 means they wrote to pipe. */
+ if (WTERMSIG(status) == SIGUSR2) {
+ if (read(pfds[0], &done[j],
+ sizeof(done[j]))
+ != sizeof(done[j]))
+ err(1,
+ "Short read from child?");
+ }
+ pids[j] = fork();
+ if (pids[j] == 0)
+ exit(run_child(j, seed, num_loops,
+ done[j], tdb_flags));
+ printf("Restarting child %i for %u-%u\n",
+ j, done[j], num_loops);
+ continue;
+ }
+ printf("child %d exited with signal %d\n",
+ (int)pid, WTERMSIG(status));
+ error_count++;
+ } else {
+ if (WEXITSTATUS(status) != 0) {
+ printf("child %d exited with status %d\n",
+ (int)pid, WEXITSTATUS(status));
+ error_count++;
+ }
+ }
+ memmove(&pids[j], &pids[j+1],
+ (num_procs - j - 1)*sizeof(pids[0]));
+ num_procs--;
+ }
+
+ free(pids);
+
+done:
+ if (error_count == 0) {
+ db = tdb_open("torture.tdb", TDB_DEFAULT, O_RDWR | O_CREAT,
+ 0600, &log_attr);
+ if (!db) {
+ fatal(db, "db open failed");
+ exit(1);
+ }
+ if (tdb_check(db, NULL, NULL) != 0) {
+ fatal(db, "db check failed");
+ exit(1);
+ }
+ tdb_close(db);
+ printf("OK\n");
+ }
+
+ return error_count;
+}
diff --git a/lib/tdb2/transaction.c b/lib/tdb2/transaction.c
new file mode 100644
index 00000000000..b13223bc2e1
--- /dev/null
+++ b/lib/tdb2/transaction.c
@@ -0,0 +1,1308 @@
+ /*
+ Unix SMB/CIFS implementation.
+
+ trivial database library
+
+ Copyright (C) Andrew Tridgell 2005
+ Copyright (C) Rusty Russell 2010
+
+ ** NOTE! The following LGPL license applies to the tdb
+ ** library. This does NOT imply that all of Samba is released
+ ** under the LGPL
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 3 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "private.h"
+#define SAFE_FREE(x) do { if ((x) != NULL) {free(x); (x)=NULL;} } while(0)
+
+/*
+ transaction design:
+
+ - only allow a single transaction at a time per database. This makes
+ using the transaction API simpler, as otherwise the caller would
+ have to cope with temporary failures in transactions that conflict
+ with other current transactions
+
+ - keep the transaction recovery information in the same file as the
+ database, using a special 'transaction recovery' record pointed at
+ by the header. This removes the need for extra journal files as
+ used by some other databases
+
+ - dynamically allocated the transaction recover record, re-using it
+ for subsequent transactions. If a larger record is needed then
+ tdb_free() the old record to place it on the normal tdb freelist
+ before allocating the new record
+
+ - during transactions, keep a linked list of writes all that have
+ been performed by intercepting all tdb_write() calls. The hooked
+ transaction versions of tdb_read() and tdb_write() check this
+ linked list and try to use the elements of the list in preference
+ to the real database.
+
+ - don't allow any locks to be held when a transaction starts,
+ otherwise we can end up with deadlock (plus lack of lock nesting
+ in POSIX locks would mean the lock is lost)
+
+ - if the caller gains a lock during the transaction but doesn't
+ release it then fail the commit
+
+ - allow for nested calls to tdb_transaction_start(), re-using the
+ existing transaction record. If the inner transaction is canceled
+ then a subsequent commit will fail
+
+ - keep a mirrored copy of the tdb hash chain heads to allow for the
+ fast hash heads scan on traverse, updating the mirrored copy in
+ the transaction version of tdb_write
+
+ - allow callers to mix transaction and non-transaction use of tdb,
+ although once a transaction is started then an exclusive lock is
+ gained until the transaction is committed or canceled
+
+ - the commit stategy involves first saving away all modified data
+ into a linearised buffer in the transaction recovery area, then
+ marking the transaction recovery area with a magic value to
+ indicate a valid recovery record. In total 4 fsync/msync calls are
+ needed per commit to prevent race conditions. It might be possible
+ to reduce this to 3 or even 2 with some more work.
+
+ - check for a valid recovery record on open of the tdb, while the
+ open lock is held. Automatically recover from the transaction
+ recovery area if needed, then continue with the open as
+ usual. This allows for smooth crash recovery with no administrator
+ intervention.
+
+ - if TDB_NOSYNC is passed to flags in tdb_open then transactions are
+ still available, but no transaction recovery area is used and no
+ fsync/msync calls are made.
+*/
+
+/*
+ hold the context of any current transaction
+*/
+struct tdb_transaction {
+ /* the original io methods - used to do IOs to the real db */
+ const struct tdb_methods *io_methods;
+
+ /* the list of transaction blocks. When a block is first
+ written to, it gets created in this list */
+ uint8_t **blocks;
+ size_t num_blocks;
+ size_t last_block_size; /* number of valid bytes in the last block */
+
+ /* non-zero when an internal transaction error has
+ occurred. All write operations will then fail until the
+ transaction is ended */
+ int transaction_error;
+
+ /* when inside a transaction we need to keep track of any
+ nested tdb_transaction_start() calls, as these are allowed,
+ but don't create a new transaction */
+ unsigned int nesting;
+
+ /* set when a prepare has already occurred */
+ bool prepared;
+ tdb_off_t magic_offset;
+
+ /* old file size before transaction */
+ tdb_len_t old_map_size;
+};
+
+/* This doesn't really need to be pagesize, but we use it for similar reasons. */
+#define PAGESIZE 65536
+
+/*
+ read while in a transaction. We need to check first if the data is in our list
+ of transaction elements, then if not do a real read
+*/
+static enum TDB_ERROR transaction_read(struct tdb_context *tdb, tdb_off_t off,
+ void *buf, tdb_len_t len)
+{
+ size_t blk;
+ enum TDB_ERROR ecode;
+
+ /* break it down into block sized ops */
+ while (len + (off % PAGESIZE) > PAGESIZE) {
+ tdb_len_t len2 = PAGESIZE - (off % PAGESIZE);
+ ecode = transaction_read(tdb, off, buf, len2);
+ if (ecode != TDB_SUCCESS) {
+ return ecode;
+ }
+ len -= len2;
+ off += len2;
+ buf = (void *)(len2 + (char *)buf);
+ }
+
+ if (len == 0) {
+ return TDB_SUCCESS;
+ }
+
+ blk = off / PAGESIZE;
+
+ /* see if we have it in the block list */
+ if (tdb->transaction->num_blocks <= blk ||
+ tdb->transaction->blocks[blk] == NULL) {
+ /* nope, do a real read */
+ ecode = tdb->transaction->io_methods->tread(tdb, off, buf, len);
+ if (ecode != TDB_SUCCESS) {
+ goto fail;
+ }
+ return 0;
+ }
+
+ /* it is in the block list. Now check for the last block */
+ if (blk == tdb->transaction->num_blocks-1) {
+ if (len > tdb->transaction->last_block_size) {
+ ecode = TDB_ERR_IO;
+ goto fail;
+ }
+ }
+
+ /* now copy it out of this block */
+ memcpy(buf, tdb->transaction->blocks[blk] + (off % PAGESIZE), len);
+ return TDB_SUCCESS;
+
+fail:
+ tdb->transaction->transaction_error = 1;
+ return tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
+ "transaction_read: failed at off=%zu len=%zu",
+ (size_t)off, (size_t)len);
+}
+
+
+/*
+ write while in a transaction
+*/
+static enum TDB_ERROR transaction_write(struct tdb_context *tdb, tdb_off_t off,
+ const void *buf, tdb_len_t len)
+{
+ size_t blk;
+ enum TDB_ERROR ecode;
+
+ /* Only a commit is allowed on a prepared transaction */
+ if (tdb->transaction->prepared) {
+ ecode = tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_LOG_ERROR,
+ "transaction_write: transaction already"
+ " prepared, write not allowed");
+ goto fail;
+ }
+
+ /* break it up into block sized chunks */
+ while (len + (off % PAGESIZE) > PAGESIZE) {
+ tdb_len_t len2 = PAGESIZE - (off % PAGESIZE);
+ ecode = transaction_write(tdb, off, buf, len2);
+ if (ecode != TDB_SUCCESS) {
+ return -1;
+ }
+ len -= len2;
+ off += len2;
+ if (buf != NULL) {
+ buf = (const void *)(len2 + (const char *)buf);
+ }
+ }
+
+ if (len == 0) {
+ return TDB_SUCCESS;
+ }
+
+ blk = off / PAGESIZE;
+ off = off % PAGESIZE;
+
+ if (tdb->transaction->num_blocks <= blk) {
+ uint8_t **new_blocks;
+ /* expand the blocks array */
+ if (tdb->transaction->blocks == NULL) {
+ new_blocks = (uint8_t **)malloc(
+ (blk+1)*sizeof(uint8_t *));
+ } else {
+ new_blocks = (uint8_t **)realloc(
+ tdb->transaction->blocks,
+ (blk+1)*sizeof(uint8_t *));
+ }
+ if (new_blocks == NULL) {
+ ecode = tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR,
+ "transaction_write:"
+ " failed to allocate");
+ goto fail;
+ }
+ memset(&new_blocks[tdb->transaction->num_blocks], 0,
+ (1+(blk - tdb->transaction->num_blocks))*sizeof(uint8_t *));
+ tdb->transaction->blocks = new_blocks;
+ tdb->transaction->num_blocks = blk+1;
+ tdb->transaction->last_block_size = 0;
+ }
+
+ /* allocate and fill a block? */
+ if (tdb->transaction->blocks[blk] == NULL) {
+ tdb->transaction->blocks[blk] = (uint8_t *)calloc(PAGESIZE, 1);
+ if (tdb->transaction->blocks[blk] == NULL) {
+ ecode = tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR,
+ "transaction_write:"
+ " failed to allocate");
+ goto fail;
+ }
+ if (tdb->transaction->old_map_size > blk * PAGESIZE) {
+ tdb_len_t len2 = PAGESIZE;
+ if (len2 + (blk * PAGESIZE) > tdb->transaction->old_map_size) {
+ len2 = tdb->transaction->old_map_size - (blk * PAGESIZE);
+ }
+ ecode = tdb->transaction->io_methods->tread(tdb,
+ blk * PAGESIZE,
+ tdb->transaction->blocks[blk],
+ len2);
+ if (ecode != TDB_SUCCESS) {
+ ecode = tdb_logerr(tdb, ecode,
+ TDB_LOG_ERROR,
+ "transaction_write:"
+ " failed to"
+ " read old block: %s",
+ strerror(errno));
+ SAFE_FREE(tdb->transaction->blocks[blk]);
+ goto fail;
+ }
+ if (blk == tdb->transaction->num_blocks-1) {
+ tdb->transaction->last_block_size = len2;
+ }
+ }
+ }
+
+ /* overwrite part of an existing block */
+ if (buf == NULL) {
+ memset(tdb->transaction->blocks[blk] + off, 0, len);
+ } else {
+ memcpy(tdb->transaction->blocks[blk] + off, buf, len);
+ }
+ if (blk == tdb->transaction->num_blocks-1) {
+ if (len + off > tdb->transaction->last_block_size) {
+ tdb->transaction->last_block_size = len + off;
+ }
+ }
+
+ return TDB_SUCCESS;
+
+fail:
+ tdb->transaction->transaction_error = 1;
+ return ecode;
+}
+
+
+/*
+ write while in a transaction - this variant never expands the transaction blocks, it only
+ updates existing blocks. This means it cannot change the recovery size
+*/
+static void transaction_write_existing(struct tdb_context *tdb, tdb_off_t off,
+ const void *buf, tdb_len_t len)
+{
+ size_t blk;
+
+ /* break it up into block sized chunks */
+ while (len + (off % PAGESIZE) > PAGESIZE) {
+ tdb_len_t len2 = PAGESIZE - (off % PAGESIZE);
+ transaction_write_existing(tdb, off, buf, len2);
+ len -= len2;
+ off += len2;
+ if (buf != NULL) {
+ buf = (const void *)(len2 + (const char *)buf);
+ }
+ }
+
+ if (len == 0) {
+ return;
+ }
+
+ blk = off / PAGESIZE;
+ off = off % PAGESIZE;
+
+ if (tdb->transaction->num_blocks <= blk ||
+ tdb->transaction->blocks[blk] == NULL) {
+ return;
+ }
+
+ if (blk == tdb->transaction->num_blocks-1 &&
+ off + len > tdb->transaction->last_block_size) {
+ if (off >= tdb->transaction->last_block_size) {
+ return;
+ }
+ len = tdb->transaction->last_block_size - off;
+ }
+
+ /* overwrite part of an existing block */
+ memcpy(tdb->transaction->blocks[blk] + off, buf, len);
+}
+
+
+/*
+ out of bounds check during a transaction
+*/
+static enum TDB_ERROR transaction_oob(struct tdb_context *tdb, tdb_off_t len,
+ bool probe)
+{
+ if (len <= tdb->file->map_size) {
+ return TDB_SUCCESS;
+ }
+ if (!probe) {
+ tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
+ "tdb_oob len %lld beyond transaction size %lld",
+ (long long)len,
+ (long long)tdb->file->map_size);
+ }
+ return TDB_ERR_IO;
+}
+
+/*
+ transaction version of tdb_expand().
+*/
+static enum TDB_ERROR transaction_expand_file(struct tdb_context *tdb,
+ tdb_off_t addition)
+{
+ enum TDB_ERROR ecode;
+
+ /* add a write to the transaction elements, so subsequent
+ reads see the zero data */
+ ecode = transaction_write(tdb, tdb->file->map_size, NULL, addition);
+ if (ecode == TDB_SUCCESS) {
+ tdb->file->map_size += addition;
+ }
+ return ecode;
+}
+
+static void *transaction_direct(struct tdb_context *tdb, tdb_off_t off,
+ size_t len, bool write_mode)
+{
+ size_t blk = off / PAGESIZE, end_blk;
+
+ /* This is wrong for zero-length blocks, but will fail gracefully */
+ end_blk = (off + len - 1) / PAGESIZE;
+
+ /* Can only do direct if in single block and we've already copied. */
+ if (write_mode) {
+ tdb->stats.transaction_write_direct++;
+ if (blk != end_blk
+ || blk >= tdb->transaction->num_blocks
+ || tdb->transaction->blocks[blk] == NULL) {
+ tdb->stats.transaction_write_direct_fail++;
+ return NULL;
+ }
+ return tdb->transaction->blocks[blk] + off % PAGESIZE;
+ }
+
+ tdb->stats.transaction_read_direct++;
+ /* Single which we have copied? */
+ if (blk == end_blk
+ && blk < tdb->transaction->num_blocks
+ && tdb->transaction->blocks[blk])
+ return tdb->transaction->blocks[blk] + off % PAGESIZE;
+
+ /* Otherwise must be all not copied. */
+ while (blk <= end_blk) {
+ if (blk >= tdb->transaction->num_blocks)
+ break;
+ if (tdb->transaction->blocks[blk]) {
+ tdb->stats.transaction_read_direct_fail++;
+ return NULL;
+ }
+ blk++;
+ }
+ return tdb->transaction->io_methods->direct(tdb, off, len, false);
+}
+
+static const struct tdb_methods transaction_methods = {
+ transaction_read,
+ transaction_write,
+ transaction_oob,
+ transaction_expand_file,
+ transaction_direct,
+};
+
+/*
+ sync to disk
+*/
+static enum TDB_ERROR transaction_sync(struct tdb_context *tdb,
+ tdb_off_t offset, tdb_len_t length)
+{
+ if (tdb->flags & TDB_NOSYNC) {
+ return TDB_SUCCESS;
+ }
+
+ if (fsync(tdb->file->fd) != 0) {
+ return tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
+ "tdb_transaction: fsync failed: %s",
+ strerror(errno));
+ }
+#ifdef MS_SYNC
+ if (tdb->file->map_ptr) {
+ tdb_off_t moffset = offset & ~(getpagesize()-1);
+ if (msync(moffset + (char *)tdb->file->map_ptr,
+ length + (offset - moffset), MS_SYNC) != 0) {
+ return tdb_logerr(tdb, TDB_ERR_IO, TDB_LOG_ERROR,
+ "tdb_transaction: msync failed: %s",
+ strerror(errno));
+ }
+ }
+#endif
+ return TDB_SUCCESS;
+}
+
+
+static void _tdb_transaction_cancel(struct tdb_context *tdb)
+{
+ int i;
+ enum TDB_ERROR ecode;
+
+ if (tdb->transaction == NULL) {
+ tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_LOG_USE_ERROR,
+ "tdb_transaction_cancel: no transaction");
+ return;
+ }
+
+ if (tdb->transaction->nesting != 0) {
+ tdb->transaction->transaction_error = 1;
+ tdb->transaction->nesting--;
+ return;
+ }
+
+ tdb->file->map_size = tdb->transaction->old_map_size;
+
+ /* free all the transaction blocks */
+ for (i=0;i<tdb->transaction->num_blocks;i++) {
+ if (tdb->transaction->blocks[i] != NULL) {
+ free(tdb->transaction->blocks[i]);
+ }
+ }
+ SAFE_FREE(tdb->transaction->blocks);
+
+ if (tdb->transaction->magic_offset) {
+ const struct tdb_methods *methods = tdb->transaction->io_methods;
+ uint64_t invalid = TDB_RECOVERY_INVALID_MAGIC;
+
+ /* remove the recovery marker */
+ ecode = methods->twrite(tdb, tdb->transaction->magic_offset,
+ &invalid, sizeof(invalid));
+ if (ecode == TDB_SUCCESS)
+ ecode = transaction_sync(tdb,
+ tdb->transaction->magic_offset,
+ sizeof(invalid));
+ if (ecode != TDB_SUCCESS) {
+ tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
+ "tdb_transaction_cancel: failed to remove"
+ " recovery magic");
+ }
+ }
+
+ if (tdb->file->allrecord_lock.count)
+ tdb_allrecord_unlock(tdb, tdb->file->allrecord_lock.ltype);
+
+ /* restore the normal io methods */
+ tdb->methods = tdb->transaction->io_methods;
+
+ tdb_transaction_unlock(tdb, F_WRLCK);
+
+ if (tdb_has_open_lock(tdb))
+ tdb_unlock_open(tdb, F_WRLCK);
+
+ SAFE_FREE(tdb->transaction);
+}
+
+/*
+ start a tdb transaction. No token is returned, as only a single
+ transaction is allowed to be pending per tdb_context
+*/
+enum TDB_ERROR tdb_transaction_start(struct tdb_context *tdb)
+{
+ enum TDB_ERROR ecode;
+
+ tdb->stats.transactions++;
+ /* some sanity checks */
+ if (tdb->read_only || (tdb->flags & TDB_INTERNAL)) {
+ return tdb->last_error = tdb_logerr(tdb, TDB_ERR_EINVAL,
+ TDB_LOG_USE_ERROR,
+ "tdb_transaction_start:"
+ " cannot start a"
+ " transaction on a "
+ "read-only or internal db");
+ }
+
+ /* cope with nested tdb_transaction_start() calls */
+ if (tdb->transaction != NULL) {
+ if (!(tdb->flags & TDB_ALLOW_NESTING)) {
+ return tdb->last_error
+ = tdb_logerr(tdb, TDB_ERR_IO,
+ TDB_LOG_USE_ERROR,
+ "tdb_transaction_start:"
+ " already inside transaction");
+ }
+ tdb->transaction->nesting++;
+ tdb->stats.transaction_nest++;
+ return 0;
+ }
+
+ if (tdb_has_hash_locks(tdb)) {
+ /* the caller must not have any locks when starting a
+ transaction as otherwise we'll be screwed by lack
+ of nested locks in POSIX */
+ return tdb->last_error = tdb_logerr(tdb, TDB_ERR_LOCK,
+ TDB_LOG_USE_ERROR,
+ "tdb_transaction_start:"
+ " cannot start a"
+ " transaction with locks"
+ " held");
+ }
+
+ tdb->transaction = (struct tdb_transaction *)
+ calloc(sizeof(struct tdb_transaction), 1);
+ if (tdb->transaction == NULL) {
+ return tdb->last_error = tdb_logerr(tdb, TDB_ERR_OOM,
+ TDB_LOG_ERROR,
+ "tdb_transaction_start:"
+ " cannot allocate");
+ }
+
+ /* get the transaction write lock. This is a blocking lock. As
+ discussed with Volker, there are a number of ways we could
+ make this async, which we will probably do in the future */
+ ecode = tdb_transaction_lock(tdb, F_WRLCK);
+ if (ecode != TDB_SUCCESS) {
+ SAFE_FREE(tdb->transaction->blocks);
+ SAFE_FREE(tdb->transaction);
+ return tdb->last_error = ecode;
+ }
+
+ /* get a read lock over entire file. This is upgraded to a write
+ lock during the commit */
+ ecode = tdb_allrecord_lock(tdb, F_RDLCK, TDB_LOCK_WAIT, true);
+ if (ecode != TDB_SUCCESS) {
+ goto fail_allrecord_lock;
+ }
+
+ /* make sure we know about any file expansions already done by
+ anyone else */
+ tdb->methods->oob(tdb, tdb->file->map_size + 1, true);
+ tdb->transaction->old_map_size = tdb->file->map_size;
+
+ /* finally hook the io methods, replacing them with
+ transaction specific methods */
+ tdb->transaction->io_methods = tdb->methods;
+ tdb->methods = &transaction_methods;
+ return tdb->last_error = TDB_SUCCESS;
+
+fail_allrecord_lock:
+ tdb_transaction_unlock(tdb, F_WRLCK);
+ SAFE_FREE(tdb->transaction->blocks);
+ SAFE_FREE(tdb->transaction);
+ return tdb->last_error = ecode;
+}
+
+
+/*
+ cancel the current transaction
+*/
+void tdb_transaction_cancel(struct tdb_context *tdb)
+{
+ tdb->stats.transaction_cancel++;
+ _tdb_transaction_cancel(tdb);
+}
+
+/*
+ work out how much space the linearised recovery data will consume (worst case)
+*/
+static tdb_len_t tdb_recovery_size(struct tdb_context *tdb)
+{
+ tdb_len_t recovery_size = 0;
+ int i;
+
+ recovery_size = 0;
+ for (i=0;i<tdb->transaction->num_blocks;i++) {
+ if (i * PAGESIZE >= tdb->transaction->old_map_size) {
+ break;
+ }
+ if (tdb->transaction->blocks[i] == NULL) {
+ continue;
+ }
+ recovery_size += 2*sizeof(tdb_off_t);
+ if (i == tdb->transaction->num_blocks-1) {
+ recovery_size += tdb->transaction->last_block_size;
+ } else {
+ recovery_size += PAGESIZE;
+ }
+ }
+
+ return recovery_size;
+}
+
+static enum TDB_ERROR tdb_recovery_area(struct tdb_context *tdb,
+ const struct tdb_methods *methods,
+ tdb_off_t *recovery_offset,
+ struct tdb_recovery_record *rec)
+{
+ enum TDB_ERROR ecode;
+
+ *recovery_offset = tdb_read_off(tdb,
+ offsetof(struct tdb_header, recovery));
+ if (TDB_OFF_IS_ERR(*recovery_offset)) {
+ return *recovery_offset;
+ }
+
+ if (*recovery_offset == 0) {
+ rec->max_len = 0;
+ return TDB_SUCCESS;
+ }
+
+ ecode = methods->tread(tdb, *recovery_offset, rec, sizeof(*rec));
+ if (ecode != TDB_SUCCESS)
+ return ecode;
+
+ tdb_convert(tdb, rec, sizeof(*rec));
+ /* ignore invalid recovery regions: can happen in crash */
+ if (rec->magic != TDB_RECOVERY_MAGIC &&
+ rec->magic != TDB_RECOVERY_INVALID_MAGIC) {
+ *recovery_offset = 0;
+ rec->max_len = 0;
+ }
+ return TDB_SUCCESS;
+}
+
+static unsigned int same(const unsigned char *new,
+ const unsigned char *old,
+ unsigned int length)
+{
+ unsigned int i;
+
+ for (i = 0; i < length; i++) {
+ if (new[i] != old[i])
+ break;
+ }
+ return i;
+}
+
+static unsigned int different(const unsigned char *new,
+ const unsigned char *old,
+ unsigned int length,
+ unsigned int min_same,
+ unsigned int *samelen)
+{
+ unsigned int i;
+
+ *samelen = 0;
+ for (i = 0; i < length; i++) {
+ if (new[i] == old[i]) {
+ (*samelen)++;
+ } else {
+ if (*samelen >= min_same) {
+ return i - *samelen;
+ }
+ *samelen = 0;
+ }
+ }
+
+ if (*samelen < min_same)
+ *samelen = 0;
+ return length - *samelen;
+}
+
+/* Allocates recovery blob, without tdb_recovery_record at head set up. */
+static struct tdb_recovery_record *alloc_recovery(struct tdb_context *tdb,
+ tdb_len_t *len)
+{
+ struct tdb_recovery_record *rec;
+ size_t i;
+ enum TDB_ERROR ecode;
+ unsigned char *p;
+ const struct tdb_methods *old_methods = tdb->methods;
+
+ rec = malloc(sizeof(*rec) + tdb_recovery_size(tdb));
+ if (!rec) {
+ tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR,
+ "transaction_setup_recovery:"
+ " cannot allocate");
+ return TDB_ERR_PTR(TDB_ERR_OOM);
+ }
+
+ /* We temporarily revert to the old I/O methods, so we can use
+ * tdb_access_read */
+ tdb->methods = tdb->transaction->io_methods;
+
+ /* build the recovery data into a single blob to allow us to do a single
+ large write, which should be more efficient */
+ p = (unsigned char *)(rec + 1);
+ for (i=0;i<tdb->transaction->num_blocks;i++) {
+ tdb_off_t offset;
+ tdb_len_t length;
+ unsigned int off;
+ const unsigned char *buffer;
+
+ if (tdb->transaction->blocks[i] == NULL) {
+ continue;
+ }
+
+ offset = i * PAGESIZE;
+ length = PAGESIZE;
+ if (i == tdb->transaction->num_blocks-1) {
+ length = tdb->transaction->last_block_size;
+ }
+
+ if (offset >= tdb->transaction->old_map_size) {
+ continue;
+ }
+
+ if (offset + length > tdb->file->map_size) {
+ ecode = tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
+ "tdb_transaction_setup_recovery:"
+ " transaction data over new region"
+ " boundary");
+ goto fail;
+ }
+ if (offset + length > tdb->transaction->old_map_size) {
+ /* Short read at EOF. */
+ length = tdb->transaction->old_map_size - offset;
+ }
+ buffer = tdb_access_read(tdb, offset, length, false);
+ if (TDB_PTR_IS_ERR(buffer)) {
+ ecode = TDB_PTR_ERR(buffer);
+ goto fail;
+ }
+
+ /* Skip over anything the same at the start. */
+ off = same(tdb->transaction->blocks[i], buffer, length);
+ offset += off;
+
+ while (off < length) {
+ tdb_len_t len;
+ unsigned int samelen;
+
+ len = different(tdb->transaction->blocks[i] + off,
+ buffer + off, length - off,
+ sizeof(offset) + sizeof(len) + 1,
+ &samelen);
+
+ memcpy(p, &offset, sizeof(offset));
+ memcpy(p + sizeof(offset), &len, sizeof(len));
+ tdb_convert(tdb, p, sizeof(offset) + sizeof(len));
+ p += sizeof(offset) + sizeof(len);
+ memcpy(p, buffer + off, len);
+ p += len;
+ off += len + samelen;
+ offset += len + samelen;
+ }
+ tdb_access_release(tdb, buffer);
+ }
+
+ *len = p - (unsigned char *)(rec + 1);
+ tdb->methods = old_methods;
+ return rec;
+
+fail:
+ free(rec);
+ tdb->methods = old_methods;
+ return TDB_ERR_PTR(ecode);
+}
+
+static tdb_off_t create_recovery_area(struct tdb_context *tdb,
+ tdb_len_t rec_length,
+ struct tdb_recovery_record *rec)
+{
+ tdb_off_t off, recovery_off;
+ tdb_len_t addition;
+ enum TDB_ERROR ecode;
+ const struct tdb_methods *methods = tdb->transaction->io_methods;
+
+ /* round up to a multiple of page size. Overallocate, since each
+ * such allocation forces us to expand the file. */
+ rec->max_len
+ = (((sizeof(*rec) + rec_length + rec_length / 2)
+ + PAGESIZE-1) & ~(PAGESIZE-1))
+ - sizeof(*rec);
+ off = tdb->file->map_size;
+
+ /* Restore ->map_size before calling underlying expand_file.
+ Also so that we don't try to expand the file again in the
+ transaction commit, which would destroy the recovery
+ area */
+ addition = (tdb->file->map_size - tdb->transaction->old_map_size) +
+ sizeof(*rec) + rec->max_len;
+ tdb->file->map_size = tdb->transaction->old_map_size;
+ tdb->stats.transaction_expand_file++;
+ ecode = methods->expand_file(tdb, addition);
+ if (ecode != TDB_SUCCESS) {
+ return tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
+ "tdb_recovery_allocate:"
+ " failed to create recovery area");
+ }
+
+ /* we have to reset the old map size so that we don't try to
+ expand the file again in the transaction commit, which
+ would destroy the recovery area */
+ tdb->transaction->old_map_size = tdb->file->map_size;
+
+ /* write the recovery header offset and sync - we can sync without a race here
+ as the magic ptr in the recovery record has not been set */
+ recovery_off = off;
+ tdb_convert(tdb, &recovery_off, sizeof(recovery_off));
+ ecode = methods->twrite(tdb, offsetof(struct tdb_header, recovery),
+ &recovery_off, sizeof(tdb_off_t));
+ if (ecode != TDB_SUCCESS) {
+ return tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
+ "tdb_recovery_allocate:"
+ " failed to write recovery head");
+ }
+ transaction_write_existing(tdb, offsetof(struct tdb_header, recovery),
+ &recovery_off,
+ sizeof(tdb_off_t));
+ return off;
+}
+
+/*
+ setup the recovery data that will be used on a crash during commit
+*/
+static enum TDB_ERROR transaction_setup_recovery(struct tdb_context *tdb)
+{
+ tdb_len_t recovery_size = 0;
+ tdb_off_t recovery_off = 0;
+ tdb_off_t old_map_size = tdb->transaction->old_map_size;
+ struct tdb_recovery_record *recovery;
+ const struct tdb_methods *methods = tdb->transaction->io_methods;
+ uint64_t magic;
+ enum TDB_ERROR ecode;
+
+ recovery = alloc_recovery(tdb, &recovery_size);
+ if (TDB_PTR_IS_ERR(recovery))
+ return TDB_PTR_ERR(recovery);
+
+ ecode = tdb_recovery_area(tdb, methods, &recovery_off, recovery);
+ if (ecode) {
+ free(recovery);
+ return ecode;
+ }
+
+ if (recovery->max_len < recovery_size) {
+ /* Not large enough. Free up old recovery area. */
+ if (recovery_off) {
+ tdb->stats.frees++;
+ ecode = add_free_record(tdb, recovery_off,
+ sizeof(*recovery)
+ + recovery->max_len,
+ TDB_LOCK_WAIT, true);
+ free(recovery);
+ if (ecode != TDB_SUCCESS) {
+ return tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
+ "tdb_recovery_allocate:"
+ " failed to free previous"
+ " recovery area");
+ }
+
+ /* Refresh recovery after add_free_record above. */
+ recovery = alloc_recovery(tdb, &recovery_size);
+ if (TDB_PTR_IS_ERR(recovery))
+ return TDB_PTR_ERR(recovery);
+ }
+
+ recovery_off = create_recovery_area(tdb, recovery_size,
+ recovery);
+ if (TDB_OFF_IS_ERR(recovery_off)) {
+ free(recovery);
+ return recovery_off;
+ }
+ }
+
+ /* Now we know size, convert rec header. */
+ recovery->magic = TDB_RECOVERY_INVALID_MAGIC;
+ recovery->len = recovery_size;
+ recovery->eof = old_map_size;
+ tdb_convert(tdb, recovery, sizeof(*recovery));
+
+ /* write the recovery data to the recovery area */
+ ecode = methods->twrite(tdb, recovery_off, recovery, recovery_size);
+ if (ecode != TDB_SUCCESS) {
+ free(recovery);
+ return tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
+ "tdb_transaction_setup_recovery:"
+ " failed to write recovery data");
+ }
+ transaction_write_existing(tdb, recovery_off, recovery, recovery_size);
+
+ free(recovery);
+
+ /* as we don't have ordered writes, we have to sync the recovery
+ data before we update the magic to indicate that the recovery
+ data is present */
+ ecode = transaction_sync(tdb, recovery_off, recovery_size);
+ if (ecode != TDB_SUCCESS)
+ return ecode;
+
+ magic = TDB_RECOVERY_MAGIC;
+ tdb_convert(tdb, &magic, sizeof(magic));
+
+ tdb->transaction->magic_offset
+ = recovery_off + offsetof(struct tdb_recovery_record, magic);
+
+ ecode = methods->twrite(tdb, tdb->transaction->magic_offset,
+ &magic, sizeof(magic));
+ if (ecode != TDB_SUCCESS) {
+ return tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
+ "tdb_transaction_setup_recovery:"
+ " failed to write recovery magic");
+ }
+ transaction_write_existing(tdb, tdb->transaction->magic_offset,
+ &magic, sizeof(magic));
+
+ /* ensure the recovery magic marker is on disk */
+ return transaction_sync(tdb, tdb->transaction->magic_offset,
+ sizeof(magic));
+}
+
+static enum TDB_ERROR _tdb_transaction_prepare_commit(struct tdb_context *tdb)
+{
+ const struct tdb_methods *methods;
+ enum TDB_ERROR ecode;
+
+ if (tdb->transaction == NULL) {
+ return tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_LOG_USE_ERROR,
+ "tdb_transaction_prepare_commit:"
+ " no transaction");
+ }
+
+ if (tdb->transaction->prepared) {
+ _tdb_transaction_cancel(tdb);
+ return tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_LOG_USE_ERROR,
+ "tdb_transaction_prepare_commit:"
+ " transaction already prepared");
+ }
+
+ if (tdb->transaction->transaction_error) {
+ _tdb_transaction_cancel(tdb);
+ return tdb_logerr(tdb, TDB_ERR_EINVAL, TDB_LOG_ERROR,
+ "tdb_transaction_prepare_commit:"
+ " transaction error pending");
+ }
+
+
+ if (tdb->transaction->nesting != 0) {
+ return TDB_SUCCESS;
+ }
+
+ /* check for a null transaction */
+ if (tdb->transaction->blocks == NULL) {
+ return TDB_SUCCESS;
+ }
+
+ methods = tdb->transaction->io_methods;
+
+ /* upgrade the main transaction lock region to a write lock */
+ ecode = tdb_allrecord_upgrade(tdb);
+ if (ecode != TDB_SUCCESS) {
+ return ecode;
+ }
+
+ /* get the open lock - this prevents new users attaching to the database
+ during the commit */
+ ecode = tdb_lock_open(tdb, F_WRLCK, TDB_LOCK_WAIT|TDB_LOCK_NOCHECK);
+ if (ecode != TDB_SUCCESS) {
+ return ecode;
+ }
+
+ /* Since we have whole db locked, we don't need the expansion lock. */
+ if (!(tdb->flags & TDB_NOSYNC)) {
+ /* Sets up tdb->transaction->recovery and
+ * tdb->transaction->magic_offset. */
+ ecode = transaction_setup_recovery(tdb);
+ if (ecode != TDB_SUCCESS) {
+ return ecode;
+ }
+ }
+
+ tdb->transaction->prepared = true;
+
+ /* expand the file to the new size if needed */
+ if (tdb->file->map_size != tdb->transaction->old_map_size) {
+ tdb_len_t add;
+
+ add = tdb->file->map_size - tdb->transaction->old_map_size;
+ /* Restore original map size for tdb_expand_file */
+ tdb->file->map_size = tdb->transaction->old_map_size;
+ ecode = methods->expand_file(tdb, add);
+ if (ecode != TDB_SUCCESS) {
+ return ecode;
+ }
+ }
+
+ /* Keep the open lock until the actual commit */
+ return TDB_SUCCESS;
+}
+
+/*
+ prepare to commit the current transaction
+*/
+enum TDB_ERROR tdb_transaction_prepare_commit(struct tdb_context *tdb)
+{
+ return _tdb_transaction_prepare_commit(tdb);
+}
+
+/*
+ commit the current transaction
+*/
+enum TDB_ERROR tdb_transaction_commit(struct tdb_context *tdb)
+{
+ const struct tdb_methods *methods;
+ int i;
+ enum TDB_ERROR ecode;
+
+ if (tdb->transaction == NULL) {
+ return tdb->last_error = tdb_logerr(tdb, TDB_ERR_EINVAL,
+ TDB_LOG_USE_ERROR,
+ "tdb_transaction_commit:"
+ " no transaction");
+ }
+
+ tdb_trace(tdb, "tdb_transaction_commit");
+
+ if (tdb->transaction->nesting != 0) {
+ tdb->transaction->nesting--;
+ return tdb->last_error = TDB_SUCCESS;
+ }
+
+ /* check for a null transaction */
+ if (tdb->transaction->blocks == NULL) {
+ _tdb_transaction_cancel(tdb);
+ return tdb->last_error = TDB_SUCCESS;
+ }
+
+ if (!tdb->transaction->prepared) {
+ ecode = _tdb_transaction_prepare_commit(tdb);
+ if (ecode != TDB_SUCCESS) {
+ _tdb_transaction_cancel(tdb);
+ return tdb->last_error = ecode;
+ }
+ }
+
+ methods = tdb->transaction->io_methods;
+
+ /* perform all the writes */
+ for (i=0;i<tdb->transaction->num_blocks;i++) {
+ tdb_off_t offset;
+ tdb_len_t length;
+
+ if (tdb->transaction->blocks[i] == NULL) {
+ continue;
+ }
+
+ offset = i * PAGESIZE;
+ length = PAGESIZE;
+ if (i == tdb->transaction->num_blocks-1) {
+ length = tdb->transaction->last_block_size;
+ }
+
+ ecode = methods->twrite(tdb, offset,
+ tdb->transaction->blocks[i], length);
+ if (ecode != TDB_SUCCESS) {
+ /* we've overwritten part of the data and
+ possibly expanded the file, so we need to
+ run the crash recovery code */
+ tdb->methods = methods;
+ tdb_transaction_recover(tdb);
+
+ _tdb_transaction_cancel(tdb);
+
+ return tdb->last_error = ecode;
+ }
+ SAFE_FREE(tdb->transaction->blocks[i]);
+ }
+
+ SAFE_FREE(tdb->transaction->blocks);
+ tdb->transaction->num_blocks = 0;
+
+ /* ensure the new data is on disk */
+ ecode = transaction_sync(tdb, 0, tdb->file->map_size);
+ if (ecode != TDB_SUCCESS) {
+ return tdb->last_error = ecode;
+ }
+
+ /*
+ TODO: maybe write to some dummy hdr field, or write to magic
+ offset without mmap, before the last sync, instead of the
+ utime() call
+ */
+
+ /* on some systems (like Linux 2.6.x) changes via mmap/msync
+ don't change the mtime of the file, this means the file may
+ not be backed up (as tdb rounding to block sizes means that
+ file size changes are quite rare too). The following forces
+ mtime changes when a transaction completes */
+#if HAVE_UTIME
+ utime(tdb->name, NULL);
+#endif
+
+ /* use a transaction cancel to free memory and remove the
+ transaction locks: it "restores" map_size, too. */
+ tdb->transaction->old_map_size = tdb->file->map_size;
+ _tdb_transaction_cancel(tdb);
+
+ return tdb->last_error = TDB_SUCCESS;
+}
+
+
+/*
+ recover from an aborted transaction. Must be called with exclusive
+ database write access already established (including the open
+ lock to prevent new processes attaching)
+*/
+enum TDB_ERROR tdb_transaction_recover(struct tdb_context *tdb)
+{
+ tdb_off_t recovery_head, recovery_eof;
+ unsigned char *data, *p;
+ struct tdb_recovery_record rec;
+ enum TDB_ERROR ecode;
+
+ /* find the recovery area */
+ recovery_head = tdb_read_off(tdb, offsetof(struct tdb_header,recovery));
+ if (TDB_OFF_IS_ERR(recovery_head)) {
+ return tdb_logerr(tdb, recovery_head, TDB_LOG_ERROR,
+ "tdb_transaction_recover:"
+ " failed to read recovery head");
+ }
+
+ if (recovery_head == 0) {
+ /* we have never allocated a recovery record */
+ return TDB_SUCCESS;
+ }
+
+ /* read the recovery record */
+ ecode = tdb_read_convert(tdb, recovery_head, &rec, sizeof(rec));
+ if (ecode != TDB_SUCCESS) {
+ return tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
+ "tdb_transaction_recover:"
+ " failed to read recovery record");
+ }
+
+ if (rec.magic != TDB_RECOVERY_MAGIC) {
+ /* there is no valid recovery data */
+ return TDB_SUCCESS;
+ }
+
+ if (tdb->read_only) {
+ return tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
+ "tdb_transaction_recover:"
+ " attempt to recover read only database");
+ }
+
+ recovery_eof = rec.eof;
+
+ data = (unsigned char *)malloc(rec.len);
+ if (data == NULL) {
+ return tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR,
+ "tdb_transaction_recover:"
+ " failed to allocate recovery data");
+ }
+
+ /* read the full recovery data */
+ ecode = tdb->methods->tread(tdb, recovery_head + sizeof(rec), data,
+ rec.len);
+ if (ecode != TDB_SUCCESS) {
+ return tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
+ "tdb_transaction_recover:"
+ " failed to read recovery data");
+ }
+
+ /* recover the file data */
+ p = data;
+ while (p+sizeof(tdb_off_t)+sizeof(tdb_len_t) < data + rec.len) {
+ tdb_off_t ofs;
+ tdb_len_t len;
+ tdb_convert(tdb, p, sizeof(ofs) + sizeof(len));
+ memcpy(&ofs, p, sizeof(ofs));
+ memcpy(&len, p + sizeof(ofs), sizeof(len));
+ p += sizeof(ofs) + sizeof(len);
+
+ ecode = tdb->methods->twrite(tdb, ofs, p, len);
+ if (ecode != TDB_SUCCESS) {
+ free(data);
+ return tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
+ "tdb_transaction_recover:"
+ " failed to recover %zu bytes"
+ " at offset %zu",
+ (size_t)len, (size_t)ofs);
+ }
+ p += len;
+ }
+
+ free(data);
+
+ ecode = transaction_sync(tdb, 0, tdb->file->map_size);
+ if (ecode != TDB_SUCCESS) {
+ return tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
+ "tdb_transaction_recover:"
+ " failed to sync recovery");
+ }
+
+ /* if the recovery area is after the recovered eof then remove it */
+ if (recovery_eof <= recovery_head) {
+ ecode = tdb_write_off(tdb, offsetof(struct tdb_header,
+ recovery),
+ 0);
+ if (ecode != TDB_SUCCESS) {
+ return tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
+ "tdb_transaction_recover:"
+ " failed to remove recovery head");
+ }
+ }
+
+ /* remove the recovery magic */
+ ecode = tdb_write_off(tdb,
+ recovery_head
+ + offsetof(struct tdb_recovery_record, magic),
+ TDB_RECOVERY_INVALID_MAGIC);
+ if (ecode != TDB_SUCCESS) {
+ return tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
+ "tdb_transaction_recover:"
+ " failed to remove recovery magic");
+ }
+
+ ecode = transaction_sync(tdb, 0, recovery_eof);
+ if (ecode != TDB_SUCCESS) {
+ return tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
+ "tdb_transaction_recover:"
+ " failed to sync2 recovery");
+ }
+
+ tdb_logerr(tdb, TDB_SUCCESS, TDB_LOG_WARNING,
+ "tdb_transaction_recover: recovered %zu byte database",
+ (size_t)recovery_eof);
+
+ /* all done */
+ return TDB_SUCCESS;
+}
+
+tdb_bool_err tdb_needs_recovery(struct tdb_context *tdb)
+{
+ tdb_off_t recovery_head;
+ struct tdb_recovery_record rec;
+ enum TDB_ERROR ecode;
+
+ /* find the recovery area */
+ recovery_head = tdb_read_off(tdb, offsetof(struct tdb_header,recovery));
+ if (TDB_OFF_IS_ERR(recovery_head)) {
+ return recovery_head;
+ }
+
+ if (recovery_head == 0) {
+ /* we have never allocated a recovery record */
+ return false;
+ }
+
+ /* read the recovery record */
+ ecode = tdb_read_convert(tdb, recovery_head, &rec, sizeof(rec));
+ if (ecode != TDB_SUCCESS) {
+ return ecode;
+ }
+
+ return (rec.magic == TDB_RECOVERY_MAGIC);
+}
diff --git a/lib/tdb2/traverse.c b/lib/tdb2/traverse.c
new file mode 100644
index 00000000000..179e095142a
--- /dev/null
+++ b/lib/tdb2/traverse.c
@@ -0,0 +1,99 @@
+ /*
+ Trivial Database 2: traverse function.
+ Copyright (C) Rusty Russell 2010
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 3 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+#include "private.h"
+#include <ccan/likely/likely.h>
+
+int64_t tdb_traverse_(struct tdb_context *tdb,
+ int (*fn)(struct tdb_context *,
+ TDB_DATA, TDB_DATA, void *),
+ void *p)
+{
+ enum TDB_ERROR ecode;
+ struct traverse_info tinfo;
+ struct tdb_data k, d;
+ int64_t count = 0;
+
+ k.dptr = NULL;
+ for (ecode = first_in_hash(tdb, &tinfo, &k, &d.dsize);
+ ecode == TDB_SUCCESS;
+ ecode = next_in_hash(tdb, &tinfo, &k, &d.dsize)) {
+ d.dptr = k.dptr + k.dsize;
+
+ count++;
+ if (fn && fn(tdb, k, d, p)) {
+ free(k.dptr);
+ tdb->last_error = TDB_SUCCESS;
+ return count;
+ }
+ free(k.dptr);
+ }
+
+ if (ecode != TDB_ERR_NOEXIST) {
+ return tdb->last_error = ecode;
+ }
+ tdb->last_error = TDB_SUCCESS;
+ return count;
+}
+
+enum TDB_ERROR tdb_firstkey(struct tdb_context *tdb, struct tdb_data *key)
+{
+ struct traverse_info tinfo;
+
+ return tdb->last_error = first_in_hash(tdb, &tinfo, key, NULL);
+}
+
+/* We lock twice, not very efficient. We could keep last key & tinfo cached. */
+enum TDB_ERROR tdb_nextkey(struct tdb_context *tdb, struct tdb_data *key)
+{
+ struct traverse_info tinfo;
+ struct hash_info h;
+ struct tdb_used_record rec;
+
+ tinfo.prev = find_and_lock(tdb, *key, F_RDLCK, &h, &rec, &tinfo);
+ free(key->dptr);
+ if (TDB_OFF_IS_ERR(tinfo.prev)) {
+ return tdb->last_error = tinfo.prev;
+ }
+ tdb_unlock_hashes(tdb, h.hlock_start, h.hlock_range, F_RDLCK);
+
+ return tdb->last_error = next_in_hash(tdb, &tinfo, key, NULL);
+}
+
+static int wipe_one(struct tdb_context *tdb,
+ TDB_DATA key, TDB_DATA data, enum TDB_ERROR *ecode)
+{
+ *ecode = tdb_delete(tdb, key);
+ return (*ecode != TDB_SUCCESS);
+}
+
+enum TDB_ERROR tdb_wipe_all(struct tdb_context *tdb)
+{
+ enum TDB_ERROR ecode;
+ int64_t count;
+
+ ecode = tdb_allrecord_lock(tdb, F_WRLCK, TDB_LOCK_WAIT, false);
+ if (ecode != TDB_SUCCESS)
+ return tdb->last_error = ecode;
+
+ /* FIXME: Be smarter. */
+ count = tdb_traverse(tdb, wipe_one, &ecode);
+ if (count < 0)
+ ecode = count;
+ tdb_allrecord_unlock(tdb, F_WRLCK);
+ return tdb->last_error = ecode;
+}