Create the dbm package from PEP 3108. #2881.

author: Georg Brandl <georg@python.org> 2008-05-26 10:29:35 +0000
committer: Georg Brandl <georg@python.org> 2008-05-26 10:29:35 +0000
commit: 7775a1f3aa8166b22293d63b5a7715bbcfa35054 (patch)
tree: 14340ef392976deb444b666f90c6f7efed9b8b62 /Lib/dbm
parent: 6487a3aa1026c275985d1ae0f7f7bcdde43611d2 (diff)
download: cpython-7775a1f3aa8166b22293d63b5a7715bbcfa35054.tar.gz
5 files changed, 471 insertions, 0 deletions
diff --git a/Lib/dbm/__init__.py b/Lib/dbm/__init__.py
new file mode 100644
index 0000000000..9fdd4145cc
--- /dev/null
+++ b/Lib/dbm/__init__.py
@@ -0,0 +1,198 @@
+"""Generic interface to all dbm clones.
+
+Use
+
+        import dbm
+        d = dbm.open(file, 'w', 0o666)
+
+The returned object is a dbm.bsd, dbm.gnu, dbm.ndbm or dbm.dumb
+object, dependent on the type of database being opened (determined by
+the whichdb function) in the case of an existing dbm.  If the dbm does
+not exist and the create or new flag ('c' or 'n') was specified, the
+dbm type will be determined by the availability of the modules (tested
+in the above order).
+
+It has the following interface (key and data are strings):
+
+        d[key] = data   # store data at key (may override data at
+                        # existing key)
+        data = d[key]   # retrieve data at key (raise KeyError if no
+                        # such key)
+        del d[key]      # delete data stored at key (raises KeyError
+                        # if no such key)
+        flag = key in d # true if the key exists
+        list = d.keys() # return a list of all existing keys (slow!)
+
+Future versions may change the order in which implementations are
+tested for existence, add interfaces to other dbm-like
+implementations.
+
+The open function has an optional second argument.  This can be 'r',
+for read-only access, 'w', for read-write access of an existing
+database, 'c' for read-write access to a new or existing database, and
+'n' for read-write access to a new database.  The default is 'r'.
+
+Note: 'r' and 'w' fail if the database doesn't exist; 'c' creates it
+only if it doesn't exist; and 'n' always creates a new database.
+"""
+
+__all__ = ['open', 'whichdb', 'error', 'errors']
+
+import io
+import os
+import struct
+import sys
+
+
+class error(Exception):
+    pass
+
+_names = ['dbm.bsd', 'dbm.gnu', 'dbm.ndbm', 'dbm.dumb']
+_errors = [error]
+_defaultmod = None
+_modules = {}
+
+for _name in _names:
+    try:
+        _mod = __import__(_name, fromlist=['open'])
+    except ImportError:
+        continue
+    if not _defaultmod:
+        _defaultmod = _mod
+    _modules[_name] = _mod
+    _errors.append(_mod.error)
+
+if not _defaultmod:
+    raise ImportError("no dbm clone found; tried %s" % _names)
+
+error = tuple(_errors)
+
+
+def open(file, flag = 'r', mode = 0o666):
+    # guess the type of an existing database
+    result = whichdb(file)
+    if result is None:
+        # db doesn't exist
+        if 'c' in flag or 'n' in flag:
+            # file doesn't exist and the new flag was used so use default type
+            mod = _defaultmod
+        else:
+            raise error("need 'c' or 'n' flag to open new db")
+    elif result == "":
+        # db type cannot be determined
+        raise error("db type could not be determined")
+    else:
+        mod = _modules[result]
+    return mod.open(file, flag, mode)
+
+
+try:
+    from dbm import ndbm
+    _dbmerror = ndbm.error
+except ImportError:
+    ndbm = None
+    # just some sort of valid exception which might be raised in the ndbm test
+    _dbmerror = IOError
+
+def whichdb(filename):
+    """Guess which db package to use to open a db file.
+
+    Return values:
+
+    - None if the database file can't be read;
+    - empty string if the file can be read but can't be recognized
+    - the name of the dbm submodule (e.g. "ndbm" or "gnu") if recognized.
+
+    Importing the given module may still fail, and opening the
+    database using that module may still fail.
+    """
+
+    # Check for ndbm first -- this has a .pag and a .dir file
+    try:
+        f = io.open(filename + ".pag", "rb")
+        f.close()
+        # dbm linked with gdbm on OS/2 doesn't have .dir file
+        if not (ndbm.library == "GNU gdbm" and sys.platform == "os2emx"):
+            f = io.open(filename + ".dir", "rb")
+            f.close()
+        return "dbm.ndbm"
+    except IOError:
+        # some dbm emulations based on Berkeley DB generate a .db file
+        # some do not, but they should be caught by the bsd checks
+        try:
+            f = io.open(filename + ".db", "rb")
+            f.close()
+            # guarantee we can actually open the file using dbm
+            # kind of overkill, but since we are dealing with emulations
+            # it seems like a prudent step
+            if ndbm is not None:
+                d = ndbm.open(filename)
+                d.close()
+                return "dbm.ndbm"
+        except (IOError, _dbmerror):
+            pass
+
+    # Check for dumbdbm next -- this has a .dir and a .dat file
+    try:
+        # First check for presence of files
+        os.stat(filename + ".dat")
+        size = os.stat(filename + ".dir").st_size
+        # dumbdbm files with no keys are empty
+        if size == 0:
+            return "dbm.dumb"
+        f = io.open(filename + ".dir", "rb")
+        try:
+            if f.read(1) in (b"'", b'"'):
+                return "dbm.dumb"
+        finally:
+            f.close()
+    except (OSError, IOError):
+        pass
+
+    # See if the file exists, return None if not
+    try:
+        f = io.open(filename, "rb")
+    except IOError:
+        return None
+
+    # Read the start of the file -- the magic number
+    s16 = f.read(16)
+    f.close()
+    s = s16[0:4]
+
+    # Return "" if not at least 4 bytes
+    if len(s) != 4:
+        return ""
+
+    # Convert to 4-byte int in native byte order -- return "" if impossible
+    try:
+        (magic,) = struct.unpack("=l", s)
+    except struct.error:
+        return ""
+
+    # Check for GNU dbm
+    if magic == 0x13579ace:
+        return "dbm.gnu"
+
+    ## Check for old Berkeley db hash file format v2
+    #if magic in (0x00061561, 0x61150600):
+    #    return "bsddb185" # not supported anymore
+
+    # Later versions of Berkeley db hash file have a 12-byte pad in
+    # front of the file type
+    try:
+        (magic,) = struct.unpack("=l", s16[-4:])
+    except struct.error:
+        return ""
+
+    # Check for BSD hash
+    if magic in (0x00061561, 0x61150600):
+        return "dbm.bsd"
+
+    # Unknown
+    return ""
+
+
+if __name__ == "__main__":
+    for filename in sys.argv[1:]:
+        print(whichdb(filename) or "UNKNOWN", filename)
diff --git a/Lib/dbm/bsd.py b/Lib/dbm/bsd.py
new file mode 100644
index 0000000000..8353f50376
--- /dev/null
+++ b/Lib/dbm/bsd.py
@@ -0,0 +1,10 @@
+"""Provide a (g)dbm-compatible interface to bsddb.hashopen."""
+
+import bsddb
+
+__all__ = ["error", "open"]
+
+error = bsddb.error
+
+def open(file, flag = 'r', mode=0o666):
+    return bsddb.hashopen(file, flag, mode)
diff --git a/Lib/dbm/dumb.py b/Lib/dbm/dumb.py
new file mode 100644
index 0000000000..76f4a631bc
--- /dev/null
+++ b/Lib/dbm/dumb.py
@@ -0,0 +1,257 @@
+"""A dumb and slow but simple dbm clone.
+
+For database spam, spam.dir contains the index (a text file),
+spam.bak *may* contain a backup of the index (also a text file),
+while spam.dat contains the data (a binary file).
+
+XXX TO DO:
+
+- seems to contain a bug when updating...
+
+- reclaim free space (currently, space once occupied by deleted or expanded
+items is never reused)
+
+- support concurrent access (currently, if two processes take turns making
+updates, they can mess up the index)
+
+- support efficient access to large databases (currently, the whole index
+is read when the database is opened, and some updates rewrite the whole index)
+
+- support opening for read-only (flag = 'm')
+
+"""
+
+import io as _io
+import os as _os
+import collections
+
+__all__ = ["error", "open"]
+
+_BLOCKSIZE = 512
+
+error = IOError
+
+class _Database(collections.MutableMapping):
+
+    # The on-disk directory and data files can remain in mutually
+    # inconsistent states for an arbitrarily long time (see comments
+    # at the end of __setitem__).  This is only repaired when _commit()
+    # gets called.  One place _commit() gets called is from __del__(),
+    # and if that occurs at program shutdown time, module globals may
+    # already have gotten rebound to None.  Since it's crucial that
+    # _commit() finish successfully, we can't ignore shutdown races
+    # here, and _commit() must not reference any globals.
+    _os = _os       # for _commit()
+    _io = _io       # for _commit()
+
+    def __init__(self, filebasename, mode):
+        self._mode = mode
+
+        # The directory file is a text file.  Each line looks like
+        #    "%r, (%d, %d)\n" % (key, pos, siz)
+        # where key is the string key, pos is the offset into the dat
+        # file of the associated value's first byte, and siz is the number
+        # of bytes in the associated value.
+        self._dirfile = filebasename + '.dir'
+
+        # The data file is a binary file pointed into by the directory
+        # file, and holds the values associated with keys.  Each value
+        # begins at a _BLOCKSIZE-aligned byte offset, and is a raw
+        # binary 8-bit string value.
+        self._datfile = filebasename + '.dat'
+        self._bakfile = filebasename + '.bak'
+
+        # The index is an in-memory dict, mirroring the directory file.
+        self._index = None  # maps keys to (pos, siz) pairs
+
+        # Mod by Jack: create data file if needed
+        try:
+            f = _io.open(self._datfile, 'r')
+        except IOError:
+            f = _io.open(self._datfile, 'w')
+            self._chmod(self._datfile)
+        f.close()
+        self._update()
+
+    # Read directory file into the in-memory index dict.
+    def _update(self):
+        self._index = {}
+        try:
+            f = _io.open(self._dirfile, 'r')
+        except IOError:
+            pass
+        else:
+            for line in f:
+                line = line.rstrip()
+                key, pos_and_siz_pair = eval(line)
+                self._index[key] = pos_and_siz_pair
+            f.close()
+
+    # Write the index dict to the directory file.  The original directory
+    # file (if any) is renamed with a .bak extension first.  If a .bak
+    # file currently exists, it's deleted.
+    def _commit(self):
+        # CAUTION:  It's vital that _commit() succeed, and _commit() can
+        # be called from __del__().  Therefore we must never reference a
+        # global in this routine.
+        if self._index is None:
+            return  # nothing to do
+
+        try:
+            self._os.unlink(self._bakfile)
+        except self._os.error:
+            pass
+
+        try:
+            self._os.rename(self._dirfile, self._bakfile)
+        except self._os.error:
+            pass
+
+        f = self._io.open(self._dirfile, 'w')
+        self._chmod(self._dirfile)
+        for key, pos_and_siz_pair in self._index.items():
+            f.write("%r, %r\n" % (key, pos_and_siz_pair))
+        f.close()
+
+    sync = _commit
+
+    def __getitem__(self, key):
+        key = key.decode("latin-1")
+        pos, siz = self._index[key]     # may raise KeyError
+        f = _io.open(self._datfile, 'rb')
+        f.seek(pos)
+        dat = f.read(siz)
+        f.close()
+        return dat
+
+    # Append val to the data file, starting at a _BLOCKSIZE-aligned
+    # offset.  The data file is first padded with NUL bytes (if needed)
+    # to get to an aligned offset.  Return pair
+    #     (starting offset of val, len(val))
+    def _addval(self, val):
+        f = _io.open(self._datfile, 'rb+')
+        f.seek(0, 2)
+        pos = int(f.tell())
+        npos = ((pos + _BLOCKSIZE - 1) // _BLOCKSIZE) * _BLOCKSIZE
+        f.write(b'\0'*(npos-pos))
+        pos = npos
+        f.write(val)
+        f.close()
+        return (pos, len(val))
+
+    # Write val to the data file, starting at offset pos.  The caller
+    # is responsible for ensuring that there's enough room starting at
+    # pos to hold val, without overwriting some other value.  Return
+    # pair (pos, len(val)).
+    def _setval(self, pos, val):
+        f = _io.open(self._datfile, 'rb+')
+        f.seek(pos)
+        f.write(val)
+        f.close()
+        return (pos, len(val))
+
+    # key is a new key whose associated value starts in the data file
+    # at offset pos and with length siz.  Add an index record to
+    # the in-memory index dict, and append one to the directory file.
+    def _addkey(self, key, pos_and_siz_pair):
+        self._index[key] = pos_and_siz_pair
+        f = _io.open(self._dirfile, 'a')
+        self._chmod(self._dirfile)
+        f.write("%r, %r\n" % (key, pos_and_siz_pair))
+        f.close()
+
+    def __setitem__(self, key, val):
+        if not isinstance(key, bytes):
+            raise TypeError("keys must be bytes")
+        key = key.decode("latin-1") # hashable bytes
+        if not isinstance(val, (bytes, bytearray)):
+            raise TypeError("values must be byte strings")
+        if key not in self._index:
+            self._addkey(key, self._addval(val))
+        else:
+            # See whether the new value is small enough to fit in the
+            # (padded) space currently occupied by the old value.
+            pos, siz = self._index[key]
+            oldblocks = (siz + _BLOCKSIZE - 1) // _BLOCKSIZE
+            newblocks = (len(val) + _BLOCKSIZE - 1) // _BLOCKSIZE
+            if newblocks <= oldblocks:
+                self._index[key] = self._setval(pos, val)
+            else:
+                # The new value doesn't fit in the (padded) space used
+                # by the old value.  The blocks used by the old value are
+                # forever lost.
+                self._index[key] = self._addval(val)
+
+            # Note that _index may be out of synch with the directory
+            # file now:  _setval() and _addval() don't update the directory
+            # file.  This also means that the on-disk directory and data
+            # files are in a mutually inconsistent state, and they'll
+            # remain that way until _commit() is called.  Note that this
+            # is a disaster (for the database) if the program crashes
+            # (so that _commit() never gets called).
+
+    def __delitem__(self, key):
+        key = key.decode("latin-1")
+        # The blocks used by the associated value are lost.
+        del self._index[key]
+        # XXX It's unclear why we do a _commit() here (the code always
+        # XXX has, so I'm not changing it).  _setitem__ doesn't try to
+        # XXX keep the directory file in synch.  Why should we?  Or
+        # XXX why shouldn't __setitem__?
+        self._commit()
+
+    def keys(self):
+        return [key.encode("latin-1") for key in self._index.keys()]
+
+    def items(self):
+        return [(key.encode("latin-1"), self[key.encode("latin-1")])
+                for key in self._index.keys()]
+
+    def __contains__(self, key):
+        key = key.decode("latin-1")
+        return key in self._index
+
+    def iterkeys(self):
+        return iter(self._index.keys())
+    __iter__ = iterkeys
+
+    def __len__(self):
+        return len(self._index)
+
+    def close(self):
+        self._commit()
+        self._index = self._datfile = self._dirfile = self._bakfile = None
+
+    __del__ = close
+
+    def _chmod (self, file):
+        if hasattr(self._os, 'chmod'):
+            self._os.chmod(file, self._mode)
+
+
+def open(file, flag=None, mode=0o666):
+    """Open the database file, filename, and return corresponding object.
+
+    The flag argument, used to control how the database is opened in the
+    other DBM implementations, is ignored in the dbm.dumb module; the
+    database is always opened for update, and will be created if it does
+    not exist.
+
+    The optional mode argument is the UNIX mode of the file, used only when
+    the database has to be created.  It defaults to octal code 0o666 (and
+    will be modified by the prevailing umask).
+
+    """
+    # flag argument is currently ignored
+
+    # Modify mode depending on the umask
+    try:
+        um = _os.umask(0)
+        _os.umask(um)
+    except AttributeError:
+        pass
+    else:
+        # Turn off any bits that are set in the umask
+        mode = mode & (~um)
+
+    return _Database(file, mode)
diff --git a/Lib/dbm/gnu.py b/Lib/dbm/gnu.py
new file mode 100644
index 0000000000..b07a1defff
--- /dev/null
+++ b/Lib/dbm/gnu.py
@@ -0,0 +1,3 @@
+"""Provide the _gdbm module as a dbm submodule."""
+
+from _gdbm import *
diff --git a/Lib/dbm/ndbm.py b/Lib/dbm/ndbm.py
new file mode 100644
index 0000000000..23056a29ef
--- /dev/null
+++ b/Lib/dbm/ndbm.py
@@ -0,0 +1,3 @@
+"""Provide the _dbm module as a dbm submodule."""
+
+from _dbm import *
author	Georg Brandl <georg@python.org>	2008-05-26 10:29:35 +0000
committer	Georg Brandl <georg@python.org>	2008-05-26 10:29:35 +0000
commit	7775a1f3aa8166b22293d63b5a7715bbcfa35054 (patch)
tree	14340ef392976deb444b666f90c6f7efed9b8b62 /Lib/dbm
parent	6487a3aa1026c275985d1ae0f7f7bcdde43611d2 (diff)
download	cpython-7775a1f3aa8166b22293d63b5a7715bbcfa35054.tar.gz