summaryrefslogtreecommitdiff
path: root/happybase/table.py
diff options
context:
space:
mode:
authorWouter Bolsterlee <uws@xs4all.nl>2013-05-02 23:37:08 +0200
committerWouter Bolsterlee <uws@xs4all.nl>2013-05-02 23:37:08 +0200
commit92715a2e2422f878831a828cf70a8d57bce10863 (patch)
treea90f9a8b01836cde65146151135d02a9c8898014 /happybase/table.py
parent45e201f2f61855a409b9934f4f4aa0c161452005 (diff)
downloadhappybase-92715a2e2422f878831a828cf70a8d57bce10863.tar.gz
Split code into separate modules
Split the monolithic .api module into separate modules implementing the connection, table, and batch parts. The public API is still the same, since all functionality is available from the main 'happybase' namespace, so this is just an internal cleanup.
Diffstat (limited to 'happybase/table.py')
-rw-r--r--happybase/table.py482
1 files changed, 482 insertions, 0 deletions
diff --git a/happybase/table.py b/happybase/table.py
new file mode 100644
index 0000000..ae80dd7
--- /dev/null
+++ b/happybase/table.py
@@ -0,0 +1,482 @@
+"""
+HappyBase table module.
+"""
+
+import logging
+from operator import attrgetter
+from struct import Struct
+
+from .hbase.ttypes import TScan
+from .util import thrift_type_to_dict, str_increment
+from .batch import Batch
+
+logger = logging.getLogger(__name__)
+
+make_cell = attrgetter('value')
+make_cell_timestamp = attrgetter('value', 'timestamp')
+pack_i64 = Struct('>q').pack
+
+
+def make_row(cell_map, include_timestamp):
+ """Make a row dict for a cell mapping like ttypes.TRowResult.columns."""
+ cellfn = include_timestamp and make_cell_timestamp or make_cell
+ return dict((cn, cellfn(cell)) for cn, cell in cell_map.iteritems())
+
+
+class Table(object):
+ """HBase table abstraction class.
+
+ This class cannot be instantiated directly; use :py:meth:`Connection.table`
+ instead.
+ """
+ def __init__(self, name, connection):
+ self.name = name
+ self.connection = connection
+ self.client = connection.client
+
+ def __repr__(self):
+ return '<%s.%s name=%r>' % (
+ __name__,
+ self.__class__.__name__,
+ self.name,
+ )
+
+ def families(self):
+ """Retrieve the column families for this table.
+
+ :return: Mapping from column family name to settings dict
+ :rtype: dict
+ """
+ descriptors = self.client.getColumnDescriptors(self.name)
+ families = dict()
+ for name, descriptor in descriptors.items():
+ name = name[:-1] # drop trailing ':'
+ families[name] = thrift_type_to_dict(descriptor)
+ return families
+
+ def _column_family_names(self):
+ """Retrieve the column family names for this table (internal use)"""
+ return self.client.getColumnDescriptors(self.name).keys()
+
+ def regions(self):
+ """Retrieve the regions for this table.
+
+ :return: regions for this table
+ :rtype: list of dicts
+ """
+ regions = self.client.getTableRegions(self.name)
+ return map(thrift_type_to_dict, regions)
+
+ #
+ # Data retrieval
+ #
+
+ def row(self, row, columns=None, timestamp=None, include_timestamp=False):
+ """Retrieve a single row of data.
+
+ This method retrieves the row with the row key specified in the `row`
+ argument and returns the columns and values for this row as
+ a dictionary.
+
+ The `row` argument is the row key of the row. If the `columns` argument
+ is specified, only the values for these columns will be returned
+ instead of all available columns. The `columns` argument should be
+ a list or tuple containing strings. Each name can be a column family,
+ such as `cf1` or `cf1:` (the trailing colon is not required), or
+ a column family with a qualifier, such as `cf1:col1`.
+
+ If specified, the `timestamp` argument specifies the maximum version
+ that results may have. The `include_timestamp` argument specifies
+ whether cells are returned as single values or as `(value, timestamp)`
+ tuples.
+
+ :param str row: the row key
+ :param list_or_tuple columns: list of columns (optional)
+ :param int timestamp: timestamp (optional)
+ :param bool include_timestamp: whether timestamps are returned
+
+ :return: Mapping of columns (both qualifier and family) to values
+ :rtype: dict
+ """
+ if columns is not None and not isinstance(columns, (tuple, list)):
+ raise TypeError("'columns' must be a tuple or list")
+
+ if timestamp is None:
+ rows = self.client.getRowWithColumns(self.name, row, columns)
+ else:
+ if not isinstance(timestamp, int):
+ raise TypeError("'timestamp' must be an integer")
+ rows = self.client.getRowWithColumnsTs(self.name, row, columns,
+ timestamp)
+
+ if not rows:
+ return {}
+
+ return make_row(rows[0].columns, include_timestamp)
+
+ def rows(self, rows, columns=None, timestamp=None,
+ include_timestamp=False):
+ """Retrieve multiple rows of data.
+
+ This method retrieves the rows with the row keys specified in the
+ `rows` argument, which should be should be a list (or tuple) of row
+ keys. The return value is a list of `(row_key, row_dict)` tuples.
+
+ The `columns`, `timestamp` and `include_timestamp` arguments behave
+ exactly the same as for :py:meth:`row`.
+
+ :param list rows: list of row keys
+ :param list_or_tuple columns: list of columns (optional)
+ :param int timestamp: timestamp (optional)
+ :param bool include_timestamp: whether timestamps are returned
+
+ :return: List of mappings (columns to values)
+ :rtype: list of dicts
+ """
+ if columns is not None and not isinstance(columns, (tuple, list)):
+ raise TypeError("'columns' must be a tuple or list")
+
+ if not rows:
+ # Avoid round-trip if the result is empty anyway
+ return {}
+
+ if timestamp is None:
+ results = self.client.getRowsWithColumns(self.name, rows, columns)
+ else:
+ if not isinstance(timestamp, int):
+ raise TypeError("'timestamp' must be an integer")
+
+ # Work-around a bug in the HBase Thrift server where the
+ # timestamp is only applied if columns are specified, at
+ # the cost of an extra round-trip.
+ if columns is None:
+ columns = self._column_family_names()
+
+ results = self.client.getRowsWithColumnsTs(self.name, rows,
+ columns, timestamp)
+
+ return [(r.row, make_row(r.columns, include_timestamp))
+ for r in results]
+
+ def cells(self, row, column, versions=None, timestamp=None,
+ include_timestamp=False):
+ """Retrieve multiple versions of a single cell from the table.
+
+ This method retrieves multiple versions of a cell (if any).
+
+ The `versions` argument defines how many cell versions to
+ retrieve at most.
+
+ The `timestamp` and `include_timestamp` arguments behave exactly the
+ same as for :py:meth:`row`.
+
+ :param str row: the row key
+ :param str column: the column name
+ :param int versions: the maximum number of versions to retrieve
+ :param int timestamp: timestamp (optional)
+ :param bool include_timestamp: whether timestamps are returned
+
+ :return: cell values
+ :rtype: list of values
+ """
+ if versions is None:
+ versions = (2 ** 31) - 1 # Thrift type is i32
+ elif not isinstance(versions, int):
+ raise TypeError("'versions' parameter must be a number or None")
+ elif versions < 1:
+ raise ValueError("'versions' parameter must be at least 1 (or None)")
+
+ if timestamp is None:
+ cells = self.client.getVer(self.name, row, column, versions)
+ else:
+ if not isinstance(timestamp, int):
+ raise TypeError("'timestamp' must be an integer")
+ cells = self.client.getVerTs(self.name, row, column, timestamp,
+ versions)
+
+ if include_timestamp:
+ return map(make_cell_timestamp, cells)
+ else:
+ return map(make_cell, cells)
+
+ def scan(self, row_start=None, row_stop=None, row_prefix=None,
+ columns=None, filter=None, timestamp=None,
+ include_timestamp=False, batch_size=1000, limit=None):
+ """Create a scanner for data in the table.
+
+ This method returns an iterable that can be used for looping over the
+ matching rows. Scanners can be created in two ways:
+
+ * The `row_start` and `row_stop` arguments specify the row keys where
+ the scanner should start and stop. It does not matter whether the
+ table contains any rows with the specified keys: the first row after
+ `row_start` will be the first result, and the last row before
+ `row_stop` will be the last result. Note that the start of the range
+ is inclusive, while the end is exclusive.
+
+ Both `row_start` and `row_stop` can be `None` to specify the start
+ and the end of the table respectively. If both are omitted, a full
+ table scan is done. Note that this usually results in severe
+ performance problems.
+
+ * Alternatively, if `row_prefix` is specified, only rows with row keys
+ matching the prefix will be returned. If given, `row_start` and
+ `row_stop` cannot be used.
+
+ The `columns`, `timestamp` and `include_timestamp` arguments behave
+ exactly the same as for :py:meth:`row`.
+
+ The `filter` argument may be a filter string that will be applied at
+ the server by the region servers.
+
+ If `limit` is given, at most `limit` results will be returned.
+
+ The `batch_size` argument specified how many results should be
+ retrieved per batch when retrieving results from the scanner. Only set
+ this to a low value (or even 1) if your data is large, since a low
+ batch size results in added round-trips to the server.
+
+ **Compatibility note:** The `filter` argument is only available when
+ using HBase 0.92 (or up). In HBase 0.90 compatibility mode, specifying
+ a `filter` raises an exception.
+
+ :param str row_start: the row key to start at (inclusive)
+ :param str row_stop: the row key to stop at (exclusive)
+ :param str row_prefix: a prefix of the row key that must match
+ :param list_or_tuple columns: list of columns (optional)
+ :param str filter: a filter string (optional)
+ :param int timestamp: timestamp (optional)
+ :param bool include_timestamp: whether timestamps are returned
+ :param int batch_size: batch size for retrieving resuls
+
+ :return: generator yielding the rows matching the scan
+ :rtype: iterable of `(row_key, row_data)` tuples
+ """
+ if batch_size < 1:
+ raise ValueError("'batch_size' must be >= 1")
+
+ if limit is not None and limit < 1:
+ raise ValueError("'limit' must be >= 1")
+
+ if row_prefix is not None:
+ if row_start is not None or row_stop is not None:
+ raise TypeError("'row_prefix' cannot be combined with 'row_start' or 'row_stop'")
+
+ row_start = row_prefix
+ row_stop = str_increment(row_prefix)
+
+ if row_start is None:
+ row_start = ''
+
+ client = self.client
+ if self.connection.compat == '0.90':
+ # The scannerOpenWithScan() Thrift function is not
+ # available, so work around it as much as possible with the
+ # other scannerOpen*() Thrift functions
+
+ if filter is not None:
+ raise NotImplementedError("'filter' is not supported in HBase 0.90")
+
+ if row_stop is None:
+ if timestamp is None:
+ scan_id = client.scannerOpen(self.name, row_start, columns)
+ else:
+ scan_id = client.scannerOpenTs(
+ self.name, row_start, columns, timestamp)
+ else:
+ if timestamp is None:
+ scan_id = client.scannerOpenWithStop(
+ self.name, row_start, row_stop, columns)
+ else:
+ scan_id = client.scannerOpenWithStopTs(
+ self.name, row_start, row_stop, columns, timestamp)
+
+ else:
+ # The scan's caching size is set to the batch_size, so that
+ # the HTable on the Java side retrieves rows from the region
+ # servers in the same chunk sizes that it sends out over
+ # Thrift.
+ scan = TScan(
+ startRow=row_start,
+ stopRow=row_stop,
+ timestamp=timestamp,
+ columns=columns,
+ caching=batch_size,
+ filterString=filter,
+ )
+ scan_id = client.scannerOpenWithScan(self.name, scan)
+
+ logger.debug("Opened scanner (id=%d) on '%s'", scan_id, self.name)
+
+ n_returned = n_fetched = 0
+ try:
+ while True:
+ if limit is None:
+ how_many = batch_size
+ else:
+ how_many = min(batch_size, limit - n_returned)
+
+ if how_many == 1:
+ items = client.scannerGet(scan_id)
+ else:
+ items = client.scannerGetList(scan_id, how_many)
+
+ n_fetched += len(items)
+
+ for n_returned, item in enumerate(items, n_returned + 1):
+ yield item.row, make_row(item.columns, include_timestamp)
+ if limit is not None and n_returned == limit:
+ return
+
+ # Avoid round-trip when exhausted
+ if len(items) < how_many:
+ break
+ finally:
+ client.scannerClose(scan_id)
+ logger.debug("Closed scanner (id=%d) on '%s' (%d returned, %d fetched)",
+ scan_id, self.name, n_returned, n_fetched)
+
+ #
+ # Data manipulation
+ #
+
+ def put(self, row, data, timestamp=None):
+ """Store data in the table.
+
+ This method stores the data in the `data` argument for the row
+ specified by `row`. The `data` argument is dictionary that maps columns
+ to values. Column names must include a family and qualifier part, e.g.
+ `cf:col`, though the qualifier part may be the empty string, e.g.
+ `cf:`. The `timestamp` argument is optional.
+
+ Note that, in many situations, :py:meth:`batch()` is a more appropriate
+ method to manipulate data.
+
+ :param str row: the row key
+ :param dict data: the data to store
+ :param int timestamp: timestamp (optional)
+ """
+ with self.batch(timestamp=timestamp) as batch:
+ batch.put(row, data)
+
+ def delete(self, row, columns=None, timestamp=None):
+ """Delete data from the table.
+
+ This method deletes all columns for the row specified by `row`, or only
+ some columns if the `columns` argument is specified.
+
+ Note that, in many situations, :py:meth:`batch()` is a more appropriate
+ method to manipulate data.
+
+ :param str row: the row key
+ :param list_or_tuple columns: list of columns (optional)
+ :param int timestamp: timestamp (optional)
+ """
+ if columns is None:
+ if timestamp is None:
+ self.client.deleteAllRow(self.name, row)
+ else:
+ self.client.deleteAllRowTs(self.name, row, timestamp)
+ else:
+ with self.batch(timestamp=timestamp) as batch:
+ batch.delete(row, columns)
+
+ def batch(self, timestamp=None, batch_size=None, transaction=False):
+ """Create a new batch operation for this table.
+
+ This method returns a new :py:class:`Batch` instance that can be used
+ for mass data manipulation. The `timestamp` argument applies to all
+ puts and deletes on the batch.
+
+ If given, the `batch_size` argument specifies the maximum batch size
+ after which the batch should send the mutations to the server. By
+ default this is unbounded.
+
+ The `transaction` argument specifies whether the returned
+ :py:class:`Batch` instance should act in a transaction-like manner when
+ used as context manager in a ``with`` block of code. The `transaction`
+ flag cannot be used in combination with `batch_size`.
+
+ :param bool transaction: whether this batch should behave like
+ a transaction (only useful when used as a
+ context manager)
+ :param int batch_size: batch size (optional)
+ :param int timestamp: timestamp (optional)
+
+ :return: Batch instance
+ :rtype: :py:class:`Batch`
+ """
+ kwargs = locals().copy()
+ del kwargs['self']
+ return Batch(table=self, **kwargs)
+
+ #
+ # Atomic counters
+ #
+
+ def counter_get(self, row, column):
+ """Retrieve the current value of a counter column.
+
+ This method retrieves the current value of a counter column. If the
+ counter column does not exist, this function initialises it to `0`.
+
+ Note that application code should *never* store a incremented or
+ decremented counter value directly; use the atomic
+ :py:meth:`Table.counter_inc` and :py:meth:`Table.counter_dec` methods
+ for that.
+
+ :param str row: the row key
+ :param str column: the column name
+
+ :return: counter value
+ :rtype: int
+ """
+ # Don't query directly, but increment with value=0 so that the counter
+ # is correctly initialised if didn't exist yet.
+ return self.counter_inc(row, column, value=0)
+
+ def counter_set(self, row, column, value=0):
+ """Set a counter column to a specific value.
+
+ This method stores a 64-bit signed integer value in the specified
+ column.
+
+ Note that application code should *never* store a incremented or
+ decremented counter value directly; use the atomic
+ :py:meth:`Table.counter_inc` and :py:meth:`Table.counter_dec` methods
+ for that.
+
+ :param str row: the row key
+ :param str column: the column name
+ :param int value: the counter value to set
+ """
+ self.put(row, {column: pack_i64(value)})
+
+ def counter_inc(self, row, column, value=1):
+ """Atomically increment (or decrements) a counter column.
+
+ This method atomically increments or decrements a counter column in the
+ row specified by `row`. The `value` argument specifies how much the
+ counter should be incremented (for positive values) or decremented (for
+ negative values). If the counter column did not exist, it is
+ automatically initialised to 0 before incrementing it.
+
+ :param str row: the row key
+ :param str column: the column name
+ :param int value: the amount to increment or decrement by (optional)
+
+ :return: counter value after incrementing
+ :rtype: int
+ """
+ return self.client.atomicIncrement(self.name, row, column, value)
+
+ def counter_dec(self, row, column, value=1):
+ """Atomically decrement (or increments) a counter column.
+
+ This method is a shortcut for calling :py:meth:`Table.counter_inc` with
+ the value negated.
+
+ :return: counter value after decrementing
+ :rtype: int
+ """
+ return self.counter_inc(row, column, -value)