From 106dcf070574c5be9bdeba90c689b09e3fbed71b Mon Sep 17 00:00:00 2001 From: Wouter Bolsterlee Date: Tue, 25 Feb 2014 21:08:21 +0100 Subject: No longer confuse batching/caching; add Table.scan(scan_batching=...) For details, see the comments added in this commit, and issues #54 and issue #56. --- NEWS.rst | 8 ++++++++ happybase/table.py | 45 ++++++++++++++++++++++++++++++++++----------- tests/test_api.py | 7 +++++++ 3 files changed, 49 insertions(+), 11 deletions(-) diff --git a/NEWS.rst b/NEWS.rst index 0f6e7d8..907b40e 100644 --- a/NEWS.rst +++ b/NEWS.rst @@ -18,6 +18,14 @@ Release date: *not yet released* Python 2.6 a separate ```ordereddict``` pacakge has to be installed from PyPI. (`issue #39 `_) +* The `batch_size` argument to :py:meth:`Table.scan` is no longer propagated to + `Scan.setBatching()` at the Java side (inside the Thrift server). To influence + the `Scan.setBatching()` (which may split rows into partial rows) a new + `scan_batching` argument to :py:meth:`Table.scan` has been added. See `issue + #54 `_, `issue #56 + `_, and the HBase docs for + `Scan.setBatching()` for more details. + HappyBase 0.7 ------------- diff --git a/happybase/table.py b/happybase/table.py index c8f2a75..76351dc 100644 --- a/happybase/table.py +++ b/happybase/table.py @@ -214,8 +214,8 @@ class Table(object): def scan(self, row_start=None, row_stop=None, row_prefix=None, columns=None, filter=None, timestamp=None, - include_timestamp=False, batch_size=1000, limit=None, - sorted_columns=False): + include_timestamp=False, batch_size=1000, scan_batching=None, + limit=None, sorted_columns=False): """Create a scanner for data in the table. This method returns an iterable that can be used for looping over the @@ -245,15 +245,22 @@ class Table(object): If `limit` is given, at most `limit` results will be returned. - If `sorted_columns` is `True`, the columns in the rows returned - by this scanner will be retrieved in sorted order, and the data - will be stored in `OrderedDict` instances. - The `batch_size` argument specifies how many results should be retrieved per batch when retrieving results from the scanner. Only set this to a low value (or even 1) if your data is large, since a low batch size results in added round-trips to the server. + The optional `scan_batching` is for advanced usage only; it + translates to `Scan.setBatching()` at the Java side (inside the + Thrift server). By setting this value rows may be split into + partial rows, so result rows may be incomplete, and the number + of results returned by te scanner may no longer correspond to + the number of rows matched by the scan. + + If `sorted_columns` is `True`, the columns in the rows returned + by this scanner will be retrieved in sorted order, and the data + will be stored in `OrderedDict` instances. + **Compatibility notes:** * The `filter` argument is only available when using HBase 0.92 @@ -274,6 +281,7 @@ class Table(object): :param int timestamp: timestamp (optional) :param bool include_timestamp: whether timestamps are returned :param int batch_size: batch size for retrieving resuls + :param bool scan_batching: server-side scan batching (optional) :param int limit: max number of rows to return :param bool sorted_columns: whether to return sorted columns @@ -327,10 +335,25 @@ class Table(object): self.name, row_start, row_stop, columns, timestamp, {}) else: - # The scan's caching size is set to the batch_size, so that - # the HTable on the Java side retrieves rows from the region - # servers in the same chunk sizes that it sends out over - # Thrift. + # XXX: The "batch_size" can be slightly confusing to those + # familiar with the HBase Java API: + # + # * TScan.caching (Thrift API) translates to + # Scan.setCaching() (Java API) + # + # * TScan.batchSize (Thrift API) translates to + # Scan.setBatching (Java API) . + # + # However, we set Scan.setCaching() to what is called + # batch_size in the HappyBase API, so that the HTable on the + # Java side (inside the Thrift server) retrieves rows from + # the region servers in the same chunk sizes that it sends + # out again to Python (over Thrift). This cannot be tweaked + # (by design). + # + # The Scan.setBatching() value (Java API), which possibly + # cuts rows into multiple partial rows, can be set using the + # slightly strange name scan_batching. scan = TScan( startRow=row_start, stopRow=row_stop, @@ -338,7 +361,7 @@ class Table(object): columns=columns, caching=batch_size, filterString=filter, - batchSize=batch_size, + batchSize=scan_batching, sortColumns=sorted_columns, ) scan_id = self.connection.client.scannerOpenWithScan( diff --git a/tests/test_api.py b/tests/test_api.py index 2d22717..292ee3f 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -446,6 +446,13 @@ def test_scan_sorting(): row.items()) +def test_scan_filter_and_batch_size(): + # See issue #54 and #56 + filter = "SingleColumnValueFilter ('cf1', 'qual1', =, 'binary:val1')" + for k, v in table.scan(filter=filter): + print v + + def test_delete(): row_key = 'row-test-delete' data = {'cf1:col1': 'v1', -- cgit v1.2.1