summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorWouter Bolsterlee <uws@xs4all.nl>2014-02-25 21:08:21 +0100
committerWouter Bolsterlee <uws@xs4all.nl>2014-02-25 21:08:21 +0100
commit106dcf070574c5be9bdeba90c689b09e3fbed71b (patch)
tree5a73f162f4f2fd3c08634dbe8014e1f3a7f323ec
parentda109ab7b3c8a6ede332bc0d99e3e71cbecf5154 (diff)
downloadhappybase-106dcf070574c5be9bdeba90c689b09e3fbed71b.tar.gz
No longer confuse batching/caching; add Table.scan(scan_batching=...)
For details, see the comments added in this commit, and issues #54 and issue #56.
-rw-r--r--NEWS.rst8
-rw-r--r--happybase/table.py45
-rw-r--r--tests/test_api.py7
3 files changed, 49 insertions, 11 deletions
diff --git a/NEWS.rst b/NEWS.rst
index 0f6e7d8..907b40e 100644
--- a/NEWS.rst
+++ b/NEWS.rst
@@ -18,6 +18,14 @@ Release date: *not yet released*
Python 2.6 a separate ```ordereddict``` pacakge has to be installed from PyPI.
(`issue #39 <https://github.com/wbolster/happybase/issues/39>`_)
+* The `batch_size` argument to :py:meth:`Table.scan` is no longer propagated to
+ `Scan.setBatching()` at the Java side (inside the Thrift server). To influence
+ the `Scan.setBatching()` (which may split rows into partial rows) a new
+ `scan_batching` argument to :py:meth:`Table.scan` has been added. See `issue
+ #54 <https://github.com/wbolster/happybase/issues/54>`_, `issue #56
+ <https://github.com/wbolster/happybase/issues/56>`_, and the HBase docs for
+ `Scan.setBatching()` for more details.
+
HappyBase 0.7
-------------
diff --git a/happybase/table.py b/happybase/table.py
index c8f2a75..76351dc 100644
--- a/happybase/table.py
+++ b/happybase/table.py
@@ -214,8 +214,8 @@ class Table(object):
def scan(self, row_start=None, row_stop=None, row_prefix=None,
columns=None, filter=None, timestamp=None,
- include_timestamp=False, batch_size=1000, limit=None,
- sorted_columns=False):
+ include_timestamp=False, batch_size=1000, scan_batching=None,
+ limit=None, sorted_columns=False):
"""Create a scanner for data in the table.
This method returns an iterable that can be used for looping over the
@@ -245,15 +245,22 @@ class Table(object):
If `limit` is given, at most `limit` results will be returned.
- If `sorted_columns` is `True`, the columns in the rows returned
- by this scanner will be retrieved in sorted order, and the data
- will be stored in `OrderedDict` instances.
-
The `batch_size` argument specifies how many results should be
retrieved per batch when retrieving results from the scanner. Only set
this to a low value (or even 1) if your data is large, since a low
batch size results in added round-trips to the server.
+ The optional `scan_batching` is for advanced usage only; it
+ translates to `Scan.setBatching()` at the Java side (inside the
+ Thrift server). By setting this value rows may be split into
+ partial rows, so result rows may be incomplete, and the number
+ of results returned by te scanner may no longer correspond to
+ the number of rows matched by the scan.
+
+ If `sorted_columns` is `True`, the columns in the rows returned
+ by this scanner will be retrieved in sorted order, and the data
+ will be stored in `OrderedDict` instances.
+
**Compatibility notes:**
* The `filter` argument is only available when using HBase 0.92
@@ -274,6 +281,7 @@ class Table(object):
:param int timestamp: timestamp (optional)
:param bool include_timestamp: whether timestamps are returned
:param int batch_size: batch size for retrieving resuls
+ :param bool scan_batching: server-side scan batching (optional)
:param int limit: max number of rows to return
:param bool sorted_columns: whether to return sorted columns
@@ -327,10 +335,25 @@ class Table(object):
self.name, row_start, row_stop, columns, timestamp, {})
else:
- # The scan's caching size is set to the batch_size, so that
- # the HTable on the Java side retrieves rows from the region
- # servers in the same chunk sizes that it sends out over
- # Thrift.
+ # XXX: The "batch_size" can be slightly confusing to those
+ # familiar with the HBase Java API:
+ #
+ # * TScan.caching (Thrift API) translates to
+ # Scan.setCaching() (Java API)
+ #
+ # * TScan.batchSize (Thrift API) translates to
+ # Scan.setBatching (Java API) .
+ #
+ # However, we set Scan.setCaching() to what is called
+ # batch_size in the HappyBase API, so that the HTable on the
+ # Java side (inside the Thrift server) retrieves rows from
+ # the region servers in the same chunk sizes that it sends
+ # out again to Python (over Thrift). This cannot be tweaked
+ # (by design).
+ #
+ # The Scan.setBatching() value (Java API), which possibly
+ # cuts rows into multiple partial rows, can be set using the
+ # slightly strange name scan_batching.
scan = TScan(
startRow=row_start,
stopRow=row_stop,
@@ -338,7 +361,7 @@ class Table(object):
columns=columns,
caching=batch_size,
filterString=filter,
- batchSize=batch_size,
+ batchSize=scan_batching,
sortColumns=sorted_columns,
)
scan_id = self.connection.client.scannerOpenWithScan(
diff --git a/tests/test_api.py b/tests/test_api.py
index 2d22717..292ee3f 100644
--- a/tests/test_api.py
+++ b/tests/test_api.py
@@ -446,6 +446,13 @@ def test_scan_sorting():
row.items())
+def test_scan_filter_and_batch_size():
+ # See issue #54 and #56
+ filter = "SingleColumnValueFilter ('cf1', 'qual1', =, 'binary:val1')"
+ for k, v in table.scan(filter=filter):
+ print v
+
+
def test_delete():
row_key = 'row-test-delete'
data = {'cf1:col1': 'v1',