No longer confuse batching/caching; add Table.scan(scan_batching=...)

For details, see the comments added in this commit, and issues #54 and issue #56.
author: Wouter Bolsterlee <uws@xs4all.nl> 2014-02-25 21:08:21 +0100
committer: Wouter Bolsterlee <uws@xs4all.nl> 2014-02-25 21:08:21 +0100
commit: 106dcf070574c5be9bdeba90c689b09e3fbed71b (patch)
tree: 5a73f162f4f2fd3c08634dbe8014e1f3a7f323ec /happybase
parent: da109ab7b3c8a6ede332bc0d99e3e71cbecf5154 (diff)
download: happybase-106dcf070574c5be9bdeba90c689b09e3fbed71b.tar.gz
1 files changed, 34 insertions, 11 deletions
diff --git a/happybase/table.py b/happybase/table.py
index c8f2a75..76351dc 100644
--- a/happybase/table.py
+++ b/happybase/table.py
@@ -214,8 +214,8 @@ class Table(object):
 
     def scan(self, row_start=None, row_stop=None, row_prefix=None,
              columns=None, filter=None, timestamp=None,
-             include_timestamp=False, batch_size=1000, limit=None,
-             sorted_columns=False):
+             include_timestamp=False, batch_size=1000, scan_batching=None,
+             limit=None, sorted_columns=False):
         """Create a scanner for data in the table.
 
         This method returns an iterable that can be used for looping over the
@@ -245,15 +245,22 @@ class Table(object):
 
         If `limit` is given, at most `limit` results will be returned.
 
-        If `sorted_columns` is `True`, the columns in the rows returned
-        by this scanner will be retrieved in sorted order, and the data
-        will be stored in `OrderedDict` instances.
-
         The `batch_size` argument specifies how many results should be
         retrieved per batch when retrieving results from the scanner. Only set
         this to a low value (or even 1) if your data is large, since a low
         batch size results in added round-trips to the server.
 
+        The optional `scan_batching` is for advanced usage only; it
+        translates to `Scan.setBatching()` at the Java side (inside the
+        Thrift server). By setting this value rows may be split into
+        partial rows, so result rows may be incomplete, and the number
+        of results returned by te scanner may no longer correspond to
+        the number of rows matched by the scan.
+
+        If `sorted_columns` is `True`, the columns in the rows returned
+        by this scanner will be retrieved in sorted order, and the data
+        will be stored in `OrderedDict` instances.
+
         **Compatibility notes:**
 
         * The `filter` argument is only available when using HBase 0.92
@@ -274,6 +281,7 @@ class Table(object):
         :param int timestamp: timestamp (optional)
         :param bool include_timestamp: whether timestamps are returned
         :param int batch_size: batch size for retrieving resuls
+        :param bool scan_batching: server-side scan batching (optional)
         :param int limit: max number of rows to return
         :param bool sorted_columns: whether to return sorted columns
 
@@ -327,10 +335,25 @@ class Table(object):
                         self.name, row_start, row_stop, columns, timestamp, {})
 
         else:
-            # The scan's caching size is set to the batch_size, so that
-            # the HTable on the Java side retrieves rows from the region
-            # servers in the same chunk sizes that it sends out over
-            # Thrift.
+            # XXX: The "batch_size" can be slightly confusing to those
+            # familiar with the HBase Java API:
+            #
+            # * TScan.caching (Thrift API) translates to
+            #   Scan.setCaching() (Java API)
+            #
+            # * TScan.batchSize (Thrift API) translates to
+            #   Scan.setBatching (Java API) .
+            #
+            # However, we set Scan.setCaching() to what is called
+            # batch_size in the HappyBase API, so that the HTable on the
+            # Java side (inside the Thrift server) retrieves rows from
+            # the region servers in the same chunk sizes that it sends
+            # out again to Python (over Thrift). This cannot be tweaked
+            # (by design).
+            #
+            # The Scan.setBatching() value (Java API), which possibly
+            # cuts rows into multiple partial rows, can be set using the
+            # slightly strange name scan_batching.
             scan = TScan(
                 startRow=row_start,
                 stopRow=row_stop,
@@ -338,7 +361,7 @@ class Table(object):
                 columns=columns,
                 caching=batch_size,
                 filterString=filter,
-                batchSize=batch_size,
+                batchSize=scan_batching,
                 sortColumns=sorted_columns,
             )
             scan_id = self.connection.client.scannerOpenWithScan(
author	Wouter Bolsterlee <uws@xs4all.nl>	2014-02-25 21:08:21 +0100
committer	Wouter Bolsterlee <uws@xs4all.nl>	2014-02-25 21:08:21 +0100
commit	106dcf070574c5be9bdeba90c689b09e3fbed71b (patch)
tree	5a73f162f4f2fd3c08634dbe8014e1f3a7f323ec /happybase
parent	da109ab7b3c8a6ede332bc0d99e3e71cbecf5154 (diff)
download	happybase-106dcf070574c5be9bdeba90c689b09e3fbed71b.tar.gz