Avoid consuming duplicate compressed messages from mid-batch (#1367)

author: Dana Powers <dana.powers@gmail.com> 2018-02-05 16:25:18 -0800
committer: GitHub <noreply@github.com> 2018-02-05 16:25:18 -0800
commit: 441aeb864519d2f574650e24a327423308adca03 (patch)
tree: 8f834926048f56755bbd146b3e97a1370aa6afb4
parent: 618c5051493693c1305aa9f08e8a0583d5fcf0e3 (diff)
download: kafka-python-441aeb864519d2f574650e24a327423308adca03.tar.gz
2 files changed, 51 insertions, 2 deletions
diff --git a/kafka/consumer/fetcher.py b/kafka/consumer/fetcher.py
index f9fcb37..c9bbb97 100644
--- a/kafka/consumer/fetcher.py
+++ b/kafka/consumer/fetcher.py
@@ -835,12 +835,21 @@ class Fetcher(six.Iterator):
 
         return parsed_records
 
-    class PartitionRecords(six.Iterator):
+    class PartitionRecords(object):
         def __init__(self, fetch_offset, tp, messages):
             self.fetch_offset = fetch_offset
             self.topic_partition = tp
             self.messages = messages
-            self.message_idx = 0
+            # When fetching an offset that is in the middle of a
+            # compressed batch, we will get all messages in the batch.
+            # But we want to start 'take' at the fetch_offset
+            for i, msg in enumerate(messages):
+                if msg.offset == fetch_offset:
+                    self.message_idx = i
+                    break
+            else:
+                self.message_idx = 0
+                self.messages = None
 
         # For truthiness evaluation we need to define __len__ or __nonzero__
         def __len__(self):
diff --git a/test/test_fetcher.py b/test/test_fetcher.py
index 429071a..4547222 100644
--- a/test/test_fetcher.py
+++ b/test/test_fetcher.py
@@ -498,3 +498,43 @@ def test__parse_fetched_data__out_of_range(fetcher, topic, mocker):
     partition_record = fetcher._parse_fetched_data(completed_fetch)
     assert partition_record is None
     assert fetcher._subscriptions.assignment[tp].awaiting_reset is True
+
+
+def test_partition_records_offset():
+    """Test that compressed messagesets are handle correctly
+    when fetch offset is in the middle of the message list
+    """
+    batch_start = 120
+    batch_end = 130
+    fetch_offset = 123
+    tp = TopicPartition('foo', 0)
+    messages = [ConsumerRecord(tp.topic, tp.partition, i,
+                               None, None, 'key', 'value', 'checksum', 0, 0)
+                for i in range(batch_start, batch_end)]
+    records = Fetcher.PartitionRecords(fetch_offset, None, messages)
+    assert len(records) > 0
+    msgs = records.take(1)
+    assert msgs[0].offset == 123
+    assert records.fetch_offset == 124
+    msgs = records.take(2)
+    assert len(msgs) == 2
+    assert len(records) > 0
+    records.discard()
+    assert len(records) == 0
+
+
+def test_partition_records_empty():
+    records = Fetcher.PartitionRecords(0, None, [])
+    assert len(records) == 0
+
+
+def test_partition_records_no_fetch_offset():
+    batch_start = 0
+    batch_end = 100
+    fetch_offset = 123
+    tp = TopicPartition('foo', 0)
+    messages = [ConsumerRecord(tp.topic, tp.partition, i,
+                               None, None, 'key', 'value', 'checksum', 0, 0)
+                for i in range(batch_start, batch_end)]
+    records = Fetcher.PartitionRecords(fetch_offset, None, messages)
+    assert len(records) == 0
author	Dana Powers <dana.powers@gmail.com>	2018-02-05 16:25:18 -0800
committer	GitHub <noreply@github.com>	2018-02-05 16:25:18 -0800
commit	441aeb864519d2f574650e24a327423308adca03 (patch)
tree	8f834926048f56755bbd146b3e97a1370aa6afb4
parent	618c5051493693c1305aa9f08e8a0583d5fcf0e3 (diff)
download	kafka-python-441aeb864519d2f574650e24a327423308adca03.tar.gz