From a537eeec64581f8f51b55b0cc68f4267155337ca Mon Sep 17 00:00:00 2001 From: Dana Powers Date: Sat, 7 Oct 2017 13:57:08 -0700 Subject: KAFKA-3977: Defer fetch parsing for space efficiency, and to raise exceptions to user --- kafka/consumer/fetcher.py | 491 ++++++++++++++++++++++------------------------ 1 file changed, 230 insertions(+), 261 deletions(-) diff --git a/kafka/consumer/fetcher.py b/kafka/consumer/fetcher.py index f552038..ebf6099 100644 --- a/kafka/consumer/fetcher.py +++ b/kafka/consumer/fetcher.py @@ -28,6 +28,11 @@ ConsumerRecord = collections.namedtuple("ConsumerRecord", "key", "value", "checksum", "serialized_key_size", "serialized_value_size"]) +CompletedFetch = collections.namedtuple("CompletedFetch", + ["topic_partition", "fetched_offset", "response_version", + "partition_data", "metric_aggregator"]) + + class NoOffsetForPartitionError(Errors.KafkaError): pass @@ -104,18 +109,15 @@ class Fetcher(six.Iterator): self._client = client self._subscriptions = subscriptions - self._records = collections.deque() # (offset, topic_partition, messages) - self._unauthorized_topics = set() - self._offset_out_of_range_partitions = dict() # {topic_partition: offset} - self._record_too_large_partitions = dict() # {topic_partition: offset} + self._completed_fetches = collections.deque() # Unparsed responses + self._next_partition_records = None # Holds a single PartitionRecords until fully consumed self._iterator = None self._fetch_futures = collections.deque() self._sensors = FetchManagerMetrics(metrics, self.config['metric_group_prefix']) def send_fetches(self): - """Send FetchRequests asynchronously for all assigned partitions. - - Note: noop if there are unconsumed records internal to the fetcher + """Send FetchRequests for all assigned partitions that do not already have + an in-flight fetch or pending fetch data. Returns: List of Futures: each future resolves to a FetchResponse @@ -125,7 +127,6 @@ class Fetcher(six.Iterator): if self._client.ready(node_id): log.debug("Sending FetchRequest to node %s", node_id) future = self._client.send(node_id, request) - future.error_on_callbacks=True future.add_callback(self._handle_fetch_response, request, time.time()) future.add_errback(log.error, 'Fetch to node %s failed: %s', node_id) futures.append(future) @@ -285,67 +286,6 @@ class Fetcher(six.Iterator): raise Errors.KafkaTimeoutError( "Failed to get offsets by timestamps in %s ms" % timeout_ms) - def _raise_if_offset_out_of_range(self): - """Check FetchResponses for offset out of range. - - Raises: - OffsetOutOfRangeError: if any partition from previous FetchResponse - contains OffsetOutOfRangeError and the default_reset_policy is - None - """ - if not self._offset_out_of_range_partitions: - return - - current_out_of_range_partitions = {} - - # filter only the fetchable partitions - for partition, offset in six.iteritems(self._offset_out_of_range_partitions): - if not self._subscriptions.is_fetchable(partition): - log.debug("Ignoring fetched records for %s since it is no" - " longer fetchable", partition) - continue - position = self._subscriptions.assignment[partition].position - # ignore partition if the current position != offset in FetchResponse - # e.g. after seek() - if position is not None and offset == position: - current_out_of_range_partitions[partition] = position - - self._offset_out_of_range_partitions.clear() - if current_out_of_range_partitions: - raise Errors.OffsetOutOfRangeError(current_out_of_range_partitions) - - def _raise_if_unauthorized_topics(self): - """Check FetchResponses for topic authorization failures. - - Raises: - TopicAuthorizationFailedError - """ - if self._unauthorized_topics: - topics = set(self._unauthorized_topics) - self._unauthorized_topics.clear() - raise Errors.TopicAuthorizationFailedError(topics) - - def _raise_if_record_too_large(self): - """Check FetchResponses for messages larger than the max per partition. - - Raises: - RecordTooLargeError: if there is a message larger than fetch size - """ - if not self._record_too_large_partitions: - return - - copied_record_too_large_partitions = dict(self._record_too_large_partitions) - self._record_too_large_partitions.clear() - - raise RecordTooLargeError( - "There are some messages at [Partition=Offset]: %s " - " whose size is larger than the fetch size %s" - " and hence cannot be ever returned." - " Increase the fetch size, or decrease the maximum message" - " size the broker will allow.", - copied_record_too_large_partitions, - self.config['max_partition_fetch_bytes']) - def fetched_records(self, max_records=None): """Returns previously fetched records and updates consumed offsets. @@ -375,22 +315,25 @@ class Fetcher(six.Iterator): if self._subscriptions.needs_partition_assignment: return {}, False - self._raise_if_offset_out_of_range() - self._raise_if_unauthorized_topics() - self._raise_if_record_too_large() - drained = collections.defaultdict(list) - partial = bool(self._records and max_records) - while self._records and max_records > 0: - part = self._records.popleft() - max_records -= self._append(drained, part, max_records) - if part.has_more(): - self._records.appendleft(part) + records_remaining = max_records + + while records_remaining > 0: + if not self._next_partition_records: + if not self._completed_fetches: + break + completion = self._completed_fetches.popleft() + self._next_partition_records = self._parse_fetched_data(completion) else: - partial &= False - return dict(drained), partial + records_remaining -= self._append(drained, + self._next_partition_records, + records_remaining) + return dict(drained), bool(self._completed_fetches) def _append(self, drained, part, max_records): + if not part: + return 0 + tp = part.topic_partition fetch_offset = part.fetch_offset if not self._subscriptions.is_assigned(tp): @@ -409,9 +352,8 @@ class Fetcher(six.Iterator): " %s since it is no longer fetchable", tp) elif fetch_offset == position: + # we are ensured to have at least one record since we already checked for emptiness part_records = part.take(max_records) - if not part_records: - return 0 next_offset = part_records[-1].offset + 1 log.log(0, "Returning fetched records at offset %d for assigned" @@ -444,93 +386,67 @@ class Fetcher(six.Iterator): if self._subscriptions.needs_partition_assignment: raise StopIteration('Subscription needs partition assignment') - while self._records: + while self._next_partition_records or self._completed_fetches: - # Check on each iteration since this is a generator - self._raise_if_offset_out_of_range() - self._raise_if_unauthorized_topics() - self._raise_if_record_too_large() + if not self._next_partition_records: + completion = self._completed_fetches.popleft() + self._next_partition_records = self._parse_fetched_data(completion) + continue # Send additional FetchRequests when the internal queue is low # this should enable moderate pipelining - if len(self._records) <= self.config['iterator_refetch_records']: + if len(self._completed_fetches) <= self.config['iterator_refetch_records']: self.send_fetches() - part = self._records.popleft() - - tp = part.topic_partition - fetch_offset = part.fetch_offset - if not self._subscriptions.is_assigned(tp): - # this can happen when a rebalance happened before - # fetched records are returned - log.debug("Not returning fetched records for partition %s" - " since it is no longer assigned", tp) - continue - - # note that the position should always be available - # as long as the partition is still assigned - position = self._subscriptions.assignment[tp].position - if not self._subscriptions.is_fetchable(tp): - # this can happen when a partition is paused before - # fetched records are returned - log.debug("Not returning fetched records for assigned partition" - " %s since it is no longer fetchable", tp) - - elif fetch_offset == position: - log.log(0, "Returning fetched records at offset %d for assigned" - " partition %s", position, tp) - - # We can ignore any prior signal to drop pending message sets - # because we are starting from a fresh one where fetch_offset == position - # i.e., the user seek()'d to this position - self._subscriptions.assignment[tp].drop_pending_message_set = False - - for msg in part.messages: - - # Because we are in a generator, it is possible for - # subscription state to change between yield calls - # so we need to re-check on each loop - # this should catch assignment changes, pauses - # and resets via seek_to_beginning / seek_to_end - if not self._subscriptions.is_fetchable(tp): - log.debug("Not returning fetched records for partition %s" - " since it is no longer fetchable", tp) - break - - # If there is a seek during message iteration, - # we should stop unpacking this message set and - # wait for a new fetch response that aligns with the - # new seek position - elif self._subscriptions.assignment[tp].drop_pending_message_set: - log.debug("Skipping remainder of message set for partition %s", tp) - self._subscriptions.assignment[tp].drop_pending_message_set = False - break - - # Compressed messagesets may include earlier messages - elif msg.offset < self._subscriptions.assignment[tp].position: - log.debug("Skipping message offset: %s (expecting %s)", - msg.offset, - self._subscriptions.assignment[tp].position) - continue + tp = self._next_partition_records.topic_partition - self._subscriptions.assignment[tp].position = msg.offset + 1 - yield msg + for msg in self._next_partition_records.take(sys.maxint): - else: - # these records aren't next in line based on the last consumed - # position, ignore them they must be from an obsolete request - log.debug("Ignoring fetched records for %s at offset %s since" - " the current position is %d", tp, part.fetch_offset, - position) + # Because we are in a generator, it is possible for + # subscription state to change between yield calls + # so we need to re-check on each loop + # this should catch assignment changes, pauses + # and resets via seek_to_beginning / seek_to_end + if not self._subscriptions.is_fetchable(tp): + log.debug("Not returning fetched records for partition %s" + " since it is no longer fetchable", tp) + self._next_partition_records = None + break + + # If there is a seek during message iteration, + # we should stop unpacking this message set and + # wait for a new fetch response that aligns with the + # new seek position + elif self._subscriptions.assignment[tp].drop_pending_message_set: + log.debug("Skipping remainder of message set for partition %s", tp) + self._subscriptions.assignment[tp].drop_pending_message_set = False + self._next_partition_records = None + break + + # Compressed messagesets may include earlier messages + elif msg.offset < self._subscriptions.assignment[tp].position: + log.debug("Skipping message offset: %s (expecting %s)", + msg.offset, + self._subscriptions.assignment[tp].position) + continue + + self._subscriptions.assignment[tp].position = msg.offset + 1 + yield msg + + self._next_partition_records = None def _unpack_message_set(self, tp, messages): try: for offset, size, msg in messages: if self.config['check_crcs'] and not msg.validate_crc(): raise Errors.InvalidMessageError(msg) - elif msg.is_compressed(): - # If relative offset is used, we need to decompress the entire message first to compute - # the absolute offset. + + if not msg.is_compressed(): + yield self._parse_record(tp, offset, msg.timestamp, msg) + + else: + # If relative offset is used, we need to decompress the entire message first + # to compute the absolute offset. inner_mset = msg.decompress() # There should only ever be a single layer of compression @@ -569,31 +485,7 @@ class Fetcher(six.Iterator): if absolute_base_offset >= 0: inner_offset += absolute_base_offset - - key = self._deserialize( - self.config['key_deserializer'], - tp.topic, inner_msg.key) - value = self._deserialize( - self.config['value_deserializer'], - tp.topic, inner_msg.value) - yield ConsumerRecord(tp.topic, tp.partition, inner_offset, - inner_timestamp, msg.timestamp_type, - key, value, inner_msg.crc, - len(inner_msg.key) if inner_msg.key is not None else -1, - len(inner_msg.value) if inner_msg.value is not None else -1) - - else: - key = self._deserialize( - self.config['key_deserializer'], - tp.topic, msg.key) - value = self._deserialize( - self.config['value_deserializer'], - tp.topic, msg.value) - yield ConsumerRecord(tp.topic, tp.partition, offset, - msg.timestamp, msg.timestamp_type, - key, value, msg.crc, - len(msg.key) if msg.key is not None else -1, - len(msg.value) if msg.value is not None else -1) + yield self._parse_record(tp, inner_offset, inner_timestamp, inner_msg) # If unpacking raises StopIteration, it is erroneously # caught by the generator. We want all exceptions to be raised @@ -608,6 +500,15 @@ class Fetcher(six.Iterator): log.exception('AssertionError raised unpacking messageset: %s', e) raise + def _parse_record(self, tp, offset, timestamp, msg): + key = self._deserialize(self.config['key_deserializer'], tp.topic, msg.key) + value = self._deserialize(self.config['value_deserializer'], tp.topic, msg.value) + return ConsumerRecord(tp.topic, tp.partition, offset, + timestamp, msg.timestamp_type, + key, value, msg.crc, + len(msg.key) if msg.key is not None else -1, + len(msg.value) if msg.value is not None else -1) + def __iter__(self): # pylint: disable=non-iterator-returned return self @@ -764,8 +665,11 @@ class Fetcher(six.Iterator): def _fetchable_partitions(self): fetchable = self._subscriptions.fetchable_partitions() - pending = set([part.topic_partition for part in self._records]) - return fetchable.difference(pending) + if self._next_partition_records: + fetchable.remove(self._next_partition_records.topic_partition) + for fetch in self._completed_fetches: + fetchable.remove(fetch.topic_partition) + return fetchable def _create_fetch_requests(self): """Create fetch requests for all assigned partitions, grouped by node. @@ -835,93 +739,126 @@ class Fetcher(six.Iterator): def _handle_fetch_response(self, request, send_time, response): """The callback for fetch completion""" - total_bytes = 0 - total_count = 0 - recv_time = time.time() - fetch_offsets = {} for topic, partitions in request.topics: - for partition, offset, _ in partitions: + for partition_data in partitions: + partition, offset = partition_data[:2] fetch_offsets[TopicPartition(topic, partition)] = offset + partitions = set([TopicPartition(topic, partition_data[0]) + for topic, partitions in response.topics + for partition_data in partitions]) + metric_aggregator = FetchResponseMetricAggregator(self._sensors, partitions) + # randomized ordering should improve balance for short-lived consumers random.shuffle(response.topics) for topic, partitions in response.topics: random.shuffle(partitions) - for partition, error_code, highwater, messages in partitions: - tp = TopicPartition(topic, partition) - error_type = Errors.for_code(error_code) - if not self._subscriptions.is_fetchable(tp): - # this can happen when a rebalance happened or a partition - # consumption paused while fetch is still in-flight - log.debug("Ignoring fetched records for partition %s" - " since it is no longer fetchable", tp) + for partition_data in partitions: + tp = TopicPartition(topic, partition_data[0]) + completed_fetch = CompletedFetch( + tp, fetch_offsets[tp], + response.API_VERSION, + partition_data[1:], + metric_aggregator + ) + self._completed_fetches.append(completed_fetch) - elif error_type is Errors.NoError: - self._subscriptions.assignment[tp].highwater = highwater - - # we are interested in this fetch only if the beginning - # offset (of the *request*) matches the current consumed position - # Note that the *response* may return a messageset that starts - # earlier (e.g., compressed messages) or later (e.g., compacted topic) - fetch_offset = fetch_offsets[tp] - position = self._subscriptions.assignment[tp].position - if position is None or position != fetch_offset: - log.debug("Discarding fetch response for partition %s" - " since its offset %d does not match the" - " expected offset %d", tp, fetch_offset, - position) - continue + if response.API_VERSION >= 1: + self._sensors.fetch_throttle_time_sensor.record(response.throttle_time_ms) + self._sensors.fetch_latency.record((time.time() - send_time) * 1000) - num_bytes = 0 - partial = None - if messages and isinstance(messages[-1][-1], PartialMessage): - partial = messages.pop() - - if messages: - log.debug("Adding fetched record for partition %s with" - " offset %d to buffered record list", tp, - position) - unpacked = list(self._unpack_message_set(tp, messages)) - self._records.append(self.PartitionRecords(fetch_offset, tp, unpacked)) - last_offset, _, _ = messages[-1] - self._sensors.records_fetch_lag.record(highwater - last_offset) - num_bytes = sum(msg[1] for msg in messages) - elif partial: - # we did not read a single message from a non-empty - # buffer because that message's size is larger than - # fetch size, in this case record this exception - self._record_too_large_partitions[tp] = fetch_offset - - self._sensors.record_topic_fetch_metrics(topic, num_bytes, len(messages)) - total_bytes += num_bytes - total_count += len(messages) - elif error_type in (Errors.NotLeaderForPartitionError, - Errors.UnknownTopicOrPartitionError): - self._client.cluster.request_update() - elif error_type is Errors.OffsetOutOfRangeError: - fetch_offset = fetch_offsets[tp] + def _parse_fetched_data(self, completed_fetch): + tp = completed_fetch.topic_partition + partition = completed_fetch.partition_data + fetch_offset = completed_fetch.fetched_offset + num_bytes = 0 + records_count = 0 + parsed_records = None + + error_code, highwater = completed_fetch.partition_data[:2] + error_type = Errors.for_code(error_code) + messages = completed_fetch.partition_data[-1] + + try: + if not self._subscriptions.is_fetchable(tp): + # this can happen when a rebalance happened or a partition + # consumption paused while fetch is still in-flight + log.debug("Ignoring fetched records for partition %s" + " since it is no longer fetchable", tp) + + elif error_type is Errors.NoError: + self._subscriptions.assignment[tp].highwater = highwater + + # we are interested in this fetch only if the beginning + # offset (of the *request*) matches the current consumed position + # Note that the *response* may return a messageset that starts + # earlier (e.g., compressed messages) or later (e.g., compacted topic) + position = self._subscriptions.assignment[tp].position + if position is None or position != fetch_offset: + log.debug("Discarding fetch response for partition %s" + " since its offset %d does not match the" + " expected offset %d", tp, fetch_offset, + position) + return None + + partial = None + if messages and isinstance(messages[-1][-1], PartialMessage): + partial = messages.pop() + + if messages: + log.debug("Adding fetched record for partition %s with" + " offset %d to buffered record list", tp, + position) + unpacked = list(self._unpack_message_set(tp, messages)) + parsed_records = self.PartitionRecords(fetch_offset, tp, unpacked) + last_offset, _, _ = messages[-1] + self._sensors.records_fetch_lag.record(highwater - last_offset) + num_bytes = sum(msg[1] for msg in messages) + records_count = len(messages) + elif partial: + # we did not read a single message from a non-empty + # buffer because that message's size is larger than + # fetch size, in this case record this exception + record_too_large_partitions = {tp: fetch_offset} + raise RecordTooLargeError( + "There are some messages at [Partition=Offset]: %s " + " whose size is larger than the fetch size %s" + " and hence cannot be ever returned." + " Increase the fetch size, or decrease the maximum message" + " size the broker will allow." % ( + record_too_large_partitions, + self.config['max_partition_fetch_bytes']), + record_too_large_partitions) + self._sensors.record_topic_fetch_metrics(tp.topic, num_bytes, records_count) + + elif error_type in (Errors.NotLeaderForPartitionError, + Errors.UnknownTopicOrPartitionError): + self._client.cluster.request_update() + elif error_type is Errors.OffsetOutOfRangeError: + position = self._subscriptions.assignment[tp].position + if position is None or position != fetch_offset: + log.debug("Discarding stale fetch response for partition %s" + " since the fetched offset %d does not match the" + " current offset %d", tp, fetch_offset, position) + elif self._subscriptions.has_default_offset_reset_policy(): log.info("Fetch offset %s is out of range for topic-partition %s", fetch_offset, tp) - if self._subscriptions.has_default_offset_reset_policy(): - self._subscriptions.need_offset_reset(tp) - log.info("Resetting offset for topic-partition %s", tp) - else: - self._offset_out_of_range_partitions[tp] = fetch_offset - elif error_type is Errors.TopicAuthorizationFailedError: - log.warn("Not authorized to read from topic %s.", tp.topic) - self._unauthorized_topics.add(tp.topic) - elif error_type is Errors.UnknownError: - log.warn("Unknown error fetching data for topic-partition %s", tp) + self._subscriptions.need_offset_reset(tp) else: - raise error_type('Unexpected error while fetching data') + raise Errors.OffsetOutOfRangeError({tp: fetch_offset}) - # Because we are currently decompressing messages lazily, the sensors here - # will get compressed bytes / message set stats when compression is enabled - self._sensors.bytes_fetched.record(total_bytes) - self._sensors.records_fetched.record(total_count) - if response.API_VERSION >= 1: - self._sensors.fetch_throttle_time_sensor.record(response.throttle_time_ms) - self._sensors.fetch_latency.record((recv_time - send_time) * 1000) + elif error_type is Errors.TopicAuthorizationFailedError: + log.warn("Not authorized to read from topic %s.", tp.topic) + raise Errors.TopicAuthorizationFailedError(set(tp.topic)) + elif error_type is Errors.UnknownError: + log.warn("Unknown error fetching data for topic-partition %s", tp) + else: + raise error_type('Unexpected error while fetching data') + + finally: + completed_fetch.metric_aggregator.record(tp, num_bytes, records_count) + + return parsed_records class PartitionRecords(object): def __init__(self, fetch_offset, tp, messages): @@ -935,21 +872,53 @@ class Fetcher(six.Iterator): if msg.offset == fetch_offset: self.message_idx = i + # For truthiness evaluation we need to define __len__ or __nonzero__ + def __len__(self): + if self.messages is None or self.message_idx >= len(self.messages): + return 0 + return len(self.messages) - self.message_idx + def discard(self): self.messages = None def take(self, n): - if not self.has_more(): + if not len(self): return [] next_idx = self.message_idx + n res = self.messages[self.message_idx:next_idx] self.message_idx = next_idx - if self.has_more(): + if len(self) > 0: self.fetch_offset = self.messages[self.message_idx].offset return res - def has_more(self): - return self.messages and self.message_idx < len(self.messages) + +class FetchResponseMetricAggregator(object): + """ + Since we parse the message data for each partition from each fetch + response lazily, fetch-level metrics need to be aggregated as the messages + from each partition are parsed. This class is used to facilitate this + incremental aggregation. + """ + def __init__(self, sensors, partitions): + self.sensors = sensors + self.unrecorded_partitions = partitions + self.total_bytes = 0 + self.total_records = 0 + + def record(self, partition, num_bytes, num_records): + """ + After each partition is parsed, we update the current metric totals + with the total bytes and number of records parsed. After all partitions + have reported, we write the metric. + """ + self.unrecorded_partitions.remove(partition) + self.total_bytes += num_bytes + self.total_records += num_records + + # once all expected partitions from the fetch have reported in, record the metrics + if not self.unrecorded_partitions: + self.sensors.bytes_fetched.record(self.total_bytes) + self.sensors.records_fetched.record(self.total_records) class FetchManagerMetrics(object): -- cgit v1.2.1 From 645129b00f63eab6368c5e9aca137463b63c0c9d Mon Sep 17 00:00:00 2001 From: Dana Powers Date: Sat, 7 Oct 2017 23:31:33 -0700 Subject: Add tests for Fetcher.fetched_records and _handle_fetch_response --- test/test_fetcher.py | 132 +++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 118 insertions(+), 14 deletions(-) diff --git a/test/test_fetcher.py b/test/test_fetcher.py index 86d154f..3bf26db 100644 --- a/test/test_fetcher.py +++ b/test/test_fetcher.py @@ -3,14 +3,18 @@ from __future__ import absolute_import import pytest -import itertools from collections import OrderedDict +import itertools +import time from kafka.client_async import KafkaClient -from kafka.consumer.fetcher import ConsumerRecord, Fetcher, NoOffsetForPartitionError +from kafka.consumer.fetcher import ( + CompletedFetch, ConsumerRecord, Fetcher, NoOffsetForPartitionError +) from kafka.consumer.subscription_state import SubscriptionState from kafka.metrics import Metrics -from kafka.protocol.fetch import FetchRequest +from kafka.protocol.fetch import FetchRequest, FetchResponse +from kafka.protocol.message import Message from kafka.protocol.offset import OffsetResponse from kafka.structs import TopicPartition from kafka.future import Future @@ -31,28 +35,33 @@ def subscription_state(): @pytest.fixture -def fetcher(client, subscription_state): - subscription_state.subscribe(topics=['foobar']) - assignment = [TopicPartition('foobar', i) for i in range(3)] +def topic(): + return 'foobar' + + +@pytest.fixture +def fetcher(client, subscription_state, topic): + subscription_state.subscribe(topics=[topic]) + assignment = [TopicPartition(topic, i) for i in range(3)] subscription_state.assign_from_subscribed(assignment) for tp in assignment: subscription_state.seek(tp, 0) return Fetcher(client, subscription_state, Metrics()) -def test_send_fetches(fetcher, mocker): +def test_send_fetches(fetcher, topic, mocker): fetch_requests = [ FetchRequest[0]( -1, fetcher.config['fetch_max_wait_ms'], fetcher.config['fetch_min_bytes'], - [('foobar', [ + [(topic, [ (0, 0, fetcher.config['max_partition_fetch_bytes']), (1, 0, fetcher.config['max_partition_fetch_bytes']), ])]), FetchRequest[0]( -1, fetcher.config['fetch_max_wait_ms'], fetcher.config['fetch_min_bytes'], - [('foobar', [ + [(topic, [ (2, 0, fetcher.config['max_partition_fetch_bytes']), ])]) ] @@ -80,9 +89,9 @@ def test_create_fetch_requests(fetcher, mocker, api_version, fetch_version): assert all([isinstance(r, FetchRequest[fetch_version]) for r in requests]) -def test_update_fetch_positions(fetcher, mocker): +def test_update_fetch_positions(fetcher, topic, mocker): mocker.patch.object(fetcher, '_reset_offset') - partition = TopicPartition('foobar', 0) + partition = TopicPartition(topic, 0) # unassigned partition fetcher.update_fetch_positions([TopicPartition('fizzbuzz', 0)]) @@ -296,12 +305,107 @@ def test_partition_records_offset(): None, None, 'key', 'value', 'checksum', 0, 0) for i in range(batch_start, batch_end)] records = Fetcher.PartitionRecords(fetch_offset, None, messages) - assert records.has_more() + assert len(records) > 0 msgs = records.take(1) assert msgs[0].offset == 123 assert records.fetch_offset == 124 msgs = records.take(2) assert len(msgs) == 2 - assert records.has_more() + assert len(records) > 0 records.discard() - assert not records.has_more() + assert len(records) == 0 + + +def test_fetched_records(fetcher, topic, mocker): + tp = TopicPartition(topic, 0) + msgs = [] + for i in range(10): + msg = Message(b'foo') + msgs.append((i, -1, msg)) + completed_fetch = CompletedFetch( + tp, 0, 0, [0, 100, msgs], + mocker.MagicMock() + ) + fetcher._completed_fetches.append(completed_fetch) + fetcher.config['check_crcs'] = False + records, partial = fetcher.fetched_records() + assert tp in records + assert len(records[tp]) == len(msgs) + assert all(map(lambda x: isinstance(x, ConsumerRecord), records[tp])) + assert partial is False + + +@pytest.mark.parametrize(("fetch_request", "fetch_response", "num_partitions"), [ + ( + FetchRequest[0]( + -1, 100, 100, + [('foo', [(0, 0, 1000),])]), + FetchResponse[0]( + [("foo", [(0, 0, 1000, [(0, Message(b'abc', magic=0)._encode_self()),])]),]), + 1, + ), + ( + FetchRequest[1]( + -1, 100, 100, + [('foo', [(0, 0, 1000), (1, 0, 1000),])]), + FetchResponse[1]( + 0, + [("foo", [ + (0, 0, 1000, [(0, Message(b'abc', magic=0)._encode_self()),]), + (1, 0, 1000, [(0, Message(b'abc', magic=0)._encode_self()),]), + ]),]), + 2, + ), + ( + FetchRequest[2]( + -1, 100, 100, + [('foo', [(0, 0, 1000),])]), + FetchResponse[2]( + 0, [("foo", [(0, 0, 1000, [(0, Message(b'abc', magic=1)._encode_self()),])]),]), + 1, + ), + ( + FetchRequest[3]( + -1, 100, 100, 10000, + [('foo', [(0, 0, 1000),])]), + FetchResponse[3]( + 0, [("foo", [(0, 0, 1000, [(0, b'xxx'),])]),]), + 1, + ), + ( + FetchRequest[4]( + -1, 100, 100, 10000, 0, + [('foo', [(0, 0, 1000),])]), + FetchResponse[4]( + 0, [("foo", [(0, 0, 1000, 0, [], [(0, b'xxx'),])]),]), + 1, + ), + ( + # This may only be used in broker-broker api calls + FetchRequest[5]( + -1, 100, 100, 10000, 0, + [('foo', [(0, 0, 1000),])]), + FetchResponse[5]( + 0, [("foo", [(0, 0, 1000, 0, 0, [], [(0, b'xxx'),])]),]), + 1, + ), +]) +def test__handle_fetch_response(fetcher, fetch_request, fetch_response, num_partitions): + fetcher._handle_fetch_response(fetch_request, time.time(), fetch_response) + assert len(fetcher._completed_fetches) == num_partitions + + +def test__unpack_message_set(): + pass + + +def test__parse_record(): + pass + + +def test_message_generator(): + pass + + +def test__parse_fetched_data(): + pass -- cgit v1.2.1 From a7d8ae5411cc74b119ca91a8ff84ddc68cd47c93 Mon Sep 17 00:00:00 2001 From: Dana Powers Date: Sun, 8 Oct 2017 01:15:34 -0700 Subject: Avoid sys.maxint; not supported on py3 --- kafka/consumer/fetcher.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kafka/consumer/fetcher.py b/kafka/consumer/fetcher.py index ebf6099..d059a10 100644 --- a/kafka/consumer/fetcher.py +++ b/kafka/consumer/fetcher.py @@ -400,7 +400,7 @@ class Fetcher(six.Iterator): tp = self._next_partition_records.topic_partition - for msg in self._next_partition_records.take(sys.maxint): + for msg in self._next_partition_records.take(): # Because we are in a generator, it is possible for # subscription state to change between yield calls @@ -881,9 +881,11 @@ class Fetcher(six.Iterator): def discard(self): self.messages = None - def take(self, n): + def take(self, n=None): if not len(self): return [] + if n is None or n > len(self): + n = len(self) next_idx = self.message_idx + n res = self.messages[self.message_idx:next_idx] self.message_idx = next_idx -- cgit v1.2.1 From bc573e3d63a687903a9be2e1b3da2f943a7208e1 Mon Sep 17 00:00:00 2001 From: Dana Powers Date: Sun, 8 Oct 2017 10:50:06 -0700 Subject: More tests --- kafka/consumer/fetcher.py | 5 ++ test/test_fetcher.py | 210 ++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 200 insertions(+), 15 deletions(-) diff --git a/kafka/consumer/fetcher.py b/kafka/consumer/fetcher.py index d059a10..c4fa546 100644 --- a/kafka/consumer/fetcher.py +++ b/kafka/consumer/fetcher.py @@ -400,6 +400,11 @@ class Fetcher(six.Iterator): tp = self._next_partition_records.topic_partition + # We can ignore any prior signal to drop pending message sets + # because we are starting from a fresh one where fetch_offset == position + # i.e., the user seek()'d to this position + self._subscriptions.assignment[tp].drop_pending_message_set = False + for msg in self._next_partition_records.take(): # Because we are in a generator, it is possible for diff --git a/test/test_fetcher.py b/test/test_fetcher.py index 3bf26db..5da597c 100644 --- a/test/test_fetcher.py +++ b/test/test_fetcher.py @@ -8,6 +8,7 @@ import itertools import time from kafka.client_async import KafkaClient +from kafka.codec import gzip_encode from kafka.consumer.fetcher import ( CompletedFetch, ConsumerRecord, Fetcher, NoOffsetForPartitionError ) @@ -16,11 +17,12 @@ from kafka.metrics import Metrics from kafka.protocol.fetch import FetchRequest, FetchResponse from kafka.protocol.message import Message from kafka.protocol.offset import OffsetResponse +from kafka.protocol.types import Int64, Int32 from kafka.structs import TopicPartition from kafka.future import Future from kafka.errors import ( StaleMetadata, LeaderNotAvailableError, NotLeaderForPartitionError, - UnknownTopicOrPartitionError + UnknownTopicOrPartitionError, OffsetOutOfRangeError ) @@ -294,7 +296,7 @@ def test__handle_offset_response(fetcher, mocker): def test_partition_records_offset(): - """Test that compressed messagesets are handle correctly + """Test that compressed messagesets are handled correctly when fetch offset is in the middle of the message list """ batch_start = 120 @@ -317,6 +319,7 @@ def test_partition_records_offset(): def test_fetched_records(fetcher, topic, mocker): + fetcher.config['check_crcs'] = False tp = TopicPartition(topic, 0) msgs = [] for i in range(10): @@ -327,7 +330,6 @@ def test_fetched_records(fetcher, topic, mocker): mocker.MagicMock() ) fetcher._completed_fetches.append(completed_fetch) - fetcher.config['check_crcs'] = False records, partial = fetcher.fetched_records() assert tp in records assert len(records[tp]) == len(msgs) @@ -341,7 +343,7 @@ def test_fetched_records(fetcher, topic, mocker): -1, 100, 100, [('foo', [(0, 0, 1000),])]), FetchResponse[0]( - [("foo", [(0, 0, 1000, [(0, Message(b'abc', magic=0)._encode_self()),])]),]), + [("foo", [(0, 0, 1000, [(0, b'xxx'),])]),]), 1, ), ( @@ -351,8 +353,8 @@ def test_fetched_records(fetcher, topic, mocker): FetchResponse[1]( 0, [("foo", [ - (0, 0, 1000, [(0, Message(b'abc', magic=0)._encode_self()),]), - (1, 0, 1000, [(0, Message(b'abc', magic=0)._encode_self()),]), + (0, 0, 1000, [(0, b'xxx'),]), + (1, 0, 1000, [(0, b'xxx'),]), ]),]), 2, ), @@ -361,7 +363,7 @@ def test_fetched_records(fetcher, topic, mocker): -1, 100, 100, [('foo', [(0, 0, 1000),])]), FetchResponse[2]( - 0, [("foo", [(0, 0, 1000, [(0, Message(b'abc', magic=1)._encode_self()),])]),]), + 0, [("foo", [(0, 0, 1000, [(0, b'xxx'),])]),]), 1, ), ( @@ -395,17 +397,195 @@ def test__handle_fetch_response(fetcher, fetch_request, fetch_response, num_part assert len(fetcher._completed_fetches) == num_partitions -def test__unpack_message_set(): - pass +def test__unpack_message_set(fetcher): + fetcher.config['check_crcs'] = False + tp = TopicPartition('foo', 0) + messages = [ + (0, None, Message(b'a')), + (1, None, Message(b'b')), + (2, None, Message(b'c')) + ] + records = list(fetcher._unpack_message_set(tp, messages)) + assert len(records) == 3 + assert all(map(lambda x: isinstance(x, ConsumerRecord), records)) + assert records[0].value == b'a' + assert records[1].value == b'b' + assert records[2].value == b'c' + assert records[0].offset == 0 + assert records[1].offset == 1 + assert records[2].offset == 2 + + +def test__unpack_message_set_compressed_v0(fetcher): + fetcher.config['check_crcs'] = False + tp = TopicPartition('foo', 0) + messages = [ + (0, None, Message(b'a')), + (1, None, Message(b'b')), + (2, None, Message(b'c')), + ] + message_bytes = [] + for offset, _, m in messages: + encoded = m.encode() + message_bytes.append(Int64.encode(offset) + Int32.encode(len(encoded)) + encoded) + compressed_bytes = gzip_encode(b''.join(message_bytes)) + compressed_base_offset = 0 + compressed_msgs = [ + (compressed_base_offset, None, + Message(compressed_bytes, + magic=0, + attributes=Message.CODEC_GZIP)) + ] + records = list(fetcher._unpack_message_set(tp, compressed_msgs)) + assert len(records) == 3 + assert all(map(lambda x: isinstance(x, ConsumerRecord), records)) + assert records[0].value == b'a' + assert records[1].value == b'b' + assert records[2].value == b'c' + assert records[0].offset == 0 + assert records[1].offset == 1 + assert records[2].offset == 2 + + +def test__unpack_message_set_compressed_v1(fetcher): + fetcher.config['check_crcs'] = False + tp = TopicPartition('foo', 0) + messages = [ + (0, None, Message(b'a')), + (1, None, Message(b'b')), + (2, None, Message(b'c')), + ] + message_bytes = [] + for offset, _, m in messages: + encoded = m.encode() + message_bytes.append(Int64.encode(offset) + Int32.encode(len(encoded)) + encoded) + compressed_bytes = gzip_encode(b''.join(message_bytes)) + compressed_base_offset = 10 + compressed_msgs = [ + (compressed_base_offset, None, + Message(compressed_bytes, + magic=1, + attributes=Message.CODEC_GZIP)) + ] + records = list(fetcher._unpack_message_set(tp, compressed_msgs)) + assert len(records) == 3 + assert all(map(lambda x: isinstance(x, ConsumerRecord), records)) + assert records[0].value == b'a' + assert records[1].value == b'b' + assert records[2].value == b'c' + assert records[0].offset == 8 + assert records[1].offset == 9 + assert records[2].offset == 10 + + +def test__parse_record(fetcher): + tp = TopicPartition('foo', 0) + record = fetcher._parse_record(tp, 123, 456, Message(b'abc')) + assert record.topic == 'foo' + assert record.partition == 0 + assert record.offset == 123 + assert record.timestamp == 456 + assert record.value == b'abc' + assert record.key is None -def test__parse_record(): - pass +def test__message_generator(fetcher, topic, mocker): + fetcher.config['check_crcs'] = False + tp = TopicPartition(topic, 0) + msgs = [] + for i in range(10): + msg = Message(b'foo') + msgs.append((i, -1, msg)) + completed_fetch = CompletedFetch( + tp, 0, 0, [0, 100, msgs], + mocker.MagicMock() + ) + fetcher._completed_fetches.append(completed_fetch) + for i in range(10): + msg = next(fetcher) + assert isinstance(msg, ConsumerRecord) + assert msg.offset == i + assert msg.value == b'foo' -def test_message_generator(): - pass +def test__parse_fetched_data(fetcher, topic, mocker): + fetcher.config['check_crcs'] = False + tp = TopicPartition(topic, 0) + msgs = [] + for i in range(10): + msg = Message(b'foo') + msgs.append((i, -1, msg)) + completed_fetch = CompletedFetch( + tp, 0, 0, [0, 100, msgs], + mocker.MagicMock() + ) + partition_record = fetcher._parse_fetched_data(completed_fetch) + assert isinstance(partition_record, fetcher.PartitionRecords) + assert len(partition_record) == 10 -def test__parse_fetched_data(): - pass +def test__parse_fetched_data__paused(fetcher, topic, mocker): + fetcher.config['check_crcs'] = False + tp = TopicPartition(topic, 0) + msgs = [] + for i in range(10): + msg = Message(b'foo') + msgs.append((i, -1, msg)) + completed_fetch = CompletedFetch( + tp, 0, 0, [0, 100, msgs], + mocker.MagicMock() + ) + fetcher._subscriptions.pause(tp) + partition_record = fetcher._parse_fetched_data(completed_fetch) + assert partition_record is None + + +def test__parse_fetched_data__stale_offset(fetcher, topic, mocker): + fetcher.config['check_crcs'] = False + tp = TopicPartition(topic, 0) + msgs = [] + for i in range(10): + msg = Message(b'foo') + msgs.append((i, -1, msg)) + completed_fetch = CompletedFetch( + tp, 10, 0, [0, 100, msgs], + mocker.MagicMock() + ) + partition_record = fetcher._parse_fetched_data(completed_fetch) + assert partition_record is None + + +def test__parse_fetched_data__not_leader(fetcher, topic, mocker): + fetcher.config['check_crcs'] = False + tp = TopicPartition(topic, 0) + completed_fetch = CompletedFetch( + tp, 0, 0, [NotLeaderForPartitionError.errno, -1, None], + mocker.MagicMock() + ) + partition_record = fetcher._parse_fetched_data(completed_fetch) + assert partition_record is None + fetcher._client.cluster.request_update.assert_called_with() + + +def test__parse_fetched_data__unknown_tp(fetcher, topic, mocker): + fetcher.config['check_crcs'] = False + tp = TopicPartition(topic, 0) + completed_fetch = CompletedFetch( + tp, 0, 0, [UnknownTopicOrPartitionError.errno, -1, None], + mocker.MagicMock() + ) + partition_record = fetcher._parse_fetched_data(completed_fetch) + assert partition_record is None + fetcher._client.cluster.request_update.assert_called_with() + + +def test__parse_fetched_data__out_of_range(fetcher, topic, mocker): + fetcher.config['check_crcs'] = False + tp = TopicPartition(topic, 0) + completed_fetch = CompletedFetch( + tp, 0, 0, [OffsetOutOfRangeError.errno, -1, None], + mocker.MagicMock() + ) + partition_record = fetcher._parse_fetched_data(completed_fetch) + assert partition_record is None + assert fetcher._subscriptions.assignment[tp].awaiting_reset is True -- cgit v1.2.1