summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPete Zaitcev <zaitcev@kotori.zaitcev.us>2021-05-21 21:06:49 -0500
committerPete Zaitcev <zaitcev@kotori.zaitcev.us>2021-07-20 22:12:35 -0500
commit92aef484b6ff09d11fdab953e993f1754a578d69 (patch)
tree5077317eb93252d3f6963652dd568077c709f4da
parentd739c3eba5b12ddf79ac597cc4ce80201306e72b (diff)
downloadswift-92aef484b6ff09d11fdab953e993f1754a578d69.tar.gz
Dark Data Watcher: switch to agreement across the whole ring
Before, we required all servers that are up and reacheable to agree that object is dark before we considered it as such and triggered an action. But now that someone wanted to run with action=delete, a concern was raised that a momentary network split could easily make the watcher start deleting objects. To guard against it, we now require all servers in the whole ring agree than object is dark. As a side effect, if one of container servers is down, the watcher is effectively disabled now. But seems like a better choice than deleting something by mistake. Change-Id: I07fa2d39817bd34f7873731990e12ab991e14a6b
-rw-r--r--swift/obj/watchers/dark_data.py33
-rw-r--r--test/unit/obj/test_auditor.py117
2 files changed, 133 insertions, 17 deletions
diff --git a/swift/obj/watchers/dark_data.py b/swift/obj/watchers/dark_data.py
index 83b95ee54..01b3dc807 100644
--- a/swift/obj/watchers/dark_data.py
+++ b/swift/obj/watchers/dark_data.py
@@ -34,9 +34,13 @@ not have any particular bugs that trigger creation of dark data. So,
this is an excercise in writing watchers, with a plausible function.
When enabled, Dark Data watcher definitely drags down the cluster's overall
-performance. Of course, the load increase can be
-mitigated as usual, but at the expense of the total time taken by
-the pass of auditor.
+performance. Of course, the load increase can be mitigated as usual,
+but at the expense of the total time taken by the pass of auditor.
+
+Because the watcher only deems an object dark when all container
+servers agree, it will silently fail to detect anything if even one
+of container servers in the ring is down or unreacheable. This is
+done in the interest of operators who run with action=delete.
Finally, keep in mind that Dark Data watcher needs the container
ring to operate, but runs on an object node. This can come up if
@@ -152,7 +156,7 @@ def get_info_1(container_ring, obj_path, logger):
# to a crawl (if this plugin is enabled).
random.shuffle(container_nodes)
- dark_flag = 0
+ err_flag = 0
for node in container_nodes:
try:
headers, objs = direct_get_container(
@@ -160,17 +164,12 @@ def get_info_1(container_ring, obj_path, logger):
prefix=obj_name, limit=1)
except (ClientException, Timeout):
# Something is wrong with that server, treat as an error.
+ err_flag += 1
continue
- if not objs or objs[0]['name'] != obj_name:
- dark_flag += 1
- continue
- return objs[0]
-
- # We do not ask for a quorum of container servers to know the object.
- # Even if 1 server knows the object, we return with the info above.
- # So, we only end here when all servers either have no record of the
- # object or error out. In such case, even one non-error server means
- # that the object is dark.
- if dark_flag:
- return None
- raise ContainerError()
+ if objs and objs[0]['name'] == obj_name:
+ return objs[0]
+
+ # We only report the object as dark if all known servers agree that it is.
+ if err_flag:
+ raise ContainerError()
+ return None
diff --git a/test/unit/obj/test_auditor.py b/test/unit/obj/test_auditor.py
index 2c0bc0c48..20f2b8685 100644
--- a/test/unit/obj/test_auditor.py
+++ b/test/unit/obj/test_auditor.py
@@ -38,6 +38,7 @@ from swift.obj.diskfile import (
DiskFile, write_metadata, invalidate_hash, get_data_dir,
DiskFileManager, ECDiskFileManager, AuditLocation, clear_auditor_status,
get_auditor_status, HASH_FILE, HASH_INVALIDATIONS_FILE)
+from swift.common.exceptions import ClientException
from swift.common.utils import (
mkdirs, normalize_timestamp, Timestamp, readconf, md5, PrefixLoggerAdapter)
from swift.common.storage_policy import (
@@ -90,6 +91,26 @@ class FakeRing1(object):
return (1, [node1])
+class FakeRing2(object):
+
+ def __init__(self, swift_dir, ring_name=None):
+ return
+
+ def get_nodes(self, *args, **kwargs):
+ nodes = []
+ for x in [1, 2]:
+ nodes.append({'ip': '10.0.0.%s' % x,
+ 'replication_ip': '10.0.0.%s' % x,
+ 'port': 6200 + x,
+ 'replication_port': 6200 + x,
+ 'device': 'sda',
+ 'zone': x % 3,
+ 'region': x % 2,
+ 'id': x,
+ 'handoff_index': 1})
+ return (1, nodes)
+
+
class TestAuditorBase(unittest.TestCase):
def setUp(self):
@@ -1776,6 +1797,7 @@ class TestAuditWatchers(TestAuditorBase):
log_lines)
self.logger.clear()
+
# with grace_age=0 the DARK object will be older than
# grace_age so will be logged
ret_config = {'test_watcher1': {'action': 'log', 'grace_age': '0'}}
@@ -1817,6 +1839,101 @@ class TestAuditWatchers(TestAuditorBase):
self.assertEqual(0, watcher.grace_age)
self.assertEqual('log', watcher.dark_data_policy)
+ def test_dark_data_agreement(self):
+
+ # The dark data watcher only sees an object as dark if all container
+ # servers in the ring reply without an error and return an empty
+ # listing. So, we have the following permutations for an object:
+ #
+ # Container Servers Result
+ # CS1 CS2
+ # Listed Listed Good - the baseline result
+ # Listed Error Good
+ # Listed Not listed Good
+ # Error Error Unknown - the baseline failure
+ # Not listed Error Unknown
+ # Not listed Not listed Dark - the only such result!
+ #
+ scenario = [
+ {'cr': ['L', 'L'], 'res': 'G'},
+ {'cr': ['L', 'E'], 'res': 'G'},
+ {'cr': ['L', 'N'], 'res': 'G'},
+ {'cr': ['E', 'E'], 'res': 'U'},
+ {'cr': ['N', 'E'], 'res': 'U'},
+ {'cr': ['N', 'N'], 'res': 'D'}]
+
+ conf = self.conf.copy()
+ conf['watchers'] = 'test_watcher1'
+ conf['__file__'] = '/etc/swift/swift.conf'
+ ret_config = {'test_watcher1': {'action': 'log', 'grace_age': '0'}}
+ with mock.patch('swift.obj.auditor.parse_prefixed_conf',
+ return_value=ret_config), \
+ mock.patch('swift.obj.auditor.load_pkg_resource',
+ side_effect=[DarkDataWatcher]):
+ my_auditor = auditor.ObjectAuditor(conf, logger=self.logger)
+
+ for cur in scenario:
+
+ def fake_direct_get_container(node, part, account, container,
+ prefix=None, limit=None):
+ self.assertEqual(part, 1)
+ self.assertEqual(limit, 1)
+
+ reply_type = cur['cr'][int(node['id']) - 1]
+
+ if reply_type == 'E':
+ raise ClientException("Emulated container server error")
+
+ if reply_type == 'N':
+ return {}, []
+
+ entry = {'bytes': 30968411,
+ 'hash': '60303f4122966fe5925f045eb52d1129',
+ 'name': '%s' % prefix,
+ 'content_type': 'video/mp4',
+ 'last_modified': '2017-08-15T03:30:57.693210'}
+ return {}, [entry]
+
+ self.logger.clear()
+
+ namespace = 'swift.obj.watchers.dark_data.'
+ with mock.patch(namespace + 'Ring', FakeRing2), \
+ mock.patch(namespace + 'direct_get_container',
+ fake_direct_get_container):
+ my_auditor.run_audit(mode='once')
+
+ # We inherit a common setUp with 3 objects, so 3 everywhere.
+ if cur['res'] == 'U':
+ unk_exp, ok_exp, dark_exp = 3, 0, 0
+ elif cur['res'] == 'G':
+ unk_exp, ok_exp, dark_exp = 0, 3, 0
+ else:
+ unk_exp, ok_exp, dark_exp = 0, 0, 3
+
+ log_lines = self.logger.get_lines_for_level('info')
+ for line in log_lines:
+
+ if not line.startswith('[audit-watcher test_watcher1] total'):
+ continue
+ words = line.split()
+ if not (words[3] == 'unknown' and
+ words[5] == 'ok' and
+ words[7] == 'dark'):
+ unittest.TestCase.fail('Syntax error in %r' % (line,))
+
+ try:
+ unk_cnt = int(words[4])
+ ok_cnt = int(words[6])
+ dark_cnt = int(words[8])
+ except ValueError:
+ unittest.TestCase.fail('Bad value in %r' % (line,))
+
+ if unk_cnt != unk_exp or ok_cnt != ok_exp or dark_cnt != dark_exp:
+ fmt = 'Expected unknown %d ok %d dark %d, got %r, for nodes %r'
+ msg = fmt % (unk_exp, ok_exp, dark_exp,
+ ' '.join(words[3:]), cur['cr'])
+ self.fail(msg=msg)
+
if __name__ == '__main__':
unittest.main()