summaryrefslogtreecommitdiff
path: root/chromium/components/federated_learning
diff options
context:
space:
mode:
Diffstat (limited to 'chromium/components/federated_learning')
-rw-r--r--chromium/components/federated_learning/BUILD.gn37
-rw-r--r--chromium/components/federated_learning/DEPS3
-rw-r--r--chromium/components/federated_learning/OWNERS5
-rw-r--r--chromium/components/federated_learning/README.md7
-rw-r--r--chromium/components/federated_learning/floc_blocklist_service.cc88
-rw-r--r--chromium/components/federated_learning/floc_blocklist_service.h59
-rw-r--r--chromium/components/federated_learning/floc_blocklist_service_unittest.cc178
-rw-r--r--chromium/components/federated_learning/floc_constants.cc19
-rw-r--r--chromium/components/federated_learning/floc_constants.h27
-rw-r--r--chromium/components/federated_learning/floc_id.cc64
-rw-r--r--chromium/components/federated_learning/floc_id.h46
-rw-r--r--chromium/components/federated_learning/proto/BUILD.gn9
-rw-r--r--chromium/components/federated_learning/proto/blocklist.proto13
-rw-r--r--chromium/components/federated_learning/sim_hash.cc93
-rw-r--r--chromium/components/federated_learning/sim_hash.h44
-rw-r--r--chromium/components/federated_learning/sim_hash_unittest.cc203
16 files changed, 895 insertions, 0 deletions
diff --git a/chromium/components/federated_learning/BUILD.gn b/chromium/components/federated_learning/BUILD.gn
new file mode 100644
index 00000000000..406236fdb41
--- /dev/null
+++ b/chromium/components/federated_learning/BUILD.gn
@@ -0,0 +1,37 @@
+# Copyright 2020 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+static_library("federated_learning") {
+ sources = [
+ "floc_blocklist_service.cc",
+ "floc_blocklist_service.h",
+ "floc_constants.cc",
+ "floc_constants.h",
+ "floc_id.cc",
+ "floc_id.h",
+ "sim_hash.cc",
+ "sim_hash.h",
+ ]
+
+ public_deps = [
+ "//components/federated_learning/proto:federated_learning",
+ "//third_party/protobuf:protobuf_lite",
+ ]
+
+ deps = [ "//base" ]
+}
+
+source_set("unit_tests") {
+ testonly = true
+ sources = [
+ "floc_blocklist_service_unittest.cc",
+ "sim_hash_unittest.cc",
+ ]
+ deps = [
+ ":federated_learning",
+ "//base",
+ "//base/test:test_support",
+ "//testing/gtest",
+ ]
+}
diff --git a/chromium/components/federated_learning/DEPS b/chromium/components/federated_learning/DEPS
new file mode 100644
index 00000000000..97db67c0df7
--- /dev/null
+++ b/chromium/components/federated_learning/DEPS
@@ -0,0 +1,3 @@
+include_rules = [
+ "+third_party/protobuf",
+]
diff --git a/chromium/components/federated_learning/OWNERS b/chromium/components/federated_learning/OWNERS
new file mode 100644
index 00000000000..073e83c96fa
--- /dev/null
+++ b/chromium/components/federated_learning/OWNERS
@@ -0,0 +1,5 @@
+jkarlin@chromium.org
+yaoxia@chromium.org
+
+# TEAM: privacy-sandbox-dev@chromium.org
+# TODO(yaoxia): Add a crbug component for FloC.
diff --git a/chromium/components/federated_learning/README.md b/chromium/components/federated_learning/README.md
new file mode 100644
index 00000000000..63c2c190e73
--- /dev/null
+++ b/chromium/components/federated_learning/README.md
@@ -0,0 +1,7 @@
+# Federated Learning Component
+The FLoC (Federated Learning of Cohorts) project aims to group users based on
+their browsing habits in a privacy preserving way, that would allow
+interest-based advertising without exposing a user’s browsing history.
+
+This component contains data types (e.g. FlocID) that other layers could access
+(e.g. chrome/browser, services/network).
diff --git a/chromium/components/federated_learning/floc_blocklist_service.cc b/chromium/components/federated_learning/floc_blocklist_service.cc
new file mode 100644
index 00000000000..5b0134a242b
--- /dev/null
+++ b/chromium/components/federated_learning/floc_blocklist_service.cc
@@ -0,0 +1,88 @@
+// Copyright 2020 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "components/federated_learning/floc_blocklist_service.h"
+
+#include <utility>
+
+#include "base/files/file.h"
+#include "base/files/file_util.h"
+#include "base/task/post_task.h"
+#include "base/task_runner_util.h"
+#include "components/federated_learning/floc_constants.h"
+#include "components/federated_learning/proto/blocklist.pb.h"
+#include "third_party/protobuf/src/google/protobuf/io/zero_copy_stream_impl_lite.h"
+
+namespace federated_learning {
+
+namespace {
+
+class CopyingFileInputStream : public google::protobuf::io::CopyingInputStream {
+ public:
+ explicit CopyingFileInputStream(base::File file) : file_(std::move(file)) {}
+
+ CopyingFileInputStream(const CopyingFileInputStream&) = delete;
+ CopyingFileInputStream& operator=(const CopyingFileInputStream&) = delete;
+
+ ~CopyingFileInputStream() override = default;
+
+ // google::protobuf::io::CopyingInputStream:
+ int Read(void* buffer, int size) override {
+ return file_.ReadAtCurrentPosNoBestEffort(static_cast<char*>(buffer), size);
+ }
+
+ private:
+ base::File file_;
+};
+
+FlocBlocklistService::LoadedBlocklist LoadBlockListOnBackgroundThread(
+ const base::FilePath& file_path) {
+ base::File blocklist_file(file_path,
+ base::File::FLAG_OPEN | base::File::FLAG_READ);
+ if (!blocklist_file.IsValid())
+ return FlocBlocklistService::LoadedBlocklist();
+
+ CopyingFileInputStream copying_stream(std::move(blocklist_file));
+ google::protobuf::io::CopyingInputStreamAdaptor zero_copy_stream_adaptor(
+ &copying_stream);
+
+ proto::Blocklist blocklist_proto;
+ if (!blocklist_proto.ParseFromZeroCopyStream(&zero_copy_stream_adaptor))
+ return FlocBlocklistService::LoadedBlocklist();
+
+ std::unordered_set<uint64_t> blocklist;
+ for (uint64_t entry : blocklist_proto.entries())
+ blocklist.insert(entry);
+
+ return blocklist;
+}
+
+} // namespace
+
+FlocBlocklistService::FlocBlocklistService()
+ : background_task_runner_(base::ThreadPool::CreateSequencedTaskRunner(
+ {base::MayBlock(), base::TaskPriority::BEST_EFFORT,
+ base::TaskShutdownBehavior::SKIP_ON_SHUTDOWN})) {}
+
+FlocBlocklistService::~FlocBlocklistService() = default;
+
+void FlocBlocklistService::OnBlocklistFileReady(
+ const base::FilePath& file_path) {
+ base::PostTaskAndReplyWithResult(
+ background_task_runner_.get(), FROM_HERE,
+ base::BindOnce(&LoadBlockListOnBackgroundThread, file_path),
+ base::BindOnce(&FlocBlocklistService::OnBlocklistLoadResult,
+ AsWeakPtr()));
+}
+
+void FlocBlocklistService::SetBackgroundTaskRunnerForTesting(
+ scoped_refptr<base::SequencedTaskRunner> background_task_runner) {
+ background_task_runner_ = background_task_runner;
+}
+
+void FlocBlocklistService::OnBlocklistLoadResult(LoadedBlocklist blocklist) {
+ loaded_blocklist_ = std::move(blocklist);
+}
+
+} // namespace federated_learning
diff --git a/chromium/components/federated_learning/floc_blocklist_service.h b/chromium/components/federated_learning/floc_blocklist_service.h
new file mode 100644
index 00000000000..c575775a256
--- /dev/null
+++ b/chromium/components/federated_learning/floc_blocklist_service.h
@@ -0,0 +1,59 @@
+// Copyright 2020 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef COMPONENTS_FEDERATED_LEARNING_FLOC_BLOCKLIST_SERVICE_H_
+#define COMPONENTS_FEDERATED_LEARNING_FLOC_BLOCKLIST_SERVICE_H_
+
+#include <stdint.h>
+
+#include <unordered_set>
+
+#include "base/memory/scoped_refptr.h"
+#include "base/memory/weak_ptr.h"
+#include "base/optional.h"
+
+namespace base {
+class FilePath;
+class SequencedTaskRunner;
+} // namespace base
+
+namespace federated_learning {
+
+// Responsible for loading the blocklist of flocs that are downloaded through
+// the component updater.
+//
+// File reading and parsing is posted to |background_task_runner_|.
+class FlocBlocklistService
+ : public base::SupportsWeakPtr<FlocBlocklistService> {
+ public:
+ using LoadedBlocklist = base::Optional<std::unordered_set<uint64_t>>;
+
+ FlocBlocklistService();
+ virtual ~FlocBlocklistService();
+
+ FlocBlocklistService(const FlocBlocklistService&) = delete;
+ FlocBlocklistService& operator=(const FlocBlocklistService&) = delete;
+
+ // Virtual for testing.
+ virtual void OnBlocklistFileReady(const base::FilePath& file_path);
+
+ void SetBackgroundTaskRunnerForTesting(
+ scoped_refptr<base::SequencedTaskRunner> background_task_runner);
+
+ protected:
+ // Virtual for testing.
+ virtual void OnBlocklistLoadResult(LoadedBlocklist blocklist);
+
+ private:
+ friend class MockFlocBlocklistService;
+
+ // Runner for tasks that do not influence user experience.
+ scoped_refptr<base::SequencedTaskRunner> background_task_runner_;
+
+ LoadedBlocklist loaded_blocklist_;
+};
+
+} // namespace federated_learning
+
+#endif // COMPONENTS_FEDERATED_LEARNING_FLOC_BLOCKLIST_SERVICE_H_
diff --git a/chromium/components/federated_learning/floc_blocklist_service_unittest.cc b/chromium/components/federated_learning/floc_blocklist_service_unittest.cc
new file mode 100644
index 00000000000..77742221c12
--- /dev/null
+++ b/chromium/components/federated_learning/floc_blocklist_service_unittest.cc
@@ -0,0 +1,178 @@
+// Copyright 2020 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "components/federated_learning/floc_blocklist_service.h"
+
+#include <string>
+#include <vector>
+
+#include "base/files/file_util.h"
+#include "base/files/scoped_temp_dir.h"
+#include "base/run_loop.h"
+#include "base/strings/string_number_conversions.h"
+#include "base/test/task_environment.h"
+#include "base/test/test_simple_task_runner.h"
+#include "components/federated_learning/proto/blocklist.pb.h"
+#include "testing/gtest/include/gtest/gtest.h"
+
+namespace federated_learning {
+
+// The purpose of this class is to expose the loaded_blocklist_ member and to
+// allow monitoring the OnBlocklistLoadResult method calls.
+class MockFlocBlocklistService : public FlocBlocklistService {
+ public:
+ using FlocBlocklistService::FlocBlocklistService;
+
+ void OnBlocklistLoadResult(LoadedBlocklist blocklist) override {
+ FlocBlocklistService::OnBlocklistLoadResult(std::move(blocklist));
+
+ ++load_result_count_;
+
+ if (load_result_count_ == expected_load_result_count_)
+ run_loop_.Quit();
+ }
+
+ const LoadedBlocklist& loaded_blocklist() { return loaded_blocklist_; }
+
+ void WaitForExpectedLoadResultCount(size_t expected_load_result_count) {
+ DCHECK(!run_loop_.running());
+ if (expected_load_result_count_ >= expected_load_result_count)
+ return;
+
+ expected_load_result_count_ = expected_load_result_count;
+ run_loop_.Run();
+ }
+
+ private:
+ size_t load_result_count_ = 0;
+ size_t expected_load_result_count_ = 0;
+ base::RunLoop run_loop_;
+};
+
+class FlocBlocklistServiceTest : public ::testing::Test {
+ public:
+ FlocBlocklistServiceTest()
+ : background_task_runner_(
+ base::MakeRefCounted<base::TestSimpleTaskRunner>()) {}
+
+ FlocBlocklistServiceTest(const FlocBlocklistServiceTest&) = delete;
+ FlocBlocklistServiceTest& operator=(const FlocBlocklistServiceTest&) = delete;
+
+ protected:
+ void SetUp() override {
+ service_ = std::make_unique<MockFlocBlocklistService>();
+ service_->SetBackgroundTaskRunnerForTesting(background_task_runner_);
+ }
+
+ base::FilePath GetUniqueTemporaryPath() {
+ CHECK(scoped_temp_dir_.IsValid() || scoped_temp_dir_.CreateUniqueTempDir());
+ return scoped_temp_dir_.GetPath().AppendASCII(
+ base::NumberToString(next_unique_file_suffix_++));
+ }
+
+ base::FilePath CreateTestBlocklistProtoFile(
+ const std::vector<uint64_t>& blocklist) {
+ base::FilePath file_path = GetUniqueTemporaryPath();
+
+ federated_learning::proto::Blocklist blocklist_proto;
+ for (uint64_t n : blocklist)
+ blocklist_proto.add_entries(n);
+
+ std::string contents;
+ CHECK(blocklist_proto.SerializeToString(&contents));
+
+ CHECK_EQ(static_cast<int>(contents.size()),
+ base::WriteFile(file_path, contents.data(),
+ static_cast<int>(contents.size())));
+ return file_path;
+ }
+
+ base::FilePath CreateCorruptedTestBlocklistProtoFile() {
+ base::FilePath file_path = GetUniqueTemporaryPath();
+ std::string contents = "1234\n5678\n";
+ CHECK_EQ(static_cast<int>(contents.size()),
+ base::WriteFile(file_path, contents.data(),
+ static_cast<int>(contents.size())));
+ return file_path;
+ }
+
+ MockFlocBlocklistService* service() { return service_.get(); }
+
+ protected:
+ base::test::TaskEnvironment task_environment_;
+ base::ScopedTempDir scoped_temp_dir_;
+ int next_unique_file_suffix_ = 1;
+
+ scoped_refptr<base::TestSimpleTaskRunner> background_task_runner_;
+
+ std::unique_ptr<MockFlocBlocklistService> service_;
+};
+
+TEST_F(FlocBlocklistServiceTest, Startup_NoBlocklistNotNotified) {
+ EXPECT_FALSE(service()->loaded_blocklist().has_value());
+}
+
+TEST_F(FlocBlocklistServiceTest, NewEmptyBlocklist_Loaded) {
+ base::FilePath file_path = CreateTestBlocklistProtoFile({});
+ service()->OnBlocklistFileReady(file_path);
+
+ background_task_runner_->RunPendingTasks();
+ service()->WaitForExpectedLoadResultCount(1u);
+
+ EXPECT_TRUE(service()->loaded_blocklist().has_value());
+ EXPECT_EQ(service()->loaded_blocklist()->size(), 0u);
+}
+
+TEST_F(FlocBlocklistServiceTest, NewNonEmptyBlocklist_Loaded) {
+ base::FilePath file_path = CreateTestBlocklistProtoFile({1, 2, 3, 0});
+ service()->OnBlocklistFileReady(file_path);
+
+ background_task_runner_->RunPendingTasks();
+ service()->WaitForExpectedLoadResultCount(1u);
+
+ EXPECT_TRUE(service()->loaded_blocklist().has_value());
+ EXPECT_EQ(service()->loaded_blocklist()->size(), 4u);
+ EXPECT_TRUE(service()->loaded_blocklist()->count(0));
+ EXPECT_TRUE(service()->loaded_blocklist()->count(1));
+ EXPECT_TRUE(service()->loaded_blocklist()->count(2));
+ EXPECT_TRUE(service()->loaded_blocklist()->count(3));
+}
+
+TEST_F(FlocBlocklistServiceTest, NonExistentBlocklist_NotLoaded) {
+ base::FilePath file_path = GetUniqueTemporaryPath();
+ service()->OnBlocklistFileReady(file_path);
+
+ background_task_runner_->RunPendingTasks();
+ service()->WaitForExpectedLoadResultCount(1u);
+
+ EXPECT_FALSE(service()->loaded_blocklist().has_value());
+}
+
+TEST_F(FlocBlocklistServiceTest, CorruptedBlocklist_NotLoaded) {
+ base::FilePath file_path = CreateCorruptedTestBlocklistProtoFile();
+ service()->OnBlocklistFileReady(file_path);
+
+ background_task_runner_->RunPendingTasks();
+ service()->WaitForExpectedLoadResultCount(1u);
+
+ EXPECT_FALSE(service()->loaded_blocklist().has_value());
+}
+
+TEST_F(FlocBlocklistServiceTest, MultipleUpdate_LatestOneLoaded) {
+ base::FilePath file_path1 = CreateTestBlocklistProtoFile({1, 2, 3, 0});
+ base::FilePath file_path2 = CreateTestBlocklistProtoFile({4});
+ service()->OnBlocklistFileReady(file_path1);
+ service()->OnBlocklistFileReady(file_path2);
+
+ EXPECT_FALSE(service()->loaded_blocklist().has_value());
+
+ background_task_runner_->RunPendingTasks();
+ service()->WaitForExpectedLoadResultCount(2u);
+
+ EXPECT_TRUE(service()->loaded_blocklist().has_value());
+ EXPECT_EQ(service()->loaded_blocklist()->size(), 1u);
+ EXPECT_TRUE(service()->loaded_blocklist()->count(4));
+}
+
+} // namespace federated_learning \ No newline at end of file
diff --git a/chromium/components/federated_learning/floc_constants.cc b/chromium/components/federated_learning/floc_constants.cc
new file mode 100644
index 00000000000..65afb017fc2
--- /dev/null
+++ b/chromium/components/federated_learning/floc_constants.cc
@@ -0,0 +1,19 @@
+// Copyright 2020 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "components/federated_learning/floc_constants.h"
+
+namespace federated_learning {
+
+const char kManifestBlocklistFormatKey[] = "blocklist_format";
+
+const int kCurrentBlocklistFormatVersion = 1;
+
+const base::FilePath::CharType kTopLevelDirectoryName[] =
+ FILE_PATH_LITERAL("Floc");
+
+const base::FilePath::CharType kBlocklistFileName[] =
+ FILE_PATH_LITERAL("Blocklist");
+
+} // namespace federated_learning
diff --git a/chromium/components/federated_learning/floc_constants.h b/chromium/components/federated_learning/floc_constants.h
new file mode 100644
index 00000000000..1e9a129de0f
--- /dev/null
+++ b/chromium/components/federated_learning/floc_constants.h
@@ -0,0 +1,27 @@
+// Copyright 2020 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef COMPONENTS_FEDERATED_LEARNING_FLOC_CONSTANTS_H_
+#define COMPONENTS_FEDERATED_LEARNING_FLOC_CONSTANTS_H_
+
+#include "base/files/file_path.h"
+
+namespace federated_learning {
+
+extern const char kManifestBlocklistFormatKey[];
+
+extern const int kCurrentBlocklistFormatVersion;
+
+// The name of the top-level directory under the user data directory that
+// contains all files and subdirectories related to the floc.
+extern const base::FilePath::CharType kTopLevelDirectoryName[];
+
+// Paths under |kTopLevelDirectoryName|
+
+// The name of the file that stores the blocklist.
+extern const base::FilePath::CharType kBlocklistFileName[];
+
+} // namespace federated_learning
+
+#endif // COMPONENTS_FEDERATED_LEARNING_FLOC_CONSTANTS_H_ \ No newline at end of file
diff --git a/chromium/components/federated_learning/floc_id.cc b/chromium/components/federated_learning/floc_id.cc
new file mode 100644
index 00000000000..38caf081581
--- /dev/null
+++ b/chromium/components/federated_learning/floc_id.cc
@@ -0,0 +1,64 @@
+// Copyright 2020 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "components/federated_learning/floc_id.h"
+
+#include "base/strings/strcat.h"
+#include "base/strings/string_number_conversions.h"
+#include "components/federated_learning/sim_hash.h"
+
+namespace federated_learning {
+
+namespace {
+
+constexpr char kFlocVersion[] = "1.0.0";
+
+// This is only for experimentation and won't be served to websites.
+constexpr size_t kNumberOfBitsInFloc = 50;
+static_assert(kNumberOfBitsInFloc > 0 &&
+ kNumberOfBitsInFloc <= std::numeric_limits<uint64_t>::digits,
+ "Number of bits in the floc id must be greater than 0 and no "
+ "greater than 64.");
+
+} // namespace
+
+// static
+FlocId FlocId::CreateFromHistory(
+ const std::unordered_set<std::string>& domains) {
+ return FlocId(SimHashStrings(domains, kNumberOfBitsInFloc));
+}
+
+FlocId::FlocId() = default;
+
+FlocId::FlocId(uint64_t id) : id_(id) {}
+
+FlocId::FlocId(const FlocId& id) = default;
+
+FlocId::~FlocId() = default;
+
+FlocId& FlocId::operator=(const FlocId& id) = default;
+
+FlocId& FlocId::operator=(FlocId&& id) = default;
+
+bool FlocId::IsValid() const {
+ return id_.has_value();
+}
+
+uint64_t FlocId::ToUint64() const {
+ DCHECK(id_.has_value());
+ return id_.value();
+}
+
+std::string FlocId::ToDebugHeaderValue() const {
+ if (!id_.has_value())
+ return "null";
+ return ToHeaderValue();
+}
+
+std::string FlocId::ToHeaderValue() const {
+ DCHECK(id_.has_value());
+ return base::StrCat({base::NumberToString(id_.value()), ".", kFlocVersion});
+}
+
+} // namespace federated_learning
diff --git a/chromium/components/federated_learning/floc_id.h b/chromium/components/federated_learning/floc_id.h
new file mode 100644
index 00000000000..ad2d9942641
--- /dev/null
+++ b/chromium/components/federated_learning/floc_id.h
@@ -0,0 +1,46 @@
+// Copyright 2020 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef COMPONENTS_FEDERATED_LEARNING_FLOC_ID_H_
+#define COMPONENTS_FEDERATED_LEARNING_FLOC_ID_H_
+
+#include "base/optional.h"
+
+#include <stdint.h>
+
+#include <string>
+#include <unordered_set>
+
+namespace federated_learning {
+
+// ID used to represent a cohort of people with similar browsing habits. For
+// more context, see the explainer at
+// https://github.com/jkarlin/floc/blob/master/README.md
+class FlocId {
+ public:
+ static FlocId CreateFromHistory(
+ const std::unordered_set<std::string>& domains);
+
+ FlocId();
+ explicit FlocId(uint64_t id);
+ FlocId(const FlocId& id);
+
+ ~FlocId();
+ FlocId& operator=(const FlocId& id);
+ FlocId& operator=(FlocId&& id);
+
+ bool IsValid() const;
+ uint64_t ToUint64() const;
+
+ std::string ToDebugHeaderValue() const;
+
+ private:
+ std::string ToHeaderValue() const;
+
+ base::Optional<uint64_t> id_;
+};
+
+} // namespace federated_learning
+
+#endif // COMPONENTS_FEDERATED_LEARNING_FLOC_ID_H_
diff --git a/chromium/components/federated_learning/proto/BUILD.gn b/chromium/components/federated_learning/proto/BUILD.gn
new file mode 100644
index 00000000000..e3154e9be25
--- /dev/null
+++ b/chromium/components/federated_learning/proto/BUILD.gn
@@ -0,0 +1,9 @@
+# Copyright 2020 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+import("//third_party/protobuf/proto_library.gni")
+
+proto_library("federated_learning") {
+ sources = [ "blocklist.proto" ]
+}
diff --git a/chromium/components/federated_learning/proto/blocklist.proto b/chromium/components/federated_learning/proto/blocklist.proto
new file mode 100644
index 00000000000..f31f26712c4
--- /dev/null
+++ b/chromium/components/federated_learning/proto/blocklist.proto
@@ -0,0 +1,13 @@
+// Copyright 2020 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+syntax = "proto2";
+option optimize_for = LITE_RUNTIME;
+
+package federated_learning.proto;
+option java_package = "org.chromium.components.federated_learning.proto";
+
+message Blocklist {
+ repeated int64 entries = 1 [packed = true];
+}
diff --git a/chromium/components/federated_learning/sim_hash.cc b/chromium/components/federated_learning/sim_hash.cc
new file mode 100644
index 00000000000..8012adfbbe2
--- /dev/null
+++ b/chromium/components/federated_learning/sim_hash.cc
@@ -0,0 +1,93 @@
+// Copyright 2020 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "components/federated_learning/sim_hash.h"
+
+#include "base/hash/legacy_hash.h"
+
+#include <algorithm>
+#include <cmath>
+
+namespace federated_learning {
+
+namespace {
+
+uint64_t g_seed1 = 1ULL;
+uint64_t g_seed2 = 2ULL;
+constexpr double kTwoPi = 2.0 * 3.141592653589793;
+
+// Hashes i and j to create a uniform RV in [0,1].
+double RandomUniform(uint64_t i, uint64_t j, uint64_t seed) {
+ uint64_t arr[2] = {i, j};
+ uint64_t hashed = base::legacy::CityHash64WithSeed(
+ base::as_bytes(
+ base::make_span(reinterpret_cast<const char*>(arr), sizeof(arr))),
+ seed);
+
+ return static_cast<double>(hashed) /
+ static_cast<double>(std::numeric_limits<uint64_t>::max());
+}
+
+// Hashes i and j to create two uniform RVs in [0,1]. Uses the Box-Muller
+// transform to generate a Gaussian.
+double RandomGaussian(uint64_t i, uint64_t j) {
+ double rv1 = RandomUniform(i, j, g_seed1);
+ double rv2 = RandomUniform(j, i, g_seed2);
+
+ // BoxMuller
+ return std::sqrt(-2.0 * std::log(rv1)) * std::cos(kTwoPi * rv2);
+}
+
+} // namespace
+
+LargeBitVector::LargeBitVector() = default;
+LargeBitVector::LargeBitVector(const LargeBitVector&) = default;
+LargeBitVector::~LargeBitVector() = default;
+
+void LargeBitVector::SetBit(uint64_t pos) {
+ positions_of_set_bits_.insert(pos);
+}
+
+const std::set<uint64_t>& LargeBitVector::PositionsOfSetBits() const {
+ return positions_of_set_bits_;
+}
+
+void SetSeedsForTesting(uint64_t seed1, uint64_t seed2) {
+ g_seed1 = seed1;
+ g_seed2 = seed2;
+}
+
+uint64_t SimHashBits(const LargeBitVector& input, size_t output_dimensions) {
+ DCHECK_LT(0u, output_dimensions);
+ DCHECK_LE(output_dimensions, 64u);
+
+ uint64_t result = 0;
+ for (size_t d = 0; d < output_dimensions; ++d) {
+ double acc = 0;
+
+ for (uint64_t pos : input.PositionsOfSetBits())
+ acc += RandomGaussian(d, pos);
+
+ if (acc > 0)
+ result |= (1ULL << d);
+ }
+
+ return result;
+}
+
+uint64_t SimHashStrings(const std::unordered_set<std::string>& input,
+ size_t output_dimensions) {
+ DCHECK_LT(0u, output_dimensions);
+ DCHECK_LE(output_dimensions, 64u);
+
+ LargeBitVector large_bit_vector;
+ for (const std::string& s : input) {
+ large_bit_vector.SetBit(
+ base::legacy::CityHash64(base::as_bytes(base::make_span(s))));
+ }
+
+ return SimHashBits(large_bit_vector, output_dimensions);
+}
+
+} // namespace federated_learning
diff --git a/chromium/components/federated_learning/sim_hash.h b/chromium/components/federated_learning/sim_hash.h
new file mode 100644
index 00000000000..55030d9d2e3
--- /dev/null
+++ b/chromium/components/federated_learning/sim_hash.h
@@ -0,0 +1,44 @@
+// Copyright 2020 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef COMPONENTS_FEDERATED_LEARNING_SIM_HASH_H_
+#define COMPONENTS_FEDERATED_LEARNING_SIM_HASH_H_
+
+#include <set>
+#include <unordered_set>
+
+namespace federated_learning {
+
+// A 2^64 bit vector
+class LargeBitVector {
+ public:
+ LargeBitVector();
+ LargeBitVector(const LargeBitVector&);
+ ~LargeBitVector();
+
+ void SetBit(uint64_t pos);
+ const std::set<uint64_t>& PositionsOfSetBits() const;
+
+ private:
+ // Sparse representation of a 2^64 bit vector. Each number in
+ // |positions_of_set_bits_| represents the position of a bit that is being
+ // set.
+ std::set<uint64_t> positions_of_set_bits_;
+};
+
+// Set the two seeds used for generating the random gaussian.
+void SetSeedsForTesting(uint64_t seed1, uint64_t seed2);
+
+// SimHash a 2^64 bit vector to an |output_dimensions| bit number.
+// |output_dimensions| must be greater than 0 and no greater than 64.
+uint64_t SimHashBits(const LargeBitVector& input, size_t output_dimensions);
+
+// SimHash a set of strings to an |output_dimensions| bit number.
+// |output_dimensions| must be greater than 0 and no greater than 64.
+uint64_t SimHashStrings(const std::unordered_set<std::string>& input,
+ size_t output_dimensions);
+
+} // namespace federated_learning
+
+#endif // COMPONENTS_FEDERATED_LEARNING_SIM_HASH_H_
diff --git a/chromium/components/federated_learning/sim_hash_unittest.cc b/chromium/components/federated_learning/sim_hash_unittest.cc
new file mode 100644
index 00000000000..14bcb92466b
--- /dev/null
+++ b/chromium/components/federated_learning/sim_hash_unittest.cc
@@ -0,0 +1,203 @@
+// Copyright 2020 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "components/federated_learning/sim_hash.h"
+
+#include "base/check_op.h"
+#include "base/hash/legacy_hash.h"
+#include "base/rand_util.h"
+#include "testing/gtest/include/gtest/gtest.h"
+
+namespace federated_learning {
+
+namespace {
+
+// Roll our own random uint64_t generator as we want deterministic outcome
+// across different test runs.
+uint64_t RandUint64() {
+ static uint64_t seed = 0;
+ ++seed;
+ uint64_t arr[2] = {1234ULL, 4321ULL};
+ return base::legacy::CityHash64WithSeed(
+ base::as_bytes(
+ base::make_span(reinterpret_cast<const char*>(arr), sizeof(arr))),
+ seed);
+}
+
+// Inclusive [min, max]
+uint64_t RandUint64InRange(uint64_t min, uint64_t max) {
+ DCHECK_LE(min, max);
+ if (min == 0ULL && max == std::numeric_limits<uint64_t>::max())
+ return RandUint64();
+
+ uint64_t range = max - min + 1;
+ DCHECK_LE(2ULL, range);
+
+ // We must discard random results above this number, as they would
+ // make the random generator non-uniform.
+ uint64_t max_acceptable_value =
+ (std::numeric_limits<uint64_t>::max() / range) * range - 1;
+
+ uint64_t rand_value;
+ do {
+ rand_value = RandUint64();
+ } while (rand_value > max_acceptable_value);
+
+ uint64_t result = (rand_value % range) + min;
+
+ DCHECK_GE(result, min);
+ DCHECK_LE(result, max);
+ return result;
+}
+
+LargeBitVector RandLargeBitVector(
+ size_t number_of_bits_set,
+ uint64_t max_bit_position = std::numeric_limits<uint64_t>::max()) {
+ LargeBitVector result;
+ while (result.PositionsOfSetBits().size() < number_of_bits_set) {
+ result.SetBit(RandUint64InRange(0ULL, max_bit_position));
+ }
+ return result;
+}
+
+float DotProduct(const LargeBitVector& v1, const LargeBitVector& v2) {
+ float result = 0;
+ for (uint64_t pos : v1.PositionsOfSetBits()) {
+ if (v2.PositionsOfSetBits().count(pos)) {
+ result += 1;
+ }
+ }
+ return result;
+}
+
+float Norm(const LargeBitVector& v) {
+ return std::sqrt(DotProduct(v, v));
+}
+
+size_t MultipleSimHashGetNumOutputBitsEqual(size_t repeat_times,
+ size_t dimensions,
+ const LargeBitVector& v1,
+ const LargeBitVector& v2) {
+ uint64_t seed1 = 1;
+ uint64_t seed2 = 100000;
+
+ uint64_t num_output_bits_equal = 0;
+ for (size_t i = 0; i < repeat_times; ++i) {
+ SetSeedsForTesting(seed1++, seed2++);
+
+ uint64_t o1 = SimHashBits(v1, dimensions);
+ uint64_t o2 = SimHashBits(v2, dimensions);
+ for (size_t j = 0; j < dimensions; ++j) {
+ if ((o1 & 1) == (o2 & 1))
+ ++num_output_bits_equal;
+ o1 >>= 1;
+ o2 >>= 1;
+ }
+ }
+ return num_output_bits_equal;
+}
+
+TEST(SimHashTest, HashValue) {
+ LargeBitVector zero;
+ EXPECT_EQ(SimHashBits(zero, 1u), 0ULL);
+ EXPECT_EQ(SimHashBits(zero, 16u), 0ULL);
+
+ LargeBitVector n0;
+ n0.SetBit(0);
+ EXPECT_EQ(SimHashBits(n0, 1u), 0ULL);
+ EXPECT_EQ(SimHashBits(n0, 16u), 8632ULL);
+
+ LargeBitVector n999999;
+ n999999.SetBit(999999);
+ EXPECT_EQ(SimHashBits(n999999, 1u), 1ULL);
+ EXPECT_EQ(SimHashBits(n999999, 16u), 28603ULL);
+
+ LargeBitVector n01;
+ n01.SetBit(0);
+ n01.SetBit(1);
+ EXPECT_EQ(SimHashBits(n01, 1u), 0ULL);
+ EXPECT_EQ(SimHashBits(n01, 16u), 10682ULL);
+}
+
+// Test that given random inputs, the chances of each bit in the SimHash result
+// to be 0 and 1 are equally likely.
+TEST(SimHashTest, ExpectationOnRandomUniformInput) {
+ const size_t dimensions = 16u;
+ const size_t repeat_times = 10000u;
+
+ uint64_t totals[dimensions] = {0.0};
+
+ for (size_t i = 0; i < repeat_times; ++i) {
+ uint64_t hash_result =
+ SimHashBits(RandLargeBitVector(RandUint64InRange(1u, 10u)), dimensions);
+
+ for (size_t j = 0; j < dimensions; ++j) {
+ totals[j] += (hash_result & 1);
+ hash_result >>= 1;
+ }
+ }
+
+ const double expectation = 0.5;
+ const double err_tolerance = 0.03;
+
+ for (size_t j = 0; j < dimensions; ++j) {
+ double avg = 1.0 * totals[j] / repeat_times;
+ EXPECT_LT(avg, expectation + err_tolerance);
+ EXPECT_GT(avg, expectation - err_tolerance);
+ }
+}
+
+// Test that the cosine similarity is preserved.
+TEST(SimHashTest, CosineSimilarity_NonOrthogonalInput) {
+ const float kPi = 3.141592653589793;
+ const size_t dimensions = 50u;
+ const size_t repeat_times = 100u;
+
+ // Generate v1 and v2 that are likely non-orthogonal.
+ LargeBitVector v1 = RandLargeBitVector(4000u, 10000);
+ LargeBitVector v2 = RandLargeBitVector(5000u, 10000);
+
+ size_t num_output_bits_equal =
+ MultipleSimHashGetNumOutputBitsEqual(repeat_times, dimensions, v1, v2);
+ float avg = 1.0 * num_output_bits_equal / dimensions / repeat_times;
+ float expectation =
+ 1.0 - std::acos(DotProduct(v1, v2) / (Norm(v1) * Norm(v2))) / kPi;
+
+ // Verify that the expectation is different from 0.5 and 1 to make sure the
+ // test is non-trivial.
+ EXPECT_LT(expectation, 0.7);
+ EXPECT_GT(expectation, 0.6);
+
+ // Verify that SimHash(v1) and SimHash(v2) have approximately |expectation|
+ // fraction of their bits in common.
+ const double err_tolerance = 0.03;
+ EXPECT_LT(avg, expectation + err_tolerance);
+ EXPECT_GT(avg, expectation - err_tolerance);
+}
+
+// Test that when input v1 and v2 are orthogonal, SimHash(v1) and SimHash(v2)
+// have approximately half their bits in common.
+TEST(SimHashTest, CosineSimilarity_OrthogonalInput) {
+ const size_t dimensions = 50u;
+ const size_t repeat_times = 100u;
+
+ // Generate v1 and v2 that are likely orthogonal
+ LargeBitVector v1 = RandLargeBitVector(1u);
+ LargeBitVector v2 = RandLargeBitVector(1u);
+
+ size_t num_output_bits_equal =
+ MultipleSimHashGetNumOutputBitsEqual(repeat_times, dimensions, v1, v2);
+ float avg = 1.0 * num_output_bits_equal / dimensions / repeat_times;
+ float expectation = 0.5;
+
+ // Verify that SimHash(v1) and SimHash(v2) have approximately half their bits
+ // in common.
+ const double err_tolerance = 0.03;
+ EXPECT_LT(avg, expectation + err_tolerance);
+ EXPECT_GT(avg, expectation - err_tolerance);
+}
+
+} // namespace
+
+} // namespace federated_learning