diff options
Diffstat (limited to 'chromium/components/federated_learning')
16 files changed, 895 insertions, 0 deletions
diff --git a/chromium/components/federated_learning/BUILD.gn b/chromium/components/federated_learning/BUILD.gn new file mode 100644 index 00000000000..406236fdb41 --- /dev/null +++ b/chromium/components/federated_learning/BUILD.gn @@ -0,0 +1,37 @@ +# Copyright 2020 The Chromium Authors. All rights reserved. +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. + +static_library("federated_learning") { + sources = [ + "floc_blocklist_service.cc", + "floc_blocklist_service.h", + "floc_constants.cc", + "floc_constants.h", + "floc_id.cc", + "floc_id.h", + "sim_hash.cc", + "sim_hash.h", + ] + + public_deps = [ + "//components/federated_learning/proto:federated_learning", + "//third_party/protobuf:protobuf_lite", + ] + + deps = [ "//base" ] +} + +source_set("unit_tests") { + testonly = true + sources = [ + "floc_blocklist_service_unittest.cc", + "sim_hash_unittest.cc", + ] + deps = [ + ":federated_learning", + "//base", + "//base/test:test_support", + "//testing/gtest", + ] +} diff --git a/chromium/components/federated_learning/DEPS b/chromium/components/federated_learning/DEPS new file mode 100644 index 00000000000..97db67c0df7 --- /dev/null +++ b/chromium/components/federated_learning/DEPS @@ -0,0 +1,3 @@ +include_rules = [ + "+third_party/protobuf", +] diff --git a/chromium/components/federated_learning/OWNERS b/chromium/components/federated_learning/OWNERS new file mode 100644 index 00000000000..073e83c96fa --- /dev/null +++ b/chromium/components/federated_learning/OWNERS @@ -0,0 +1,5 @@ +jkarlin@chromium.org +yaoxia@chromium.org + +# TEAM: privacy-sandbox-dev@chromium.org +# TODO(yaoxia): Add a crbug component for FloC. diff --git a/chromium/components/federated_learning/README.md b/chromium/components/federated_learning/README.md new file mode 100644 index 00000000000..63c2c190e73 --- /dev/null +++ b/chromium/components/federated_learning/README.md @@ -0,0 +1,7 @@ +# Federated Learning Component +The FLoC (Federated Learning of Cohorts) project aims to group users based on +their browsing habits in a privacy preserving way, that would allow +interest-based advertising without exposing a user’s browsing history. + +This component contains data types (e.g. FlocID) that other layers could access +(e.g. chrome/browser, services/network). diff --git a/chromium/components/federated_learning/floc_blocklist_service.cc b/chromium/components/federated_learning/floc_blocklist_service.cc new file mode 100644 index 00000000000..5b0134a242b --- /dev/null +++ b/chromium/components/federated_learning/floc_blocklist_service.cc @@ -0,0 +1,88 @@ +// Copyright 2020 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "components/federated_learning/floc_blocklist_service.h" + +#include <utility> + +#include "base/files/file.h" +#include "base/files/file_util.h" +#include "base/task/post_task.h" +#include "base/task_runner_util.h" +#include "components/federated_learning/floc_constants.h" +#include "components/federated_learning/proto/blocklist.pb.h" +#include "third_party/protobuf/src/google/protobuf/io/zero_copy_stream_impl_lite.h" + +namespace federated_learning { + +namespace { + +class CopyingFileInputStream : public google::protobuf::io::CopyingInputStream { + public: + explicit CopyingFileInputStream(base::File file) : file_(std::move(file)) {} + + CopyingFileInputStream(const CopyingFileInputStream&) = delete; + CopyingFileInputStream& operator=(const CopyingFileInputStream&) = delete; + + ~CopyingFileInputStream() override = default; + + // google::protobuf::io::CopyingInputStream: + int Read(void* buffer, int size) override { + return file_.ReadAtCurrentPosNoBestEffort(static_cast<char*>(buffer), size); + } + + private: + base::File file_; +}; + +FlocBlocklistService::LoadedBlocklist LoadBlockListOnBackgroundThread( + const base::FilePath& file_path) { + base::File blocklist_file(file_path, + base::File::FLAG_OPEN | base::File::FLAG_READ); + if (!blocklist_file.IsValid()) + return FlocBlocklistService::LoadedBlocklist(); + + CopyingFileInputStream copying_stream(std::move(blocklist_file)); + google::protobuf::io::CopyingInputStreamAdaptor zero_copy_stream_adaptor( + ©ing_stream); + + proto::Blocklist blocklist_proto; + if (!blocklist_proto.ParseFromZeroCopyStream(&zero_copy_stream_adaptor)) + return FlocBlocklistService::LoadedBlocklist(); + + std::unordered_set<uint64_t> blocklist; + for (uint64_t entry : blocklist_proto.entries()) + blocklist.insert(entry); + + return blocklist; +} + +} // namespace + +FlocBlocklistService::FlocBlocklistService() + : background_task_runner_(base::ThreadPool::CreateSequencedTaskRunner( + {base::MayBlock(), base::TaskPriority::BEST_EFFORT, + base::TaskShutdownBehavior::SKIP_ON_SHUTDOWN})) {} + +FlocBlocklistService::~FlocBlocklistService() = default; + +void FlocBlocklistService::OnBlocklistFileReady( + const base::FilePath& file_path) { + base::PostTaskAndReplyWithResult( + background_task_runner_.get(), FROM_HERE, + base::BindOnce(&LoadBlockListOnBackgroundThread, file_path), + base::BindOnce(&FlocBlocklistService::OnBlocklistLoadResult, + AsWeakPtr())); +} + +void FlocBlocklistService::SetBackgroundTaskRunnerForTesting( + scoped_refptr<base::SequencedTaskRunner> background_task_runner) { + background_task_runner_ = background_task_runner; +} + +void FlocBlocklistService::OnBlocklistLoadResult(LoadedBlocklist blocklist) { + loaded_blocklist_ = std::move(blocklist); +} + +} // namespace federated_learning diff --git a/chromium/components/federated_learning/floc_blocklist_service.h b/chromium/components/federated_learning/floc_blocklist_service.h new file mode 100644 index 00000000000..c575775a256 --- /dev/null +++ b/chromium/components/federated_learning/floc_blocklist_service.h @@ -0,0 +1,59 @@ +// Copyright 2020 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef COMPONENTS_FEDERATED_LEARNING_FLOC_BLOCKLIST_SERVICE_H_ +#define COMPONENTS_FEDERATED_LEARNING_FLOC_BLOCKLIST_SERVICE_H_ + +#include <stdint.h> + +#include <unordered_set> + +#include "base/memory/scoped_refptr.h" +#include "base/memory/weak_ptr.h" +#include "base/optional.h" + +namespace base { +class FilePath; +class SequencedTaskRunner; +} // namespace base + +namespace federated_learning { + +// Responsible for loading the blocklist of flocs that are downloaded through +// the component updater. +// +// File reading and parsing is posted to |background_task_runner_|. +class FlocBlocklistService + : public base::SupportsWeakPtr<FlocBlocklistService> { + public: + using LoadedBlocklist = base::Optional<std::unordered_set<uint64_t>>; + + FlocBlocklistService(); + virtual ~FlocBlocklistService(); + + FlocBlocklistService(const FlocBlocklistService&) = delete; + FlocBlocklistService& operator=(const FlocBlocklistService&) = delete; + + // Virtual for testing. + virtual void OnBlocklistFileReady(const base::FilePath& file_path); + + void SetBackgroundTaskRunnerForTesting( + scoped_refptr<base::SequencedTaskRunner> background_task_runner); + + protected: + // Virtual for testing. + virtual void OnBlocklistLoadResult(LoadedBlocklist blocklist); + + private: + friend class MockFlocBlocklistService; + + // Runner for tasks that do not influence user experience. + scoped_refptr<base::SequencedTaskRunner> background_task_runner_; + + LoadedBlocklist loaded_blocklist_; +}; + +} // namespace federated_learning + +#endif // COMPONENTS_FEDERATED_LEARNING_FLOC_BLOCKLIST_SERVICE_H_ diff --git a/chromium/components/federated_learning/floc_blocklist_service_unittest.cc b/chromium/components/federated_learning/floc_blocklist_service_unittest.cc new file mode 100644 index 00000000000..77742221c12 --- /dev/null +++ b/chromium/components/federated_learning/floc_blocklist_service_unittest.cc @@ -0,0 +1,178 @@ +// Copyright 2020 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "components/federated_learning/floc_blocklist_service.h" + +#include <string> +#include <vector> + +#include "base/files/file_util.h" +#include "base/files/scoped_temp_dir.h" +#include "base/run_loop.h" +#include "base/strings/string_number_conversions.h" +#include "base/test/task_environment.h" +#include "base/test/test_simple_task_runner.h" +#include "components/federated_learning/proto/blocklist.pb.h" +#include "testing/gtest/include/gtest/gtest.h" + +namespace federated_learning { + +// The purpose of this class is to expose the loaded_blocklist_ member and to +// allow monitoring the OnBlocklistLoadResult method calls. +class MockFlocBlocklistService : public FlocBlocklistService { + public: + using FlocBlocklistService::FlocBlocklistService; + + void OnBlocklistLoadResult(LoadedBlocklist blocklist) override { + FlocBlocklistService::OnBlocklistLoadResult(std::move(blocklist)); + + ++load_result_count_; + + if (load_result_count_ == expected_load_result_count_) + run_loop_.Quit(); + } + + const LoadedBlocklist& loaded_blocklist() { return loaded_blocklist_; } + + void WaitForExpectedLoadResultCount(size_t expected_load_result_count) { + DCHECK(!run_loop_.running()); + if (expected_load_result_count_ >= expected_load_result_count) + return; + + expected_load_result_count_ = expected_load_result_count; + run_loop_.Run(); + } + + private: + size_t load_result_count_ = 0; + size_t expected_load_result_count_ = 0; + base::RunLoop run_loop_; +}; + +class FlocBlocklistServiceTest : public ::testing::Test { + public: + FlocBlocklistServiceTest() + : background_task_runner_( + base::MakeRefCounted<base::TestSimpleTaskRunner>()) {} + + FlocBlocklistServiceTest(const FlocBlocklistServiceTest&) = delete; + FlocBlocklistServiceTest& operator=(const FlocBlocklistServiceTest&) = delete; + + protected: + void SetUp() override { + service_ = std::make_unique<MockFlocBlocklistService>(); + service_->SetBackgroundTaskRunnerForTesting(background_task_runner_); + } + + base::FilePath GetUniqueTemporaryPath() { + CHECK(scoped_temp_dir_.IsValid() || scoped_temp_dir_.CreateUniqueTempDir()); + return scoped_temp_dir_.GetPath().AppendASCII( + base::NumberToString(next_unique_file_suffix_++)); + } + + base::FilePath CreateTestBlocklistProtoFile( + const std::vector<uint64_t>& blocklist) { + base::FilePath file_path = GetUniqueTemporaryPath(); + + federated_learning::proto::Blocklist blocklist_proto; + for (uint64_t n : blocklist) + blocklist_proto.add_entries(n); + + std::string contents; + CHECK(blocklist_proto.SerializeToString(&contents)); + + CHECK_EQ(static_cast<int>(contents.size()), + base::WriteFile(file_path, contents.data(), + static_cast<int>(contents.size()))); + return file_path; + } + + base::FilePath CreateCorruptedTestBlocklistProtoFile() { + base::FilePath file_path = GetUniqueTemporaryPath(); + std::string contents = "1234\n5678\n"; + CHECK_EQ(static_cast<int>(contents.size()), + base::WriteFile(file_path, contents.data(), + static_cast<int>(contents.size()))); + return file_path; + } + + MockFlocBlocklistService* service() { return service_.get(); } + + protected: + base::test::TaskEnvironment task_environment_; + base::ScopedTempDir scoped_temp_dir_; + int next_unique_file_suffix_ = 1; + + scoped_refptr<base::TestSimpleTaskRunner> background_task_runner_; + + std::unique_ptr<MockFlocBlocklistService> service_; +}; + +TEST_F(FlocBlocklistServiceTest, Startup_NoBlocklistNotNotified) { + EXPECT_FALSE(service()->loaded_blocklist().has_value()); +} + +TEST_F(FlocBlocklistServiceTest, NewEmptyBlocklist_Loaded) { + base::FilePath file_path = CreateTestBlocklistProtoFile({}); + service()->OnBlocklistFileReady(file_path); + + background_task_runner_->RunPendingTasks(); + service()->WaitForExpectedLoadResultCount(1u); + + EXPECT_TRUE(service()->loaded_blocklist().has_value()); + EXPECT_EQ(service()->loaded_blocklist()->size(), 0u); +} + +TEST_F(FlocBlocklistServiceTest, NewNonEmptyBlocklist_Loaded) { + base::FilePath file_path = CreateTestBlocklistProtoFile({1, 2, 3, 0}); + service()->OnBlocklistFileReady(file_path); + + background_task_runner_->RunPendingTasks(); + service()->WaitForExpectedLoadResultCount(1u); + + EXPECT_TRUE(service()->loaded_blocklist().has_value()); + EXPECT_EQ(service()->loaded_blocklist()->size(), 4u); + EXPECT_TRUE(service()->loaded_blocklist()->count(0)); + EXPECT_TRUE(service()->loaded_blocklist()->count(1)); + EXPECT_TRUE(service()->loaded_blocklist()->count(2)); + EXPECT_TRUE(service()->loaded_blocklist()->count(3)); +} + +TEST_F(FlocBlocklistServiceTest, NonExistentBlocklist_NotLoaded) { + base::FilePath file_path = GetUniqueTemporaryPath(); + service()->OnBlocklistFileReady(file_path); + + background_task_runner_->RunPendingTasks(); + service()->WaitForExpectedLoadResultCount(1u); + + EXPECT_FALSE(service()->loaded_blocklist().has_value()); +} + +TEST_F(FlocBlocklistServiceTest, CorruptedBlocklist_NotLoaded) { + base::FilePath file_path = CreateCorruptedTestBlocklistProtoFile(); + service()->OnBlocklistFileReady(file_path); + + background_task_runner_->RunPendingTasks(); + service()->WaitForExpectedLoadResultCount(1u); + + EXPECT_FALSE(service()->loaded_blocklist().has_value()); +} + +TEST_F(FlocBlocklistServiceTest, MultipleUpdate_LatestOneLoaded) { + base::FilePath file_path1 = CreateTestBlocklistProtoFile({1, 2, 3, 0}); + base::FilePath file_path2 = CreateTestBlocklistProtoFile({4}); + service()->OnBlocklistFileReady(file_path1); + service()->OnBlocklistFileReady(file_path2); + + EXPECT_FALSE(service()->loaded_blocklist().has_value()); + + background_task_runner_->RunPendingTasks(); + service()->WaitForExpectedLoadResultCount(2u); + + EXPECT_TRUE(service()->loaded_blocklist().has_value()); + EXPECT_EQ(service()->loaded_blocklist()->size(), 1u); + EXPECT_TRUE(service()->loaded_blocklist()->count(4)); +} + +} // namespace federated_learning
\ No newline at end of file diff --git a/chromium/components/federated_learning/floc_constants.cc b/chromium/components/federated_learning/floc_constants.cc new file mode 100644 index 00000000000..65afb017fc2 --- /dev/null +++ b/chromium/components/federated_learning/floc_constants.cc @@ -0,0 +1,19 @@ +// Copyright 2020 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "components/federated_learning/floc_constants.h" + +namespace federated_learning { + +const char kManifestBlocklistFormatKey[] = "blocklist_format"; + +const int kCurrentBlocklistFormatVersion = 1; + +const base::FilePath::CharType kTopLevelDirectoryName[] = + FILE_PATH_LITERAL("Floc"); + +const base::FilePath::CharType kBlocklistFileName[] = + FILE_PATH_LITERAL("Blocklist"); + +} // namespace federated_learning diff --git a/chromium/components/federated_learning/floc_constants.h b/chromium/components/federated_learning/floc_constants.h new file mode 100644 index 00000000000..1e9a129de0f --- /dev/null +++ b/chromium/components/federated_learning/floc_constants.h @@ -0,0 +1,27 @@ +// Copyright 2020 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef COMPONENTS_FEDERATED_LEARNING_FLOC_CONSTANTS_H_ +#define COMPONENTS_FEDERATED_LEARNING_FLOC_CONSTANTS_H_ + +#include "base/files/file_path.h" + +namespace federated_learning { + +extern const char kManifestBlocklistFormatKey[]; + +extern const int kCurrentBlocklistFormatVersion; + +// The name of the top-level directory under the user data directory that +// contains all files and subdirectories related to the floc. +extern const base::FilePath::CharType kTopLevelDirectoryName[]; + +// Paths under |kTopLevelDirectoryName| + +// The name of the file that stores the blocklist. +extern const base::FilePath::CharType kBlocklistFileName[]; + +} // namespace federated_learning + +#endif // COMPONENTS_FEDERATED_LEARNING_FLOC_CONSTANTS_H_
\ No newline at end of file diff --git a/chromium/components/federated_learning/floc_id.cc b/chromium/components/federated_learning/floc_id.cc new file mode 100644 index 00000000000..38caf081581 --- /dev/null +++ b/chromium/components/federated_learning/floc_id.cc @@ -0,0 +1,64 @@ +// Copyright 2020 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "components/federated_learning/floc_id.h" + +#include "base/strings/strcat.h" +#include "base/strings/string_number_conversions.h" +#include "components/federated_learning/sim_hash.h" + +namespace federated_learning { + +namespace { + +constexpr char kFlocVersion[] = "1.0.0"; + +// This is only for experimentation and won't be served to websites. +constexpr size_t kNumberOfBitsInFloc = 50; +static_assert(kNumberOfBitsInFloc > 0 && + kNumberOfBitsInFloc <= std::numeric_limits<uint64_t>::digits, + "Number of bits in the floc id must be greater than 0 and no " + "greater than 64."); + +} // namespace + +// static +FlocId FlocId::CreateFromHistory( + const std::unordered_set<std::string>& domains) { + return FlocId(SimHashStrings(domains, kNumberOfBitsInFloc)); +} + +FlocId::FlocId() = default; + +FlocId::FlocId(uint64_t id) : id_(id) {} + +FlocId::FlocId(const FlocId& id) = default; + +FlocId::~FlocId() = default; + +FlocId& FlocId::operator=(const FlocId& id) = default; + +FlocId& FlocId::operator=(FlocId&& id) = default; + +bool FlocId::IsValid() const { + return id_.has_value(); +} + +uint64_t FlocId::ToUint64() const { + DCHECK(id_.has_value()); + return id_.value(); +} + +std::string FlocId::ToDebugHeaderValue() const { + if (!id_.has_value()) + return "null"; + return ToHeaderValue(); +} + +std::string FlocId::ToHeaderValue() const { + DCHECK(id_.has_value()); + return base::StrCat({base::NumberToString(id_.value()), ".", kFlocVersion}); +} + +} // namespace federated_learning diff --git a/chromium/components/federated_learning/floc_id.h b/chromium/components/federated_learning/floc_id.h new file mode 100644 index 00000000000..ad2d9942641 --- /dev/null +++ b/chromium/components/federated_learning/floc_id.h @@ -0,0 +1,46 @@ +// Copyright 2020 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef COMPONENTS_FEDERATED_LEARNING_FLOC_ID_H_ +#define COMPONENTS_FEDERATED_LEARNING_FLOC_ID_H_ + +#include "base/optional.h" + +#include <stdint.h> + +#include <string> +#include <unordered_set> + +namespace federated_learning { + +// ID used to represent a cohort of people with similar browsing habits. For +// more context, see the explainer at +// https://github.com/jkarlin/floc/blob/master/README.md +class FlocId { + public: + static FlocId CreateFromHistory( + const std::unordered_set<std::string>& domains); + + FlocId(); + explicit FlocId(uint64_t id); + FlocId(const FlocId& id); + + ~FlocId(); + FlocId& operator=(const FlocId& id); + FlocId& operator=(FlocId&& id); + + bool IsValid() const; + uint64_t ToUint64() const; + + std::string ToDebugHeaderValue() const; + + private: + std::string ToHeaderValue() const; + + base::Optional<uint64_t> id_; +}; + +} // namespace federated_learning + +#endif // COMPONENTS_FEDERATED_LEARNING_FLOC_ID_H_ diff --git a/chromium/components/federated_learning/proto/BUILD.gn b/chromium/components/federated_learning/proto/BUILD.gn new file mode 100644 index 00000000000..e3154e9be25 --- /dev/null +++ b/chromium/components/federated_learning/proto/BUILD.gn @@ -0,0 +1,9 @@ +# Copyright 2020 The Chromium Authors. All rights reserved. +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. + +import("//third_party/protobuf/proto_library.gni") + +proto_library("federated_learning") { + sources = [ "blocklist.proto" ] +} diff --git a/chromium/components/federated_learning/proto/blocklist.proto b/chromium/components/federated_learning/proto/blocklist.proto new file mode 100644 index 00000000000..f31f26712c4 --- /dev/null +++ b/chromium/components/federated_learning/proto/blocklist.proto @@ -0,0 +1,13 @@ +// Copyright 2020 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +syntax = "proto2"; +option optimize_for = LITE_RUNTIME; + +package federated_learning.proto; +option java_package = "org.chromium.components.federated_learning.proto"; + +message Blocklist { + repeated int64 entries = 1 [packed = true]; +} diff --git a/chromium/components/federated_learning/sim_hash.cc b/chromium/components/federated_learning/sim_hash.cc new file mode 100644 index 00000000000..8012adfbbe2 --- /dev/null +++ b/chromium/components/federated_learning/sim_hash.cc @@ -0,0 +1,93 @@ +// Copyright 2020 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "components/federated_learning/sim_hash.h" + +#include "base/hash/legacy_hash.h" + +#include <algorithm> +#include <cmath> + +namespace federated_learning { + +namespace { + +uint64_t g_seed1 = 1ULL; +uint64_t g_seed2 = 2ULL; +constexpr double kTwoPi = 2.0 * 3.141592653589793; + +// Hashes i and j to create a uniform RV in [0,1]. +double RandomUniform(uint64_t i, uint64_t j, uint64_t seed) { + uint64_t arr[2] = {i, j}; + uint64_t hashed = base::legacy::CityHash64WithSeed( + base::as_bytes( + base::make_span(reinterpret_cast<const char*>(arr), sizeof(arr))), + seed); + + return static_cast<double>(hashed) / + static_cast<double>(std::numeric_limits<uint64_t>::max()); +} + +// Hashes i and j to create two uniform RVs in [0,1]. Uses the Box-Muller +// transform to generate a Gaussian. +double RandomGaussian(uint64_t i, uint64_t j) { + double rv1 = RandomUniform(i, j, g_seed1); + double rv2 = RandomUniform(j, i, g_seed2); + + // BoxMuller + return std::sqrt(-2.0 * std::log(rv1)) * std::cos(kTwoPi * rv2); +} + +} // namespace + +LargeBitVector::LargeBitVector() = default; +LargeBitVector::LargeBitVector(const LargeBitVector&) = default; +LargeBitVector::~LargeBitVector() = default; + +void LargeBitVector::SetBit(uint64_t pos) { + positions_of_set_bits_.insert(pos); +} + +const std::set<uint64_t>& LargeBitVector::PositionsOfSetBits() const { + return positions_of_set_bits_; +} + +void SetSeedsForTesting(uint64_t seed1, uint64_t seed2) { + g_seed1 = seed1; + g_seed2 = seed2; +} + +uint64_t SimHashBits(const LargeBitVector& input, size_t output_dimensions) { + DCHECK_LT(0u, output_dimensions); + DCHECK_LE(output_dimensions, 64u); + + uint64_t result = 0; + for (size_t d = 0; d < output_dimensions; ++d) { + double acc = 0; + + for (uint64_t pos : input.PositionsOfSetBits()) + acc += RandomGaussian(d, pos); + + if (acc > 0) + result |= (1ULL << d); + } + + return result; +} + +uint64_t SimHashStrings(const std::unordered_set<std::string>& input, + size_t output_dimensions) { + DCHECK_LT(0u, output_dimensions); + DCHECK_LE(output_dimensions, 64u); + + LargeBitVector large_bit_vector; + for (const std::string& s : input) { + large_bit_vector.SetBit( + base::legacy::CityHash64(base::as_bytes(base::make_span(s)))); + } + + return SimHashBits(large_bit_vector, output_dimensions); +} + +} // namespace federated_learning diff --git a/chromium/components/federated_learning/sim_hash.h b/chromium/components/federated_learning/sim_hash.h new file mode 100644 index 00000000000..55030d9d2e3 --- /dev/null +++ b/chromium/components/federated_learning/sim_hash.h @@ -0,0 +1,44 @@ +// Copyright 2020 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef COMPONENTS_FEDERATED_LEARNING_SIM_HASH_H_ +#define COMPONENTS_FEDERATED_LEARNING_SIM_HASH_H_ + +#include <set> +#include <unordered_set> + +namespace federated_learning { + +// A 2^64 bit vector +class LargeBitVector { + public: + LargeBitVector(); + LargeBitVector(const LargeBitVector&); + ~LargeBitVector(); + + void SetBit(uint64_t pos); + const std::set<uint64_t>& PositionsOfSetBits() const; + + private: + // Sparse representation of a 2^64 bit vector. Each number in + // |positions_of_set_bits_| represents the position of a bit that is being + // set. + std::set<uint64_t> positions_of_set_bits_; +}; + +// Set the two seeds used for generating the random gaussian. +void SetSeedsForTesting(uint64_t seed1, uint64_t seed2); + +// SimHash a 2^64 bit vector to an |output_dimensions| bit number. +// |output_dimensions| must be greater than 0 and no greater than 64. +uint64_t SimHashBits(const LargeBitVector& input, size_t output_dimensions); + +// SimHash a set of strings to an |output_dimensions| bit number. +// |output_dimensions| must be greater than 0 and no greater than 64. +uint64_t SimHashStrings(const std::unordered_set<std::string>& input, + size_t output_dimensions); + +} // namespace federated_learning + +#endif // COMPONENTS_FEDERATED_LEARNING_SIM_HASH_H_ diff --git a/chromium/components/federated_learning/sim_hash_unittest.cc b/chromium/components/federated_learning/sim_hash_unittest.cc new file mode 100644 index 00000000000..14bcb92466b --- /dev/null +++ b/chromium/components/federated_learning/sim_hash_unittest.cc @@ -0,0 +1,203 @@ +// Copyright 2020 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "components/federated_learning/sim_hash.h" + +#include "base/check_op.h" +#include "base/hash/legacy_hash.h" +#include "base/rand_util.h" +#include "testing/gtest/include/gtest/gtest.h" + +namespace federated_learning { + +namespace { + +// Roll our own random uint64_t generator as we want deterministic outcome +// across different test runs. +uint64_t RandUint64() { + static uint64_t seed = 0; + ++seed; + uint64_t arr[2] = {1234ULL, 4321ULL}; + return base::legacy::CityHash64WithSeed( + base::as_bytes( + base::make_span(reinterpret_cast<const char*>(arr), sizeof(arr))), + seed); +} + +// Inclusive [min, max] +uint64_t RandUint64InRange(uint64_t min, uint64_t max) { + DCHECK_LE(min, max); + if (min == 0ULL && max == std::numeric_limits<uint64_t>::max()) + return RandUint64(); + + uint64_t range = max - min + 1; + DCHECK_LE(2ULL, range); + + // We must discard random results above this number, as they would + // make the random generator non-uniform. + uint64_t max_acceptable_value = + (std::numeric_limits<uint64_t>::max() / range) * range - 1; + + uint64_t rand_value; + do { + rand_value = RandUint64(); + } while (rand_value > max_acceptable_value); + + uint64_t result = (rand_value % range) + min; + + DCHECK_GE(result, min); + DCHECK_LE(result, max); + return result; +} + +LargeBitVector RandLargeBitVector( + size_t number_of_bits_set, + uint64_t max_bit_position = std::numeric_limits<uint64_t>::max()) { + LargeBitVector result; + while (result.PositionsOfSetBits().size() < number_of_bits_set) { + result.SetBit(RandUint64InRange(0ULL, max_bit_position)); + } + return result; +} + +float DotProduct(const LargeBitVector& v1, const LargeBitVector& v2) { + float result = 0; + for (uint64_t pos : v1.PositionsOfSetBits()) { + if (v2.PositionsOfSetBits().count(pos)) { + result += 1; + } + } + return result; +} + +float Norm(const LargeBitVector& v) { + return std::sqrt(DotProduct(v, v)); +} + +size_t MultipleSimHashGetNumOutputBitsEqual(size_t repeat_times, + size_t dimensions, + const LargeBitVector& v1, + const LargeBitVector& v2) { + uint64_t seed1 = 1; + uint64_t seed2 = 100000; + + uint64_t num_output_bits_equal = 0; + for (size_t i = 0; i < repeat_times; ++i) { + SetSeedsForTesting(seed1++, seed2++); + + uint64_t o1 = SimHashBits(v1, dimensions); + uint64_t o2 = SimHashBits(v2, dimensions); + for (size_t j = 0; j < dimensions; ++j) { + if ((o1 & 1) == (o2 & 1)) + ++num_output_bits_equal; + o1 >>= 1; + o2 >>= 1; + } + } + return num_output_bits_equal; +} + +TEST(SimHashTest, HashValue) { + LargeBitVector zero; + EXPECT_EQ(SimHashBits(zero, 1u), 0ULL); + EXPECT_EQ(SimHashBits(zero, 16u), 0ULL); + + LargeBitVector n0; + n0.SetBit(0); + EXPECT_EQ(SimHashBits(n0, 1u), 0ULL); + EXPECT_EQ(SimHashBits(n0, 16u), 8632ULL); + + LargeBitVector n999999; + n999999.SetBit(999999); + EXPECT_EQ(SimHashBits(n999999, 1u), 1ULL); + EXPECT_EQ(SimHashBits(n999999, 16u), 28603ULL); + + LargeBitVector n01; + n01.SetBit(0); + n01.SetBit(1); + EXPECT_EQ(SimHashBits(n01, 1u), 0ULL); + EXPECT_EQ(SimHashBits(n01, 16u), 10682ULL); +} + +// Test that given random inputs, the chances of each bit in the SimHash result +// to be 0 and 1 are equally likely. +TEST(SimHashTest, ExpectationOnRandomUniformInput) { + const size_t dimensions = 16u; + const size_t repeat_times = 10000u; + + uint64_t totals[dimensions] = {0.0}; + + for (size_t i = 0; i < repeat_times; ++i) { + uint64_t hash_result = + SimHashBits(RandLargeBitVector(RandUint64InRange(1u, 10u)), dimensions); + + for (size_t j = 0; j < dimensions; ++j) { + totals[j] += (hash_result & 1); + hash_result >>= 1; + } + } + + const double expectation = 0.5; + const double err_tolerance = 0.03; + + for (size_t j = 0; j < dimensions; ++j) { + double avg = 1.0 * totals[j] / repeat_times; + EXPECT_LT(avg, expectation + err_tolerance); + EXPECT_GT(avg, expectation - err_tolerance); + } +} + +// Test that the cosine similarity is preserved. +TEST(SimHashTest, CosineSimilarity_NonOrthogonalInput) { + const float kPi = 3.141592653589793; + const size_t dimensions = 50u; + const size_t repeat_times = 100u; + + // Generate v1 and v2 that are likely non-orthogonal. + LargeBitVector v1 = RandLargeBitVector(4000u, 10000); + LargeBitVector v2 = RandLargeBitVector(5000u, 10000); + + size_t num_output_bits_equal = + MultipleSimHashGetNumOutputBitsEqual(repeat_times, dimensions, v1, v2); + float avg = 1.0 * num_output_bits_equal / dimensions / repeat_times; + float expectation = + 1.0 - std::acos(DotProduct(v1, v2) / (Norm(v1) * Norm(v2))) / kPi; + + // Verify that the expectation is different from 0.5 and 1 to make sure the + // test is non-trivial. + EXPECT_LT(expectation, 0.7); + EXPECT_GT(expectation, 0.6); + + // Verify that SimHash(v1) and SimHash(v2) have approximately |expectation| + // fraction of their bits in common. + const double err_tolerance = 0.03; + EXPECT_LT(avg, expectation + err_tolerance); + EXPECT_GT(avg, expectation - err_tolerance); +} + +// Test that when input v1 and v2 are orthogonal, SimHash(v1) and SimHash(v2) +// have approximately half their bits in common. +TEST(SimHashTest, CosineSimilarity_OrthogonalInput) { + const size_t dimensions = 50u; + const size_t repeat_times = 100u; + + // Generate v1 and v2 that are likely orthogonal + LargeBitVector v1 = RandLargeBitVector(1u); + LargeBitVector v2 = RandLargeBitVector(1u); + + size_t num_output_bits_equal = + MultipleSimHashGetNumOutputBitsEqual(repeat_times, dimensions, v1, v2); + float avg = 1.0 * num_output_bits_equal / dimensions / repeat_times; + float expectation = 0.5; + + // Verify that SimHash(v1) and SimHash(v2) have approximately half their bits + // in common. + const double err_tolerance = 0.03; + EXPECT_LT(avg, expectation + err_tolerance); + EXPECT_GT(avg, expectation - err_tolerance); +} + +} // namespace + +} // namespace federated_learning |