diff options
Diffstat (limited to 'chromium/components/federated_learning/sim_hash.cc')
-rw-r--r-- | chromium/components/federated_learning/sim_hash.cc | 93 |
1 files changed, 93 insertions, 0 deletions
diff --git a/chromium/components/federated_learning/sim_hash.cc b/chromium/components/federated_learning/sim_hash.cc new file mode 100644 index 00000000000..8012adfbbe2 --- /dev/null +++ b/chromium/components/federated_learning/sim_hash.cc @@ -0,0 +1,93 @@ +// Copyright 2020 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "components/federated_learning/sim_hash.h" + +#include "base/hash/legacy_hash.h" + +#include <algorithm> +#include <cmath> + +namespace federated_learning { + +namespace { + +uint64_t g_seed1 = 1ULL; +uint64_t g_seed2 = 2ULL; +constexpr double kTwoPi = 2.0 * 3.141592653589793; + +// Hashes i and j to create a uniform RV in [0,1]. +double RandomUniform(uint64_t i, uint64_t j, uint64_t seed) { + uint64_t arr[2] = {i, j}; + uint64_t hashed = base::legacy::CityHash64WithSeed( + base::as_bytes( + base::make_span(reinterpret_cast<const char*>(arr), sizeof(arr))), + seed); + + return static_cast<double>(hashed) / + static_cast<double>(std::numeric_limits<uint64_t>::max()); +} + +// Hashes i and j to create two uniform RVs in [0,1]. Uses the Box-Muller +// transform to generate a Gaussian. +double RandomGaussian(uint64_t i, uint64_t j) { + double rv1 = RandomUniform(i, j, g_seed1); + double rv2 = RandomUniform(j, i, g_seed2); + + // BoxMuller + return std::sqrt(-2.0 * std::log(rv1)) * std::cos(kTwoPi * rv2); +} + +} // namespace + +LargeBitVector::LargeBitVector() = default; +LargeBitVector::LargeBitVector(const LargeBitVector&) = default; +LargeBitVector::~LargeBitVector() = default; + +void LargeBitVector::SetBit(uint64_t pos) { + positions_of_set_bits_.insert(pos); +} + +const std::set<uint64_t>& LargeBitVector::PositionsOfSetBits() const { + return positions_of_set_bits_; +} + +void SetSeedsForTesting(uint64_t seed1, uint64_t seed2) { + g_seed1 = seed1; + g_seed2 = seed2; +} + +uint64_t SimHashBits(const LargeBitVector& input, size_t output_dimensions) { + DCHECK_LT(0u, output_dimensions); + DCHECK_LE(output_dimensions, 64u); + + uint64_t result = 0; + for (size_t d = 0; d < output_dimensions; ++d) { + double acc = 0; + + for (uint64_t pos : input.PositionsOfSetBits()) + acc += RandomGaussian(d, pos); + + if (acc > 0) + result |= (1ULL << d); + } + + return result; +} + +uint64_t SimHashStrings(const std::unordered_set<std::string>& input, + size_t output_dimensions) { + DCHECK_LT(0u, output_dimensions); + DCHECK_LE(output_dimensions, 64u); + + LargeBitVector large_bit_vector; + for (const std::string& s : input) { + large_bit_vector.SetBit( + base::legacy::CityHash64(base::as_bytes(base::make_span(s)))); + } + + return SimHashBits(large_bit_vector, output_dimensions); +} + +} // namespace federated_learning |