summaryrefslogtreecommitdiff
path: root/chromium/components/url_formatter/top_domains/top_domain_state_generator.cc
blob: c4e90d350a22345fe304cc175d214f9804fdb85d (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
// Copyright 2018 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "components/url_formatter/top_domains/top_domain_state_generator.h"

#include <cstdint>
#include <memory>
#include <string>

#include "base/strings/string_number_conversions.h"
#include "base/strings/string_util.h"
#include "base/strings/stringprintf.h"
#include "net/tools/huffman_trie/huffman/huffman_builder.h"
#include "net/tools/huffman_trie/trie/trie_bit_buffer.h"
#include "net/tools/huffman_trie/trie/trie_writer.h"

using net::huffman_trie::HuffmanRepresentationTable;
using net::huffman_trie::HuffmanBuilder;
using net::huffman_trie::TrieWriter;

namespace url_formatter {

namespace top_domains {

namespace {

static const char kNewLine[] = "\n";
static const char kIndent[] = "  ";

// Replaces the first occurrence of "[[" + name + "]]" in |*tpl| with
// |value|.
bool ReplaceTag(const std::string& name,
                const std::string& value,
                std::string* tpl) {
  std::string tag = "[[" + name + "]]";

  size_t start_pos = tpl->find(tag);
  if (start_pos == std::string::npos) {
    return false;
  }

  tpl->replace(start_pos, tag.length(), value);
  return true;
}

// Formats the bytes in |bytes| as an C++ array initializer and returns the
// resulting string.
std::string FormatVectorAsArray(const std::vector<uint8_t>& bytes) {
  std::string output = "{";
  output.append(kNewLine);
  output.append(kIndent);
  output.append(kIndent);

  size_t bytes_on_current_line = 0;

  for (size_t i = 0; i < bytes.size(); ++i) {
    base::StringAppendF(&output, "0x%02x,", bytes[i]);

    bytes_on_current_line++;
    if (bytes_on_current_line >= 12 && (i + 1) < bytes.size()) {
      output.append(kNewLine);
      output.append(kIndent);
      output.append(kIndent);

      bytes_on_current_line = 0;
    } else if ((i + 1) < bytes.size()) {
      output.append(" ");
    }
  }

  output.append(kNewLine);
  output.append("}");

  return output;
}

HuffmanRepresentationTable ApproximateHuffman(const TopDomainEntries& entries) {
  HuffmanBuilder huffman_builder;
  for (const auto& entry : entries) {
    for (const auto& c : entry->skeleton) {
      huffman_builder.RecordUsage(c);
    }
    for (const auto& c : entry->top_domain) {
      huffman_builder.RecordUsage(c);
    }
    huffman_builder.RecordUsage(net::huffman_trie::kTerminalValue);
    huffman_builder.RecordUsage(net::huffman_trie::kEndOfTableValue);
  }

  return huffman_builder.ToTable();
}

}  // namespace

TopDomainStateGenerator::TopDomainStateGenerator() = default;

TopDomainStateGenerator::~TopDomainStateGenerator() = default;

std::string TopDomainStateGenerator::Generate(
    const std::string& preload_template,
    const TopDomainEntries& entries) {
  std::string output = preload_template;

  // The trie generation process for the whole data is run twice, the first time
  // using an approximate Huffman table. During this first run, the correct
  // character frequencies are collected which are then used to calculate the
  // most space efficient Huffman table for the given inputs. This table is used
  // for the second run.

  HuffmanRepresentationTable approximate_table = ApproximateHuffman(entries);
  HuffmanBuilder huffman_builder;

  // Create trie entries for the first pass.
  std::vector<std::unique_ptr<TopDomainTrieEntry>> trie_entries;
  std::vector<net::huffman_trie::TrieEntry*> raw_trie_entries;
  for (const auto& entry : entries) {
    auto trie_entry = std::make_unique<TopDomainTrieEntry>(
        approximate_table, &huffman_builder, entry.get());
    raw_trie_entries.push_back(trie_entry.get());
    trie_entries.push_back(std::move(trie_entry));
  }

  TrieWriter writer(approximate_table, &huffman_builder);
  uint32_t root_position;
  if (!writer.WriteEntries(raw_trie_entries, &root_position))
    return std::string();

  HuffmanRepresentationTable optimal_table = huffman_builder.ToTable();
  TrieWriter new_writer(optimal_table, &huffman_builder);

  // Create trie entries using the optimal table for the second pass.
  raw_trie_entries.clear();
  trie_entries.clear();
  for (const auto& entry : entries) {
    auto trie_entry = std::make_unique<TopDomainTrieEntry>(
        optimal_table, &huffman_builder, entry.get());
    raw_trie_entries.push_back(trie_entry.get());
    trie_entries.push_back(std::move(trie_entry));
  }

  if (!new_writer.WriteEntries(raw_trie_entries, &root_position))
    return std::string();

  uint32_t new_length = new_writer.position();
  std::vector<uint8_t> huffman_tree = huffman_builder.ToVector();
  new_writer.Flush();

  ReplaceTag("HUFFMAN_TREE", FormatVectorAsArray(huffman_tree), &output);

  ReplaceTag("TOP_DOMAINS_TRIE", FormatVectorAsArray(new_writer.bytes()),
             &output);

  ReplaceTag("TOP_DOMAINS_TRIE_BITS", base::NumberToString(new_length),
             &output);
  ReplaceTag("TOP_DOMAINS_TRIE_ROOT", base::NumberToString(root_position),
             &output);

  return output;
}

}  // namespace top_domains

}  // namespace url_formatter