summaryrefslogtreecommitdiff
path: root/chromium/components/autofill/core/browser/data_model/autofill_structured_address_utils.cc
blob: 62bf64b27222563ffebbf4fae078689da026de9e (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
// Copyright 2020 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "components/autofill/core/browser/data_model/autofill_structured_address_utils.h"

#include <algorithm>
#include <map>
#include <string>
#include <utility>

#include "base/check.h"
#include "base/debug/alias.h"
#include "base/debug/dump_without_crashing.h"
#include "base/feature_list.h"
#include "base/i18n/case_conversion.h"
#include "base/i18n/char_iterator.h"
#include "base/strings/strcat.h"
#include "base/strings/string_piece.h"
#include "base/strings/string_split.h"
#include "base/strings/string_util.h"
#include "base/strings/utf_string_conversion_utils.h"
#include "base/strings/utf_string_conversions.h"
#include "components/autofill/core/browser/data_model/autofill_profile_comparator.h"
#include "components/autofill/core/browser/data_model/autofill_structured_address_regex_provider.h"
#include "components/autofill/core/browser/data_model/borrowed_transliterator.h"
#include "components/autofill/core/common/autofill_features.h"

namespace autofill {
namespace structured_address {

SortedTokenComparisonResult::SortedTokenComparisonResult(
    SortedTokenComparisonStatus status,
    std::vector<AddressToken> additional_tokens)
    : status(status), additional_tokens(additional_tokens) {}

SortedTokenComparisonResult::~SortedTokenComparisonResult() = default;

SortedTokenComparisonResult::SortedTokenComparisonResult(
    const SortedTokenComparisonResult& other) = default;

bool SortedTokenComparisonResult::IsSingleTokenSubset() const {
  return status == SUBSET && additional_tokens.size() == 1;
}

bool SortedTokenComparisonResult::IsSingleTokenSuperset() const {
  return status == SUPERSET && additional_tokens.size() == 1;
}

bool SortedTokenComparisonResult::OneIsSubset() const {
  return status == SUBSET || status == SUPERSET;
}

bool SortedTokenComparisonResult::ContainEachOther() const {
  return status != DISTINCT;
}

bool StructuredNamesEnabled() {
  return base::FeatureList::IsEnabled(
      features::kAutofillEnableSupportForMoreStructureInNames);
}

bool StructuredAddressesEnabled() {
  return base::FeatureList::IsEnabled(
      features::kAutofillEnableSupportForMoreStructureInAddresses);
}

Re2RegExCache::Re2RegExCache() = default;

// static
Re2RegExCache* Re2RegExCache::Instance() {
  static base::NoDestructor<Re2RegExCache> g_re2regex_cache;
  return g_re2regex_cache.get();
}

const RE2* Re2RegExCache::GetRegEx(const std::string& pattern) {
  // For thread safety, acquire a lock to prevent concurrent access.
  base::AutoLock lock(lock_);

  auto it = regex_map_.find(pattern);
  if (it != regex_map_.end()) {
    const RE2* regex = it->second.get();
    return regex;
  }

  // Build the expression and verify it is correct.
  auto regex_ptr = BuildRegExFromPattern(pattern);

  // Insert the expression into the map, check the success and return the
  // pointer.
  auto result = regex_map_.emplace(pattern, std::move(regex_ptr));
  DCHECK(result.second);
  return result.first->second.get();
}

RewriterCache::RewriterCache() = default;

// static
RewriterCache* RewriterCache::GetInstance() {
  static base::NoDestructor<RewriterCache> g_rewriter_cache;
  return g_rewriter_cache.get();
}

// static
base::string16 RewriterCache::Rewrite(const base::string16& country_code,
                                      const base::string16& text) {
  return GetInstance()->GetRewriter(country_code).Rewrite(NormalizeValue(text));
}

const AddressRewriter& RewriterCache::GetRewriter(
    const base::string16& country_code) {
  // For thread safety, acquire a lock to prevent concurrent access.
  base::AutoLock lock(lock_);

  auto it = rewriter_map_.find(country_code);
  if (it != rewriter_map_.end()) {
    const AddressRewriter& rewriter = it->second;
    return rewriter;
  }

  // Insert the expression into the map, check the success and return the
  // pointer.
  auto result = rewriter_map_.emplace(
      country_code, AddressRewriter::ForCountryCode(country_code));
  DCHECK(result.second);
  return result.first->second;
}

std::unique_ptr<const RE2> BuildRegExFromPattern(const std::string& pattern) {
  RE2::Options opt;
  // By default, patters are case sensitive.
  // Note that, the named-capture-group patterns build with
  // |CaptureTypeWithPattern()| apply a flag to make the matching case
  // insensitive.
  opt.set_case_sensitive(true);

  auto regex = std::make_unique<const RE2>(pattern, opt);

  if (!regex->ok()) {
    DEBUG_ALIAS_FOR_CSTR(pattern_copy, pattern.c_str(), 128);
    base::debug::DumpWithoutCrashing();
  }

  return regex;
}

bool HasCjkNameCharacteristics(const std::string& name) {
  return IsPartialMatch(name, RegEx::kMatchCjkNameCharacteristics);
}

bool HasMiddleNameInitialsCharacteristics(const std::string& middle_name) {
  return IsPartialMatch(middle_name,
                        RegEx::kMatchMiddleNameInitialsCharacteristics);
}

bool HasHispanicLatinxNameCharaceristics(const std::string& name) {
  // Check if the name contains one of the most common Hispanic/Latinx
  // last names.
  if (IsPartialMatch(name, RegEx::kMatchHispanicCommonNameCharacteristics))
    return true;

  // Check if it contains a last name conjunction.
  if (IsPartialMatch(name,
                     RegEx::kMatchHispanicLastNameConjuctionCharacteristics))
    return true;

  // If none of the above, there is not sufficient reason to assume this is a
  // Hispanic/Latinx name.
  return false;
}

bool ParseValueByRegularExpression(
    const std::string& value,
    const std::string& pattern,
    std::map<std::string, std::string>* result_map) {
  DCHECK(result_map);

  const RE2* regex = Re2RegExCache::Instance()->GetRegEx(pattern);

  return ParseValueByRegularExpression(value, regex, result_map);
}

bool ParseValueByRegularExpression(
    const std::string& value,
    const RE2* regex,
    std::map<std::string, std::string>* result_map) {
  if (!regex || !regex->ok())
    return false;

  // Get the number of capturing groups in the expression.
  // Note, the capturing group for the full match is not counted.
  size_t number_of_capturing_groups = regex->NumberOfCapturingGroups() + 1;

  // Create result vectors to get the matches for the capturing groups.
  std::vector<std::string> results(number_of_capturing_groups);
  std::vector<RE2::Arg> match_results(number_of_capturing_groups);
  std::vector<RE2::Arg*> match_results_ptr(number_of_capturing_groups);

  // Note, the capturing group for the full match is not counted by
  // |NumberOfCapturingGroups|.
  for (size_t i = 0; i < number_of_capturing_groups; i++) {
    match_results[i] = &results[i];
    match_results_ptr[i] = &match_results[i];
  }

  // One capturing group is not counted since it holds the full match.
  if (!RE2::FullMatchN(value, *regex, match_results_ptr.data(),
                       number_of_capturing_groups - 1))
    return false;

  // If successful, write the values into the results map.
  // Note, the capturing group for the full match creates an off-by-one scenario
  // in the indexing.
  for (auto named_group : regex->NamedCapturingGroups())
    (*result_map)[named_group.first] =
        std::move(results.at(named_group.second - 1));

  return true;
}

bool IsPartialMatch(const std::string& value, RegEx regex) {
  return IsPartialMatch(
      value, StructuredAddressesRegExProvider::Instance()->GetRegEx(regex));
}

bool IsPartialMatch(const std::string& value, const std::string& pattern) {
  return IsPartialMatch(value, Re2RegExCache::Instance()->GetRegEx(pattern));
}

bool IsPartialMatch(const std::string& value, const RE2* expression) {
  return RE2::PartialMatch(value, *expression);
}

std::vector<std::string> GetAllPartialMatches(const std::string& value,
                                              const std::string& pattern) {
  const RE2* regex = Re2RegExCache::Instance()->GetRegEx(pattern);
  if (!regex || !regex->ok())
    return {};
  re2::StringPiece input(value);
  std::string match;
  std::vector<std::string> matches;
  while (re2::RE2::FindAndConsume(&input, *regex, &match)) {
    matches.emplace_back(match);
  }
  return matches;
}

std::vector<std::string> ExtractAllPlaceholders(const std::string& value) {
  return GetAllPartialMatches(value, "\\${([\\w]+)}");
}

std::string GetPlaceholderToken(const std::string& value) {
  return base::StrCat({"${", value, "}"});
}

std::string CaptureTypeWithPattern(
    const ServerFieldType& type,
    std::initializer_list<base::StringPiece> pattern_span_initializer_list) {
  return CaptureTypeWithPattern(type, pattern_span_initializer_list,
                                CaptureOptions());
}

std::string CaptureTypeWithPattern(
    const ServerFieldType& type,
    std::initializer_list<base::StringPiece> pattern_span_initializer_list,
    const CaptureOptions& options) {
  return CaptureTypeWithPattern(
      type, base::StrCat(base::make_span(pattern_span_initializer_list)),
      options);
}

std::string CaptureTypeWithAffixedPattern(const ServerFieldType& type,
                                          const std::string& prefix,
                                          const std::string& pattern,
                                          const std::string& suffix,
                                          const CaptureOptions& options) {
  std::string quantifier;
  switch (options.quantifier) {
    // Makes the match optional.
    case MATCH_OPTIONAL:
      quantifier = "?";
      break;
    // Makes the match lazy meaning that it is avoided if possible.
    case MATCH_LAZY_OPTIONAL:
      quantifier = "??";
      break;
    // Makes the match required.
    case MATCH_REQUIRED:
      quantifier = "";
  }

  // By adding an "i" in the first group, the capturing is case insensitive.
  // Allow multiple separators to support the ", " case.
  return base::StrCat({"(?i:", prefix, "(?P<", AutofillType(type).ToString(),
                       ">", pattern, ")", suffix, "(?:", options.separator,
                       ")+)", quantifier});
}

std::string CaptureTypeWithSuffixedPattern(const ServerFieldType& type,
                                           const std::string& pattern,
                                           const std::string& suffix_pattern,
                                           const CaptureOptions& options) {
  return CaptureTypeWithAffixedPattern(type, std::string(), pattern,
                                       suffix_pattern, options);
}

std::string CaptureTypeWithPrefixedPattern(const ServerFieldType& type,
                                           const std::string& prefix_pattern,
                                           const std::string& pattern,
                                           const CaptureOptions& options) {
  return CaptureTypeWithAffixedPattern(type, prefix_pattern, pattern,
                                       std::string(), options);
}

std::string CaptureTypeWithPattern(const ServerFieldType& type,
                                   const std::string& pattern,
                                   CaptureOptions options) {
  return CaptureTypeWithAffixedPattern(type, std::string(), pattern,
                                       std::string(), options);
}

base::string16 NormalizeValue(base::StringPiece16 value,
                              bool keep_white_space) {
  return AutofillProfileComparator::NormalizeForComparison(
      value, keep_white_space ? AutofillProfileComparator::RETAIN_WHITESPACE
                              : AutofillProfileComparator::DISCARD_WHITESPACE);
}

bool AreStringTokenEquivalent(const base::string16& one,
                              const base::string16& other) {
  return AreSortedTokensEqual(TokenizeValue(one), TokenizeValue(other));
}

SortedTokenComparisonResult CompareSortedTokens(
    const std::vector<AddressToken>& first,
    const std::vector<AddressToken>& second) {
  // Lambda to compare the normalized values of two AddressTokens.
  auto cmp_normalized = [](const auto& a, const auto& b) {
    return a.normalized_value < b.normalized_value;
  };

  // Verify that the two multi sets are sorted.
  DCHECK(std::is_sorted(first.begin(), first.end(), cmp_normalized) &&
         std::is_sorted(second.begin(), second.end(), cmp_normalized));

  bool is_supserset = std::includes(first.begin(), first.end(), second.begin(),
                                    second.end(), cmp_normalized);
  bool is_subset = std::includes(second.begin(), second.end(), first.begin(),
                                 first.end(), cmp_normalized);

  // If first is both a superset and a subset it is the same.
  if (is_supserset && is_subset)
    return SortedTokenComparisonResult(MATCH);

  // If it is neither, both are distinct.
  if (!is_supserset && !is_subset)
    return SortedTokenComparisonResult(DISTINCT);

  std::vector<AddressToken> additional_tokens;

  // Collect the additional tokens from the superset.
  // Note, that the superset property is already assured.
  std::set_symmetric_difference(
      first.begin(), first.end(), second.begin(), second.end(),
      std::back_inserter(additional_tokens), cmp_normalized);

  if (is_supserset) {
    return SortedTokenComparisonResult(SUPERSET, additional_tokens);
  }

  return SortedTokenComparisonResult(SUBSET, additional_tokens);
}

SortedTokenComparisonResult CompareSortedTokens(const base::string16& first,
                                                const base::string16& second) {
  return CompareSortedTokens(TokenizeValue(first), TokenizeValue(second));
}

bool AreSortedTokensEqual(const std::vector<AddressToken>& first,
                          const std::vector<AddressToken>& second) {
  return CompareSortedTokens(first, second).status == MATCH;
}

std::vector<AddressToken> TokenizeValue(const base::string16 value) {
  std::vector<AddressToken> tokens;
  int index = 0;

  // CJK names are a special case and are tokenized by character without the
  // separators.
  if (HasCjkNameCharacteristics(base::UTF16ToUTF8(value))) {
    tokens.reserve(value.size());
    for (size_t i = 0; i < value.size(); i++) {
      base::string16 cjk_separators = base::UTF8ToUTF16("・·  ");
      if (cjk_separators.find(value.substr(i, 1)) == base::string16::npos) {
        tokens.emplace_back(AddressToken{.value = value.substr(i, 1),
                                         .normalized_value = value.substr(i, 1),
                                         .position = index++});
      }
    }
  } else {
    // Split it by white spaces and commas into non-empty values.
    for (const auto& token :
         base::SplitString(value, base::ASCIIToUTF16(", \n"),
                           base::TRIM_WHITESPACE, base::SPLIT_WANT_NONEMPTY)) {
      tokens.emplace_back(
          AddressToken{.value = token,
                       .normalized_value = NormalizeValue(token),
                       .position = index++});
    }
  }
  // Sort the tokens lexicographically by their normalized value.
  std::sort(tokens.begin(), tokens.end(), [](const auto& a, const auto& b) {
    return a.normalized_value < b.normalized_value;
  });

  return tokens;
}

}  // namespace structured_address
}  // namespace autofill