summaryrefslogtreecommitdiff
path: root/chromium/third_party/liburlpattern/tokenize.h
blob: ba33741f7b0468768657808dd6fab9fe173d46fa (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
// Copyright 2020 The Chromium Authors. All rights reserved.
// Copyright 2014 Blake Embrey (hello@blakeembrey.com)
// Use of this source code is governed by an MIT-style license that can be
// found in the LICENSE file or at https://opensource.org/licenses/MIT.

#ifndef THIRD_PARTY_LIBURLPATTERN_LEXER_H_
#define THIRD_PARTY_LIBURLPATTERN_LEXER_H_

#include <vector>
#include "base/component_export.h"
#include "third_party/abseil-cpp/absl/status/statusor.h"
#include "third_party/abseil-cpp/absl/strings/string_view.h"

namespace liburlpattern {

enum class TokenType {
  // Open a scope with a '{'.
  kOpen,

  // Close a scope with a '}'.
  kClose,

  // A regular expression group like '(...)'.
  kRegex,

  // A named group like ':foo'.
  kName,

  // A single character.
  kChar,

  // The '\' escape character.
  kEscapedChar,

  // A '+' or '?' modifier.
  kOtherModifier,

  // A '*' character which can be a wildcard or modifier.
  kAsterisk,

  // The end of the token stream.
  kEnd,

  // A character that is not valid in a properly formed pattern; e.g. the colon
  // in `https://`.  This is only generated when TokenizerPolicy::kLenient is
  // used.
  kInvalidChar,
};

const char* TokenTypeToString(TokenType type);

// Simple structure representing a single lexical token.
struct COMPONENT_EXPORT(LIBURLPATTERN) Token {
  // Indicate the token type.
  TokenType type = TokenType::kEnd;

  // Index of the start of this token in the original pattern string.
  size_t index = 0;

  // The value of the token.  May be one or many characters depending on type.
  // May be null zero characters for the kEnd type.
  absl::string_view value;

  Token(TokenType t, size_t i, absl::string_view v)
      : type(t), index(i), value(v) {}
  Token() = default;
};

enum class TokenizePolicy {
  // The strict policy causes any problems found during tokenization to be
  // thrown as errors.
  kStrict,

  // The lenient policy converts problems detected during tokenization into
  // kInvalidChar tokens in the returned token list.  For something like a
  // `\` at the end of the string, this simply returns the immediate `\`
  // character.  For validation errors that cause a group to be invalid, the
  // first character of the group is instead returned.  For example, `https://`
  // returns the `:` as a kInvalidChar.  For `(foo(bar))` where capture groups
  // are illegal it causes the first `(` to be returned as a kInvalidChar.
  // Tokenization then continues with the next character after the kInvalidChar.
  kLenient,
};

COMPONENT_EXPORT(LIBURLPATTERN)
inline bool operator==(const Token& lh, const Token& rh) {
  return lh.type == rh.type && lh.index == rh.index && lh.value == rh.value;
}

inline bool operator!=(const Token& lh, const Token& rh) {
  return !(lh == rh);
}

COMPONENT_EXPORT(LIBURLPATTERN)
std::ostream& operator<<(std::ostream& o, Token token);

// Split the given input pattern string into a list of lexical tokens. Note,
// the generated Token objects simply reference positions within the input
// |pattern|.  The |pattern| must be kept alive as long as the Token objects.
COMPONENT_EXPORT(LIBURLPATTERN)
absl::StatusOr<std::vector<Token>> Tokenize(
    absl::string_view pattern,
    TokenizePolicy policy = TokenizePolicy::kStrict);

}  // namespace liburlpattern

#endif  // THIRD_PARTY_LIBURLPATTERN_LEXER_H_