summaryrefslogtreecommitdiff
path: root/chromium/third_party/blink/renderer/platform/text/text_break_iterator.h
blob: d5bff1697492934539af7328fda964e71a038a6b (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
/*
 * Copyright (C) 2006 Lars Knoll <lars@trolltech.com>
 * Copyright (C) 2007, 2011, 2012 Apple Inc. All rights reserved.
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Library General Public
 * License as published by the Free Software Foundation; either
 * version 2 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Library General Public License for more details.
 *
 * You should have received a copy of the GNU Library General Public License
 * along with this library; see the file COPYING.LIB.  If not, write to
 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
 * Boston, MA 02110-1301, USA.
 *
 */

#ifndef THIRD_PARTY_BLINK_RENDERER_PLATFORM_TEXT_TEXT_BREAK_ITERATOR_H_
#define THIRD_PARTY_BLINK_RENDERER_PLATFORM_TEXT_TEXT_BREAK_ITERATOR_H_

#include "base/macros.h"
#include "third_party/blink/renderer/platform/platform_export.h"
#include "third_party/blink/renderer/platform/wtf/text/atomic_string.h"
#include "third_party/blink/renderer/platform/wtf/text/character_names.h"
#include "third_party/blink/renderer/platform/wtf/text/unicode.h"

#include <unicode/brkiter.h>

namespace blink {

typedef icu::BreakIterator TextBreakIterator;

// Note: The returned iterator is good only until you get another iterator, with
// the exception of acquireLineBreakIterator.

// This is similar to character break iterator in most cases, but is subject to
// platform UI conventions. One notable example where this can be different
// from character break iterator is Thai prepend characters, see bug 24342.
// Use this for insertion point and selection manipulations.
PLATFORM_EXPORT TextBreakIterator* CursorMovementIterator(const UChar*,
                                                          int length);

PLATFORM_EXPORT TextBreakIterator* WordBreakIterator(const String&,
                                                     int start,
                                                     int length);
PLATFORM_EXPORT TextBreakIterator* WordBreakIterator(const UChar*, int length);
PLATFORM_EXPORT TextBreakIterator* AcquireLineBreakIterator(
    const LChar*,
    int length,
    const AtomicString& locale,
    const UChar* prior_context,
    unsigned prior_context_length);
PLATFORM_EXPORT TextBreakIterator* AcquireLineBreakIterator(
    const UChar*,
    int length,
    const AtomicString& locale,
    const UChar* prior_context,
    unsigned prior_context_length);
PLATFORM_EXPORT void ReleaseLineBreakIterator(TextBreakIterator*);
PLATFORM_EXPORT TextBreakIterator* SentenceBreakIterator(const UChar*,
                                                         int length);

// Before calling this, check if the iterator is not at the end. Otherwise,
// it may not work as expected.
// See https://ssl.icu-project.org/trac/ticket/13447 .
PLATFORM_EXPORT bool IsWordTextBreak(TextBreakIterator*);

const int kTextBreakDone = -1;

enum class LineBreakType {
  kNormal,

  // word-break:break-all allows breaks between letters/numbers, but prohibits
  // break before/after certain punctuation.
  kBreakAll,

  // Allows breaks at every grapheme cluster boundary.
  // Terminal style line breaks described in UAX#14: Examples of Customization
  // http://unicode.org/reports/tr14/#Examples
  // CSS is discussing to add this feature crbug.com/720205
  // Used internally for word-break:break-word.
  kBreakCharacter,

  // word-break:keep-all doesn't allow breaks between all kind of
  // letters/numbers except some south east asians'.
  kKeepAll,
};

// Determines break opportunities around collapsible space characters (space,
// newline, and tabulation characters.)
enum class BreakSpaceType {
  // Break before every collapsible space character.
  // This is a specialized optimization for CSS, where leading/trailing spaces
  // in each line are removed, and thus breaking before spaces can save
  // computing hanging spaces.
  // Callers are expected to handle spaces by themselves. Because a run of
  // spaces can include different types of spaces, break opportunity is given
  // for every space character.
  // Pre-LayoutNG line breaker uses this type.
  kBeforeEverySpace,

  // Break before a run of white space characters.
  // This is for CSS line breaking as in |kBeforeEverySpace|, but when
  // whitespace collapsing is already applied to the target string. In this
  // case, a run of white spaces are preserved spaces. There should not be break
  // opportunities between white spaces.
  // LayoutNG line breaker uses this type.
  kBeforeSpaceRun,
};

PLATFORM_EXPORT std::ostream& operator<<(std::ostream&, LineBreakType);
PLATFORM_EXPORT std::ostream& operator<<(std::ostream&, BreakSpaceType);

class PLATFORM_EXPORT LazyLineBreakIterator final {
  STACK_ALLOCATED();

 public:
  LazyLineBreakIterator()
      : iterator_(nullptr),
        cached_prior_context_(nullptr),
        cached_prior_context_length_(0),
        break_type_(LineBreakType::kNormal) {
    ResetPriorContext();
  }

  LazyLineBreakIterator(String string,
                        const AtomicString& locale = AtomicString(),
                        LineBreakType break_type = LineBreakType::kNormal)
      : string_(string),
        locale_(locale),
        iterator_(nullptr),
        cached_prior_context_(nullptr),
        cached_prior_context_length_(0),
        break_type_(break_type) {
    ResetPriorContext();
  }

  ~LazyLineBreakIterator() {
    if (iterator_)
      ReleaseLineBreakIterator(iterator_);
  }

  const String& GetString() const { return string_; }

  UChar LastCharacter() const {
    static_assert(WTF_ARRAY_LENGTH(prior_context_) == 2,
                  "TextBreakIterator has unexpected prior context length");
    return prior_context_[1];
  }

  UChar SecondToLastCharacter() const {
    static_assert(WTF_ARRAY_LENGTH(prior_context_) == 2,
                  "TextBreakIterator has unexpected prior context length");
    return prior_context_[0];
  }

  void SetPriorContext(UChar last, UChar second_to_last) {
    static_assert(WTF_ARRAY_LENGTH(prior_context_) == 2,
                  "TextBreakIterator has unexpected prior context length");
    prior_context_[0] = second_to_last;
    prior_context_[1] = last;
  }

  void UpdatePriorContext(UChar last) {
    static_assert(WTF_ARRAY_LENGTH(prior_context_) == 2,
                  "TextBreakIterator has unexpected prior context length");
    prior_context_[0] = prior_context_[1];
    prior_context_[1] = last;
  }

  void ResetPriorContext() {
    static_assert(WTF_ARRAY_LENGTH(prior_context_) == 2,
                  "TextBreakIterator has unexpected prior context length");
    prior_context_[0] = 0;
    prior_context_[1] = 0;
  }

  unsigned PriorContextLength() const {
    unsigned prior_context_length = 0;
    static_assert(WTF_ARRAY_LENGTH(prior_context_) == 2,
                  "TextBreakIterator has unexpected prior context length");
    if (prior_context_[1]) {
      ++prior_context_length;
      if (prior_context_[0])
        ++prior_context_length;
    }
    return prior_context_length;
  }

  // Obtain text break iterator, possibly previously cached, where this iterator
  // is (or has been) initialized to use the previously stored string as the
  // primary breaking context and using previously stored prior context if
  // non-empty.
  TextBreakIterator* Get(unsigned prior_context_length) const {
    DCHECK(prior_context_length <= kPriorContextCapacity);
    const UChar* prior_context =
        prior_context_length
            ? &prior_context_[kPriorContextCapacity - prior_context_length]
            : nullptr;
    if (!iterator_) {
      if (string_.Is8Bit())
        iterator_ = AcquireLineBreakIterator(
            string_.Characters8(), string_.length(), locale_, prior_context,
            prior_context_length);
      else
        iterator_ = AcquireLineBreakIterator(
            string_.Characters16(), string_.length(), locale_, prior_context,
            prior_context_length);
      cached_prior_context_ = prior_context;
      cached_prior_context_length_ = prior_context_length;
    } else if (prior_context != cached_prior_context_ ||
               prior_context_length != cached_prior_context_length_) {
      ReleaseIterator();
      return Get(prior_context_length);
    }
    return iterator_;
  }

  void ResetStringAndReleaseIterator(String string,
                                     const AtomicString& locale) {
    string_ = string;
    locale_ = locale;

    ReleaseIterator();
  }

  void SetLocale(const AtomicString& locale) {
    if (locale == locale_)
      return;
    locale_ = locale;
    ReleaseIterator();
  }

  LineBreakType BreakType() const { return break_type_; }
  void SetBreakType(LineBreakType break_type) { break_type_ = break_type; }
  BreakSpaceType BreakSpace() const { return break_space_; }
  void SetBreakSpace(BreakSpaceType break_space) { break_space_ = break_space; }

  inline bool IsBreakable(int pos,
                          int& next_breakable,
                          LineBreakType line_break_type) const {
    if (pos > next_breakable) {
      next_breakable = NextBreakablePosition(pos, line_break_type);
    }
    return pos == next_breakable;
  }

  inline bool IsBreakable(int pos, int& next_breakable) const {
    return IsBreakable(pos, next_breakable, break_type_);
  }

  inline bool IsBreakable(int pos) const {
    int next_breakable = -1;
    return IsBreakable(pos, next_breakable, break_type_);
  }

  // Returns the break opportunity at or after |offset|.
  unsigned NextBreakOpportunity(unsigned offset) const;

  // Returns the break opportunity at or before |offset|.
  unsigned PreviousBreakOpportunity(unsigned offset, unsigned min = 0) const;

  static bool IsBreakableSpace(UChar ch) {
    return ch == kSpaceCharacter || ch == kTabulationCharacter ||
           ch == kNewlineCharacter;
  }

 private:
  void ReleaseIterator() const {
    if (iterator_)
      ReleaseLineBreakIterator(iterator_);
    iterator_ = nullptr;
    cached_prior_context_ = nullptr;
    cached_prior_context_length_ = 0;
  }

  template <typename CharacterType, LineBreakType, BreakSpaceType>
  int NextBreakablePosition(int pos, const CharacterType* str) const;
  template <typename CharacterType, LineBreakType>
  int NextBreakablePosition(int pos, const CharacterType* str) const;
  template <LineBreakType>
  int NextBreakablePosition(int pos) const;
  int NextBreakablePositionBreakCharacter(int pos) const;
  int NextBreakablePosition(int pos, LineBreakType) const;

  static const unsigned kPriorContextCapacity = 2;
  String string_;
  AtomicString locale_;
  mutable TextBreakIterator* iterator_;
  UChar prior_context_[kPriorContextCapacity];
  mutable const UChar* cached_prior_context_;
  mutable unsigned cached_prior_context_length_;
  LineBreakType break_type_;
  BreakSpaceType break_space_ = BreakSpaceType::kBeforeEverySpace;
};

// Iterates over "extended grapheme clusters", as defined in UAX #29.
// Note that platform implementations may be less sophisticated - e.g. ICU prior
// to version 4.0 only supports "legacy grapheme clusters".  Use this for
// general text processing, e.g. string truncation.

class PLATFORM_EXPORT NonSharedCharacterBreakIterator final {
  STACK_ALLOCATED();

 public:
  explicit NonSharedCharacterBreakIterator(const String&);
  NonSharedCharacterBreakIterator(const UChar*, unsigned length);
  ~NonSharedCharacterBreakIterator();

  int Next();
  int Current();

  bool IsBreak(int offset) const;
  int Preceding(int offset) const;
  int Following(int offset) const;

  bool operator!() const { return !is8_bit_ && !iterator_; }

 private:
  void CreateIteratorForBuffer(const UChar*, unsigned length);

  unsigned ClusterLengthStartingAt(unsigned offset) const {
    DCHECK(is8_bit_);
    // The only Latin-1 Extended Grapheme Cluster is CR LF
    return IsCRBeforeLF(offset) ? 2 : 1;
  }

  bool IsCRBeforeLF(unsigned offset) const {
    DCHECK(is8_bit_);
    return charaters8_[offset] == '\r' && offset + 1 < length_ &&
           charaters8_[offset + 1] == '\n';
  }

  bool IsLFAfterCR(unsigned offset) const {
    DCHECK(is8_bit_);
    return charaters8_[offset] == '\n' && offset >= 1 &&
           charaters8_[offset - 1] == '\r';
  }

  bool is8_bit_;

  // For 8 bit strings, we implement the iterator ourselves.
  const LChar* charaters8_;
  unsigned offset_;
  unsigned length_;

  // For 16 bit strings, we use a TextBreakIterator.
  TextBreakIterator* iterator_;

  DISALLOW_COPY_AND_ASSIGN(NonSharedCharacterBreakIterator);
};

// Counts the number of grapheme clusters. A surrogate pair or a sequence
// of a non-combining character and following combining characters is
// counted as 1 grapheme cluster.
PLATFORM_EXPORT unsigned NumGraphemeClusters(const String&);

// Returns the number of code units that the next grapheme cluster is made of.
PLATFORM_EXPORT unsigned LengthOfGraphemeCluster(const String&, unsigned = 0);

}  // namespace blink

#endif