summaryrefslogtreecommitdiff
path: root/chromium/third_party/blink/renderer/platform/fonts/utf16_ragel_iterator.cc
blob: a482b0a65f70f8762b2ed37ae5a1bcd14c63b060 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
// Copyright 2019 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "third_party/blink/renderer/platform/fonts/utf16_ragel_iterator.h"

#include "third_party/blink/renderer/platform/text/character.h"
#include "third_party/blink/renderer/platform/wtf/text/character_names.h"

namespace blink {

namespace {

char EmojiSegmentationCategory(UChar32 codepoint) {
  if (codepoint <= 0x7F) {
    if (Character::IsEmojiKeycapBase(codepoint))
      return UTF16RagelIterator::KEYCAP_BASE;
    return UTF16RagelIterator::kMaxEmojiScannerCategory;
  }
  // For the grammar to work, we need to check for more specific character
  // classes first, then expand towards more generic ones. So we match single
  // characters and small ranges first, then return EMOJI and
  // EMOJI_TEXT_PRESENTATION for the remaining ones.
  if (codepoint == kCombiningEnclosingKeycapCharacter)
    return UTF16RagelIterator::COMBINING_ENCLOSING_KEYCAP;
  if (codepoint == kCombiningEnclosingCircleBackslashCharacter)
    return UTF16RagelIterator::COMBINING_ENCLOSING_CIRCLE_BACKSLASH;
  if (codepoint == kZeroWidthJoinerCharacter)
    return UTF16RagelIterator::ZWJ;
  if (codepoint == kVariationSelector15Character)
    return UTF16RagelIterator::VS15;
  if (codepoint == kVariationSelector16Character)
    return UTF16RagelIterator::VS16;
  if (codepoint == 0x1F3F4)
    return UTF16RagelIterator::TAG_BASE;
  if (Character::IsEmojiTagSequence(codepoint))
    return UTF16RagelIterator::TAG_SEQUENCE;
  if (codepoint == kCancelTag) {
    // http://www.unicode.org/reports/tr51/#def_emoji_tag_sequence
    // defines a TAG_TERM grammar rule for U+E007F CANCEL TAG.
    return UTF16RagelIterator::TAG_TERM;
  }
  if (Character::IsEmojiModifierBase(codepoint))
    return UTF16RagelIterator::EMOJI_MODIFIER_BASE;
  if (Character::IsModifier(codepoint))
    return UTF16RagelIterator::EMOJI_MODIFIER;
  if (Character::IsRegionalIndicator(codepoint))
    return UTF16RagelIterator::REGIONAL_INDICATOR;

  if (Character::IsEmojiEmojiDefault(codepoint))
    return UTF16RagelIterator::EMOJI_EMOJI_PRESENTATION;
  if (Character::IsEmojiTextDefault(codepoint))
    return UTF16RagelIterator::EMOJI_TEXT_PRESENTATION;
  if (Character::IsEmoji(codepoint))
    return UTF16RagelIterator::EMOJI;

  // Ragel state machine will interpret unknown category as "any".
  return UTF16RagelIterator::kMaxEmojiScannerCategory;
}

}  // namespace

UTF16RagelIterator& UTF16RagelIterator::SetCursor(unsigned new_cursor) {
  CHECK_GE(new_cursor, 0u);
  CHECK_LT(new_cursor, buffer_size_);
  cursor_ = new_cursor;
  UpdateCachedCategory();
  return *this;
}

void UTF16RagelIterator::UpdateCachedCategory() {
  if (cursor_ >= buffer_size_)
    return;
  cached_category_ = EmojiSegmentationCategory(Codepoint());
}

}  // namespace blink