summaryrefslogtreecommitdiff
path: root/chromium/third_party/blink/renderer/core/editing/state_machines/state_machine_util.cc
blob: 13d35c58583290aae920da8f33955703a47837f3 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
// Copyright 2016 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "third_party/blink/renderer/core/editing/state_machines/state_machine_util.h"

#include "base/stl_util.h"
#include "third_party/blink/renderer/platform/text/character.h"
#include "third_party/blink/renderer/platform/wtf/assertions.h"
#include "third_party/blink/renderer/platform/wtf/text/character_names.h"
#include "third_party/blink/renderer/platform/wtf/text/unicode.h"

namespace blink {

namespace {

// The list of code points which has Indic_Syllabic_Category=Virama property.
// Must be sorted.
// See http://www.unicode.org/Public/9.0.0/ucd/IndicSyllabicCategory-9.0.0d2.txt
const uint32_t kIndicSyllabicCategoryViramaList[] = {
    // Do not include 0+0BCD TAMIL SIGN VIRAMA as Tamil works differently from
    // other Indic languages. See crbug.com/693687.
    0x094D,  0x09CD,  0x0A4D,  0x0ACD,  0x0B4D,  0x0C4D,  0x0CCD,  0x0D4D,
    0x0DCA,  0x1B44,  0xA8C4,  0xA9C0,  0x11046, 0x110B9, 0x111C0, 0x11235,
    0x1134D, 0x11442, 0x114C2, 0x115BF, 0x1163F, 0x116B6, 0x11C3F,
};

// Returns true if the code point has Indic_Syllabic_Category=Virama property.
// See http://www.unicode.org/Public/9.0.0/ucd/IndicSyllabicCategory-9.0.0d2.txt
bool IsIndicSyllabicCategoryVirama(uint32_t code_point) {
  const int length = base::size(kIndicSyllabicCategoryViramaList);
  return std::binary_search(kIndicSyllabicCategoryViramaList,
                            kIndicSyllabicCategoryViramaList + length,
                            code_point);
}

}  // namespace

bool IsGraphemeBreak(UChar32 prev_code_point, UChar32 next_code_point) {
  // The following breaking rules come from Unicode Standard Annex #29 on
  // Unicode Text Segmentation. See http://www.unicode.org/reports/tr29/
  int prev_prop =
      u_getIntPropertyValue(prev_code_point, UCHAR_GRAPHEME_CLUSTER_BREAK);
  int next_prop =
      u_getIntPropertyValue(next_code_point, UCHAR_GRAPHEME_CLUSTER_BREAK);

  // Rule1 GB1 sot ÷
  // Rule2 GB2 ÷ eot
  // Should be handled by caller.

  // Rule GB3, CR x LF
  if (prev_prop == U_GCB_CR && next_prop == U_GCB_LF)
    return false;

  // Rule GB4, (Control | CR | LF) ÷
  if (prev_prop == U_GCB_CONTROL || prev_prop == U_GCB_CR ||
      prev_prop == U_GCB_LF)
    return true;

  // Rule GB5, ÷ (Control | CR | LF)
  if (next_prop == U_GCB_CONTROL || next_prop == U_GCB_CR ||
      next_prop == U_GCB_LF)
    return true;

  // Rule GB6, L x (L | V | LV | LVT)
  if (prev_prop == U_GCB_L && (next_prop == U_GCB_L || next_prop == U_GCB_V ||
                               next_prop == U_GCB_LV || next_prop == U_GCB_LVT))
    return false;

  // Rule GB7, (LV | V) x (V | T)
  if ((prev_prop == U_GCB_LV || prev_prop == U_GCB_V) &&
      (next_prop == U_GCB_V || next_prop == U_GCB_T))
    return false;

  // Rule GB8, (LVT | T) x T
  if ((prev_prop == U_GCB_LVT || prev_prop == U_GCB_T) && next_prop == U_GCB_T)
    return false;

  // Rule GB8a
  //
  // sot   (RI RI)* RI x RI
  // [^RI] (RI RI)* RI x RI
  //                RI ÷ RI
  if (Character::IsRegionalIndicator(prev_code_point) &&
      Character::IsRegionalIndicator(next_code_point))
    NOTREACHED() << "Do not use this function for regional indicators.";

  // This is an exception for Myanmar IMEs that uses zwnj character as base
  // character during a composition to avoid merging the actively composed text
  // into the previous character. We intentionally diverge from UAX#29.
  // Please see crbug.com/1027695 for more details.
  if (next_code_point == kZeroWidthNonJoinerCharacter)
    return true;

  // Rule GB9, x (Extend | ZWJ)
  // Rule GB9a, x SpacingMark
  if (next_prop == U_GCB_EXTEND ||
      next_code_point == kZeroWidthJoinerCharacter ||
      next_prop == U_GCB_SPACING_MARK)
    return false;

  // Rule GB9b, Prepend x
  if (prev_prop == U_GCB_PREPEND)
    return false;

  // Cluster Indic syllables together.
  if (IsIndicSyllabicCategoryVirama(prev_code_point) &&
      u_getIntPropertyValue(next_code_point, UCHAR_GENERAL_CATEGORY) ==
          U_OTHER_LETTER)
    return false;

  // GB11, ZWJ x Emoji
  if (prev_code_point == kZeroWidthJoinerCharacter &&
      (Character::IsEmoji(next_code_point)))
    return false;

  // GB12 for RI(Regional Indicator) is handled elsewhere because it requires
  // counting the number of consecutive RIs.

  // Rule GB999 any ÷ any
  return true;
}

}  // namespace blink