summaryrefslogtreecommitdiff
path: root/chromium/third_party/blink/renderer/platform/text/text_break_iterator.cc
diff options
context:
space:
mode:
Diffstat (limited to 'chromium/third_party/blink/renderer/platform/text/text_break_iterator.cc')
-rw-r--r--chromium/third_party/blink/renderer/platform/text/text_break_iterator.cc452
1 files changed, 452 insertions, 0 deletions
diff --git a/chromium/third_party/blink/renderer/platform/text/text_break_iterator.cc b/chromium/third_party/blink/renderer/platform/text/text_break_iterator.cc
new file mode 100644
index 00000000000..8b3625cfeec
--- /dev/null
+++ b/chromium/third_party/blink/renderer/platform/text/text_break_iterator.cc
@@ -0,0 +1,452 @@
+/*
+ * (C) 1999 Lars Knoll (knoll@kde.org)
+ * Copyright (C) 2004, 2005, 2006, 2007, 2008, 2010 Apple Inc. All rights
+ * reserved.
+ * Copyright (C) 2007-2009 Torch Mobile, Inc.
+ * Copyright (C) 2011 Google Inc. All rights reserved.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public License
+ * along with this library; see the file COPYING.LIB. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#include "third_party/blink/renderer/platform/text/text_break_iterator.h"
+
+#include "third_party/blink/renderer/platform/text/character.h"
+#include "third_party/blink/renderer/platform/wtf/ascii_ctype.h"
+#include "third_party/blink/renderer/platform/wtf/std_lib_extras.h"
+#include "third_party/blink/renderer/platform/wtf/text/character_names.h"
+
+#include <unicode/uchar.h>
+#include <unicode/uvernum.h>
+
+namespace blink {
+
+unsigned NumGraphemeClusters(const String& string) {
+ unsigned string_length = string.length();
+
+ if (!string_length)
+ return 0;
+
+ // The only Latin-1 Extended Grapheme Cluster is CR LF
+ if (string.Is8Bit() && !string.Contains('\r'))
+ return string_length;
+
+ NonSharedCharacterBreakIterator it(string);
+ if (!it)
+ return string_length;
+
+ unsigned num = 0;
+ while (it.Next() != kTextBreakDone)
+ ++num;
+ return num;
+}
+
+unsigned LengthOfGraphemeCluster(const String& string, unsigned offset) {
+ unsigned string_length = string.length();
+
+ if (string_length - offset <= 1)
+ return string_length - offset;
+
+ // The only Latin-1 Extended Grapheme Cluster is CRLF.
+ if (string.Is8Bit()) {
+ auto* characters = string.Characters8();
+ return 1 + (characters[offset] == '\r' && characters[offset + 1] == '\n');
+ }
+
+ NonSharedCharacterBreakIterator it(string);
+ if (!it)
+ return string_length - offset;
+
+ if (it.Following(offset) == kTextBreakDone)
+ return string_length - offset;
+ return it.Current() - offset;
+}
+
+static const UChar kAsciiLineBreakTableFirstChar = '!';
+static const UChar kAsciiLineBreakTableLastChar = 127;
+
+// Pack 8 bits into one byte
+#define B(a, b, c, d, e, f, g, h) \
+ ((a) | ((b) << 1) | ((c) << 2) | ((d) << 3) | ((e) << 4) | ((f) << 5) | \
+ ((g) << 6) | ((h) << 7))
+
+// Line breaking table row for each digit (0-9)
+#define DI \
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }
+
+// Line breaking table row for ascii letters (a-z A-Z)
+#define AL \
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }
+
+#define F 0xFF
+
+// Line breaking table for printable ASCII characters. Line breaking
+// opportunities in this table are as below:
+// - before opening punctuations such as '(', '<', '[', '{' after certain
+// characters (compatible with Firefox 3.6);
+// - after '-' and '?' (backward-compatible, and compatible with Internet
+// Explorer).
+// Please refer to <https://bugs.webkit.org/show_bug.cgi?id=37698> for line
+// breaking matrixes of different browsers and the ICU standard.
+// clang-format off
+static const unsigned char kAsciiLineBreakTable[][(kAsciiLineBreakTableLastChar - kAsciiLineBreakTableFirstChar) / 8 + 1] = {
+ // ! " # $ % & ' ( ) * + , - . / 0 1-8 9 : ; < = > ? @ A-X Y Z [ \ ] ^ _ ` a-x y z { | } ~ DEL
+ { B(0, 0, 0, 0, 0, 0, 0, 1), B(0, 0, 0, 0, 0, 0, 0, 0), 0, B(0, 0, 0, 1, 0, 0, 0, 0), 0, 0, 0, B(0, 0, 1, 0, 0, 0, 0, 0), 0, 0, 0, B(0, 0, 1, 0, 0, 0, 0, 0) }, // !
+ { B(0, 0, 0, 0, 0, 0, 0, 1), B(0, 0, 0, 0, 0, 0, 0, 0), 0, B(0, 0, 0, 1, 0, 0, 0, 0), 0, 0, 0, B(0, 0, 1, 0, 0, 0, 0, 0), 0, 0, 0, B(0, 0, 1, 0, 0, 0, 0, 0) }, // "
+ { B(0, 0, 0, 0, 0, 0, 0, 1), B(0, 0, 0, 0, 0, 0, 0, 0), 0, B(0, 0, 0, 1, 0, 0, 0, 0), 0, 0, 0, B(0, 0, 1, 0, 0, 0, 0, 0), 0, 0, 0, B(0, 0, 1, 0, 0, 0, 0, 0) }, // #
+ { B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), 0, B(0, 0, 0, 0, 0, 0, 0, 0), 0, 0, 0, B(0, 0, 0, 0, 0, 0, 0, 0), 0, 0, 0, B(0, 0, 0, 0, 0, 0, 0, 0) }, // $
+ { B(0, 0, 0, 0, 0, 0, 0, 1), B(0, 0, 0, 0, 0, 0, 0, 0), 0, B(0, 0, 0, 1, 0, 0, 0, 0), 0, 0, 0, B(0, 0, 1, 0, 0, 0, 0, 0), 0, 0, 0, B(0, 0, 1, 0, 0, 0, 0, 0) }, // %
+ { B(0, 0, 0, 0, 0, 0, 0, 1), B(0, 0, 0, 0, 0, 0, 0, 0), 0, B(0, 0, 0, 1, 0, 0, 0, 0), 0, 0, 0, B(0, 0, 1, 0, 0, 0, 0, 0), 0, 0, 0, B(0, 0, 1, 0, 0, 0, 0, 0) }, // &
+ { B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), 0, B(0, 0, 0, 0, 0, 0, 0, 0), 0, 0, 0, B(0, 0, 0, 0, 0, 0, 0, 0), 0, 0, 0, B(0, 0, 0, 0, 0, 0, 0, 0) }, // '
+ { B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), 0, B(0, 0, 0, 0, 0, 0, 0, 0), 0, 0, 0, B(0, 0, 0, 0, 0, 0, 0, 0), 0, 0, 0, B(0, 0, 0, 0, 0, 0, 0, 0) }, // (
+ { B(0, 0, 0, 0, 0, 0, 0, 1), B(0, 0, 0, 0, 0, 0, 0, 0), 0, B(0, 0, 0, 1, 0, 0, 0, 0), 0, 0, 0, B(0, 0, 1, 0, 0, 0, 0, 0), 0, 0, 0, B(0, 0, 1, 0, 0, 0, 0, 0) }, // )
+ { B(0, 0, 0, 0, 0, 0, 0, 1), B(0, 0, 0, 0, 0, 0, 0, 0), 0, B(0, 0, 0, 1, 0, 0, 0, 0), 0, 0, 0, B(0, 0, 1, 0, 0, 0, 0, 0), 0, 0, 0, B(0, 0, 1, 0, 0, 0, 0, 0) }, // *
+ { B(0, 0, 0, 0, 0, 0, 0, 1), B(0, 0, 0, 0, 0, 0, 0, 0), 0, B(0, 0, 0, 1, 0, 0, 0, 0), 0, 0, 0, B(0, 0, 1, 0, 0, 0, 0, 0), 0, 0, 0, B(0, 0, 1, 0, 0, 0, 0, 0) }, // +
+ { B(0, 0, 0, 0, 0, 0, 0, 1), B(0, 0, 0, 0, 0, 0, 0, 0), 0, B(0, 0, 0, 1, 0, 0, 0, 0), 0, 0, 0, B(0, 0, 1, 0, 0, 0, 0, 0), 0, 0, 0, B(0, 0, 1, 0, 0, 0, 0, 0) }, // ,
+ { B(0, 1, 1, 0, 1, 1, 1, 1), B(0, 1, 1, 0, 1, 0, 0, 0), 0, B(0, 0, 0, 1, 1, 1, 0, 1), F, F, F, B(1, 1, 1, 1, 0, 1, 1, 1), F, F, F, B(1, 1, 1, 1, 0, 1, 1, 1) }, // - Note: breaking before '0'-'9' is handled hard-coded in shouldBreakAfter().
+ { B(0, 0, 0, 0, 0, 0, 0, 1), B(0, 0, 0, 0, 0, 0, 0, 0), 0, B(0, 0, 0, 1, 0, 0, 0, 0), 0, 0, 0, B(0, 0, 1, 0, 0, 0, 0, 0), 0, 0, 0, B(0, 0, 1, 0, 0, 0, 0, 0) }, // .
+ { B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), 0, B(0, 0, 0, 0, 0, 0, 0, 0), 0, 0, 0, B(0, 0, 0, 0, 0, 0, 0, 0), 0, 0, 0, B(0, 0, 0, 0, 0, 0, 0, 0) }, // /
+ DI, DI, DI, DI, DI, DI, DI, DI, DI, DI, // 0-9
+ { B(0, 0, 0, 0, 0, 0, 0, 1), B(0, 0, 0, 0, 0, 0, 0, 0), 0, B(0, 0, 0, 1, 0, 0, 0, 0), 0, 0, 0, B(0, 0, 1, 0, 0, 0, 0, 0), 0, 0, 0, B(0, 0, 1, 0, 0, 0, 0, 0) }, // :
+ { B(0, 0, 0, 0, 0, 0, 0, 1), B(0, 0, 0, 0, 0, 0, 0, 0), 0, B(0, 0, 0, 1, 0, 0, 0, 0), 0, 0, 0, B(0, 0, 1, 0, 0, 0, 0, 0), 0, 0, 0, B(0, 0, 1, 0, 0, 0, 0, 0) }, // ;
+ { B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), 0, B(0, 0, 0, 0, 0, 0, 0, 0), 0, 0, 0, B(0, 0, 0, 0, 0, 0, 0, 0), 0, 0, 0, B(0, 0, 0, 0, 0, 0, 0, 0) }, // <
+ { B(0, 0, 0, 0, 0, 0, 0, 1), B(0, 0, 0, 0, 0, 0, 0, 0), 0, B(0, 0, 0, 1, 0, 0, 0, 0), 0, 0, 0, B(0, 0, 1, 0, 0, 0, 0, 0), 0, 0, 0, B(0, 0, 1, 0, 0, 0, 0, 0) }, // =
+ { B(0, 0, 0, 0, 0, 0, 0, 1), B(0, 0, 0, 0, 0, 0, 0, 0), 0, B(0, 0, 0, 1, 0, 0, 0, 0), 0, 0, 0, B(0, 0, 1, 0, 0, 0, 0, 0), 0, 0, 0, B(0, 0, 1, 0, 0, 0, 0, 0) }, // >
+ { B(0, 0, 1, 1, 1, 1, 0, 1), B(0, 1, 1, 0, 1, 0, 0, 1), F, B(1, 0, 0, 1, 1, 1, 0, 1), F, F, F, B(1, 1, 1, 1, 0, 1, 1, 1), F, F, F, B(1, 1, 1, 1, 0, 1, 1, 0) }, // ?
+ { B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), 0, B(0, 0, 0, 0, 0, 0, 0, 0), 0, 0, 0, B(0, 0, 0, 0, 0, 0, 0, 0), 0, 0, 0, B(0, 0, 0, 0, 0, 0, 0, 0) }, // @
+ AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, // A-Z
+ { B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), 0, B(0, 0, 0, 0, 0, 0, 0, 0), 0, 0, 0, B(0, 0, 0, 0, 0, 0, 0, 0), 0, 0, 0, B(0, 0, 0, 0, 0, 0, 0, 0) }, // [
+ { B(0, 0, 0, 0, 0, 0, 0, 1), B(0, 0, 0, 0, 0, 0, 0, 0), 0, B(0, 0, 0, 1, 0, 0, 0, 0), 0, 0, 0, B(0, 0, 1, 0, 0, 0, 0, 0), 0, 0, 0, B(0, 0, 1, 0, 0, 0, 0, 0) }, // '\'
+ { B(0, 0, 0, 0, 0, 0, 0, 1), B(0, 0, 0, 0, 0, 0, 0, 0), 0, B(0, 0, 0, 1, 0, 0, 0, 0), 0, 0, 0, B(0, 0, 1, 0, 0, 0, 0, 0), 0, 0, 0, B(0, 0, 1, 0, 0, 0, 0, 0) }, // ]
+ { B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), 0, B(0, 0, 0, 0, 0, 0, 0, 0), 0, 0, 0, B(0, 0, 0, 0, 0, 0, 0, 0), 0, 0, 0, B(0, 0, 0, 0, 0, 0, 0, 0) }, // ^
+ { B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), 0, B(0, 0, 0, 0, 0, 0, 0, 0), 0, 0, 0, B(0, 0, 0, 0, 0, 0, 0, 0), 0, 0, 0, B(0, 0, 0, 0, 0, 0, 0, 0) }, // _
+ { B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), 0, B(0, 0, 0, 0, 0, 0, 0, 0), 0, 0, 0, B(0, 0, 0, 0, 0, 0, 0, 0), 0, 0, 0, B(0, 0, 0, 0, 0, 0, 0, 0) }, // `
+ AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, // a-z
+ { B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), 0, B(0, 0, 0, 0, 0, 0, 0, 0), 0, 0, 0, B(0, 0, 0, 0, 0, 0, 0, 0), 0, 0, 0, B(0, 0, 0, 0, 0, 0, 0, 0) }, // {
+ { B(0, 0, 0, 0, 0, 0, 0, 1), B(0, 0, 0, 0, 0, 0, 0, 0), 0, B(0, 0, 0, 1, 0, 0, 0, 0), 0, 0, 0, B(0, 0, 1, 0, 0, 0, 0, 0), 0, 0, 0, B(0, 0, 1, 0, 0, 0, 0, 0) }, // |
+ { B(0, 0, 0, 0, 0, 0, 0, 1), B(0, 0, 0, 0, 0, 0, 0, 0), 0, B(0, 0, 0, 1, 0, 0, 0, 0), 0, 0, 0, B(0, 0, 1, 0, 0, 0, 0, 0), 0, 0, 0, B(0, 0, 1, 0, 0, 0, 0, 0) }, // }
+ { B(0, 0, 0, 0, 0, 0, 0, 1), B(0, 0, 0, 0, 0, 0, 0, 0), 0, B(0, 0, 0, 1, 0, 0, 0, 0), 0, 0, 0, B(0, 0, 1, 0, 0, 0, 0, 0), 0, 0, 0, B(0, 0, 1, 0, 0, 0, 0, 0) }, // ~
+ { B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), 0, B(0, 0, 0, 0, 0, 0, 0, 0), 0, 0, 0, B(0, 0, 0, 0, 0, 0, 0, 0), 0, 0, 0, B(0, 0, 0, 0, 0, 0, 0, 0) }, // DEL
+};
+// clang-format on
+
+#if U_ICU_VERSION_MAJOR_NUM >= 58
+#define BA_LB_COUNT (U_LB_COUNT - 3)
+#else
+#define BA_LB_COUNT U_LB_COUNT
+#endif
+// Line breaking table for CSS word-break: break-all. This table differs from
+// asciiLineBreakTable in:
+// - Indices are Line Breaking Classes defined in UAX#14 Unicode Line Breaking
+// Algorithm: http://unicode.org/reports/tr14/#DescriptionOfProperties
+// - 1 indicates additional break opportunities. 0 indicates to fallback to
+// normal line break, not "prohibit break."
+// clang-format off
+static const unsigned char kBreakAllLineBreakClassTable[][BA_LB_COUNT / 8 + 1] = {
+ // XX AI AL B2 BA BB BK CB CL CM CR EX GL HY ID IN IS LF NS NU OP PO PR QU SA SG SP SY ZW NL WJ H2 H3 JL JT JV CP CJ HL RI
+ { B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0) }, // XX
+ { B(0, 1, 1, 0, 1, 0, 0, 0), B(0, 0, 0, 0, 0, 1, 0, 0), B(0, 0, 0, 1, 1, 0, 1, 0), B(1, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 1, 0) }, // AI
+ { B(0, 1, 1, 0, 1, 0, 0, 0), B(0, 0, 0, 0, 0, 1, 0, 0), B(0, 0, 0, 1, 1, 0, 1, 0), B(1, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 1, 0) }, // AL
+ { B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0) }, // B2
+ { B(0, 1, 1, 0, 1, 0, 0, 0), B(0, 0, 0, 0, 0, 1, 0, 0), B(0, 0, 0, 1, 1, 0, 1, 0), B(1, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 1, 0) }, // BA
+ { B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0) }, // BB
+ { B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0) }, // BK
+ { B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0) }, // CB
+ { B(0, 1, 1, 0, 1, 0, 0, 0), B(0, 0, 0, 0, 0, 1, 0, 0), B(0, 0, 0, 1, 0, 0, 1, 0), B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 1, 0) }, // CL
+ { B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0) }, // CM
+ { B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0) }, // CR
+ { B(0, 1, 1, 0, 1, 0, 0, 0), B(0, 0, 0, 0, 0, 1, 0, 0), B(0, 0, 0, 1, 0, 1, 1, 0), B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 1, 0) }, // EX
+ { B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0) }, // GL
+ { B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 1, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0) }, // HY
+ { B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0) }, // ID
+ { B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0) }, // IN
+ { B(0, 1, 1, 0, 1, 0, 0, 0), B(0, 0, 0, 0, 0, 1, 0, 0), B(0, 0, 0, 1, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 1, 0) }, // IS
+ { B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0) }, // LF
+ { B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0) }, // NS
+ { B(0, 1, 1, 0, 1, 0, 0, 0), B(0, 0, 0, 0, 0, 1, 0, 0), B(0, 0, 0, 1, 1, 0, 1, 0), B(1, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 1, 0) }, // NU
+ { B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0) }, // OP
+ { B(0, 1, 1, 0, 1, 0, 0, 0), B(0, 0, 0, 0, 0, 1, 0, 0), B(0, 0, 0, 1, 0, 1, 1, 0), B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 1, 0) }, // PO
+ { B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 1, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0) }, // PR
+ { B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0) }, // QU
+ { B(0, 1, 1, 0, 1, 0, 0, 0), B(0, 0, 0, 0, 0, 1, 0, 0), B(0, 0, 0, 1, 1, 0, 1, 0), B(1, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 1, 0) }, // SA
+ { B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0) }, // SG
+ { B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0) }, // SP
+ { B(0, 1, 1, 0, 1, 0, 0, 0), B(0, 0, 0, 0, 0, 1, 0, 0), B(0, 0, 0, 1, 1, 0, 1, 0), B(1, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 1, 0) }, // SY
+ { B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0) }, // ZW
+ { B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0) }, // NL
+ { B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0) }, // WJ
+ { B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0) }, // H2
+ { B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0) }, // H3
+ { B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0) }, // JL
+ { B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0) }, // JT
+ { B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0) }, // JV
+ { B(0, 1, 1, 0, 1, 0, 0, 0), B(0, 0, 0, 0, 0, 1, 0, 0), B(0, 0, 0, 1, 0, 0, 1, 0), B(1, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 1, 0) }, // CP
+ { B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0) }, // CJ
+ { B(0, 1, 1, 0, 1, 0, 0, 0), B(0, 0, 0, 0, 0, 1, 0, 0), B(0, 0, 0, 1, 1, 0, 1, 0), B(1, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 1, 0) }, // HL
+ { B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0), B(0, 0, 0, 0, 0, 0, 0, 0) }, // RI
+};
+// clang-format on
+
+#undef B
+#undef F
+#undef DI
+#undef AL
+
+static_assert(WTF_ARRAY_LENGTH(kAsciiLineBreakTable) ==
+ kAsciiLineBreakTableLastChar - kAsciiLineBreakTableFirstChar +
+ 1,
+ "asciiLineBreakTable should be consistent");
+static_assert(WTF_ARRAY_LENGTH(kBreakAllLineBreakClassTable) == BA_LB_COUNT,
+ "breakAllLineBreakClassTable should be consistent");
+
+static inline bool ShouldBreakAfter(UChar last_ch, UChar ch, UChar next_ch) {
+ // Don't allow line breaking between '-' and a digit if the '-' may mean a
+ // minus sign in the context, while allow breaking in 'ABCD-1234' and
+ // '1234-5678' which may be in long URLs.
+ if (ch == '-' && IsASCIIDigit(next_ch))
+ return IsASCIIAlphanumeric(last_ch);
+
+ // If both ch and nextCh are ASCII characters, use a lookup table for enhanced
+ // speed and for compatibility with other browsers (see comments for
+ // asciiLineBreakTable for details).
+ if (ch >= kAsciiLineBreakTableFirstChar &&
+ ch <= kAsciiLineBreakTableLastChar &&
+ next_ch >= kAsciiLineBreakTableFirstChar &&
+ next_ch <= kAsciiLineBreakTableLastChar) {
+ const unsigned char* table_row =
+ kAsciiLineBreakTable[ch - kAsciiLineBreakTableFirstChar];
+ int next_ch_index = next_ch - kAsciiLineBreakTableFirstChar;
+ return table_row[next_ch_index / 8] & (1 << (next_ch_index % 8));
+ }
+ // Otherwise defer to the Unicode algorithm by returning false.
+ return false;
+}
+
+static inline ULineBreak LineBreakPropertyValue(UChar last_ch, UChar ch) {
+ if (ch == '+') // IE tailors '+' to AL-like class when break-all is enabled.
+ return U_LB_ALPHABETIC;
+ UChar32 ch32 = U16_IS_LEAD(last_ch) && U16_IS_TRAIL(ch)
+ ? U16_GET_SUPPLEMENTARY(last_ch, ch)
+ : ch;
+ return static_cast<ULineBreak>(u_getIntPropertyValue(ch32, UCHAR_LINE_BREAK));
+}
+
+static inline bool ShouldBreakAfterBreakAll(ULineBreak last_line_break,
+ ULineBreak line_break) {
+ if (line_break >= 0 && line_break < BA_LB_COUNT && last_line_break >= 0 &&
+ last_line_break < BA_LB_COUNT) {
+ const unsigned char* table_row =
+ kBreakAllLineBreakClassTable[last_line_break];
+ return table_row[line_break / 8] & (1 << (line_break % 8));
+ }
+ return false;
+}
+
+// Computes if 'word-break:keep-all' should prevent line break.
+// https://drafts.csswg.org/css-text-3/#valdef-word-break-keep-all
+// The spec is not very verbose on how this should work. This logic prevents L/M
+// general categories and complex line breaking since the spec says "except some
+// south east aisans".
+// https://github.com/w3c/csswg-drafts/issues/1619
+static inline bool ShouldKeepAfterKeepAll(UChar last_ch,
+ UChar ch,
+ UChar next_ch) {
+ UChar pre_ch = U_MASK(u_charType(ch)) & U_GC_M_MASK ? last_ch : ch;
+ return U_MASK(u_charType(pre_ch)) & (U_GC_L_MASK | U_GC_N_MASK) &&
+ !WTF::Unicode::HasLineBreakingPropertyComplexContext(pre_ch) &&
+ U_MASK(u_charType(next_ch)) & (U_GC_L_MASK | U_GC_N_MASK) &&
+ !WTF::Unicode::HasLineBreakingPropertyComplexContext(next_ch);
+}
+
+inline bool NeedsLineBreakIterator(UChar ch) {
+ return ch > kAsciiLineBreakTableLastChar && ch != kNoBreakSpaceCharacter;
+}
+
+template <typename CharacterType,
+ LineBreakType lineBreakType,
+ BreakSpaceType break_space>
+inline int LazyLineBreakIterator::NextBreakablePosition(
+ int pos,
+ const CharacterType* str) const {
+ int len = static_cast<int>(string_.length());
+ int next_break = -1;
+
+ UChar last_last_ch = pos > 1 ? str[pos - 2] : SecondToLastCharacter();
+ UChar last_ch = pos > 0 ? str[pos - 1] : LastCharacter();
+ bool is_last_space = IsBreakableSpace(last_ch);
+ ULineBreak last_line_break;
+ if (lineBreakType == LineBreakType::kBreakAll)
+ last_line_break = LineBreakPropertyValue(last_last_ch, last_ch);
+ unsigned prior_context_length = PriorContextLength();
+ CharacterType ch;
+ bool is_space;
+ for (int i = pos; i < len;
+ i++, last_last_ch = last_ch, last_ch = ch, is_last_space = is_space) {
+ ch = str[i];
+
+ is_space = IsBreakableSpace(ch);
+ switch (break_space) {
+ case BreakSpaceType::kBeforeEverySpace:
+ if (is_space)
+ return i;
+ break;
+ case BreakSpaceType::kBeforeSpaceRun:
+ // Theoritically, preserved newline characters are different from space
+ // and tab characters. The difference is not implemented because the
+ // LayoutNG line breaker handles preserved newline characters by itself.
+ if (is_space) {
+ if (!is_last_space)
+ return i;
+ continue;
+ }
+ break;
+ }
+
+ if (ShouldBreakAfter(last_last_ch, last_ch, ch))
+ return i;
+
+ if (lineBreakType == LineBreakType::kBreakAll && !U16_IS_LEAD(ch)) {
+ ULineBreak line_break = LineBreakPropertyValue(last_ch, ch);
+ if (ShouldBreakAfterBreakAll(last_line_break, line_break))
+ return i > pos && U16_IS_TRAIL(ch) ? i - 1 : i;
+ if (line_break != U_LB_COMBINING_MARK)
+ last_line_break = line_break;
+ }
+
+ if (lineBreakType == LineBreakType::kKeepAll &&
+ ShouldKeepAfterKeepAll(last_last_ch, last_ch, ch)) {
+ // word-break:keep-all prevents breaks between East Asian ideographic.
+ continue;
+ }
+
+ if (NeedsLineBreakIterator(ch) || NeedsLineBreakIterator(last_ch)) {
+ if (next_break < i) {
+ // Don't break if positioned at start of primary context and there is no
+ // prior context.
+ if (i || prior_context_length) {
+ TextBreakIterator* break_iterator = Get(prior_context_length);
+ if (break_iterator) {
+ next_break =
+ break_iterator->following(i - 1 + prior_context_length);
+ if (next_break >= 0) {
+ next_break -= prior_context_length;
+ }
+ }
+ }
+ }
+ if (i == next_break && !is_last_space)
+ return i;
+ }
+ }
+
+ return len;
+}
+
+template <typename CharacterType, LineBreakType lineBreakType>
+inline int LazyLineBreakIterator::NextBreakablePosition(
+ int pos,
+ const CharacterType* str) const {
+ switch (break_space_) {
+ case BreakSpaceType::kBeforeEverySpace:
+ return NextBreakablePosition<CharacterType, lineBreakType,
+ BreakSpaceType::kBeforeEverySpace>(pos, str);
+ case BreakSpaceType::kBeforeSpaceRun:
+ return NextBreakablePosition<CharacterType, lineBreakType,
+ BreakSpaceType::kBeforeSpaceRun>(pos, str);
+ }
+ NOTREACHED();
+ return NextBreakablePosition<CharacterType, lineBreakType,
+ BreakSpaceType::kBeforeEverySpace>(pos, str);
+}
+
+template <LineBreakType lineBreakType>
+inline int LazyLineBreakIterator::NextBreakablePosition(int pos) const {
+ if (UNLIKELY(string_.IsNull()))
+ return 0;
+ if (string_.Is8Bit()) {
+ return NextBreakablePosition<LChar, lineBreakType>(pos,
+ string_.Characters8());
+ }
+ return NextBreakablePosition<UChar, lineBreakType>(pos,
+ string_.Characters16());
+}
+
+int LazyLineBreakIterator::NextBreakablePositionBreakCharacter(int pos) const {
+ NonSharedCharacterBreakIterator iterator(string_);
+ int next = iterator.Following(std::max(pos - 1, 0));
+ return next != kTextBreakDone ? next : string_.length();
+}
+
+int LazyLineBreakIterator::NextBreakablePosition(
+ int pos,
+ LineBreakType line_break_type) const {
+ switch (line_break_type) {
+ case LineBreakType::kNormal:
+ return NextBreakablePosition<LineBreakType::kNormal>(pos);
+ case LineBreakType::kBreakAll:
+ return NextBreakablePosition<LineBreakType::kBreakAll>(pos);
+ case LineBreakType::kKeepAll:
+ return NextBreakablePosition<LineBreakType::kKeepAll>(pos);
+ case LineBreakType::kBreakCharacter:
+ return NextBreakablePositionBreakCharacter(pos);
+ }
+ NOTREACHED();
+ return NextBreakablePosition(pos, LineBreakType::kNormal);
+}
+
+unsigned LazyLineBreakIterator::NextBreakOpportunity(unsigned offset) const {
+ int next_break = -1;
+ IsBreakable(offset, next_break);
+ DCHECK_GE(next_break, 0);
+ return next_break;
+}
+
+unsigned LazyLineBreakIterator::PreviousBreakOpportunity(unsigned offset,
+ unsigned min) const {
+ unsigned pos = std::min(offset, string_.length());
+ for (; pos > min; pos--) {
+ if (IsBreakable(pos))
+ return pos;
+ }
+ return min;
+}
+
+std::ostream& operator<<(std::ostream& ostream, LineBreakType line_break_type) {
+ switch (line_break_type) {
+ case LineBreakType::kNormal:
+ return ostream << "Normal";
+ case LineBreakType::kBreakAll:
+ return ostream << "BreakAll";
+ case LineBreakType::kBreakCharacter:
+ return ostream << "BreakCharacter";
+ case LineBreakType::kKeepAll:
+ return ostream << "KeepAll";
+ }
+ NOTREACHED();
+ return ostream << "LineBreakType::" << static_cast<int>(line_break_type);
+}
+
+std::ostream& operator<<(std::ostream& ostream, BreakSpaceType break_space) {
+ switch (break_space) {
+ case BreakSpaceType::kBeforeEverySpace:
+ return ostream << "kBeforeEverySpace";
+ case BreakSpaceType::kBeforeSpaceRun:
+ return ostream << "kBeforeSpaceRun";
+ }
+ NOTREACHED();
+ return ostream << "BreakSpaceType::" << static_cast<int>(break_space);
+}
+
+} // namespace blink