summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChristian Persch <chpe@src.gnome.org>2022-03-01 21:52:22 +0100
committerChristian Persch <chpe@src.gnome.org>2022-03-01 21:52:22 +0100
commit8fe5e1541c79cf12bc9009bc6f61a722cd773795 (patch)
tree26d59e6994f414dee12bbf0dc28aca88b844c756
parent7a0a7f3c5dd1b9c798bfa0f1f0a13661167f54a3 (diff)
downloadvte-wip/regex-builtins.tar.gz
lib: Add builtin regexeswip/regex-builtins
Add builin regexes to recognise URLs, copied from gnome-terminal. https://gitlab.gnome.org/GNOME/vte/issues/114
-rw-r--r--doc/reference/vte-sections.txt.in5
-rw-r--r--meson.build4
-rw-r--r--src/app/app.cc41
-rw-r--r--src/fwd.hh1
-rw-r--r--src/meson.build29
-rw-r--r--src/regex-builtins-patterns.hh156
-rw-r--r--src/regex-builtins.cc103
-rw-r--r--src/regex-builtins.hh80
-rw-r--r--src/regex-test.cc668
-rw-r--r--src/regex.cc11
-rw-r--r--src/regex.hh1
-rw-r--r--src/vte.cc39
-rw-r--r--src/vte/vteenums.h14
-rw-r--r--src/vte/vtemacros.h6
-rw-r--r--src/vte/vteterminal.h5
-rw-r--r--src/vtedefines.hh3
-rw-r--r--src/vtegtk.cc74
-rw-r--r--src/vteinternal.hh6
-rw-r--r--src/vteregex.cc6
19 files changed, 1229 insertions, 23 deletions
diff --git a/doc/reference/vte-sections.txt.in b/doc/reference/vte-sections.txt.in
index 4c9680f0..2bc53bf5 100644
--- a/doc/reference/vte-sections.txt.in
+++ b/doc/reference/vte-sections.txt.in
@@ -7,6 +7,7 @@ VteCursorBlinkMode
VteCursorShape
VteEraseBinding
VteTextBlinkMode
+VteBuiltinMatchTag
VteFormat
VteWriteFlags
VteSelectionFunc
@@ -79,8 +80,10 @@ vte_terminal_get_cursor_position
vte_terminal_hyperlink_check_event
#endif
vte_terminal_match_add_regex
+vte_terminal_match_add_builtins
vte_terminal_match_remove
vte_terminal_match_remove_all
+vte_terminal_match_remove_builtins
vte_terminal_match_check
#if VTE_GTK == 3
vte_terminal_match_check_event
@@ -148,6 +151,8 @@ VTE_TYPE_ERASE_BINDING
vte_erase_binding_get_type
VTE_TYPE_TEXT_BLINK_MODE
vte_text_blink_mode_get_type
+VTE_TYPE_BUILTIN_MATCH_TAGS
+vte_builtin_match_tags_get_type
VTE_TYPE_FORMAT
vte_format_get_type
VTE_TYPE_WRITE_FLAGS
diff --git a/meson.build b/meson.build
index abe7e8fd..da4d7ab0 100644
--- a/meson.build
+++ b/meson.build
@@ -38,9 +38,9 @@ clangxx_req_version = '11.0'
gtk3_req_version = '3.20.0'
gtk3_min_req_version = '3.18'
-gtk3_max_allowed_version = '3.20'
+gtk3_max_allowed_version = '3.24'
-gtk4_req_version = '4.0.1'
+gtk4_req_version = '4.0.0'
gtk4_min_req_version = '4.0'
gtk4_max_allowed_version = '4.0'
diff --git a/src/app/app.cc b/src/app/app.cc
index 8b1d47ea..e9cd488a 100644
--- a/src/app/app.cc
+++ b/src/app/app.cc
@@ -1452,8 +1452,7 @@ struct _VteappWindowClass {
static GType vteapp_window_get_type(void);
static char const* const builtin_dingus[] = {
- "(((gopher|news|telnet|nntp|file|http|ftp|https)://)|(www|ftp)[-A-Za-z0-9]*\\.)[-A-Za-z0-9\\.]+(:[0-9]*)?",
- "(((gopher|news|telnet|nntp|file|http|ftp|https)://)|(www|ftp)[-A-Za-z0-9]*\\.)[-A-Za-z0-9\\.]+(:[0-9]*)?/[-A-Za-z0-9_\\$\\.\\+\\!\\*\\(\\),;:@&=\\?/~\\#\\%]*[^]'\\.}>\\) ,\\\"]",
+ "(foo|bar|baz)+",
nullptr,
};
@@ -1941,6 +1940,29 @@ window_action_copy_match_cb(GSimpleAction* action,
}
static void
+window_action_open_uri_cb(GSimpleAction* action,
+ GVariant* parameter,
+ void* data)
+{
+ VteappWindow* window = VTEAPP_WINDOW(data);
+ auto len = size_t{};
+ auto str = g_variant_get_string(parameter, &len);
+ GError* err{nullptr};
+
+#if GTK_CHECK_VERSION(3, 22, 0)
+ if (!gtk_show_uri_on_window(GTK_WINDOW(window),
+#else
+ if (!gtk_show_uri(gtk_widget_get_screen(GTK_WIDGET(window)),
+#endif
+ str,
+ gtk_get_current_event_time(),
+ &err)) {
+ verbose_printerr("Opening URI \"%s\" failed: %s\n", str, err->message);
+ g_error_free(err);
+ }
+}
+
+static void
window_action_paste_cb(GSimpleAction* action,
GVariant* parameter,
void* data)
@@ -2027,13 +2049,21 @@ vteapp_window_show_context_menu(VteappWindow* window,
g_menu_append_item(menu, item.get());
}
- auto match = vte::glib::take_string(vte_terminal_match_check_event(window->terminal, event, nullptr));
+ auto tag = -1;
+ auto match = vte::glib::take_string(vte_terminal_match_check_event(window->terminal, event, &tag));
if (match) {
verbose_print("Match: %s\n", match.get());
auto target = g_variant_new_string(match.get()); /* floating */
auto item = vte::glib::take_ref(g_menu_item_new("Copy _Match", nullptr));
g_menu_item_set_action_and_target_value(item.get(), "win.copy-match", target);
g_menu_append_item(menu, item.get());
+
+ }
+ if (match && tag == VTE_BUILTIN_MATCH_TAG_URI) {
+ auto target = g_variant_new_string(match.get()); /* floating */
+ auto item = vte::glib::take_ref(g_menu_item_new("_Open URI", nullptr));
+ g_menu_item_set_action_and_target_value(item.get(), "win.open-uri", target);
+ g_menu_append_item(menu, item.get());
}
/* Test extra match API */
@@ -2464,6 +2494,7 @@ vteapp_window_constructed(GObject *object)
GActionEntry const entries[] = {
{ "copy", window_action_copy_cb, "s", nullptr, nullptr },
{ "copy-match", window_action_copy_match_cb, "s", nullptr, nullptr },
+ { "open-uri", window_action_open_uri_cb, "s", nullptr, nullptr },
{ "paste", window_action_paste_cb, nullptr, nullptr, nullptr },
{ "reset", window_action_reset_cb, "b", nullptr, nullptr },
{ "find", window_action_find_cb, nullptr, nullptr, nullptr },
@@ -2614,8 +2645,10 @@ vteapp_window_constructed(GObject *object)
gtk_widget_set_opacity (GTK_WIDGET (window), options.get_alpha());
/* Dingus */
- if (!options.no_builtin_dingus)
+ if (!options.no_builtin_dingus) {
vteapp_window_add_dingus(window, builtin_dingus);
+ vte_terminal_match_add_builtins(window->terminal);
+ }
if (options.dingus != nullptr)
vteapp_window_add_dingus(window, options.dingus);
diff --git a/src/fwd.hh b/src/fwd.hh
index 58e2208b..1367623c 100644
--- a/src/fwd.hh
+++ b/src/fwd.hh
@@ -22,6 +22,7 @@ namespace vte {
namespace base {
class Pty;
+class RegexBuiltins;
} // namespace base
diff --git a/src/meson.build b/src/meson.build
index 89f52985..cbef4dc9 100644
--- a/src/meson.build
+++ b/src/meson.build
@@ -130,7 +130,12 @@ refptr_sources = files(
regex_sources = files(
'regex.cc',
- 'regex.hh'
+ 'regex.hh',
+ 'regex-builtins.cc',
+ 'regex-builtins.hh',
+ 'regex-builtins-patterns.hh',
+ 'vteregex.cc',
+ 'vteregexinternal.hh',
)
sixel_parser_sources = files(
@@ -183,6 +188,7 @@ libvte_common_sources = config_sources + debug_sources + glib_glue_sources + gtk
'drawing-cairo.hh',
'fonts-pangocairo.cc',
'fonts-pangocairo.hh',
+ 'fwd.hh',
'gobject-glue.hh',
'keymap.cc',
'keymap.h',
@@ -205,8 +211,6 @@ libvte_common_sources = config_sources + debug_sources + glib_glue_sources + gtk
'vtegtk.cc',
'vtegtk.hh',
'vteinternal.hh',
- 'vteregex.cc',
- 'vteregexinternal.hh',
'vterowdata.cc',
'vterowdata.hh',
'vteseq.cc',
@@ -587,6 +591,24 @@ test_refptr = executable(
install: false,
)
+test_regex_sources = regex_sources + glib_glue_sources + files(
+ 'regex-test.cc',
+)
+
+test_regex = executable(
+ 'test-regex',
+ sources: test_regex_sources,
+ dependencies: [glib_dep, gobject_dep, pcre2_dep,],
+ cpp_args: ['-DVTE_COMPILATION',],
+ include_directories: top_inc,
+ install: false,
+)
+
+test_tabstops_sources = files(
+ 'tabstops-test.cc',
+ 'tabstops.hh'
+)
+
if get_option('sixel')
fuzz_sixel_sources = config_sources + files(
'sixel-fuzzer.cc',
@@ -685,6 +707,7 @@ test_units = [
['pastify', test_pastify],
['reaper', test_reaper],
['refptr', test_refptr],
+ ['regex', test_regex],
['stream', test_stream],
['tabstops', test_tabstops],
['utf8', test_utf8],
diff --git a/src/regex-builtins-patterns.hh b/src/regex-builtins-patterns.hh
new file mode 100644
index 00000000..3df945b8
--- /dev/null
+++ b/src/regex-builtins-patterns.hh
@@ -0,0 +1,156 @@
+/*
+ * Copyright © 2015 Egmont Koblinger
+ *
+ * This library is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this library. If not, see <https://www.gnu.org/licenses/>.
+ */
+
+/*
+ * Mini style-guide:
+ *
+ * #define'd fragments should preferably have an outermost group, for the
+ * exact same reason as why usually in C/C++ #define's the values are enclosed
+ * in parentheses: that is, so that you don't get surprised when you use the
+ * macro and append a quantifier.
+ *
+ * For repeated fragments prefer regex-style (?(DEFINE)(?<NAME>(...))) and use
+ * as (?&NAME), so that the regex string and the compiled regex object is
+ * smaller.
+ *
+ * Build small blocks, comment and unittest them heavily.
+ *
+ * Use free-spacing mode for improved readability. The hardest to read is
+ * which additional characters belong to a "(?" prefix. To improve
+ * readability, place a space after this, and for symmetry, before the closing
+ * parenthesis. Also place a space around "|" characters. No space before
+ * quantifiers. Try to be consistent with the existing style (yes I know the
+ * existing style is not consistent either, but please do your best).
+ *
+ * See http://www.rexegg.com/regex-disambiguation.html for all the "(?"
+ * syntaxes.
+ */
+
+#pragma once
+
+/* Lookbehind to see if there's a preceding apostrophe.
+ * Unlike the other *_DEF macros which define regex subroutines,
+ * this one is a named capture that defines APOS_START to either
+ * an apostrophe or the empty string, depending on the character
+ * preceding this APOS_START_DEF construct.
+ */
+#define APOS_START_DEF "(?<APOS_START>(?<='))?"
+
+#define SCHEME "(?ix: news | telnet | nntp | https? | ftps? | sftp | webcal )"
+
+#define USERCHARS "-+.[:alnum:]"
+/* Nonempty username, e.g. "john.smith" */
+#define USER "[" USERCHARS "]+"
+
+#define PASSCHARS_CLASS "[-[:alnum:]\\Q,?;.:/!%$^*&~\"#'\\E]"
+/* Optional colon-prefixed password. I guess empty password should be allowed, right? E.g. ":secret", ":", "" */
+#define PASS "(?x: :" PASSCHARS_CLASS "* )?"
+
+/* Optional at-terminated username (with perhaps a password too), e.g. "joe@", "pete:secret@", "" */
+#define USERPASS "(?:" USER PASS "@)?"
+
+/* S4: IPv4 segment (number between 0 and 255) with lookahead at the end so that we don't match "25" in the string "256".
+ The lookahead could go to the last segment of IPv4 only but this construct allows nicer unittesting. */
+#define S4_DEF "(?(DEFINE)(?<S4>(?x: (?: [0-9] | [1-9][0-9] | 1[0-9]{2} | 2[0-4][0-9] | 25[0-5] ) (?! [0-9] ) )))"
+
+/* IPV4: Decimal IPv4, e.g. "1.2.3.4", with lookahead (implemented in S4) at the end so that we don't match "192.168.1.123" in the string "192.168.1.1234". */
+#define IPV4_DEF S4_DEF "(?(DEFINE)(?<IPV4>(?x: (?: (?&S4) \\. ){3} (?&S4) )))"
+
+/* IPv6, including embedded IPv4, e.g. "::1", "dead:beef::1.2.3.4".
+ * Lookahead for the next char not being a dot or digit, so it doesn't get stuck matching "dead:beef::1" in "dead:beef::1.2.3.4".
+ * This is not required since the surrounding brackets would trigger backtracking, but it allows nicer unittesting.
+ * TODO: more strict check (right number of colons, etc.)
+ * TODO: add zone_id: RFC 4007 section 11, RFC 6874 */
+
+/* S6: IPv6 segment, S6C: IPv6 segment followed by a comma, CS6: comma followed by an IPv6 segment */
+#define S6_DEF "(?(DEFINE)(?<S6>[[:xdigit:]]{1,4})(?<CS6>:(?&S6))(?<S6C>(?&S6):))"
+
+/* No :: shorthand */
+#define IPV6_FULL "(?x: (?&S6C){7} (?&S6) )"
+/* Begins with :: */
+#define IPV6_LEFT "(?x: : (?&CS6){1,7} )"
+/* :: somewhere in the middle - use negative lookahead to make sure there aren't too many colons in total */
+#define IPV6_MID "(?x: (?! (?: [[:xdigit:]]*: ){8} ) (?&S6C){1,6} (?&CS6){1,6} )"
+/* Ends with :: */
+#define IPV6_RIGHT "(?x: (?&S6C){1,7} : )"
+/* Is "::" and nothing more */
+#define IPV6_NULL "(?x: :: )"
+
+/* The same ones for IPv4-embedded notation, without the actual IPv4 part */
+#define IPV6V4_FULL "(?x: (?&S6C){6} )"
+#define IPV6V4_LEFT "(?x: :: (?&S6C){0,5} )" /* includes "::<ipv4>" */
+#define IPV6V4_MID "(?x: (?! (?: [[:xdigit:]]*: ){7} ) (?&S6C){1,4} (?&CS6){1,4} ) :"
+#define IPV6V4_RIGHT "(?x: (?&S6C){1,5} : )"
+
+/* IPV6: An IPv6 address (possibly with an embedded IPv4).
+ * This macro defines both IPV4 and IPV6, since the latter one requires the former. */
+#define IP_DEF IPV4_DEF S6_DEF "(?(DEFINE)(?<IPV6>(?x: (?: " IPV6_NULL " | " IPV6_LEFT " | " IPV6_MID " | " IPV6_RIGHT " | " IPV6_FULL " | (?: " IPV6V4_FULL " | " IPV6V4_LEFT " | " IPV6V4_MID " | " IPV6V4_RIGHT " ) (?&IPV4) ) (?! [.:[:xdigit:]] ) )))"
+
+/* Either an alphanumeric character or dash; or if [negative lookahead] not ASCII
+ * then any graphical Unicode character.
+ * A segment can consist entirely of numbers.
+ * (Note: PCRE doesn't support character class subtraction/intersection.) */
+#define HOSTNAMESEGMENTCHARS_CLASS "(?x: [-[:alnum:]] | (?! [[:ascii:]] ) [[:graph:]] )"
+
+/* A hostname of at least 1 component. The last component cannot be entirely numbers.
+ * E.g. "foo", "example.com", "1234.com", but not "foo.123" */
+#define HOSTNAME1 "(?x: (?: " HOSTNAMESEGMENTCHARS_CLASS "+ \\. )* " HOSTNAMESEGMENTCHARS_CLASS "* (?! [0-9] ) " HOSTNAMESEGMENTCHARS_CLASS "+ )"
+
+/* A hostname of at least 2 components. The last component cannot be entirely numbers.
+ * E.g. "example.com", "1234.com", but not "1234.56" */
+#define HOSTNAME2 "(?x: (?: " HOSTNAMESEGMENTCHARS_CLASS "+ \\.)+ " HOSTNAME1 " )"
+
+/* For URL: Hostname, IPv4, or bracket-enclosed IPv6, e.g. "example.com", "1.2.3.4", "[::1]" */
+#define URL_HOST "(?x: " HOSTNAME1 " | (?&IPV4) | \\[ (?&IPV6) \\] )"
+
+/* For e-mail: Hostname of at least two segments, or bracket-enclosed IPv4 or IPv6, e.g. "example.com", "[1.2.3.4]", "[::1]".
+ * Technically an e-mail with a single-component hostname might be valid on a local network, but let's avoid tons of false positives (e.g. in a typical shell prompt). */
+#define EMAIL_HOST "(?x: " HOSTNAME2 " | \\[ (?: (?&IPV4) | (?&IPV6) ) \\] )"
+
+/* Number between 1 and 65535, with lookahead at the end so that we don't match "6789" in the string "67890",
+ and in turn we don't eventually match "http://host:6789" in "http://host:67890". */
+#define N_1_65535 "(?x: (?: [1-9][0-9]{0,3} | [1-5][0-9]{4} | 6[0-4][0-9]{3} | 65[0-4][0-9]{2} | 655[0-2][0-9] | 6553[0-5] ) (?! [0-9] ) )"
+
+/* Optional colon-prefixed port, e.g. ":1080", "" */
+#define PORT "(?x: \\:" N_1_65535 " )?"
+
+/* Omit the parentheses, see below */
+#define PATHCHARS_CLASS "[-[:alnum:]\\Q_$.+!*,:;@&=?/~#|%'\\E]"
+/* Chars to end a URL. Apostrophe only allowed if there wasn't one in front of the URL, see bug 448044 */
+#define PATHTERM_CLASS "[-[:alnum:]\\Q_$+*:@&=/~#|%'\\E]"
+#define PATHTERM_NOAPOS_CLASS "[-[:alnum:]\\Q_$+*:@&=/~#|%\\E]"
+
+/* Recursive definition of PATH that allows parentheses and square brackets only if balanced, see bug 763980. */
+#define PATH_INNER_DEF "(?(DEFINE)(?<PATH_INNER>(?x: (?: " PATHCHARS_CLASS "* (?: \\( (?&PATH_INNER) \\) | \\[ (?&PATH_INNER) \\] ) )* " PATHCHARS_CLASS "* )))"
+/* Same as above, but the last character (if exists and is not a parenthesis) must be from PATHTERM_CLASS. */
+#define PATH_DEF "(?(DEFINE)(?<PATH>(?x: (?: " PATHCHARS_CLASS "* (?: \\( (?&PATH_INNER) \\) | \\[ (?&PATH_INNER) \\] ) )* (?: " PATHCHARS_CLASS "* (?(<APOS_START>)" PATHTERM_NOAPOS_CLASS "|" PATHTERM_CLASS ") )? )))"
+
+#define URLPATH "(?x: [/?#](?&PATH) )?"
+#define VOIP_PATH "(?x: [;?](?&PATH) )?"
+
+/* Now let's put these fragments together */
+
+#define DEFS APOS_START_DEF IP_DEF PATH_INNER_DEF PATH_DEF
+
+#define REGEX_URL_AS_IS DEFS SCHEME "://" USERPASS URL_HOST PORT URLPATH
+/* TODO: also support file:/etc/passwd */
+#define REGEX_URL_FILE DEFS "(?ix: file:/ (?: / (?: " HOSTNAME1 " )? / )? (?! / ) )(?&PATH)"
+/* Lookbehind so that we don't catch "abc.www.foo.bar", bug 739757. Lookahead for www/ftp for convenience (so that we can reuse HOSTNAME1). */
+#define REGEX_URL_HTTP DEFS "(?<!(?:" HOSTNAMESEGMENTCHARS_CLASS "|[.]))(?=(?i:www|ftp))" HOSTNAME1 PORT URLPATH
+#define REGEX_URL_VOIP DEFS "(?i:h323:|sips?:)" USERPASS URL_HOST PORT VOIP_PATH
+#define REGEX_EMAIL DEFS "(?i:mailto:)?" USER "@" EMAIL_HOST
+#define REGEX_NEWS_MAN "(?i:news:|man:|info:)[-[:alnum:]\\Q^_{|}~!\"#$%&'()*+,./;:=?`\\E]+"
diff --git a/src/regex-builtins.cc b/src/regex-builtins.cc
new file mode 100644
index 00000000..f748ca1a
--- /dev/null
+++ b/src/regex-builtins.cc
@@ -0,0 +1,103 @@
+/*
+ * Copyright © 2019 Christian Persch
+ *
+ * This library is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this library. If not, see <https://www.gnu.org/licenses/>.
+ */
+
+#include "config.h"
+
+#include <glib.h>
+
+#include "regex.hh"
+#include "regex-builtins.hh"
+#include "regex-builtins-patterns.hh"
+
+#include "pcre2-glue.hh"
+
+namespace vte::base {
+
+RegexBuiltins::RegexBuiltins()
+{
+ m_builtins.reserve(8);
+
+ compile_builtin(REGEX_URL_AS_IS, InternalBuiltinsTag::eURL);
+ compile_builtin(REGEX_URL_HTTP, InternalBuiltinsTag::eHTTP);
+ compile_builtin(REGEX_URL_FILE, InternalBuiltinsTag::eFILE);
+ compile_builtin(REGEX_URL_VOIP, InternalBuiltinsTag::eVOIP);
+ compile_builtin(REGEX_EMAIL, InternalBuiltinsTag::eEMAIL);
+ compile_builtin(REGEX_NEWS_MAN, InternalBuiltinsTag::eNEWS_MAN);
+}
+
+void
+RegexBuiltins::compile_builtin(std::string_view const& pattern,
+ InternalBuiltinsTag tag) noexcept
+{
+ GError* error{nullptr};
+ auto regex = Regex::compile(Regex::Purpose::eMatch,
+ pattern,
+ PCRE2_UTF | PCRE2_UCP | PCRE2_NO_UTF_CHECK | PCRE2_MULTILINE,
+ &error);
+ if (error) {
+ g_printerr("Failed to compile builtin regex %d: %s\n", int(tag), error->message);
+ g_error_free(error);
+ return;
+ }
+
+ regex->jit(PCRE2_JIT_COMPLETE, &error);
+ if (error) {
+ g_printerr("Failed to complete JIT compile builtin regex %d: %s\n", int(tag), error->message);
+ g_clear_error(&error);
+ }
+
+ regex->jit(PCRE2_JIT_PARTIAL_SOFT, &error);
+ if (error) {
+ g_printerr("Failed to partial-soft JIT compile builtin regex %d: %s\n", int(tag), error->message);
+ g_clear_error(&error);
+ }
+
+ m_builtins.emplace_back(take_ref(regex), int(tag));
+}
+
+int
+RegexBuiltins::transform_match(char*& match,
+ int tag) const noexcept
+{
+ switch (InternalBuiltinsTag(tag)) {
+ case InternalBuiltinsTag::eURL:
+ case InternalBuiltinsTag::eFILE:
+ case InternalBuiltinsTag::eNEWS_MAN:
+ case InternalBuiltinsTag::eVOIP:
+ /* No transformation */
+ return int(BuiltinsTag::eURI);
+
+ case InternalBuiltinsTag::eHTTP: {
+ auto v = match;
+ match = g_strdup_printf("http://%s", match);
+ g_free(v);
+ return int(BuiltinsTag::eURI);
+ }
+
+ case InternalBuiltinsTag::eEMAIL:
+ if (g_ascii_strncasecmp ("mailto:", match, 7) != 0) {
+ auto v = match;
+ match = g_strdup_printf ("mailto:%s", match);
+ g_free(v);
+ }
+ return int(BuiltinsTag::eURI);
+ }
+
+ return -1;
+}
+
+} // namespace vte::base
diff --git a/src/regex-builtins.hh b/src/regex-builtins.hh
new file mode 100644
index 00000000..9d201949
--- /dev/null
+++ b/src/regex-builtins.hh
@@ -0,0 +1,80 @@
+/*
+ * Copyright © 2019 Christian Persch
+ *
+ * This library is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this library. If not, see <https://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "regex.hh"
+#include "refptr.hh"
+
+namespace vte {
+
+namespace base {
+
+class RegexBuiltins {
+private:
+ static inline std::weak_ptr<RegexBuiltins> s_weak_ptr{};
+
+ std::vector<std::pair<RefPtr<Regex>, int>> m_builtins{};
+
+ enum class InternalBuiltinsTag : int {
+ eURL = -2,
+ eHTTP = -3,
+ eFILE = -4,
+ eVOIP = -5,
+ eEMAIL = -6,
+ eNEWS_MAN = -7
+ };
+
+ void compile_builtin(std::string_view const& pattern,
+ InternalBuiltinsTag tag) noexcept;
+
+public:
+ // these must have the same values as the public VteBuiltinMatchTag
+ enum class BuiltinsTag : int {
+ eURI = -2
+ };
+
+ RegexBuiltins();
+ ~RegexBuiltins() { }
+ RegexBuiltins(RegexBuiltins const&) = delete;
+ RegexBuiltins(RegexBuiltins&&) = delete;
+
+ RegexBuiltins& operator= (RegexBuiltins const&) = delete;
+ RegexBuiltins& operator= (RegexBuiltins&&) = delete;
+
+ inline constexpr auto const& builtins() const noexcept { return m_builtins; }
+
+ int transform_match(char*& match,
+ int tag) const noexcept;
+
+ static std::shared_ptr<RegexBuiltins> get()
+ {
+ auto inst = s_weak_ptr.lock();
+ if (!inst)
+ s_weak_ptr = inst = std::make_shared<RegexBuiltins>();
+ return inst;
+ }
+};
+
+} // namespace base
+
+} // namespace vte
diff --git a/src/regex-test.cc b/src/regex-test.cc
new file mode 100644
index 00000000..25195072
--- /dev/null
+++ b/src/regex-test.cc
@@ -0,0 +1,668 @@
+/*
+ * Copyright © 2015 Egmont Koblinger
+ * Copyright © 2019, 2020 Christian Persch
+ *
+ * This library is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this library. If not, see <https://www.gnu.org/licenses/>.
+ */
+
+#include "config.h"
+
+#include <glib.h>
+#include <locale.h>
+
+#include <cstdint>
+#include <cstdlib>
+
+#include <string>
+
+#include "glib-glue.hh"
+#include "regex.hh"
+#include "regex-builtins-patterns.hh"
+
+using namespace std::literals;
+
+auto pcre2test = bool{false};
+auto pcre2_atleast_10_35 = bool{false};
+FILE* pcre2test_in{nullptr};
+FILE* pcre2test_out{nullptr};
+
+/* Shorthand for expecting the pattern to match the entire input string */
+#define ENTIRE ((char *) 1)
+
+static pcre2_match_context_8*
+create_match_context()
+{
+ pcre2_match_context_8 *match_context;
+
+ match_context = pcre2_match_context_create_8(nullptr /* general context */);
+ pcre2_set_match_limit_8(match_context, 65536); /* should be plenty */
+ pcre2_set_recursion_limit_8(match_context, 64); /* should be plenty */
+
+ return match_context;
+}
+
+static char*
+get_match(decltype(&pcre2_match_8) match_fn,
+ vte::base::Regex const* regex,
+ uint32_t match_flags,
+ char const* subject)
+{
+ auto match_context = create_match_context();
+ auto match_data = pcre2_match_data_create_8(256 /* should be plenty */,
+ nullptr /* general context */);
+
+ auto r = match_fn(regex->code(),
+ (PCRE2_SPTR8)subject,
+ strlen(subject),
+ 0, /* start offset */
+ match_flags |
+ PCRE2_NO_UTF_CHECK,
+ match_data,
+ match_context);
+
+ char* match;
+ if (r == PCRE2_ERROR_NOMATCH) {
+ match = nullptr;
+ } else if (r < 0) {
+ /* Error */
+ PCRE2_UCHAR8 buf[256];
+ auto n = pcre2_get_error_message_8(r, buf, sizeof(buf));
+ g_assert_true(n >= 0);
+ g_printerr("PCRE2 error %d: %s\n", r, buf);
+
+ match = nullptr;
+ } else {
+ /* has match */
+ auto const* ovector = pcre2_get_ovector_pointer_8(match_data);
+ auto const so = ovector[0];
+ auto const eo = ovector[1];
+ if (so == PCRE2_UNSET || eo == PCRE2_UNSET)
+ match = nullptr;
+ else
+ match = g_strndup(subject + so, eo - so);
+ }
+
+ pcre2_match_data_free_8(match_data);
+ pcre2_match_context_free_8(match_context);
+
+ return match;
+}
+
+struct TestData {
+ char const* pattern;
+ char const* string;
+ char const* expected;
+ uint32_t match_flags;
+};
+
+static std::string
+escape_slash(std::string str)
+{
+ auto escaped = std::string{};
+ for (auto const c : str) {
+ if (c == '/')
+ escaped.append("\\/");
+ else if (c == '\\')
+ escaped.append("\\\\");
+ else
+ escaped.push_back(c);
+ }
+
+ return escaped;
+}
+
+static std::string
+flags_to_string(uint32_t flags)
+{
+ auto str = std::string{};
+
+ if (flags & PCRE2_ANCHORED)
+ str.append("anchored,");
+
+ return str;
+}
+
+static void
+print_testdata(TestData* data,
+ int line)
+{
+ auto patstr = escape_slash(data->pattern);
+ auto flagstr = flags_to_string(data->match_flags);
+
+ fprintf(pcre2test_in,
+ "# Line: %d\n"
+ "/%s/%s\n"
+ " %s\\=\n"
+ "\n",
+ line,
+ patstr.c_str(), flagstr.c_str(),
+ data->string);
+ fprintf(pcre2test_out,
+ "# Line: %d\n"
+ "/%s/%s\n"
+ " %s\\=\n"
+ "%s%s\n"
+ "\n",
+ line,
+ patstr.c_str(), flagstr.c_str(),
+ data->string,
+ data->expected ? " 0: " : "No match",
+ data->expected ? data->expected : "");
+}
+
+static void
+assert_match_test(void const* ptr)
+{
+ auto data = reinterpret_cast<TestData const*>(ptr);
+
+ auto error = vte::glib::Error{};
+ auto regex = vte::base::Regex::compile(vte::base::Regex::Purpose::eMatch,
+ data->pattern,
+ PCRE2_UTF | PCRE2_NO_UTF_CHECK |
+ PCRE2_UCP |
+ PCRE2_MULTILINE |
+ /* Pass match_flags here as compile flags, since
+ * otherwise some JITed regex tests fail because
+ * ANCHORED is ignored when passed to
+ * pcre2_jit_match_8.
+ */
+ data->match_flags,
+ error);
+ error.assert_no_error();
+ g_assert_nonnull(regex);
+
+ auto match = get_match(&pcre2_match_8, regex, data->match_flags, data->string);
+
+ g_assert_cmpstr(match, ==, data->expected);
+ g_free(match);
+
+ if (vte::base::Regex::check_pcre_config_jit()) {
+ regex->jit(PCRE2_JIT_COMPLETE, error);
+ error.assert_no_error();
+ regex->jit(PCRE2_JIT_PARTIAL_SOFT, error);
+ error.assert_no_error();
+ regex->jit(PCRE2_JIT_PARTIAL_HARD, error);
+ error.assert_no_error();
+
+ match = get_match(&pcre2_jit_match_8, regex, data->match_flags, data->string);
+ g_assert_cmpstr(match, ==, data->expected);
+ g_free(match);
+ }
+
+ regex->unref();
+}
+
+static void
+assert_match(char const* pattern,
+ char const* string,
+ char const* expected,
+ uint32_t match_flags = 0u,
+ int line = __builtin_LINE())
+{
+ auto data = g_new(TestData, 1);
+ data->pattern = pattern;
+ data->string = string;
+ data->expected = expected == ENTIRE ? string : expected;
+ data->match_flags = match_flags;
+
+ auto path = g_strdup_printf("/vte/regex/builtins/%d", line);
+ g_test_add_data_func_full(path, data, assert_match_test, (GDestroyNotify)g_free);
+ g_free(path);
+
+ if (pcre2test)
+ print_testdata(data, line);
+}
+
+static void
+assert_match_anchored(char const* pattern,
+ char const* string,
+ char const* expected,
+ int line = __builtin_LINE())
+{
+ assert_match(pattern, string, expected, PCRE2_ANCHORED, line);
+}
+
+static void
+setup_regex_builtins_tests(void)
+{
+ /* SCHEME is case insensitive */
+ assert_match_anchored (SCHEME, "http", ENTIRE);
+ assert_match_anchored (SCHEME, "HTTPS", ENTIRE);
+
+ /* USER is nonempty, alphanumeric, dot, plus and dash */
+ assert_match_anchored (USER, "", nullptr);
+ assert_match_anchored (USER, "dr.john-smith", ENTIRE);
+ assert_match_anchored (USER, "abc+def@ghi", "abc+def");
+
+ /* PASS is optional colon-prefixed value, allowing quite some characters, but definitely not @ */
+ assert_match_anchored (PASS, "", ENTIRE);
+ assert_match_anchored (PASS, "nocolon", "");
+ assert_match_anchored (PASS, ":s3cr3T", ENTIRE);
+ assert_match_anchored (PASS, ":$?#@host", ":$?#");
+
+ /* Hostname of at least 1 component, containing at least one non-digit in at least one of the segments */
+ assert_match_anchored (HOSTNAME1, "example.com", ENTIRE);
+ assert_match_anchored (HOSTNAME1, "a-b.c-d", ENTIRE);
+ assert_match_anchored (HOSTNAME1, "a_b", "a"); /* TODO: can/should we totally abort here? */
+ assert_match_anchored (HOSTNAME1, "déjà-vu.com", ENTIRE);
+ assert_match_anchored (HOSTNAME1, "➡.ws", ENTIRE);
+ assert_match_anchored (HOSTNAME1, "cömbining-áccents", ENTIRE);
+ assert_match_anchored (HOSTNAME1, "12", nullptr);
+ assert_match_anchored (HOSTNAME1, "12.34", nullptr);
+ assert_match_anchored (HOSTNAME1, "12.ab", ENTIRE);
+ if (pcre2test) // unexplained failure
+ assert_match_anchored (HOSTNAME1, "ab.12", nullptr); /* errr... could we fail here?? */
+
+ /* Hostname of at least 2 components, containing at least one non-digit in at least one of the segments */
+ assert_match_anchored (HOSTNAME2, "example.com", ENTIRE);
+ assert_match_anchored (HOSTNAME2, "example", nullptr);
+ assert_match_anchored (HOSTNAME2, "12", nullptr);
+ assert_match_anchored (HOSTNAME2, "12.34", nullptr);
+ assert_match_anchored (HOSTNAME2, "12.ab", ENTIRE);
+ assert_match_anchored (HOSTNAME2, "ab.12", nullptr);
+ if (pcre2test) // unexplained failure
+ assert_match_anchored (HOSTNAME2, "ab.cd.12", nullptr); /* errr... could we fail here?? */
+
+ /* IPv4 segment (number between 0 and 255) */
+ assert_match_anchored (DEFS "(?&S4)", "0", ENTIRE);
+ assert_match_anchored (DEFS "(?&S4)", "1", ENTIRE);
+ assert_match_anchored (DEFS "(?&S4)", "9", ENTIRE);
+ assert_match_anchored (DEFS "(?&S4)", "10", ENTIRE);
+ assert_match_anchored (DEFS "(?&S4)", "99", ENTIRE);
+ assert_match_anchored (DEFS "(?&S4)", "100", ENTIRE);
+ assert_match_anchored (DEFS "(?&S4)", "200", ENTIRE);
+ assert_match_anchored (DEFS "(?&S4)", "250", ENTIRE);
+ assert_match_anchored (DEFS "(?&S4)", "255", ENTIRE);
+ assert_match_anchored (DEFS "(?&S4)", "256", nullptr);
+ assert_match_anchored (DEFS "(?&S4)", "260", nullptr);
+ assert_match_anchored (DEFS "(?&S4)", "300", nullptr);
+ assert_match_anchored (DEFS "(?&S4)", "1000", nullptr);
+ assert_match_anchored (DEFS "(?&S4)", "", nullptr);
+ assert_match_anchored (DEFS "(?&S4)", "a1b", nullptr);
+
+ /* IPv4 addresses */
+ assert_match_anchored (DEFS "(?&IPV4)", "11.22.33.44", ENTIRE);
+ assert_match_anchored (DEFS "(?&IPV4)", "0.1.254.255", ENTIRE);
+ assert_match_anchored (DEFS "(?&IPV4)", "75.150.225.300", nullptr);
+ assert_match_anchored (DEFS "(?&IPV4)", "1.2.3.4.5", "1.2.3.4"); /* we could also bail out and not match at all */
+
+ /* IPv6 addresses */
+ assert_match_anchored (DEFS "(?&IPV6)", "11:::22", nullptr);
+ assert_match_anchored (DEFS "(?&IPV6)", "11:22::33:44::55:66", nullptr);
+ assert_match_anchored (DEFS "(?&IPV6)", "dead::beef", ENTIRE);
+ assert_match_anchored (DEFS "(?&IPV6)", "faded::bee", nullptr);
+ assert_match_anchored (DEFS "(?&IPV6)", "live::pork", nullptr);
+ assert_match_anchored (DEFS "(?&IPV6)", "::1", ENTIRE);
+ assert_match_anchored (DEFS "(?&IPV6)", "11::22:33::44", nullptr);
+ assert_match_anchored (DEFS "(?&IPV6)", "11:22:::33", nullptr);
+ assert_match_anchored (DEFS "(?&IPV6)", "dead:beef::192.168.1.1", ENTIRE);
+ assert_match_anchored (DEFS "(?&IPV6)", "192.168.1.1", nullptr);
+ assert_match_anchored (DEFS "(?&IPV6)", "11:22:33:44:55:66:77:87654", nullptr);
+ assert_match_anchored (DEFS "(?&IPV6)", "11:22::33:45678", nullptr);
+ assert_match_anchored (DEFS "(?&IPV6)", "11:22:33:44:55:66:192.168.1.12345", nullptr);
+
+ assert_match_anchored (DEFS "(?&IPV6)", "11:22:33:44:55:66:77", nullptr); /* no :: */
+ assert_match_anchored (DEFS "(?&IPV6)", "11:22:33:44:55:66:77:88", ENTIRE);
+ assert_match_anchored (DEFS "(?&IPV6)", "11:22:33:44:55:66:77:88:99", nullptr);
+ assert_match_anchored (DEFS "(?&IPV6)", "::11:22:33:44:55:66:77", ENTIRE); /* :: at the start */
+ assert_match_anchored (DEFS "(?&IPV6)", "::11:22:33:44:55:66:77:88", nullptr);
+ assert_match_anchored (DEFS "(?&IPV6)", "11:22:33::44:55:66:77", ENTIRE); /* :: in the middle */
+ assert_match_anchored (DEFS "(?&IPV6)", "11:22:33::44:55:66:77:88", nullptr);
+ assert_match_anchored (DEFS "(?&IPV6)", "11:22:33:44:55:66:77::", ENTIRE); /* :: at the end */
+ assert_match_anchored (DEFS "(?&IPV6)", "11:22:33:44:55:66:77:88::", nullptr);
+ assert_match_anchored (DEFS "(?&IPV6)", "::", ENTIRE); /* :: only */
+
+ assert_match_anchored (DEFS "(?&IPV6)", "11:22:33:44:55:192.168.1.1", nullptr); /* no :: */
+ assert_match_anchored (DEFS "(?&IPV6)", "11:22:33:44:55:66:192.168.1.1", ENTIRE);
+ assert_match_anchored (DEFS "(?&IPV6)", "11:22:33:44:55:66:77:192.168.1.1", nullptr);
+ assert_match_anchored (DEFS "(?&IPV6)", "::11:22:33:44:55:192.168.1.1", ENTIRE); /* :: at the start */
+ assert_match_anchored (DEFS "(?&IPV6)", "::11:22:33:44:55:66:192.168.1.1", nullptr);
+ assert_match_anchored (DEFS "(?&IPV6)", "11:22:33::44:55:192.168.1.1", ENTIRE); /* :: in the imddle */
+ assert_match_anchored (DEFS "(?&IPV6)", "11:22:33::44:55:66:192.168.1.1", nullptr);
+ assert_match_anchored (DEFS "(?&IPV6)", "11:22:33:44:55::192.168.1.1", ENTIRE); /* :: at the end(ish) */
+ assert_match_anchored (DEFS "(?&IPV6)", "11:22:33:44:55:66::192.168.1.1", nullptr);
+ assert_match_anchored (DEFS "(?&IPV6)", "::192.168.1.1", ENTIRE); /* :: only(ish) */
+
+ /* URL_HOST is either a hostname, or an IPv4 address, or a bracket-enclosed IPv6 address */
+ assert_match_anchored (DEFS URL_HOST, "example", ENTIRE);
+ assert_match_anchored (DEFS URL_HOST, "example.com", ENTIRE);
+ assert_match_anchored (DEFS URL_HOST, "11.22.33.44", ENTIRE);
+ assert_match_anchored (DEFS URL_HOST, "[11.22.33.44]", nullptr);
+ assert_match_anchored (DEFS URL_HOST, "dead::be:ef", "dead"); /* TODO: can/should we totally abort here? */
+ assert_match_anchored (DEFS URL_HOST, "[dead::be:ef]", ENTIRE);
+
+ /* EMAIL_HOST is either an at least two-component hostname, or a bracket-enclosed IPv[46] address */
+ assert_match_anchored (DEFS EMAIL_HOST, "example", nullptr);
+ assert_match_anchored (DEFS EMAIL_HOST, "example.com", ENTIRE);
+ assert_match_anchored (DEFS EMAIL_HOST, "11.22.33.44", nullptr);
+ assert_match_anchored (DEFS EMAIL_HOST, "[11.22.33.44]", ENTIRE);
+ assert_match_anchored (DEFS EMAIL_HOST, "[11.22.33.456]", nullptr);
+ assert_match_anchored (DEFS EMAIL_HOST, "dead::be:ef", nullptr);
+ assert_match_anchored (DEFS EMAIL_HOST, "[dead::be:ef]", ENTIRE);
+
+ /* Number between 1 and 65535 (helper for port) */
+ assert_match_anchored (N_1_65535, "0", nullptr);
+ assert_match_anchored (N_1_65535, "1", ENTIRE);
+ assert_match_anchored (N_1_65535, "10", ENTIRE);
+ assert_match_anchored (N_1_65535, "100", ENTIRE);
+ assert_match_anchored (N_1_65535, "1000", ENTIRE);
+ assert_match_anchored (N_1_65535, "10000", ENTIRE);
+ assert_match_anchored (N_1_65535, "60000", ENTIRE);
+ assert_match_anchored (N_1_65535, "65000", ENTIRE);
+ assert_match_anchored (N_1_65535, "65500", ENTIRE);
+ assert_match_anchored (N_1_65535, "65530", ENTIRE);
+ assert_match_anchored (N_1_65535, "65535", ENTIRE);
+ assert_match_anchored (N_1_65535, "65536", nullptr);
+ assert_match_anchored (N_1_65535, "65540", nullptr);
+ assert_match_anchored (N_1_65535, "65600", nullptr);
+ assert_match_anchored (N_1_65535, "66000", nullptr);
+ assert_match_anchored (N_1_65535, "70000", nullptr);
+ assert_match_anchored (N_1_65535, "100000", nullptr);
+ assert_match_anchored (N_1_65535, "", nullptr);
+ assert_match_anchored (N_1_65535, "a1b", nullptr);
+
+ /* PORT is an optional colon-prefixed value */
+ assert_match_anchored (PORT, "", ENTIRE);
+ assert_match_anchored (PORT, ":1", ENTIRE);
+ assert_match_anchored (PORT, ":65535", ENTIRE);
+ assert_match_anchored (PORT, ":65536", ""); /* TODO: can/should we totally abort here? */
+
+ /* Parentheses are only allowed in matching pairs, see bug 763980. */
+ /* TODO: add tests for PATHCHARS and PATHNONTERM; and/or URLPATH */
+ assert_match_anchored (DEFS URLPATH, "/ab/cd", ENTIRE);
+ assert_match_anchored (DEFS URLPATH, "/ab/cd.html.", "/ab/cd.html");
+ assert_match_anchored (DEFS URLPATH, "/The_Offspring_(album)", ENTIRE);
+ assert_match_anchored (DEFS URLPATH, "/The_Offspring)", "/The_Offspring");
+ assert_match_anchored (DEFS URLPATH, "/a((b(c)d)e(f))", ENTIRE);
+ assert_match_anchored (DEFS URLPATH, "/a((b(c)d)e(f)))", "/a((b(c)d)e(f))");
+ assert_match_anchored (DEFS URLPATH, "/a(b).(c).", "/a(b).(c)");
+ assert_match_anchored (DEFS URLPATH, "/a.(b.(c.).).(d.(e.).).)", "/a.(b.(c.).).(d.(e.).)");
+ assert_match_anchored (DEFS URLPATH, "/a)b(c", "/a");
+ assert_match_anchored (DEFS URLPATH, "/.", "/");
+ assert_match_anchored (DEFS URLPATH, "/(.", "/");
+ assert_match_anchored (DEFS URLPATH, "/).", "/");
+ assert_match_anchored (DEFS URLPATH, "/().", "/()");
+ assert_match_anchored (DEFS URLPATH, "/", ENTIRE);
+ assert_match_anchored (DEFS URLPATH, "", ENTIRE);
+ assert_match_anchored (DEFS URLPATH, "?", ENTIRE);
+ assert_match_anchored (DEFS URLPATH, "?param=value", ENTIRE);
+ assert_match_anchored (DEFS URLPATH, "#", ENTIRE);
+ assert_match_anchored (DEFS URLPATH, "#anchor", ENTIRE);
+ assert_match_anchored (DEFS URLPATH, "/php?param[]=value1&param[]=value2", ENTIRE);
+ assert_match_anchored (DEFS URLPATH, "/foo?param1[index1]=value1&param2[index2]=value2", ENTIRE);
+ assert_match_anchored (DEFS URLPATH, "/[[[]][]]", ENTIRE);
+ assert_match_anchored (DEFS URLPATH, "/[([])]([()])", ENTIRE);
+ assert_match_anchored (DEFS URLPATH, "/([()])[([])]", ENTIRE);
+ assert_match_anchored (DEFS URLPATH, "/[(])", "/");
+ assert_match_anchored (DEFS URLPATH, "/([)]", "/");
+
+
+ /* Put the components together and test the big picture */
+
+ assert_match (REGEX_URL_AS_IS, "There's no URL here http:/foo", nullptr);
+ assert_match (REGEX_URL_AS_IS, "Visit http://example.com for details", "http://example.com");
+ assert_match (REGEX_URL_AS_IS, "Trailing dot http://foo/bar.html.", "http://foo/bar.html");
+ assert_match (REGEX_URL_AS_IS, "Trailing ellipsis http://foo/bar.html...", "http://foo/bar.html");
+ assert_match (REGEX_URL_AS_IS, "Trailing comma http://foo/bar,baz,", "http://foo/bar,baz");
+ assert_match (REGEX_URL_AS_IS, "Trailing semicolon http://foo/bar;baz;", "http://foo/bar;baz");
+ assert_match (REGEX_URL_AS_IS, "See <http://foo/bar>", "http://foo/bar");
+ assert_match (REGEX_URL_AS_IS, "<http://foo.bar/asdf.qwer.html>", "http://foo.bar/asdf.qwer.html");
+ assert_match (REGEX_URL_AS_IS, "Go to http://192.168.1.1.", "http://192.168.1.1");
+ assert_match (REGEX_URL_AS_IS, "If not, see <http://www.gnu.org/licenses/>.", "http://www.gnu.org/licenses/");
+ assert_match (REGEX_URL_AS_IS, "<a href=\"http://foo/bar\">foo</a>", "http://foo/bar");
+ assert_match (REGEX_URL_AS_IS, "<a href='http://foo/bar'>foo</a>", "http://foo/bar");
+ assert_match (REGEX_URL_AS_IS, "<url>http://foo/bar</url>", "http://foo/bar");
+
+ assert_match (REGEX_URL_AS_IS, "http://", nullptr);
+ assert_match (REGEX_URL_AS_IS, "http://a", ENTIRE);
+ assert_match (REGEX_URL_AS_IS, "http://aa.", "http://aa");
+ assert_match (REGEX_URL_AS_IS, "http://aa.b", ENTIRE);
+ assert_match (REGEX_URL_AS_IS, "http://aa.bb", ENTIRE);
+ assert_match (REGEX_URL_AS_IS, "http://aa.bb/c", ENTIRE);
+ assert_match (REGEX_URL_AS_IS, "http://aa.bb/cc", ENTIRE);
+ assert_match (REGEX_URL_AS_IS, "http://aa.bb/cc/", ENTIRE);
+
+ assert_match (REGEX_URL_AS_IS, "HtTp://déjà-vu.com:10000/déjà/vu", ENTIRE);
+ assert_match (REGEX_URL_AS_IS, "HTTP://joe:sEcReT@➡.ws:1080", ENTIRE);
+ assert_match (REGEX_URL_AS_IS, "https://cömbining-áccents", ENTIRE);
+
+ assert_match (REGEX_URL_AS_IS, "http://111.222.33.44", ENTIRE);
+ assert_match (REGEX_URL_AS_IS, "http://111.222.33.44/", ENTIRE);
+ assert_match (REGEX_URL_AS_IS, "http://111.222.33.44/foo", ENTIRE);
+ assert_match (REGEX_URL_AS_IS, "http://1.2.3.4:5555/xyz", ENTIRE);
+ assert_match (REGEX_URL_AS_IS, "https://[dead::beef]:12345/ipv6", ENTIRE);
+ assert_match (REGEX_URL_AS_IS, "https://[dead::beef:11.22.33.44]", ENTIRE);
+ assert_match (REGEX_URL_AS_IS, "http://1.2.3.4:", "http://1.2.3.4"); /* TODO: can/should we totally abort here? */
+ assert_match (REGEX_URL_AS_IS, "https://dead::beef/no-brackets-ipv6", "https://dead"); /* ditto */
+ assert_match (REGEX_URL_AS_IS, "http://111.222.333.444/", nullptr);
+ assert_match (REGEX_URL_AS_IS, "http://1.2.3.4:70000", "http://1.2.3.4"); /* TODO: can/should we totally abort here? */
+ assert_match (REGEX_URL_AS_IS, "http://[dead::beef:111.222.333.444]", nullptr);
+
+ /* '?' or '#' without '/', GNOME/gnome-terminal#7888 */
+ assert_match (REGEX_URL_AS_IS, "http://foo.bar?", ENTIRE);
+ assert_match (REGEX_URL_AS_IS, "http://foo.bar?param=value", ENTIRE);
+ assert_match (REGEX_URL_AS_IS, "http://foo.bar:12345?param=value", ENTIRE);
+ assert_match (REGEX_URL_AS_IS, "http://1.2.3.4?param=value", ENTIRE);
+ assert_match (REGEX_URL_AS_IS, "http://[dead::beef]?param=value", ENTIRE);
+ assert_match (REGEX_URL_AS_IS, "http://foo.bar#", ENTIRE);
+ assert_match (REGEX_URL_AS_IS, "http://foo.bar#anchor", ENTIRE);
+ assert_match (REGEX_URL_AS_IS, "http://foo.bar:12345#anchor", ENTIRE);
+ assert_match (REGEX_URL_AS_IS, "http://1.2.3.4#anchor", ENTIRE);
+ assert_match (REGEX_URL_AS_IS, "http://[dead::beef]#anchor", ENTIRE);
+
+ /* Username, password */
+ assert_match (REGEX_URL_AS_IS, "http://joe@example.com", ENTIRE);
+ assert_match (REGEX_URL_AS_IS, "http://user.name:sec.ret@host.name", ENTIRE);
+ assert_match (REGEX_URL_AS_IS, "http://joe:secret@[::1]", ENTIRE);
+ assert_match (REGEX_URL_AS_IS, "http://dudewithnopassword:@example.com", ENTIRE);
+ assert_match (REGEX_URL_AS_IS, "http://safeguy:!#$%^&*@host", ENTIRE);
+ assert_match (REGEX_URL_AS_IS, "http://invalidusername!@host", "http://invalidusername");
+
+ assert_match (REGEX_URL_AS_IS, "http://ab.cd/ef?g=h&i=j|k=l#m=n:o=p", ENTIRE);
+ assert_match (REGEX_URL_AS_IS, "http:///foo", nullptr);
+
+ /* Parentheses are only allowed in matching pairs, see bug 763980. */
+ assert_match (REGEX_URL_AS_IS, "https://en.wikipedia.org/wiki/The_Offspring_(album)", ENTIRE);
+ assert_match (REGEX_URL_AS_IS, "[markdown](https://en.wikipedia.org/wiki/The_Offspring)", "https://en.wikipedia.org/wiki/The_Offspring");
+ assert_match (REGEX_URL_AS_IS, "[markdown](https://en.wikipedia.org/wiki/The_Offspring_(album))", "https://en.wikipedia.org/wiki/The_Offspring_(album)");
+ assert_match (REGEX_URL_AS_IS, "[markdown](http://foo.bar/(a(b)c)d)e)f", "http://foo.bar/(a(b)c)d");
+ assert_match (REGEX_URL_AS_IS, "[markdown](http://foo.bar/a)b(c", "http://foo.bar/a");
+
+ /* Apostrophes are allowed, except at trailing position if the URL is preceded by an apostrophe, see bug 448044. */
+ assert_match (REGEX_URL_AS_IS, "https://en.wikipedia.org/wiki/Moore's_law", ENTIRE);
+ assert_match (REGEX_URL_AS_IS, "<a href=\"https://en.wikipedia.org/wiki/Moore's_law\">", "https://en.wikipedia.org/wiki/Moore's_law");
+ assert_match (REGEX_URL_AS_IS, "https://en.wikipedia.org/wiki/Cryin'", ENTIRE);
+ assert_match (REGEX_URL_AS_IS, "<a href=\"https://en.wikipedia.org/wiki/Cryin'\">", "https://en.wikipedia.org/wiki/Cryin'");
+ assert_match (REGEX_URL_AS_IS, "<a href='https://en.wikipedia.org/wiki/Aerosmith'>", "https://en.wikipedia.org/wiki/Aerosmith");
+
+ /* Apostrophes are allowed, except at trailing position if the URL is preceded by an apostrophe, see issue GNOME/gnome-terminal#5921 */
+ assert_match (REGEX_URL_AS_IS, "https://en.wikipedia.org/wiki/Moore's_law", ENTIRE);
+ assert_match (REGEX_URL_AS_IS, "<a href=\"https://en.wikipedia.org/wiki/Moore's_law\">", "https://en.wikipedia.org/wiki/Moore's_law");
+ assert_match (REGEX_URL_AS_IS, "https://en.wikipedia.org/wiki/Cryin'", ENTIRE);
+ assert_match (REGEX_URL_AS_IS, "<a href=\"https://en.wikipedia.org/wiki/Cryin'\">", "https://en.wikipedia.org/wiki/Cryin'");
+ assert_match (REGEX_URL_AS_IS, "<a href='https://en.wikipedia.org/wiki/Aerosmith'>", "https://en.wikipedia.org/wiki/Aerosmith");
+
+ /* No scheme */
+ /* These need PCRE2 10.35 to succeed; see issue GNOME/gnome-terminal#221 */
+ if (pcre2_atleast_10_35 || pcre2test) {
+ assert_match (REGEX_URL_HTTP, "www.foo.bar/baz", ENTIRE);
+ assert_match (REGEX_URL_HTTP, "WWW3.foo.bar/baz", ENTIRE);
+ assert_match (REGEX_URL_HTTP, "FTP.FOO.BAR/BAZ", ENTIRE); /* FIXME if no scheme is given and url starts with ftp, can we make the protocol ftp instead of http? */
+ assert_match (REGEX_URL_HTTP, "ftpxy.foo.bar/baz", ENTIRE);
+ if (pcre2test) // unexplained failure
+ assert_match (REGEX_URL_HTTP, "ftp.123/baz", nullptr); /* errr... could we fail here?? */
+ }
+ assert_match (REGEX_URL_HTTP, "foo.bar/baz", nullptr);
+ assert_match (REGEX_URL_HTTP, "abc.www.foo.bar/baz", nullptr);
+ assert_match (REGEX_URL_HTTP, "uvwww.foo.bar/baz", nullptr);
+ assert_match (REGEX_URL_HTTP, "xftp.foo.bar/baz", nullptr);
+
+ /* file:/ or file://(hostname)?/ */
+ assert_match (REGEX_URL_FILE, "file:", nullptr);
+ assert_match (REGEX_URL_FILE, "file:/", ENTIRE);
+ assert_match (REGEX_URL_FILE, "file://", nullptr);
+ assert_match (REGEX_URL_FILE, "file:///", ENTIRE);
+ assert_match (REGEX_URL_FILE, "file:////", nullptr);
+ assert_match (REGEX_URL_FILE, "file:etc/passwd", nullptr);
+ assert_match (REGEX_URL_FILE, "File:/etc/passwd", ENTIRE);
+ assert_match (REGEX_URL_FILE, "FILE:///etc/passwd", ENTIRE);
+ assert_match (REGEX_URL_FILE, "file:////etc/passwd", nullptr);
+ assert_match (REGEX_URL_FILE, "file://host.name", nullptr);
+ assert_match (REGEX_URL_FILE, "file://host.name/", ENTIRE);
+ assert_match (REGEX_URL_FILE, "file://host.name/etc", ENTIRE);
+
+ assert_match (REGEX_URL_FILE, "See file:/.", "file:/");
+ assert_match (REGEX_URL_FILE, "See file:///.", "file:///");
+ assert_match (REGEX_URL_FILE, "See file:/lost+found.", "file:/lost+found");
+ assert_match (REGEX_URL_FILE, "See file:///lost+found.", "file:///lost+found");
+
+ /* Email */
+ assert_match (REGEX_EMAIL, "Write to foo@bar.com.", "foo@bar.com");
+ assert_match (REGEX_EMAIL, "Write to <foo@bar.com>", "foo@bar.com");
+ assert_match (REGEX_EMAIL, "Write to mailto:foo@bar.com.", "mailto:foo@bar.com");
+ assert_match (REGEX_EMAIL, "Write to MAILTO:FOO@BAR.COM.", "MAILTO:FOO@BAR.COM");
+ assert_match (REGEX_EMAIL, "Write to foo@[1.2.3.4]", "foo@[1.2.3.4]");
+ assert_match (REGEX_EMAIL, "Write to foo@[1.2.3.456]", nullptr);
+ assert_match (REGEX_EMAIL, "Write to foo@[1::2345]", "foo@[1::2345]");
+ assert_match (REGEX_EMAIL, "Write to foo@[dead::beef]", "foo@[dead::beef]");
+ assert_match (REGEX_EMAIL, "Write to foo@1.2.3.4", nullptr);
+ assert_match (REGEX_EMAIL, "Write to foo@1.2.3.456", nullptr);
+ assert_match (REGEX_EMAIL, "Write to foo@1::2345", nullptr);
+ assert_match (REGEX_EMAIL, "Write to foo@dead::beef", nullptr);
+ assert_match (REGEX_EMAIL, "<baz email=\"foo@bar.com\"/>", "foo@bar.com");
+ assert_match (REGEX_EMAIL, "<baz email='foo@bar.com'/>", "foo@bar.com");
+ assert_match (REGEX_EMAIL, "<email>foo@bar.com</email>", "foo@bar.com");
+
+ /* Sip, examples from rfc 3261 */
+ assert_match (REGEX_URL_VOIP, "sip:alice@atlanta.com;maddr=239.255.255.1;ttl=15", ENTIRE);
+ assert_match (REGEX_URL_VOIP, "sip:alice@atlanta.com", ENTIRE);
+ assert_match (REGEX_URL_VOIP, "sip:alice:secretword@atlanta.com;transport=tcp", ENTIRE);
+ assert_match (REGEX_URL_VOIP, "sips:alice@atlanta.com?subject=project%20x&priority=urgent", ENTIRE);
+ assert_match (REGEX_URL_VOIP, "sip:+1-212-555-1212:1234@gateway.com;user=phone", ENTIRE);
+ assert_match (REGEX_URL_VOIP, "sips:1212@gateway.com", ENTIRE);
+ assert_match (REGEX_URL_VOIP, "sip:alice@192.0.2.4", ENTIRE);
+ assert_match (REGEX_URL_VOIP, "sip:atlanta.com;method=REGISTER?to=alice%40atlanta.com", ENTIRE);
+ assert_match (REGEX_URL_VOIP, "SIP:alice;day=tuesday@atlanta.com", ENTIRE);
+ assert_match (REGEX_URL_VOIP, "Dial sip:alice@192.0.2.4.", "sip:alice@192.0.2.4");
+
+ /* Extremely long match, bug 770147 */
+ assert_match (REGEX_URL_AS_IS, "http://www.example.com/ThisPathConsistsOfMoreThan1024Characters"
+ "1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890"
+ "1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890"
+ "1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890"
+ "1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890"
+ "1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890"
+ "1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890"
+ "1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890"
+ "1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890"
+ "1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890"
+ "1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890", ENTIRE);
+}
+
+static void
+test_regex_unicode(void)
+{
+ auto error = vte::glib::Error{};
+ g_assert_true(vte::base::Regex::check_pcre_config_unicode(error));
+ error.assert_no_error();
+}
+
+static bool
+parse_args(char*** argv,
+ int* argc,
+ GError** error)
+{
+ char* _pcre2test_filename{nullptr};
+ GOptionEntry const entries[] = {
+ { "pcre2test", 0, 0, G_OPTION_ARG_FILENAME, &_pcre2test_filename,
+ "Print input and output of tests in pcre2test format to file", "FILENAME" },
+ { nullptr }
+ };
+
+ auto context = g_option_context_new(nullptr);
+ g_option_context_set_help_enabled(context, false);
+ g_option_context_set_ignore_unknown_options(context, true);
+ g_option_context_add_main_entries(context, entries, nullptr);
+
+ bool rv = g_option_context_parse(context, argc, argv, error);
+ g_option_context_free(context);
+
+ pcre2test = _pcre2test_filename != nullptr;
+ if (rv && pcre2test) {
+ auto pcre2test_in_filename = std::string{_pcre2test_filename} + ".in"s;
+ auto pcre2test_out_filename = std::string{_pcre2test_filename} + ".out"s;
+ g_free(_pcre2test_filename);
+ _pcre2test_filename = nullptr;
+
+ pcre2test_in = fopen(pcre2test_in_filename.c_str(), "wbe");
+ if (pcre2test_in == nullptr) {
+ auto errsv = int{errno};
+ g_set_error(error, G_OPTION_ERROR, G_OPTION_ERROR_FAILED,
+ "Failed to open pcre2test input file: %s",
+ g_strerror(errsv));
+ return false;
+ }
+ pcre2test_out = fopen(pcre2test_out_filename.c_str(), "wbe");
+ if (pcre2test_out == nullptr) {
+ auto errsv = int{errno};
+ g_set_error(error, G_OPTION_ERROR, G_OPTION_ERROR_FAILED,
+ "Failed to open pcre2test output file: %s",
+ g_strerror(errsv));
+ fclose(pcre2test_in);
+ pcre2test_in = nullptr;
+ return false;
+ }
+ }
+
+ return rv;
+}
+
+int
+main(int argc,
+ char* argv[])
+{
+ setlocale(LC_ALL, "");
+
+ g_test_init(&argc, &argv, nullptr);
+
+ auto err = vte::glib::Error{};
+ if (!parse_args(&argv, &argc, err)) {
+ g_printerr("Failed to parse arguments: %s\n", err.message());
+ return EXIT_FAILURE;
+ }
+
+ auto version = vte::base::Regex::get_pcre_version();
+ pcre2_atleast_10_35 = strverscmp(version.c_str(), "10.35") > 0;
+
+ if (pcre2test) {
+ fprintf(pcre2test_in, "#pattern multiline,ucp,utf,no_utf_check\n\n");
+ fprintf(pcre2test_out, "#pattern multiline,ucp,utf,no_utf_check\n\n");
+ }
+
+ /* Build test suites */
+
+ g_test_add_func("/vte/regex/unicode", test_regex_unicode);
+
+ setup_regex_builtins_tests();
+
+ /* Run tests */
+
+ if (pcre2test) {
+ fclose(pcre2test_in);
+ fclose(pcre2test_out);
+ return EXIT_SUCCESS;
+ }
+
+ return g_test_run();
+}
diff --git a/src/regex.cc b/src/regex.cc
index 20684f1c..914fa6a6 100644
--- a/src/regex.cc
+++ b/src/regex.cc
@@ -52,6 +52,17 @@ Regex::unref() noexcept
delete this;
}
+std::string
+Regex::get_pcre_version()
+{
+ auto v = std::string{};
+ auto r = pcre2_config_8(PCRE2_CONFIG_VERSION, nullptr);
+ v.resize(r);
+ r = pcre2_config_8(PCRE2_CONFIG_VERSION, v.data());
+
+ return v;
+}
+
bool
Regex::check_pcre_config_unicode(GError** error)
{
diff --git a/src/regex.hh b/src/regex.hh
index db5b70ad..b98be2c0 100644
--- a/src/regex.hh
+++ b/src/regex.hh
@@ -38,6 +38,7 @@ public:
eSearch,
};
+ static std::string get_pcre_version();
static bool check_pcre_config_unicode(GError** error);
static bool check_pcre_config_jit(void);
static Regex* compile(Purpose purpose,
diff --git a/src/vte.cc b/src/vte.cc
index c0053900..76b6c68b 100644
--- a/src/vte.cc
+++ b/src/vte.cc
@@ -73,6 +73,8 @@
#include "cxx-utils.hh"
#include "gobject-glue.hh"
+#include "regex-builtins.hh"
+
#ifdef WITH_A11Y
#if VTE_GTK == 3
#include "vteaccess.h"
@@ -81,6 +83,7 @@
#endif /* VTE_GTK == 3 */
#endif /* WITH_A11Y */
+#include <algorithm>
#include <new> /* placement new */
using namespace std::literals;
@@ -1107,6 +1110,28 @@ Terminal::regex_match_remove(int tag) noexcept
match_regexes_writable().erase(i);
}
+void
+Terminal::regex_match_add_builtins() noexcept
+{
+ auto& match_regexes = match_regexes_writable();
+ if (!m_match_regex_builtins)
+ m_match_regex_builtins = vte::base::RegexBuiltins::get();
+ for (auto const& [regex, tag] : m_match_regex_builtins->builtins()) {
+ match_regexes.emplace_back(make_ref(regex.get()),
+ 0 /* match flags */,
+ VTE_MATCH_BUILTINS_CURSOR,
+ tag);
+ }
+}
+
+void
+Terminal::regex_match_remove_builtins() noexcept
+{
+ auto& match_regexes = match_regexes_writable();
+ std::remove_if(std::begin(match_regexes), std::end(match_regexes),
+ [](MatchRegex const& rem) { return rem.tag() < 0; });
+}
+
/*
* match_rowcol_to_offset:
* @terminal:
@@ -1480,7 +1505,7 @@ Terminal::match_check_internal(vte::grid::column_t column,
char*
Terminal::regex_match_check(vte::grid::column_t column,
vte::grid::row_t row,
- int* tag)
+ int* tag_ptr)
{
/* Need to ensure the ringview is updated. */
ringview_update();
@@ -1506,8 +1531,16 @@ Terminal::regex_match_check(vte::grid::column_t column,
_VTE_DEBUG_IF(VTE_DEBUG_EVENTS | VTE_DEBUG_REGEX) {
if (ret != NULL) g_printerr("Matched `%s'.\n", ret);
}
- if (tag != nullptr)
- *tag = (match != nullptr) ? match->tag() : -1;
+
+ int tag = -1;
+ if (match != nullptr) {
+ tag = match->tag();
+ if (tag < -1 && m_match_regex_builtins)
+ tag = m_match_regex_builtins->transform_match(ret, tag);
+ }
+
+ if (tag_ptr != nullptr)
+ *tag_ptr = tag;
return ret;
}
diff --git a/src/vte/vteenums.h b/src/vte/vteenums.h
index 4082fe88..0eb77673 100644
--- a/src/vte/vteenums.h
+++ b/src/vte/vteenums.h
@@ -215,4 +215,18 @@ typedef enum {
VTE_ALIGN_END = 3U,
} VteAlign;
+/*
+ * VteBuiltinMatchTag:
+ * @VTE_BUILTIN_MATCH_TAG_URI: the match is an URI as recognised by
+ * the expressions added with vte_terminal_match_add_uris()
+ *
+ * An enumeration that will be returned from vte_terminal_match_check_event()
+ * if a builtin expression matched.
+ *
+ * Since: 0.70
+ */
+typedef enum {
+ VTE_BUILTIN_MATCH_TAG_URI = -2
+} VteBuiltinMatchTag;
+
G_END_DECLS
diff --git a/src/vte/vtemacros.h b/src/vte/vtemacros.h
index c1300998..873f12ad 100644
--- a/src/vte/vtemacros.h
+++ b/src/vte/vtemacros.h
@@ -21,6 +21,10 @@
#error "Only <vte/vte.h> can be included directly."
#endif
+#ifdef VTE_COMPILATION
+#define _VTE_GTK VTE_GTK
+#else
+
#include <gtk/gtk.h>
#if GTK_CHECK_VERSION(4,0,0)
@@ -33,6 +37,8 @@
#error gtk+ version unknown
#endif
+#endif /* VTE_COMPILATION */
+
#if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ > 6)
#define _VTE_GNUC_PACKED __attribute__((__packed__))
#else
diff --git a/src/vte/vteterminal.h b/src/vte/vteterminal.h
index 0d3cf51c..1e287d0c 100644
--- a/src/vte/vteterminal.h
+++ b/src/vte/vteterminal.h
@@ -429,6 +429,9 @@ _VTE_PUBLIC
int vte_terminal_match_add_regex(VteTerminal *terminal,
VteRegex *regex,
guint32 flags) _VTE_CXX_NOEXCEPT _VTE_GNUC_NONNULL(1, 2);
+_VTE_PUBLIC
+void vte_terminal_match_add_builtins(VteTerminal *terminal) _VTE_CXX_NOEXCEPT _VTE_GNUC_NONNULL(1);
+
/* Set the cursor to be used when the pointer is over a given match. */
_VTE_PUBLIC
void vte_terminal_match_set_cursor_name(VteTerminal *terminal,
@@ -438,6 +441,8 @@ _VTE_PUBLIC
void vte_terminal_match_remove(VteTerminal *terminal,
int tag) _VTE_CXX_NOEXCEPT _VTE_GNUC_NONNULL(1);
_VTE_PUBLIC
+void vte_terminal_match_remove_builtins(VteTerminal *terminal) _VTE_CXX_NOEXCEPT _VTE_GNUC_NONNULL(1);
+_VTE_PUBLIC
void vte_terminal_match_remove_all(VteTerminal *terminal) _VTE_CXX_NOEXCEPT _VTE_GNUC_NONNULL(1);
/* Check if a given cell on the screen contains part of a matched string. If
diff --git a/src/vtedefines.hh b/src/vtedefines.hh
index 7f2e478a..a452ad8e 100644
--- a/src/vtedefines.hh
+++ b/src/vtedefines.hh
@@ -74,10 +74,13 @@
#define VTE_PALETTE_SIZE 263
#define VTE_SCROLLBACK_INIT 512
+
#define VTE_DEFAULT_CURSOR std::string{"text"}
#define VTE_MOUSING_CURSOR std::string{"default"}
#define VTE_HYPERLINK_CURSOR std::string{"pointer"}
#define VTE_HYPERLINK_CURSOR_DEBUG std::string{"crosshair"}
+#define VTE_MATCH_BUILTINS_CURSOR std::string{"pointer"}
+
#define VTE_CHILD_INPUT_PRIORITY G_PRIORITY_DEFAULT_IDLE
#define VTE_CHILD_OUTPUT_PRIORITY G_PRIORITY_HIGH
#define VTE_MAX_INPUT_READ 0x1000
diff --git a/src/vtegtk.cc b/src/vtegtk.cc
index f94b8121..33223a5e 100644
--- a/src/vtegtk.cc
+++ b/src/vtegtk.cc
@@ -2895,7 +2895,7 @@ vte_terminal_match_add_gregex(VteTerminal *terminal,
* vte_terminal_match_add_regex:
* @terminal: a #VteTerminal
* @regex: (transfer none): a #VteRegex
- * @flags: PCRE2 match flags, or 0
+ * @flags: PCRE2 match flags, or 0 to use the default flags
*
* Adds the regular expression @regex to the list of matching expressions. When the
* user moves the mouse cursor over a section of displayed text which matches
@@ -2904,7 +2904,12 @@ vte_terminal_match_add_gregex(VteTerminal *terminal,
* Note that @regex should have been created using the <literal>PCRE2_MULTILINE</literal>
* flag.
*
- * Returns: an integer associated with this expression
+ * Note that the default flags only contain PCRE2_UTF (and some flags for internal use);
+ * if you want to match unicode properties, you need to pass PCRE2_UCP in @flags; and you
+ * must always use the %PCRE2_MULTILINE flag.
+ * See man:pcre2_compile(3) for more information on available flags.
+ *
+ * Returns: a nonnegative integer associated with this expression
*
* Since: 0.46
*/
@@ -2932,6 +2937,35 @@ catch (...)
}
/**
+ * vte_terminal_match_add_builtins:
+ * @terminal: a #VteTerminal
+ *
+ * Adds regular expressions to recognise URIs to the list of matching expressions.
+ * When the user moves the mouse cursor over a section of displayed text which matches
+ * this expression, the text will be highlighted.
+ *
+ * When vte_terminal_match_check_event() returns a match for this regex, the
+ * returned tag will a value from #VteBuiltinMatchTag.
+ *
+ * Use vte_terminal_match_remove_builtins() or vte_terminal_match_remove_all() to remove
+ * the matching expressions added by this function.
+ *
+ * Since: 0.60
+ */
+void
+vte_terminal_match_add_builtins(VteTerminal *terminal) noexcept
+try
+{
+ g_return_if_fail(VTE_IS_TERMINAL(terminal));
+
+ IMPL(terminal)->regex_match_add_builtins();
+}
+catch (...)
+{
+ vte::log_exception();
+}
+
+/**
* vte_terminal_match_check:
* @terminal: a #VteTerminal
* @column: the text column
@@ -2978,12 +3012,16 @@ catch (...)
*
* Checks if the text in and around the position of the event matches any of the
* regular expressions previously set using vte_terminal_match_add(). If a
- * match exists, the text string is returned and if @tag is not %NULL, the number
- * associated with the matched regular expression will be stored in @tag.
+ * match exists, the text string is returned.
*
- * If more than one regular expression has been set with
- * vte_terminal_match_add(), then expressions are checked in the order in
- * which they were added.
+ * If @tag is not %NULL, it will store the nonnegative integer associated with the
+ * matched regular expression, if it was added with vte_terminal_match_add_regex(),
+ * or a negative number from #VteBuiltinMatchTag if the matching regular expression
+ * is one added with vte_terminal_match_add_builtins() matched, or -1 if there is
+ * no match.
+ *
+ * Expressions are checked in the order in which they were added, returning the
+ * first match.
*
* Returns: (transfer full) (nullable): a newly allocated string which matches one of the previously
* set regular expressions, or %NULL if there is no match
@@ -3254,7 +3292,7 @@ catch (...)
/**
* vte_terminal_match_remove:
* @terminal: a #VteTerminal
- * @tag: the tag of the regex to remove
+ * @tag: the nonnegative tag of the regex to remove
*
* Removes the regular expression which is associated with the given @tag from
* the list of expressions which the terminal will highlight when the user
@@ -3274,6 +3312,26 @@ catch (...)
}
/**
+ * vte_terminal_match_remove_builtins:
+ * @terminal: a #VteTerminal
+ *
+ * Removes the regular expression added with vte_terminal_match_add_builtins().
+ *
+ * Since: 0.60
+ */
+void
+vte_terminal_match_remove_builtins(VteTerminal *terminal) noexcept
+try
+{
+ g_return_if_fail(VTE_IS_TERMINAL(terminal));
+ IMPL(terminal)->regex_match_remove_builtins();
+}
+catch (...)
+{
+ vte::log_exception();
+}
+
+/**
* vte_terminal_match_remove_all:
* @terminal: a #VteTerminal
*
diff --git a/src/vteinternal.hh b/src/vteinternal.hh
index 54409499..19a1539a 100644
--- a/src/vteinternal.hh
+++ b/src/vteinternal.hh
@@ -55,8 +55,10 @@
#include "chunk.hh"
#include "pty.hh"
#include "utf8.hh"
+#include "fwd.hh"
#include <list>
+#include <memory>
#include <queue>
#include <optional>
#include <string>
@@ -597,6 +599,10 @@ public:
return match_regexes_writable().emplace_back(std::forward<Args>(args)...);
}
+ std::shared_ptr<vte::base::RegexBuiltins> m_match_regex_builtins{};
+ void regex_match_add_builtins() noexcept;
+ void regex_match_remove_builtins() noexcept;
+
char* m_match_contents;
GArray* m_match_attributes;
char* m_match;
diff --git a/src/vteregex.cc b/src/vteregex.cc
index e8deabf8..1dcf79c4 100644
--- a/src/vteregex.cc
+++ b/src/vteregex.cc
@@ -26,9 +26,9 @@
#include <exception>
-#include "vtemacros.h"
-#include "vteenums.h"
-#include "vteregex.h"
+#include "vte/vtemacros.h"
+#include "vte/vteenums.h"
+#include "vte/vteregex.h"
#include "glib-glue.hh"
#include "pcre2-glue.hh"