diff options
author | Christian Persch <chpe@src.gnome.org> | 2022-03-01 21:52:22 +0100 |
---|---|---|
committer | Christian Persch <chpe@src.gnome.org> | 2022-03-01 21:52:22 +0100 |
commit | 8fe5e1541c79cf12bc9009bc6f61a722cd773795 (patch) | |
tree | 26d59e6994f414dee12bbf0dc28aca88b844c756 | |
parent | 7a0a7f3c5dd1b9c798bfa0f1f0a13661167f54a3 (diff) | |
download | vte-wip/regex-builtins.tar.gz |
lib: Add builtin regexeswip/regex-builtins
Add builin regexes to recognise URLs, copied from gnome-terminal.
https://gitlab.gnome.org/GNOME/vte/issues/114
-rw-r--r-- | doc/reference/vte-sections.txt.in | 5 | ||||
-rw-r--r-- | meson.build | 4 | ||||
-rw-r--r-- | src/app/app.cc | 41 | ||||
-rw-r--r-- | src/fwd.hh | 1 | ||||
-rw-r--r-- | src/meson.build | 29 | ||||
-rw-r--r-- | src/regex-builtins-patterns.hh | 156 | ||||
-rw-r--r-- | src/regex-builtins.cc | 103 | ||||
-rw-r--r-- | src/regex-builtins.hh | 80 | ||||
-rw-r--r-- | src/regex-test.cc | 668 | ||||
-rw-r--r-- | src/regex.cc | 11 | ||||
-rw-r--r-- | src/regex.hh | 1 | ||||
-rw-r--r-- | src/vte.cc | 39 | ||||
-rw-r--r-- | src/vte/vteenums.h | 14 | ||||
-rw-r--r-- | src/vte/vtemacros.h | 6 | ||||
-rw-r--r-- | src/vte/vteterminal.h | 5 | ||||
-rw-r--r-- | src/vtedefines.hh | 3 | ||||
-rw-r--r-- | src/vtegtk.cc | 74 | ||||
-rw-r--r-- | src/vteinternal.hh | 6 | ||||
-rw-r--r-- | src/vteregex.cc | 6 |
19 files changed, 1229 insertions, 23 deletions
diff --git a/doc/reference/vte-sections.txt.in b/doc/reference/vte-sections.txt.in index 4c9680f0..2bc53bf5 100644 --- a/doc/reference/vte-sections.txt.in +++ b/doc/reference/vte-sections.txt.in @@ -7,6 +7,7 @@ VteCursorBlinkMode VteCursorShape VteEraseBinding VteTextBlinkMode +VteBuiltinMatchTag VteFormat VteWriteFlags VteSelectionFunc @@ -79,8 +80,10 @@ vte_terminal_get_cursor_position vte_terminal_hyperlink_check_event #endif vte_terminal_match_add_regex +vte_terminal_match_add_builtins vte_terminal_match_remove vte_terminal_match_remove_all +vte_terminal_match_remove_builtins vte_terminal_match_check #if VTE_GTK == 3 vte_terminal_match_check_event @@ -148,6 +151,8 @@ VTE_TYPE_ERASE_BINDING vte_erase_binding_get_type VTE_TYPE_TEXT_BLINK_MODE vte_text_blink_mode_get_type +VTE_TYPE_BUILTIN_MATCH_TAGS +vte_builtin_match_tags_get_type VTE_TYPE_FORMAT vte_format_get_type VTE_TYPE_WRITE_FLAGS diff --git a/meson.build b/meson.build index abe7e8fd..da4d7ab0 100644 --- a/meson.build +++ b/meson.build @@ -38,9 +38,9 @@ clangxx_req_version = '11.0' gtk3_req_version = '3.20.0' gtk3_min_req_version = '3.18' -gtk3_max_allowed_version = '3.20' +gtk3_max_allowed_version = '3.24' -gtk4_req_version = '4.0.1' +gtk4_req_version = '4.0.0' gtk4_min_req_version = '4.0' gtk4_max_allowed_version = '4.0' diff --git a/src/app/app.cc b/src/app/app.cc index 8b1d47ea..e9cd488a 100644 --- a/src/app/app.cc +++ b/src/app/app.cc @@ -1452,8 +1452,7 @@ struct _VteappWindowClass { static GType vteapp_window_get_type(void); static char const* const builtin_dingus[] = { - "(((gopher|news|telnet|nntp|file|http|ftp|https)://)|(www|ftp)[-A-Za-z0-9]*\\.)[-A-Za-z0-9\\.]+(:[0-9]*)?", - "(((gopher|news|telnet|nntp|file|http|ftp|https)://)|(www|ftp)[-A-Za-z0-9]*\\.)[-A-Za-z0-9\\.]+(:[0-9]*)?/[-A-Za-z0-9_\\$\\.\\+\\!\\*\\(\\),;:@&=\\?/~\\#\\%]*[^]'\\.}>\\) ,\\\"]", + "(foo|bar|baz)+", nullptr, }; @@ -1941,6 +1940,29 @@ window_action_copy_match_cb(GSimpleAction* action, } static void +window_action_open_uri_cb(GSimpleAction* action, + GVariant* parameter, + void* data) +{ + VteappWindow* window = VTEAPP_WINDOW(data); + auto len = size_t{}; + auto str = g_variant_get_string(parameter, &len); + GError* err{nullptr}; + +#if GTK_CHECK_VERSION(3, 22, 0) + if (!gtk_show_uri_on_window(GTK_WINDOW(window), +#else + if (!gtk_show_uri(gtk_widget_get_screen(GTK_WIDGET(window)), +#endif + str, + gtk_get_current_event_time(), + &err)) { + verbose_printerr("Opening URI \"%s\" failed: %s\n", str, err->message); + g_error_free(err); + } +} + +static void window_action_paste_cb(GSimpleAction* action, GVariant* parameter, void* data) @@ -2027,13 +2049,21 @@ vteapp_window_show_context_menu(VteappWindow* window, g_menu_append_item(menu, item.get()); } - auto match = vte::glib::take_string(vte_terminal_match_check_event(window->terminal, event, nullptr)); + auto tag = -1; + auto match = vte::glib::take_string(vte_terminal_match_check_event(window->terminal, event, &tag)); if (match) { verbose_print("Match: %s\n", match.get()); auto target = g_variant_new_string(match.get()); /* floating */ auto item = vte::glib::take_ref(g_menu_item_new("Copy _Match", nullptr)); g_menu_item_set_action_and_target_value(item.get(), "win.copy-match", target); g_menu_append_item(menu, item.get()); + + } + if (match && tag == VTE_BUILTIN_MATCH_TAG_URI) { + auto target = g_variant_new_string(match.get()); /* floating */ + auto item = vte::glib::take_ref(g_menu_item_new("_Open URI", nullptr)); + g_menu_item_set_action_and_target_value(item.get(), "win.open-uri", target); + g_menu_append_item(menu, item.get()); } /* Test extra match API */ @@ -2464,6 +2494,7 @@ vteapp_window_constructed(GObject *object) GActionEntry const entries[] = { { "copy", window_action_copy_cb, "s", nullptr, nullptr }, { "copy-match", window_action_copy_match_cb, "s", nullptr, nullptr }, + { "open-uri", window_action_open_uri_cb, "s", nullptr, nullptr }, { "paste", window_action_paste_cb, nullptr, nullptr, nullptr }, { "reset", window_action_reset_cb, "b", nullptr, nullptr }, { "find", window_action_find_cb, nullptr, nullptr, nullptr }, @@ -2614,8 +2645,10 @@ vteapp_window_constructed(GObject *object) gtk_widget_set_opacity (GTK_WIDGET (window), options.get_alpha()); /* Dingus */ - if (!options.no_builtin_dingus) + if (!options.no_builtin_dingus) { vteapp_window_add_dingus(window, builtin_dingus); + vte_terminal_match_add_builtins(window->terminal); + } if (options.dingus != nullptr) vteapp_window_add_dingus(window, options.dingus); @@ -22,6 +22,7 @@ namespace vte { namespace base { class Pty; +class RegexBuiltins; } // namespace base diff --git a/src/meson.build b/src/meson.build index 89f52985..cbef4dc9 100644 --- a/src/meson.build +++ b/src/meson.build @@ -130,7 +130,12 @@ refptr_sources = files( regex_sources = files( 'regex.cc', - 'regex.hh' + 'regex.hh', + 'regex-builtins.cc', + 'regex-builtins.hh', + 'regex-builtins-patterns.hh', + 'vteregex.cc', + 'vteregexinternal.hh', ) sixel_parser_sources = files( @@ -183,6 +188,7 @@ libvte_common_sources = config_sources + debug_sources + glib_glue_sources + gtk 'drawing-cairo.hh', 'fonts-pangocairo.cc', 'fonts-pangocairo.hh', + 'fwd.hh', 'gobject-glue.hh', 'keymap.cc', 'keymap.h', @@ -205,8 +211,6 @@ libvte_common_sources = config_sources + debug_sources + glib_glue_sources + gtk 'vtegtk.cc', 'vtegtk.hh', 'vteinternal.hh', - 'vteregex.cc', - 'vteregexinternal.hh', 'vterowdata.cc', 'vterowdata.hh', 'vteseq.cc', @@ -587,6 +591,24 @@ test_refptr = executable( install: false, ) +test_regex_sources = regex_sources + glib_glue_sources + files( + 'regex-test.cc', +) + +test_regex = executable( + 'test-regex', + sources: test_regex_sources, + dependencies: [glib_dep, gobject_dep, pcre2_dep,], + cpp_args: ['-DVTE_COMPILATION',], + include_directories: top_inc, + install: false, +) + +test_tabstops_sources = files( + 'tabstops-test.cc', + 'tabstops.hh' +) + if get_option('sixel') fuzz_sixel_sources = config_sources + files( 'sixel-fuzzer.cc', @@ -685,6 +707,7 @@ test_units = [ ['pastify', test_pastify], ['reaper', test_reaper], ['refptr', test_refptr], + ['regex', test_regex], ['stream', test_stream], ['tabstops', test_tabstops], ['utf8', test_utf8], diff --git a/src/regex-builtins-patterns.hh b/src/regex-builtins-patterns.hh new file mode 100644 index 00000000..3df945b8 --- /dev/null +++ b/src/regex-builtins-patterns.hh @@ -0,0 +1,156 @@ +/* + * Copyright © 2015 Egmont Koblinger + * + * This library is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library. If not, see <https://www.gnu.org/licenses/>. + */ + +/* + * Mini style-guide: + * + * #define'd fragments should preferably have an outermost group, for the + * exact same reason as why usually in C/C++ #define's the values are enclosed + * in parentheses: that is, so that you don't get surprised when you use the + * macro and append a quantifier. + * + * For repeated fragments prefer regex-style (?(DEFINE)(?<NAME>(...))) and use + * as (?&NAME), so that the regex string and the compiled regex object is + * smaller. + * + * Build small blocks, comment and unittest them heavily. + * + * Use free-spacing mode for improved readability. The hardest to read is + * which additional characters belong to a "(?" prefix. To improve + * readability, place a space after this, and for symmetry, before the closing + * parenthesis. Also place a space around "|" characters. No space before + * quantifiers. Try to be consistent with the existing style (yes I know the + * existing style is not consistent either, but please do your best). + * + * See http://www.rexegg.com/regex-disambiguation.html for all the "(?" + * syntaxes. + */ + +#pragma once + +/* Lookbehind to see if there's a preceding apostrophe. + * Unlike the other *_DEF macros which define regex subroutines, + * this one is a named capture that defines APOS_START to either + * an apostrophe or the empty string, depending on the character + * preceding this APOS_START_DEF construct. + */ +#define APOS_START_DEF "(?<APOS_START>(?<='))?" + +#define SCHEME "(?ix: news | telnet | nntp | https? | ftps? | sftp | webcal )" + +#define USERCHARS "-+.[:alnum:]" +/* Nonempty username, e.g. "john.smith" */ +#define USER "[" USERCHARS "]+" + +#define PASSCHARS_CLASS "[-[:alnum:]\\Q,?;.:/!%$^*&~\"#'\\E]" +/* Optional colon-prefixed password. I guess empty password should be allowed, right? E.g. ":secret", ":", "" */ +#define PASS "(?x: :" PASSCHARS_CLASS "* )?" + +/* Optional at-terminated username (with perhaps a password too), e.g. "joe@", "pete:secret@", "" */ +#define USERPASS "(?:" USER PASS "@)?" + +/* S4: IPv4 segment (number between 0 and 255) with lookahead at the end so that we don't match "25" in the string "256". + The lookahead could go to the last segment of IPv4 only but this construct allows nicer unittesting. */ +#define S4_DEF "(?(DEFINE)(?<S4>(?x: (?: [0-9] | [1-9][0-9] | 1[0-9]{2} | 2[0-4][0-9] | 25[0-5] ) (?! [0-9] ) )))" + +/* IPV4: Decimal IPv4, e.g. "1.2.3.4", with lookahead (implemented in S4) at the end so that we don't match "192.168.1.123" in the string "192.168.1.1234". */ +#define IPV4_DEF S4_DEF "(?(DEFINE)(?<IPV4>(?x: (?: (?&S4) \\. ){3} (?&S4) )))" + +/* IPv6, including embedded IPv4, e.g. "::1", "dead:beef::1.2.3.4". + * Lookahead for the next char not being a dot or digit, so it doesn't get stuck matching "dead:beef::1" in "dead:beef::1.2.3.4". + * This is not required since the surrounding brackets would trigger backtracking, but it allows nicer unittesting. + * TODO: more strict check (right number of colons, etc.) + * TODO: add zone_id: RFC 4007 section 11, RFC 6874 */ + +/* S6: IPv6 segment, S6C: IPv6 segment followed by a comma, CS6: comma followed by an IPv6 segment */ +#define S6_DEF "(?(DEFINE)(?<S6>[[:xdigit:]]{1,4})(?<CS6>:(?&S6))(?<S6C>(?&S6):))" + +/* No :: shorthand */ +#define IPV6_FULL "(?x: (?&S6C){7} (?&S6) )" +/* Begins with :: */ +#define IPV6_LEFT "(?x: : (?&CS6){1,7} )" +/* :: somewhere in the middle - use negative lookahead to make sure there aren't too many colons in total */ +#define IPV6_MID "(?x: (?! (?: [[:xdigit:]]*: ){8} ) (?&S6C){1,6} (?&CS6){1,6} )" +/* Ends with :: */ +#define IPV6_RIGHT "(?x: (?&S6C){1,7} : )" +/* Is "::" and nothing more */ +#define IPV6_NULL "(?x: :: )" + +/* The same ones for IPv4-embedded notation, without the actual IPv4 part */ +#define IPV6V4_FULL "(?x: (?&S6C){6} )" +#define IPV6V4_LEFT "(?x: :: (?&S6C){0,5} )" /* includes "::<ipv4>" */ +#define IPV6V4_MID "(?x: (?! (?: [[:xdigit:]]*: ){7} ) (?&S6C){1,4} (?&CS6){1,4} ) :" +#define IPV6V4_RIGHT "(?x: (?&S6C){1,5} : )" + +/* IPV6: An IPv6 address (possibly with an embedded IPv4). + * This macro defines both IPV4 and IPV6, since the latter one requires the former. */ +#define IP_DEF IPV4_DEF S6_DEF "(?(DEFINE)(?<IPV6>(?x: (?: " IPV6_NULL " | " IPV6_LEFT " | " IPV6_MID " | " IPV6_RIGHT " | " IPV6_FULL " | (?: " IPV6V4_FULL " | " IPV6V4_LEFT " | " IPV6V4_MID " | " IPV6V4_RIGHT " ) (?&IPV4) ) (?! [.:[:xdigit:]] ) )))" + +/* Either an alphanumeric character or dash; or if [negative lookahead] not ASCII + * then any graphical Unicode character. + * A segment can consist entirely of numbers. + * (Note: PCRE doesn't support character class subtraction/intersection.) */ +#define HOSTNAMESEGMENTCHARS_CLASS "(?x: [-[:alnum:]] | (?! [[:ascii:]] ) [[:graph:]] )" + +/* A hostname of at least 1 component. The last component cannot be entirely numbers. + * E.g. "foo", "example.com", "1234.com", but not "foo.123" */ +#define HOSTNAME1 "(?x: (?: " HOSTNAMESEGMENTCHARS_CLASS "+ \\. )* " HOSTNAMESEGMENTCHARS_CLASS "* (?! [0-9] ) " HOSTNAMESEGMENTCHARS_CLASS "+ )" + +/* A hostname of at least 2 components. The last component cannot be entirely numbers. + * E.g. "example.com", "1234.com", but not "1234.56" */ +#define HOSTNAME2 "(?x: (?: " HOSTNAMESEGMENTCHARS_CLASS "+ \\.)+ " HOSTNAME1 " )" + +/* For URL: Hostname, IPv4, or bracket-enclosed IPv6, e.g. "example.com", "1.2.3.4", "[::1]" */ +#define URL_HOST "(?x: " HOSTNAME1 " | (?&IPV4) | \\[ (?&IPV6) \\] )" + +/* For e-mail: Hostname of at least two segments, or bracket-enclosed IPv4 or IPv6, e.g. "example.com", "[1.2.3.4]", "[::1]". + * Technically an e-mail with a single-component hostname might be valid on a local network, but let's avoid tons of false positives (e.g. in a typical shell prompt). */ +#define EMAIL_HOST "(?x: " HOSTNAME2 " | \\[ (?: (?&IPV4) | (?&IPV6) ) \\] )" + +/* Number between 1 and 65535, with lookahead at the end so that we don't match "6789" in the string "67890", + and in turn we don't eventually match "http://host:6789" in "http://host:67890". */ +#define N_1_65535 "(?x: (?: [1-9][0-9]{0,3} | [1-5][0-9]{4} | 6[0-4][0-9]{3} | 65[0-4][0-9]{2} | 655[0-2][0-9] | 6553[0-5] ) (?! [0-9] ) )" + +/* Optional colon-prefixed port, e.g. ":1080", "" */ +#define PORT "(?x: \\:" N_1_65535 " )?" + +/* Omit the parentheses, see below */ +#define PATHCHARS_CLASS "[-[:alnum:]\\Q_$.+!*,:;@&=?/~#|%'\\E]" +/* Chars to end a URL. Apostrophe only allowed if there wasn't one in front of the URL, see bug 448044 */ +#define PATHTERM_CLASS "[-[:alnum:]\\Q_$+*:@&=/~#|%'\\E]" +#define PATHTERM_NOAPOS_CLASS "[-[:alnum:]\\Q_$+*:@&=/~#|%\\E]" + +/* Recursive definition of PATH that allows parentheses and square brackets only if balanced, see bug 763980. */ +#define PATH_INNER_DEF "(?(DEFINE)(?<PATH_INNER>(?x: (?: " PATHCHARS_CLASS "* (?: \\( (?&PATH_INNER) \\) | \\[ (?&PATH_INNER) \\] ) )* " PATHCHARS_CLASS "* )))" +/* Same as above, but the last character (if exists and is not a parenthesis) must be from PATHTERM_CLASS. */ +#define PATH_DEF "(?(DEFINE)(?<PATH>(?x: (?: " PATHCHARS_CLASS "* (?: \\( (?&PATH_INNER) \\) | \\[ (?&PATH_INNER) \\] ) )* (?: " PATHCHARS_CLASS "* (?(<APOS_START>)" PATHTERM_NOAPOS_CLASS "|" PATHTERM_CLASS ") )? )))" + +#define URLPATH "(?x: [/?#](?&PATH) )?" +#define VOIP_PATH "(?x: [;?](?&PATH) )?" + +/* Now let's put these fragments together */ + +#define DEFS APOS_START_DEF IP_DEF PATH_INNER_DEF PATH_DEF + +#define REGEX_URL_AS_IS DEFS SCHEME "://" USERPASS URL_HOST PORT URLPATH +/* TODO: also support file:/etc/passwd */ +#define REGEX_URL_FILE DEFS "(?ix: file:/ (?: / (?: " HOSTNAME1 " )? / )? (?! / ) )(?&PATH)" +/* Lookbehind so that we don't catch "abc.www.foo.bar", bug 739757. Lookahead for www/ftp for convenience (so that we can reuse HOSTNAME1). */ +#define REGEX_URL_HTTP DEFS "(?<!(?:" HOSTNAMESEGMENTCHARS_CLASS "|[.]))(?=(?i:www|ftp))" HOSTNAME1 PORT URLPATH +#define REGEX_URL_VOIP DEFS "(?i:h323:|sips?:)" USERPASS URL_HOST PORT VOIP_PATH +#define REGEX_EMAIL DEFS "(?i:mailto:)?" USER "@" EMAIL_HOST +#define REGEX_NEWS_MAN "(?i:news:|man:|info:)[-[:alnum:]\\Q^_{|}~!\"#$%&'()*+,./;:=?`\\E]+" diff --git a/src/regex-builtins.cc b/src/regex-builtins.cc new file mode 100644 index 00000000..f748ca1a --- /dev/null +++ b/src/regex-builtins.cc @@ -0,0 +1,103 @@ +/* + * Copyright © 2019 Christian Persch + * + * This library is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library. If not, see <https://www.gnu.org/licenses/>. + */ + +#include "config.h" + +#include <glib.h> + +#include "regex.hh" +#include "regex-builtins.hh" +#include "regex-builtins-patterns.hh" + +#include "pcre2-glue.hh" + +namespace vte::base { + +RegexBuiltins::RegexBuiltins() +{ + m_builtins.reserve(8); + + compile_builtin(REGEX_URL_AS_IS, InternalBuiltinsTag::eURL); + compile_builtin(REGEX_URL_HTTP, InternalBuiltinsTag::eHTTP); + compile_builtin(REGEX_URL_FILE, InternalBuiltinsTag::eFILE); + compile_builtin(REGEX_URL_VOIP, InternalBuiltinsTag::eVOIP); + compile_builtin(REGEX_EMAIL, InternalBuiltinsTag::eEMAIL); + compile_builtin(REGEX_NEWS_MAN, InternalBuiltinsTag::eNEWS_MAN); +} + +void +RegexBuiltins::compile_builtin(std::string_view const& pattern, + InternalBuiltinsTag tag) noexcept +{ + GError* error{nullptr}; + auto regex = Regex::compile(Regex::Purpose::eMatch, + pattern, + PCRE2_UTF | PCRE2_UCP | PCRE2_NO_UTF_CHECK | PCRE2_MULTILINE, + &error); + if (error) { + g_printerr("Failed to compile builtin regex %d: %s\n", int(tag), error->message); + g_error_free(error); + return; + } + + regex->jit(PCRE2_JIT_COMPLETE, &error); + if (error) { + g_printerr("Failed to complete JIT compile builtin regex %d: %s\n", int(tag), error->message); + g_clear_error(&error); + } + + regex->jit(PCRE2_JIT_PARTIAL_SOFT, &error); + if (error) { + g_printerr("Failed to partial-soft JIT compile builtin regex %d: %s\n", int(tag), error->message); + g_clear_error(&error); + } + + m_builtins.emplace_back(take_ref(regex), int(tag)); +} + +int +RegexBuiltins::transform_match(char*& match, + int tag) const noexcept +{ + switch (InternalBuiltinsTag(tag)) { + case InternalBuiltinsTag::eURL: + case InternalBuiltinsTag::eFILE: + case InternalBuiltinsTag::eNEWS_MAN: + case InternalBuiltinsTag::eVOIP: + /* No transformation */ + return int(BuiltinsTag::eURI); + + case InternalBuiltinsTag::eHTTP: { + auto v = match; + match = g_strdup_printf("http://%s", match); + g_free(v); + return int(BuiltinsTag::eURI); + } + + case InternalBuiltinsTag::eEMAIL: + if (g_ascii_strncasecmp ("mailto:", match, 7) != 0) { + auto v = match; + match = g_strdup_printf ("mailto:%s", match); + g_free(v); + } + return int(BuiltinsTag::eURI); + } + + return -1; +} + +} // namespace vte::base diff --git a/src/regex-builtins.hh b/src/regex-builtins.hh new file mode 100644 index 00000000..9d201949 --- /dev/null +++ b/src/regex-builtins.hh @@ -0,0 +1,80 @@ +/* + * Copyright © 2019 Christian Persch + * + * This library is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library. If not, see <https://www.gnu.org/licenses/>. + */ + +#pragma once + +#include <memory> +#include <string> +#include <utility> +#include <vector> + +#include "regex.hh" +#include "refptr.hh" + +namespace vte { + +namespace base { + +class RegexBuiltins { +private: + static inline std::weak_ptr<RegexBuiltins> s_weak_ptr{}; + + std::vector<std::pair<RefPtr<Regex>, int>> m_builtins{}; + + enum class InternalBuiltinsTag : int { + eURL = -2, + eHTTP = -3, + eFILE = -4, + eVOIP = -5, + eEMAIL = -6, + eNEWS_MAN = -7 + }; + + void compile_builtin(std::string_view const& pattern, + InternalBuiltinsTag tag) noexcept; + +public: + // these must have the same values as the public VteBuiltinMatchTag + enum class BuiltinsTag : int { + eURI = -2 + }; + + RegexBuiltins(); + ~RegexBuiltins() { } + RegexBuiltins(RegexBuiltins const&) = delete; + RegexBuiltins(RegexBuiltins&&) = delete; + + RegexBuiltins& operator= (RegexBuiltins const&) = delete; + RegexBuiltins& operator= (RegexBuiltins&&) = delete; + + inline constexpr auto const& builtins() const noexcept { return m_builtins; } + + int transform_match(char*& match, + int tag) const noexcept; + + static std::shared_ptr<RegexBuiltins> get() + { + auto inst = s_weak_ptr.lock(); + if (!inst) + s_weak_ptr = inst = std::make_shared<RegexBuiltins>(); + return inst; + } +}; + +} // namespace base + +} // namespace vte diff --git a/src/regex-test.cc b/src/regex-test.cc new file mode 100644 index 00000000..25195072 --- /dev/null +++ b/src/regex-test.cc @@ -0,0 +1,668 @@ +/* + * Copyright © 2015 Egmont Koblinger + * Copyright © 2019, 2020 Christian Persch + * + * This library is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library. If not, see <https://www.gnu.org/licenses/>. + */ + +#include "config.h" + +#include <glib.h> +#include <locale.h> + +#include <cstdint> +#include <cstdlib> + +#include <string> + +#include "glib-glue.hh" +#include "regex.hh" +#include "regex-builtins-patterns.hh" + +using namespace std::literals; + +auto pcre2test = bool{false}; +auto pcre2_atleast_10_35 = bool{false}; +FILE* pcre2test_in{nullptr}; +FILE* pcre2test_out{nullptr}; + +/* Shorthand for expecting the pattern to match the entire input string */ +#define ENTIRE ((char *) 1) + +static pcre2_match_context_8* +create_match_context() +{ + pcre2_match_context_8 *match_context; + + match_context = pcre2_match_context_create_8(nullptr /* general context */); + pcre2_set_match_limit_8(match_context, 65536); /* should be plenty */ + pcre2_set_recursion_limit_8(match_context, 64); /* should be plenty */ + + return match_context; +} + +static char* +get_match(decltype(&pcre2_match_8) match_fn, + vte::base::Regex const* regex, + uint32_t match_flags, + char const* subject) +{ + auto match_context = create_match_context(); + auto match_data = pcre2_match_data_create_8(256 /* should be plenty */, + nullptr /* general context */); + + auto r = match_fn(regex->code(), + (PCRE2_SPTR8)subject, + strlen(subject), + 0, /* start offset */ + match_flags | + PCRE2_NO_UTF_CHECK, + match_data, + match_context); + + char* match; + if (r == PCRE2_ERROR_NOMATCH) { + match = nullptr; + } else if (r < 0) { + /* Error */ + PCRE2_UCHAR8 buf[256]; + auto n = pcre2_get_error_message_8(r, buf, sizeof(buf)); + g_assert_true(n >= 0); + g_printerr("PCRE2 error %d: %s\n", r, buf); + + match = nullptr; + } else { + /* has match */ + auto const* ovector = pcre2_get_ovector_pointer_8(match_data); + auto const so = ovector[0]; + auto const eo = ovector[1]; + if (so == PCRE2_UNSET || eo == PCRE2_UNSET) + match = nullptr; + else + match = g_strndup(subject + so, eo - so); + } + + pcre2_match_data_free_8(match_data); + pcre2_match_context_free_8(match_context); + + return match; +} + +struct TestData { + char const* pattern; + char const* string; + char const* expected; + uint32_t match_flags; +}; + +static std::string +escape_slash(std::string str) +{ + auto escaped = std::string{}; + for (auto const c : str) { + if (c == '/') + escaped.append("\\/"); + else if (c == '\\') + escaped.append("\\\\"); + else + escaped.push_back(c); + } + + return escaped; +} + +static std::string +flags_to_string(uint32_t flags) +{ + auto str = std::string{}; + + if (flags & PCRE2_ANCHORED) + str.append("anchored,"); + + return str; +} + +static void +print_testdata(TestData* data, + int line) +{ + auto patstr = escape_slash(data->pattern); + auto flagstr = flags_to_string(data->match_flags); + + fprintf(pcre2test_in, + "# Line: %d\n" + "/%s/%s\n" + " %s\\=\n" + "\n", + line, + patstr.c_str(), flagstr.c_str(), + data->string); + fprintf(pcre2test_out, + "# Line: %d\n" + "/%s/%s\n" + " %s\\=\n" + "%s%s\n" + "\n", + line, + patstr.c_str(), flagstr.c_str(), + data->string, + data->expected ? " 0: " : "No match", + data->expected ? data->expected : ""); +} + +static void +assert_match_test(void const* ptr) +{ + auto data = reinterpret_cast<TestData const*>(ptr); + + auto error = vte::glib::Error{}; + auto regex = vte::base::Regex::compile(vte::base::Regex::Purpose::eMatch, + data->pattern, + PCRE2_UTF | PCRE2_NO_UTF_CHECK | + PCRE2_UCP | + PCRE2_MULTILINE | + /* Pass match_flags here as compile flags, since + * otherwise some JITed regex tests fail because + * ANCHORED is ignored when passed to + * pcre2_jit_match_8. + */ + data->match_flags, + error); + error.assert_no_error(); + g_assert_nonnull(regex); + + auto match = get_match(&pcre2_match_8, regex, data->match_flags, data->string); + + g_assert_cmpstr(match, ==, data->expected); + g_free(match); + + if (vte::base::Regex::check_pcre_config_jit()) { + regex->jit(PCRE2_JIT_COMPLETE, error); + error.assert_no_error(); + regex->jit(PCRE2_JIT_PARTIAL_SOFT, error); + error.assert_no_error(); + regex->jit(PCRE2_JIT_PARTIAL_HARD, error); + error.assert_no_error(); + + match = get_match(&pcre2_jit_match_8, regex, data->match_flags, data->string); + g_assert_cmpstr(match, ==, data->expected); + g_free(match); + } + + regex->unref(); +} + +static void +assert_match(char const* pattern, + char const* string, + char const* expected, + uint32_t match_flags = 0u, + int line = __builtin_LINE()) +{ + auto data = g_new(TestData, 1); + data->pattern = pattern; + data->string = string; + data->expected = expected == ENTIRE ? string : expected; + data->match_flags = match_flags; + + auto path = g_strdup_printf("/vte/regex/builtins/%d", line); + g_test_add_data_func_full(path, data, assert_match_test, (GDestroyNotify)g_free); + g_free(path); + + if (pcre2test) + print_testdata(data, line); +} + +static void +assert_match_anchored(char const* pattern, + char const* string, + char const* expected, + int line = __builtin_LINE()) +{ + assert_match(pattern, string, expected, PCRE2_ANCHORED, line); +} + +static void +setup_regex_builtins_tests(void) +{ + /* SCHEME is case insensitive */ + assert_match_anchored (SCHEME, "http", ENTIRE); + assert_match_anchored (SCHEME, "HTTPS", ENTIRE); + + /* USER is nonempty, alphanumeric, dot, plus and dash */ + assert_match_anchored (USER, "", nullptr); + assert_match_anchored (USER, "dr.john-smith", ENTIRE); + assert_match_anchored (USER, "abc+def@ghi", "abc+def"); + + /* PASS is optional colon-prefixed value, allowing quite some characters, but definitely not @ */ + assert_match_anchored (PASS, "", ENTIRE); + assert_match_anchored (PASS, "nocolon", ""); + assert_match_anchored (PASS, ":s3cr3T", ENTIRE); + assert_match_anchored (PASS, ":$?#@host", ":$?#"); + + /* Hostname of at least 1 component, containing at least one non-digit in at least one of the segments */ + assert_match_anchored (HOSTNAME1, "example.com", ENTIRE); + assert_match_anchored (HOSTNAME1, "a-b.c-d", ENTIRE); + assert_match_anchored (HOSTNAME1, "a_b", "a"); /* TODO: can/should we totally abort here? */ + assert_match_anchored (HOSTNAME1, "déjà-vu.com", ENTIRE); + assert_match_anchored (HOSTNAME1, "➡.ws", ENTIRE); + assert_match_anchored (HOSTNAME1, "cömbining-áccents", ENTIRE); + assert_match_anchored (HOSTNAME1, "12", nullptr); + assert_match_anchored (HOSTNAME1, "12.34", nullptr); + assert_match_anchored (HOSTNAME1, "12.ab", ENTIRE); + if (pcre2test) // unexplained failure + assert_match_anchored (HOSTNAME1, "ab.12", nullptr); /* errr... could we fail here?? */ + + /* Hostname of at least 2 components, containing at least one non-digit in at least one of the segments */ + assert_match_anchored (HOSTNAME2, "example.com", ENTIRE); + assert_match_anchored (HOSTNAME2, "example", nullptr); + assert_match_anchored (HOSTNAME2, "12", nullptr); + assert_match_anchored (HOSTNAME2, "12.34", nullptr); + assert_match_anchored (HOSTNAME2, "12.ab", ENTIRE); + assert_match_anchored (HOSTNAME2, "ab.12", nullptr); + if (pcre2test) // unexplained failure + assert_match_anchored (HOSTNAME2, "ab.cd.12", nullptr); /* errr... could we fail here?? */ + + /* IPv4 segment (number between 0 and 255) */ + assert_match_anchored (DEFS "(?&S4)", "0", ENTIRE); + assert_match_anchored (DEFS "(?&S4)", "1", ENTIRE); + assert_match_anchored (DEFS "(?&S4)", "9", ENTIRE); + assert_match_anchored (DEFS "(?&S4)", "10", ENTIRE); + assert_match_anchored (DEFS "(?&S4)", "99", ENTIRE); + assert_match_anchored (DEFS "(?&S4)", "100", ENTIRE); + assert_match_anchored (DEFS "(?&S4)", "200", ENTIRE); + assert_match_anchored (DEFS "(?&S4)", "250", ENTIRE); + assert_match_anchored (DEFS "(?&S4)", "255", ENTIRE); + assert_match_anchored (DEFS "(?&S4)", "256", nullptr); + assert_match_anchored (DEFS "(?&S4)", "260", nullptr); + assert_match_anchored (DEFS "(?&S4)", "300", nullptr); + assert_match_anchored (DEFS "(?&S4)", "1000", nullptr); + assert_match_anchored (DEFS "(?&S4)", "", nullptr); + assert_match_anchored (DEFS "(?&S4)", "a1b", nullptr); + + /* IPv4 addresses */ + assert_match_anchored (DEFS "(?&IPV4)", "11.22.33.44", ENTIRE); + assert_match_anchored (DEFS "(?&IPV4)", "0.1.254.255", ENTIRE); + assert_match_anchored (DEFS "(?&IPV4)", "75.150.225.300", nullptr); + assert_match_anchored (DEFS "(?&IPV4)", "1.2.3.4.5", "1.2.3.4"); /* we could also bail out and not match at all */ + + /* IPv6 addresses */ + assert_match_anchored (DEFS "(?&IPV6)", "11:::22", nullptr); + assert_match_anchored (DEFS "(?&IPV6)", "11:22::33:44::55:66", nullptr); + assert_match_anchored (DEFS "(?&IPV6)", "dead::beef", ENTIRE); + assert_match_anchored (DEFS "(?&IPV6)", "faded::bee", nullptr); + assert_match_anchored (DEFS "(?&IPV6)", "live::pork", nullptr); + assert_match_anchored (DEFS "(?&IPV6)", "::1", ENTIRE); + assert_match_anchored (DEFS "(?&IPV6)", "11::22:33::44", nullptr); + assert_match_anchored (DEFS "(?&IPV6)", "11:22:::33", nullptr); + assert_match_anchored (DEFS "(?&IPV6)", "dead:beef::192.168.1.1", ENTIRE); + assert_match_anchored (DEFS "(?&IPV6)", "192.168.1.1", nullptr); + assert_match_anchored (DEFS "(?&IPV6)", "11:22:33:44:55:66:77:87654", nullptr); + assert_match_anchored (DEFS "(?&IPV6)", "11:22::33:45678", nullptr); + assert_match_anchored (DEFS "(?&IPV6)", "11:22:33:44:55:66:192.168.1.12345", nullptr); + + assert_match_anchored (DEFS "(?&IPV6)", "11:22:33:44:55:66:77", nullptr); /* no :: */ + assert_match_anchored (DEFS "(?&IPV6)", "11:22:33:44:55:66:77:88", ENTIRE); + assert_match_anchored (DEFS "(?&IPV6)", "11:22:33:44:55:66:77:88:99", nullptr); + assert_match_anchored (DEFS "(?&IPV6)", "::11:22:33:44:55:66:77", ENTIRE); /* :: at the start */ + assert_match_anchored (DEFS "(?&IPV6)", "::11:22:33:44:55:66:77:88", nullptr); + assert_match_anchored (DEFS "(?&IPV6)", "11:22:33::44:55:66:77", ENTIRE); /* :: in the middle */ + assert_match_anchored (DEFS "(?&IPV6)", "11:22:33::44:55:66:77:88", nullptr); + assert_match_anchored (DEFS "(?&IPV6)", "11:22:33:44:55:66:77::", ENTIRE); /* :: at the end */ + assert_match_anchored (DEFS "(?&IPV6)", "11:22:33:44:55:66:77:88::", nullptr); + assert_match_anchored (DEFS "(?&IPV6)", "::", ENTIRE); /* :: only */ + + assert_match_anchored (DEFS "(?&IPV6)", "11:22:33:44:55:192.168.1.1", nullptr); /* no :: */ + assert_match_anchored (DEFS "(?&IPV6)", "11:22:33:44:55:66:192.168.1.1", ENTIRE); + assert_match_anchored (DEFS "(?&IPV6)", "11:22:33:44:55:66:77:192.168.1.1", nullptr); + assert_match_anchored (DEFS "(?&IPV6)", "::11:22:33:44:55:192.168.1.1", ENTIRE); /* :: at the start */ + assert_match_anchored (DEFS "(?&IPV6)", "::11:22:33:44:55:66:192.168.1.1", nullptr); + assert_match_anchored (DEFS "(?&IPV6)", "11:22:33::44:55:192.168.1.1", ENTIRE); /* :: in the imddle */ + assert_match_anchored (DEFS "(?&IPV6)", "11:22:33::44:55:66:192.168.1.1", nullptr); + assert_match_anchored (DEFS "(?&IPV6)", "11:22:33:44:55::192.168.1.1", ENTIRE); /* :: at the end(ish) */ + assert_match_anchored (DEFS "(?&IPV6)", "11:22:33:44:55:66::192.168.1.1", nullptr); + assert_match_anchored (DEFS "(?&IPV6)", "::192.168.1.1", ENTIRE); /* :: only(ish) */ + + /* URL_HOST is either a hostname, or an IPv4 address, or a bracket-enclosed IPv6 address */ + assert_match_anchored (DEFS URL_HOST, "example", ENTIRE); + assert_match_anchored (DEFS URL_HOST, "example.com", ENTIRE); + assert_match_anchored (DEFS URL_HOST, "11.22.33.44", ENTIRE); + assert_match_anchored (DEFS URL_HOST, "[11.22.33.44]", nullptr); + assert_match_anchored (DEFS URL_HOST, "dead::be:ef", "dead"); /* TODO: can/should we totally abort here? */ + assert_match_anchored (DEFS URL_HOST, "[dead::be:ef]", ENTIRE); + + /* EMAIL_HOST is either an at least two-component hostname, or a bracket-enclosed IPv[46] address */ + assert_match_anchored (DEFS EMAIL_HOST, "example", nullptr); + assert_match_anchored (DEFS EMAIL_HOST, "example.com", ENTIRE); + assert_match_anchored (DEFS EMAIL_HOST, "11.22.33.44", nullptr); + assert_match_anchored (DEFS EMAIL_HOST, "[11.22.33.44]", ENTIRE); + assert_match_anchored (DEFS EMAIL_HOST, "[11.22.33.456]", nullptr); + assert_match_anchored (DEFS EMAIL_HOST, "dead::be:ef", nullptr); + assert_match_anchored (DEFS EMAIL_HOST, "[dead::be:ef]", ENTIRE); + + /* Number between 1 and 65535 (helper for port) */ + assert_match_anchored (N_1_65535, "0", nullptr); + assert_match_anchored (N_1_65535, "1", ENTIRE); + assert_match_anchored (N_1_65535, "10", ENTIRE); + assert_match_anchored (N_1_65535, "100", ENTIRE); + assert_match_anchored (N_1_65535, "1000", ENTIRE); + assert_match_anchored (N_1_65535, "10000", ENTIRE); + assert_match_anchored (N_1_65535, "60000", ENTIRE); + assert_match_anchored (N_1_65535, "65000", ENTIRE); + assert_match_anchored (N_1_65535, "65500", ENTIRE); + assert_match_anchored (N_1_65535, "65530", ENTIRE); + assert_match_anchored (N_1_65535, "65535", ENTIRE); + assert_match_anchored (N_1_65535, "65536", nullptr); + assert_match_anchored (N_1_65535, "65540", nullptr); + assert_match_anchored (N_1_65535, "65600", nullptr); + assert_match_anchored (N_1_65535, "66000", nullptr); + assert_match_anchored (N_1_65535, "70000", nullptr); + assert_match_anchored (N_1_65535, "100000", nullptr); + assert_match_anchored (N_1_65535, "", nullptr); + assert_match_anchored (N_1_65535, "a1b", nullptr); + + /* PORT is an optional colon-prefixed value */ + assert_match_anchored (PORT, "", ENTIRE); + assert_match_anchored (PORT, ":1", ENTIRE); + assert_match_anchored (PORT, ":65535", ENTIRE); + assert_match_anchored (PORT, ":65536", ""); /* TODO: can/should we totally abort here? */ + + /* Parentheses are only allowed in matching pairs, see bug 763980. */ + /* TODO: add tests for PATHCHARS and PATHNONTERM; and/or URLPATH */ + assert_match_anchored (DEFS URLPATH, "/ab/cd", ENTIRE); + assert_match_anchored (DEFS URLPATH, "/ab/cd.html.", "/ab/cd.html"); + assert_match_anchored (DEFS URLPATH, "/The_Offspring_(album)", ENTIRE); + assert_match_anchored (DEFS URLPATH, "/The_Offspring)", "/The_Offspring"); + assert_match_anchored (DEFS URLPATH, "/a((b(c)d)e(f))", ENTIRE); + assert_match_anchored (DEFS URLPATH, "/a((b(c)d)e(f)))", "/a((b(c)d)e(f))"); + assert_match_anchored (DEFS URLPATH, "/a(b).(c).", "/a(b).(c)"); + assert_match_anchored (DEFS URLPATH, "/a.(b.(c.).).(d.(e.).).)", "/a.(b.(c.).).(d.(e.).)"); + assert_match_anchored (DEFS URLPATH, "/a)b(c", "/a"); + assert_match_anchored (DEFS URLPATH, "/.", "/"); + assert_match_anchored (DEFS URLPATH, "/(.", "/"); + assert_match_anchored (DEFS URLPATH, "/).", "/"); + assert_match_anchored (DEFS URLPATH, "/().", "/()"); + assert_match_anchored (DEFS URLPATH, "/", ENTIRE); + assert_match_anchored (DEFS URLPATH, "", ENTIRE); + assert_match_anchored (DEFS URLPATH, "?", ENTIRE); + assert_match_anchored (DEFS URLPATH, "?param=value", ENTIRE); + assert_match_anchored (DEFS URLPATH, "#", ENTIRE); + assert_match_anchored (DEFS URLPATH, "#anchor", ENTIRE); + assert_match_anchored (DEFS URLPATH, "/php?param[]=value1¶m[]=value2", ENTIRE); + assert_match_anchored (DEFS URLPATH, "/foo?param1[index1]=value1¶m2[index2]=value2", ENTIRE); + assert_match_anchored (DEFS URLPATH, "/[[[]][]]", ENTIRE); + assert_match_anchored (DEFS URLPATH, "/[([])]([()])", ENTIRE); + assert_match_anchored (DEFS URLPATH, "/([()])[([])]", ENTIRE); + assert_match_anchored (DEFS URLPATH, "/[(])", "/"); + assert_match_anchored (DEFS URLPATH, "/([)]", "/"); + + + /* Put the components together and test the big picture */ + + assert_match (REGEX_URL_AS_IS, "There's no URL here http:/foo", nullptr); + assert_match (REGEX_URL_AS_IS, "Visit http://example.com for details", "http://example.com"); + assert_match (REGEX_URL_AS_IS, "Trailing dot http://foo/bar.html.", "http://foo/bar.html"); + assert_match (REGEX_URL_AS_IS, "Trailing ellipsis http://foo/bar.html...", "http://foo/bar.html"); + assert_match (REGEX_URL_AS_IS, "Trailing comma http://foo/bar,baz,", "http://foo/bar,baz"); + assert_match (REGEX_URL_AS_IS, "Trailing semicolon http://foo/bar;baz;", "http://foo/bar;baz"); + assert_match (REGEX_URL_AS_IS, "See <http://foo/bar>", "http://foo/bar"); + assert_match (REGEX_URL_AS_IS, "<http://foo.bar/asdf.qwer.html>", "http://foo.bar/asdf.qwer.html"); + assert_match (REGEX_URL_AS_IS, "Go to http://192.168.1.1.", "http://192.168.1.1"); + assert_match (REGEX_URL_AS_IS, "If not, see <http://www.gnu.org/licenses/>.", "http://www.gnu.org/licenses/"); + assert_match (REGEX_URL_AS_IS, "<a href=\"http://foo/bar\">foo</a>", "http://foo/bar"); + assert_match (REGEX_URL_AS_IS, "<a href='http://foo/bar'>foo</a>", "http://foo/bar"); + assert_match (REGEX_URL_AS_IS, "<url>http://foo/bar</url>", "http://foo/bar"); + + assert_match (REGEX_URL_AS_IS, "http://", nullptr); + assert_match (REGEX_URL_AS_IS, "http://a", ENTIRE); + assert_match (REGEX_URL_AS_IS, "http://aa.", "http://aa"); + assert_match (REGEX_URL_AS_IS, "http://aa.b", ENTIRE); + assert_match (REGEX_URL_AS_IS, "http://aa.bb", ENTIRE); + assert_match (REGEX_URL_AS_IS, "http://aa.bb/c", ENTIRE); + assert_match (REGEX_URL_AS_IS, "http://aa.bb/cc", ENTIRE); + assert_match (REGEX_URL_AS_IS, "http://aa.bb/cc/", ENTIRE); + + assert_match (REGEX_URL_AS_IS, "HtTp://déjà-vu.com:10000/déjà/vu", ENTIRE); + assert_match (REGEX_URL_AS_IS, "HTTP://joe:sEcReT@➡.ws:1080", ENTIRE); + assert_match (REGEX_URL_AS_IS, "https://cömbining-áccents", ENTIRE); + + assert_match (REGEX_URL_AS_IS, "http://111.222.33.44", ENTIRE); + assert_match (REGEX_URL_AS_IS, "http://111.222.33.44/", ENTIRE); + assert_match (REGEX_URL_AS_IS, "http://111.222.33.44/foo", ENTIRE); + assert_match (REGEX_URL_AS_IS, "http://1.2.3.4:5555/xyz", ENTIRE); + assert_match (REGEX_URL_AS_IS, "https://[dead::beef]:12345/ipv6", ENTIRE); + assert_match (REGEX_URL_AS_IS, "https://[dead::beef:11.22.33.44]", ENTIRE); + assert_match (REGEX_URL_AS_IS, "http://1.2.3.4:", "http://1.2.3.4"); /* TODO: can/should we totally abort here? */ + assert_match (REGEX_URL_AS_IS, "https://dead::beef/no-brackets-ipv6", "https://dead"); /* ditto */ + assert_match (REGEX_URL_AS_IS, "http://111.222.333.444/", nullptr); + assert_match (REGEX_URL_AS_IS, "http://1.2.3.4:70000", "http://1.2.3.4"); /* TODO: can/should we totally abort here? */ + assert_match (REGEX_URL_AS_IS, "http://[dead::beef:111.222.333.444]", nullptr); + + /* '?' or '#' without '/', GNOME/gnome-terminal#7888 */ + assert_match (REGEX_URL_AS_IS, "http://foo.bar?", ENTIRE); + assert_match (REGEX_URL_AS_IS, "http://foo.bar?param=value", ENTIRE); + assert_match (REGEX_URL_AS_IS, "http://foo.bar:12345?param=value", ENTIRE); + assert_match (REGEX_URL_AS_IS, "http://1.2.3.4?param=value", ENTIRE); + assert_match (REGEX_URL_AS_IS, "http://[dead::beef]?param=value", ENTIRE); + assert_match (REGEX_URL_AS_IS, "http://foo.bar#", ENTIRE); + assert_match (REGEX_URL_AS_IS, "http://foo.bar#anchor", ENTIRE); + assert_match (REGEX_URL_AS_IS, "http://foo.bar:12345#anchor", ENTIRE); + assert_match (REGEX_URL_AS_IS, "http://1.2.3.4#anchor", ENTIRE); + assert_match (REGEX_URL_AS_IS, "http://[dead::beef]#anchor", ENTIRE); + + /* Username, password */ + assert_match (REGEX_URL_AS_IS, "http://joe@example.com", ENTIRE); + assert_match (REGEX_URL_AS_IS, "http://user.name:sec.ret@host.name", ENTIRE); + assert_match (REGEX_URL_AS_IS, "http://joe:secret@[::1]", ENTIRE); + assert_match (REGEX_URL_AS_IS, "http://dudewithnopassword:@example.com", ENTIRE); + assert_match (REGEX_URL_AS_IS, "http://safeguy:!#$%^&*@host", ENTIRE); + assert_match (REGEX_URL_AS_IS, "http://invalidusername!@host", "http://invalidusername"); + + assert_match (REGEX_URL_AS_IS, "http://ab.cd/ef?g=h&i=j|k=l#m=n:o=p", ENTIRE); + assert_match (REGEX_URL_AS_IS, "http:///foo", nullptr); + + /* Parentheses are only allowed in matching pairs, see bug 763980. */ + assert_match (REGEX_URL_AS_IS, "https://en.wikipedia.org/wiki/The_Offspring_(album)", ENTIRE); + assert_match (REGEX_URL_AS_IS, "[markdown](https://en.wikipedia.org/wiki/The_Offspring)", "https://en.wikipedia.org/wiki/The_Offspring"); + assert_match (REGEX_URL_AS_IS, "[markdown](https://en.wikipedia.org/wiki/The_Offspring_(album))", "https://en.wikipedia.org/wiki/The_Offspring_(album)"); + assert_match (REGEX_URL_AS_IS, "[markdown](http://foo.bar/(a(b)c)d)e)f", "http://foo.bar/(a(b)c)d"); + assert_match (REGEX_URL_AS_IS, "[markdown](http://foo.bar/a)b(c", "http://foo.bar/a"); + + /* Apostrophes are allowed, except at trailing position if the URL is preceded by an apostrophe, see bug 448044. */ + assert_match (REGEX_URL_AS_IS, "https://en.wikipedia.org/wiki/Moore's_law", ENTIRE); + assert_match (REGEX_URL_AS_IS, "<a href=\"https://en.wikipedia.org/wiki/Moore's_law\">", "https://en.wikipedia.org/wiki/Moore's_law"); + assert_match (REGEX_URL_AS_IS, "https://en.wikipedia.org/wiki/Cryin'", ENTIRE); + assert_match (REGEX_URL_AS_IS, "<a href=\"https://en.wikipedia.org/wiki/Cryin'\">", "https://en.wikipedia.org/wiki/Cryin'"); + assert_match (REGEX_URL_AS_IS, "<a href='https://en.wikipedia.org/wiki/Aerosmith'>", "https://en.wikipedia.org/wiki/Aerosmith"); + + /* Apostrophes are allowed, except at trailing position if the URL is preceded by an apostrophe, see issue GNOME/gnome-terminal#5921 */ + assert_match (REGEX_URL_AS_IS, "https://en.wikipedia.org/wiki/Moore's_law", ENTIRE); + assert_match (REGEX_URL_AS_IS, "<a href=\"https://en.wikipedia.org/wiki/Moore's_law\">", "https://en.wikipedia.org/wiki/Moore's_law"); + assert_match (REGEX_URL_AS_IS, "https://en.wikipedia.org/wiki/Cryin'", ENTIRE); + assert_match (REGEX_URL_AS_IS, "<a href=\"https://en.wikipedia.org/wiki/Cryin'\">", "https://en.wikipedia.org/wiki/Cryin'"); + assert_match (REGEX_URL_AS_IS, "<a href='https://en.wikipedia.org/wiki/Aerosmith'>", "https://en.wikipedia.org/wiki/Aerosmith"); + + /* No scheme */ + /* These need PCRE2 10.35 to succeed; see issue GNOME/gnome-terminal#221 */ + if (pcre2_atleast_10_35 || pcre2test) { + assert_match (REGEX_URL_HTTP, "www.foo.bar/baz", ENTIRE); + assert_match (REGEX_URL_HTTP, "WWW3.foo.bar/baz", ENTIRE); + assert_match (REGEX_URL_HTTP, "FTP.FOO.BAR/BAZ", ENTIRE); /* FIXME if no scheme is given and url starts with ftp, can we make the protocol ftp instead of http? */ + assert_match (REGEX_URL_HTTP, "ftpxy.foo.bar/baz", ENTIRE); + if (pcre2test) // unexplained failure + assert_match (REGEX_URL_HTTP, "ftp.123/baz", nullptr); /* errr... could we fail here?? */ + } + assert_match (REGEX_URL_HTTP, "foo.bar/baz", nullptr); + assert_match (REGEX_URL_HTTP, "abc.www.foo.bar/baz", nullptr); + assert_match (REGEX_URL_HTTP, "uvwww.foo.bar/baz", nullptr); + assert_match (REGEX_URL_HTTP, "xftp.foo.bar/baz", nullptr); + + /* file:/ or file://(hostname)?/ */ + assert_match (REGEX_URL_FILE, "file:", nullptr); + assert_match (REGEX_URL_FILE, "file:/", ENTIRE); + assert_match (REGEX_URL_FILE, "file://", nullptr); + assert_match (REGEX_URL_FILE, "file:///", ENTIRE); + assert_match (REGEX_URL_FILE, "file:////", nullptr); + assert_match (REGEX_URL_FILE, "file:etc/passwd", nullptr); + assert_match (REGEX_URL_FILE, "File:/etc/passwd", ENTIRE); + assert_match (REGEX_URL_FILE, "FILE:///etc/passwd", ENTIRE); + assert_match (REGEX_URL_FILE, "file:////etc/passwd", nullptr); + assert_match (REGEX_URL_FILE, "file://host.name", nullptr); + assert_match (REGEX_URL_FILE, "file://host.name/", ENTIRE); + assert_match (REGEX_URL_FILE, "file://host.name/etc", ENTIRE); + + assert_match (REGEX_URL_FILE, "See file:/.", "file:/"); + assert_match (REGEX_URL_FILE, "See file:///.", "file:///"); + assert_match (REGEX_URL_FILE, "See file:/lost+found.", "file:/lost+found"); + assert_match (REGEX_URL_FILE, "See file:///lost+found.", "file:///lost+found"); + + /* Email */ + assert_match (REGEX_EMAIL, "Write to foo@bar.com.", "foo@bar.com"); + assert_match (REGEX_EMAIL, "Write to <foo@bar.com>", "foo@bar.com"); + assert_match (REGEX_EMAIL, "Write to mailto:foo@bar.com.", "mailto:foo@bar.com"); + assert_match (REGEX_EMAIL, "Write to MAILTO:FOO@BAR.COM.", "MAILTO:FOO@BAR.COM"); + assert_match (REGEX_EMAIL, "Write to foo@[1.2.3.4]", "foo@[1.2.3.4]"); + assert_match (REGEX_EMAIL, "Write to foo@[1.2.3.456]", nullptr); + assert_match (REGEX_EMAIL, "Write to foo@[1::2345]", "foo@[1::2345]"); + assert_match (REGEX_EMAIL, "Write to foo@[dead::beef]", "foo@[dead::beef]"); + assert_match (REGEX_EMAIL, "Write to foo@1.2.3.4", nullptr); + assert_match (REGEX_EMAIL, "Write to foo@1.2.3.456", nullptr); + assert_match (REGEX_EMAIL, "Write to foo@1::2345", nullptr); + assert_match (REGEX_EMAIL, "Write to foo@dead::beef", nullptr); + assert_match (REGEX_EMAIL, "<baz email=\"foo@bar.com\"/>", "foo@bar.com"); + assert_match (REGEX_EMAIL, "<baz email='foo@bar.com'/>", "foo@bar.com"); + assert_match (REGEX_EMAIL, "<email>foo@bar.com</email>", "foo@bar.com"); + + /* Sip, examples from rfc 3261 */ + assert_match (REGEX_URL_VOIP, "sip:alice@atlanta.com;maddr=239.255.255.1;ttl=15", ENTIRE); + assert_match (REGEX_URL_VOIP, "sip:alice@atlanta.com", ENTIRE); + assert_match (REGEX_URL_VOIP, "sip:alice:secretword@atlanta.com;transport=tcp", ENTIRE); + assert_match (REGEX_URL_VOIP, "sips:alice@atlanta.com?subject=project%20x&priority=urgent", ENTIRE); + assert_match (REGEX_URL_VOIP, "sip:+1-212-555-1212:1234@gateway.com;user=phone", ENTIRE); + assert_match (REGEX_URL_VOIP, "sips:1212@gateway.com", ENTIRE); + assert_match (REGEX_URL_VOIP, "sip:alice@192.0.2.4", ENTIRE); + assert_match (REGEX_URL_VOIP, "sip:atlanta.com;method=REGISTER?to=alice%40atlanta.com", ENTIRE); + assert_match (REGEX_URL_VOIP, "SIP:alice;day=tuesday@atlanta.com", ENTIRE); + assert_match (REGEX_URL_VOIP, "Dial sip:alice@192.0.2.4.", "sip:alice@192.0.2.4"); + + /* Extremely long match, bug 770147 */ + assert_match (REGEX_URL_AS_IS, "http://www.example.com/ThisPathConsistsOfMoreThan1024Characters" + "1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890" + "1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890" + "1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890" + "1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890" + "1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890" + "1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890" + "1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890" + "1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890" + "1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890" + "1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890", ENTIRE); +} + +static void +test_regex_unicode(void) +{ + auto error = vte::glib::Error{}; + g_assert_true(vte::base::Regex::check_pcre_config_unicode(error)); + error.assert_no_error(); +} + +static bool +parse_args(char*** argv, + int* argc, + GError** error) +{ + char* _pcre2test_filename{nullptr}; + GOptionEntry const entries[] = { + { "pcre2test", 0, 0, G_OPTION_ARG_FILENAME, &_pcre2test_filename, + "Print input and output of tests in pcre2test format to file", "FILENAME" }, + { nullptr } + }; + + auto context = g_option_context_new(nullptr); + g_option_context_set_help_enabled(context, false); + g_option_context_set_ignore_unknown_options(context, true); + g_option_context_add_main_entries(context, entries, nullptr); + + bool rv = g_option_context_parse(context, argc, argv, error); + g_option_context_free(context); + + pcre2test = _pcre2test_filename != nullptr; + if (rv && pcre2test) { + auto pcre2test_in_filename = std::string{_pcre2test_filename} + ".in"s; + auto pcre2test_out_filename = std::string{_pcre2test_filename} + ".out"s; + g_free(_pcre2test_filename); + _pcre2test_filename = nullptr; + + pcre2test_in = fopen(pcre2test_in_filename.c_str(), "wbe"); + if (pcre2test_in == nullptr) { + auto errsv = int{errno}; + g_set_error(error, G_OPTION_ERROR, G_OPTION_ERROR_FAILED, + "Failed to open pcre2test input file: %s", + g_strerror(errsv)); + return false; + } + pcre2test_out = fopen(pcre2test_out_filename.c_str(), "wbe"); + if (pcre2test_out == nullptr) { + auto errsv = int{errno}; + g_set_error(error, G_OPTION_ERROR, G_OPTION_ERROR_FAILED, + "Failed to open pcre2test output file: %s", + g_strerror(errsv)); + fclose(pcre2test_in); + pcre2test_in = nullptr; + return false; + } + } + + return rv; +} + +int +main(int argc, + char* argv[]) +{ + setlocale(LC_ALL, ""); + + g_test_init(&argc, &argv, nullptr); + + auto err = vte::glib::Error{}; + if (!parse_args(&argv, &argc, err)) { + g_printerr("Failed to parse arguments: %s\n", err.message()); + return EXIT_FAILURE; + } + + auto version = vte::base::Regex::get_pcre_version(); + pcre2_atleast_10_35 = strverscmp(version.c_str(), "10.35") > 0; + + if (pcre2test) { + fprintf(pcre2test_in, "#pattern multiline,ucp,utf,no_utf_check\n\n"); + fprintf(pcre2test_out, "#pattern multiline,ucp,utf,no_utf_check\n\n"); + } + + /* Build test suites */ + + g_test_add_func("/vte/regex/unicode", test_regex_unicode); + + setup_regex_builtins_tests(); + + /* Run tests */ + + if (pcre2test) { + fclose(pcre2test_in); + fclose(pcre2test_out); + return EXIT_SUCCESS; + } + + return g_test_run(); +} diff --git a/src/regex.cc b/src/regex.cc index 20684f1c..914fa6a6 100644 --- a/src/regex.cc +++ b/src/regex.cc @@ -52,6 +52,17 @@ Regex::unref() noexcept delete this; } +std::string +Regex::get_pcre_version() +{ + auto v = std::string{}; + auto r = pcre2_config_8(PCRE2_CONFIG_VERSION, nullptr); + v.resize(r); + r = pcre2_config_8(PCRE2_CONFIG_VERSION, v.data()); + + return v; +} + bool Regex::check_pcre_config_unicode(GError** error) { diff --git a/src/regex.hh b/src/regex.hh index db5b70ad..b98be2c0 100644 --- a/src/regex.hh +++ b/src/regex.hh @@ -38,6 +38,7 @@ public: eSearch, }; + static std::string get_pcre_version(); static bool check_pcre_config_unicode(GError** error); static bool check_pcre_config_jit(void); static Regex* compile(Purpose purpose, @@ -73,6 +73,8 @@ #include "cxx-utils.hh" #include "gobject-glue.hh" +#include "regex-builtins.hh" + #ifdef WITH_A11Y #if VTE_GTK == 3 #include "vteaccess.h" @@ -81,6 +83,7 @@ #endif /* VTE_GTK == 3 */ #endif /* WITH_A11Y */ +#include <algorithm> #include <new> /* placement new */ using namespace std::literals; @@ -1107,6 +1110,28 @@ Terminal::regex_match_remove(int tag) noexcept match_regexes_writable().erase(i); } +void +Terminal::regex_match_add_builtins() noexcept +{ + auto& match_regexes = match_regexes_writable(); + if (!m_match_regex_builtins) + m_match_regex_builtins = vte::base::RegexBuiltins::get(); + for (auto const& [regex, tag] : m_match_regex_builtins->builtins()) { + match_regexes.emplace_back(make_ref(regex.get()), + 0 /* match flags */, + VTE_MATCH_BUILTINS_CURSOR, + tag); + } +} + +void +Terminal::regex_match_remove_builtins() noexcept +{ + auto& match_regexes = match_regexes_writable(); + std::remove_if(std::begin(match_regexes), std::end(match_regexes), + [](MatchRegex const& rem) { return rem.tag() < 0; }); +} + /* * match_rowcol_to_offset: * @terminal: @@ -1480,7 +1505,7 @@ Terminal::match_check_internal(vte::grid::column_t column, char* Terminal::regex_match_check(vte::grid::column_t column, vte::grid::row_t row, - int* tag) + int* tag_ptr) { /* Need to ensure the ringview is updated. */ ringview_update(); @@ -1506,8 +1531,16 @@ Terminal::regex_match_check(vte::grid::column_t column, _VTE_DEBUG_IF(VTE_DEBUG_EVENTS | VTE_DEBUG_REGEX) { if (ret != NULL) g_printerr("Matched `%s'.\n", ret); } - if (tag != nullptr) - *tag = (match != nullptr) ? match->tag() : -1; + + int tag = -1; + if (match != nullptr) { + tag = match->tag(); + if (tag < -1 && m_match_regex_builtins) + tag = m_match_regex_builtins->transform_match(ret, tag); + } + + if (tag_ptr != nullptr) + *tag_ptr = tag; return ret; } diff --git a/src/vte/vteenums.h b/src/vte/vteenums.h index 4082fe88..0eb77673 100644 --- a/src/vte/vteenums.h +++ b/src/vte/vteenums.h @@ -215,4 +215,18 @@ typedef enum { VTE_ALIGN_END = 3U, } VteAlign; +/* + * VteBuiltinMatchTag: + * @VTE_BUILTIN_MATCH_TAG_URI: the match is an URI as recognised by + * the expressions added with vte_terminal_match_add_uris() + * + * An enumeration that will be returned from vte_terminal_match_check_event() + * if a builtin expression matched. + * + * Since: 0.70 + */ +typedef enum { + VTE_BUILTIN_MATCH_TAG_URI = -2 +} VteBuiltinMatchTag; + G_END_DECLS diff --git a/src/vte/vtemacros.h b/src/vte/vtemacros.h index c1300998..873f12ad 100644 --- a/src/vte/vtemacros.h +++ b/src/vte/vtemacros.h @@ -21,6 +21,10 @@ #error "Only <vte/vte.h> can be included directly." #endif +#ifdef VTE_COMPILATION +#define _VTE_GTK VTE_GTK +#else + #include <gtk/gtk.h> #if GTK_CHECK_VERSION(4,0,0) @@ -33,6 +37,8 @@ #error gtk+ version unknown #endif +#endif /* VTE_COMPILATION */ + #if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ > 6) #define _VTE_GNUC_PACKED __attribute__((__packed__)) #else diff --git a/src/vte/vteterminal.h b/src/vte/vteterminal.h index 0d3cf51c..1e287d0c 100644 --- a/src/vte/vteterminal.h +++ b/src/vte/vteterminal.h @@ -429,6 +429,9 @@ _VTE_PUBLIC int vte_terminal_match_add_regex(VteTerminal *terminal, VteRegex *regex, guint32 flags) _VTE_CXX_NOEXCEPT _VTE_GNUC_NONNULL(1, 2); +_VTE_PUBLIC +void vte_terminal_match_add_builtins(VteTerminal *terminal) _VTE_CXX_NOEXCEPT _VTE_GNUC_NONNULL(1); + /* Set the cursor to be used when the pointer is over a given match. */ _VTE_PUBLIC void vte_terminal_match_set_cursor_name(VteTerminal *terminal, @@ -438,6 +441,8 @@ _VTE_PUBLIC void vte_terminal_match_remove(VteTerminal *terminal, int tag) _VTE_CXX_NOEXCEPT _VTE_GNUC_NONNULL(1); _VTE_PUBLIC +void vte_terminal_match_remove_builtins(VteTerminal *terminal) _VTE_CXX_NOEXCEPT _VTE_GNUC_NONNULL(1); +_VTE_PUBLIC void vte_terminal_match_remove_all(VteTerminal *terminal) _VTE_CXX_NOEXCEPT _VTE_GNUC_NONNULL(1); /* Check if a given cell on the screen contains part of a matched string. If diff --git a/src/vtedefines.hh b/src/vtedefines.hh index 7f2e478a..a452ad8e 100644 --- a/src/vtedefines.hh +++ b/src/vtedefines.hh @@ -74,10 +74,13 @@ #define VTE_PALETTE_SIZE 263 #define VTE_SCROLLBACK_INIT 512 + #define VTE_DEFAULT_CURSOR std::string{"text"} #define VTE_MOUSING_CURSOR std::string{"default"} #define VTE_HYPERLINK_CURSOR std::string{"pointer"} #define VTE_HYPERLINK_CURSOR_DEBUG std::string{"crosshair"} +#define VTE_MATCH_BUILTINS_CURSOR std::string{"pointer"} + #define VTE_CHILD_INPUT_PRIORITY G_PRIORITY_DEFAULT_IDLE #define VTE_CHILD_OUTPUT_PRIORITY G_PRIORITY_HIGH #define VTE_MAX_INPUT_READ 0x1000 diff --git a/src/vtegtk.cc b/src/vtegtk.cc index f94b8121..33223a5e 100644 --- a/src/vtegtk.cc +++ b/src/vtegtk.cc @@ -2895,7 +2895,7 @@ vte_terminal_match_add_gregex(VteTerminal *terminal, * vte_terminal_match_add_regex: * @terminal: a #VteTerminal * @regex: (transfer none): a #VteRegex - * @flags: PCRE2 match flags, or 0 + * @flags: PCRE2 match flags, or 0 to use the default flags * * Adds the regular expression @regex to the list of matching expressions. When the * user moves the mouse cursor over a section of displayed text which matches @@ -2904,7 +2904,12 @@ vte_terminal_match_add_gregex(VteTerminal *terminal, * Note that @regex should have been created using the <literal>PCRE2_MULTILINE</literal> * flag. * - * Returns: an integer associated with this expression + * Note that the default flags only contain PCRE2_UTF (and some flags for internal use); + * if you want to match unicode properties, you need to pass PCRE2_UCP in @flags; and you + * must always use the %PCRE2_MULTILINE flag. + * See man:pcre2_compile(3) for more information on available flags. + * + * Returns: a nonnegative integer associated with this expression * * Since: 0.46 */ @@ -2932,6 +2937,35 @@ catch (...) } /** + * vte_terminal_match_add_builtins: + * @terminal: a #VteTerminal + * + * Adds regular expressions to recognise URIs to the list of matching expressions. + * When the user moves the mouse cursor over a section of displayed text which matches + * this expression, the text will be highlighted. + * + * When vte_terminal_match_check_event() returns a match for this regex, the + * returned tag will a value from #VteBuiltinMatchTag. + * + * Use vte_terminal_match_remove_builtins() or vte_terminal_match_remove_all() to remove + * the matching expressions added by this function. + * + * Since: 0.60 + */ +void +vte_terminal_match_add_builtins(VteTerminal *terminal) noexcept +try +{ + g_return_if_fail(VTE_IS_TERMINAL(terminal)); + + IMPL(terminal)->regex_match_add_builtins(); +} +catch (...) +{ + vte::log_exception(); +} + +/** * vte_terminal_match_check: * @terminal: a #VteTerminal * @column: the text column @@ -2978,12 +3012,16 @@ catch (...) * * Checks if the text in and around the position of the event matches any of the * regular expressions previously set using vte_terminal_match_add(). If a - * match exists, the text string is returned and if @tag is not %NULL, the number - * associated with the matched regular expression will be stored in @tag. + * match exists, the text string is returned. * - * If more than one regular expression has been set with - * vte_terminal_match_add(), then expressions are checked in the order in - * which they were added. + * If @tag is not %NULL, it will store the nonnegative integer associated with the + * matched regular expression, if it was added with vte_terminal_match_add_regex(), + * or a negative number from #VteBuiltinMatchTag if the matching regular expression + * is one added with vte_terminal_match_add_builtins() matched, or -1 if there is + * no match. + * + * Expressions are checked in the order in which they were added, returning the + * first match. * * Returns: (transfer full) (nullable): a newly allocated string which matches one of the previously * set regular expressions, or %NULL if there is no match @@ -3254,7 +3292,7 @@ catch (...) /** * vte_terminal_match_remove: * @terminal: a #VteTerminal - * @tag: the tag of the regex to remove + * @tag: the nonnegative tag of the regex to remove * * Removes the regular expression which is associated with the given @tag from * the list of expressions which the terminal will highlight when the user @@ -3274,6 +3312,26 @@ catch (...) } /** + * vte_terminal_match_remove_builtins: + * @terminal: a #VteTerminal + * + * Removes the regular expression added with vte_terminal_match_add_builtins(). + * + * Since: 0.60 + */ +void +vte_terminal_match_remove_builtins(VteTerminal *terminal) noexcept +try +{ + g_return_if_fail(VTE_IS_TERMINAL(terminal)); + IMPL(terminal)->regex_match_remove_builtins(); +} +catch (...) +{ + vte::log_exception(); +} + +/** * vte_terminal_match_remove_all: * @terminal: a #VteTerminal * diff --git a/src/vteinternal.hh b/src/vteinternal.hh index 54409499..19a1539a 100644 --- a/src/vteinternal.hh +++ b/src/vteinternal.hh @@ -55,8 +55,10 @@ #include "chunk.hh" #include "pty.hh" #include "utf8.hh" +#include "fwd.hh" #include <list> +#include <memory> #include <queue> #include <optional> #include <string> @@ -597,6 +599,10 @@ public: return match_regexes_writable().emplace_back(std::forward<Args>(args)...); } + std::shared_ptr<vte::base::RegexBuiltins> m_match_regex_builtins{}; + void regex_match_add_builtins() noexcept; + void regex_match_remove_builtins() noexcept; + char* m_match_contents; GArray* m_match_attributes; char* m_match; diff --git a/src/vteregex.cc b/src/vteregex.cc index e8deabf8..1dcf79c4 100644 --- a/src/vteregex.cc +++ b/src/vteregex.cc @@ -26,9 +26,9 @@ #include <exception> -#include "vtemacros.h" -#include "vteenums.h" -#include "vteregex.h" +#include "vte/vtemacros.h" +#include "vte/vteenums.h" +#include "vte/vteregex.h" #include "glib-glue.hh" #include "pcre2-glue.hh" |