diff options
author | Christian Hergert <chergert@redhat.com> | 2022-09-05 16:10:34 -0700 |
---|---|---|
committer | Christian Hergert <chergert@redhat.com> | 2022-09-05 16:11:49 -0700 |
commit | fd0128f0fd957ce64baeb95d071b39f2f3cc679d (patch) | |
tree | b64ea1822b94875ed18ee06440b164da138c8ee7 | |
parent | 0dae82fb1e21ce3dd2cf911d5b5e2318688754f8 (diff) | |
download | gtksourceview-wip/chergert/backport-implregex-to-gsv4.tar.gz |
regex: backport from GRegex to ImplRegexwip/chergert/backport-implregex-to-gsv4
This uses the same regex abstraction from GSV 5.x to use PCRE2 directly
rather than indirectly through GRegex.
-rw-r--r-- | gtksourceview/gtksourceregex.c | 88 | ||||
-rw-r--r-- | gtksourceview/gtksourceregex.h | 70 | ||||
-rw-r--r-- | gtksourceview/gtksourcesearchcontext.c | 116 | ||||
-rw-r--r-- | gtksourceview/implregex-private.h | 89 | ||||
-rw-r--r-- | gtksourceview/implregex.c | 1141 | ||||
-rw-r--r-- | gtksourceview/meson.build | 2 | ||||
-rw-r--r-- | meson.build | 2 | ||||
-rw-r--r-- | subprojects/pcre2.wrap | 10 |
8 files changed, 1375 insertions, 143 deletions
diff --git a/gtksourceview/gtksourceregex.c b/gtksourceview/gtksourceregex.c index 77570837..ce7f9449 100644 --- a/gtksourceview/gtksourceregex.c +++ b/gtksourceview/gtksourceregex.c @@ -29,21 +29,23 @@ #include <glib/gi18n-lib.h> #include "gtksourceutils-private.h" +#include "implregex-private.h" + /* * GRegex wrapper which adds a few features needed for syntax highlighting, * in particular resolving "\%{...@start}" and forbidding the use of \C. */ /* Regex used to match "\%{...@start}". */ -static GRegex * +static ImplRegex * get_start_ref_regex (void) { - static GRegex *start_ref_regex = NULL; + static ImplRegex *start_ref_regex = NULL; if (start_ref_regex == NULL) { - start_ref_regex = g_regex_new ("(?<!\\\\)(\\\\\\\\)*\\\\%\\{(.*?)@start\\}", - G_REGEX_OPTIMIZE, 0, NULL); + start_ref_regex = impl_regex_new ("(?<!\\\\)(\\\\\\\\)*\\\\%\\{(.*?)@start\\}", + G_REGEX_OPTIMIZE, 0, NULL); } return start_ref_regex; @@ -57,8 +59,8 @@ struct _GtkSourceRegex GRegexCompileFlags flags; } info; struct { - GRegex *regex; - GMatchInfo *match; + ImplRegex *regex; + ImplMatchInfo *match; } regex; } u; @@ -105,16 +107,16 @@ find_single_byte_escape (const gchar *string) * gtk_source_regex_new: * @pattern: the regular expression. * @flags: compile options for @pattern. - * @error: location to store the error occuring, or %NULL to ignore errors. + * @error: location to store the error occurring, or %NULL to ignore errors. * * Creates a new regex. * * Returns: a newly-allocated #GtkSourceRegex. */ GtkSourceRegex * -_gtk_source_regex_new (const gchar *pattern, - GRegexCompileFlags flags, - GError **error) +_gtk_source_regex_new (const gchar *pattern, + GRegexCompileFlags flags, + GError **error) { GtkSourceRegex *regex; @@ -132,7 +134,7 @@ _gtk_source_regex_new (const gchar *pattern, regex = g_slice_new0 (GtkSourceRegex); regex->ref_count = 1; - if (g_regex_match (get_start_ref_regex (), pattern, 0, NULL)) + if (impl_regex_match (get_start_ref_regex (), pattern, 0, NULL)) { regex->resolved = FALSE; regex->u.info.pattern = g_strdup (pattern); @@ -141,9 +143,9 @@ _gtk_source_regex_new (const gchar *pattern, else { regex->resolved = TRUE; - regex->u.regex.regex = g_regex_new (pattern, - flags | G_REGEX_OPTIMIZE | G_REGEX_NEWLINE_LF, 0, - error); + regex->u.regex.regex = impl_regex_new (pattern, + flags | G_REGEX_OPTIMIZE | G_REGEX_NEWLINE_LF, 0, + error); if (regex->u.regex.regex == NULL) { @@ -170,9 +172,9 @@ _gtk_source_regex_unref (GtkSourceRegex *regex) { if (regex->resolved) { - g_regex_unref (regex->u.regex.regex); + impl_regex_unref (regex->u.regex.regex); if (regex->u.regex.match) - g_match_info_free (regex->u.regex.match); + impl_match_info_free (regex->u.regex.match); } else { @@ -188,27 +190,25 @@ struct RegexResolveData { }; static gboolean -replace_start_regex (const GMatchInfo *match_info, - GString *expanded_regex, - gpointer user_data) +replace_start_regex (const ImplMatchInfo *match_info, + GString *expanded_regex, + gpointer user_data) { gchar *num_string, *subst, *subst_escaped, *escapes; gint num; struct RegexResolveData *data = user_data; - escapes = g_match_info_fetch (match_info, 1); - num_string = g_match_info_fetch (match_info, 2); + escapes = impl_match_info_fetch (match_info, 1); + num_string = impl_match_info_fetch (match_info, 2); num = _gtk_source_utils_string_to_int (num_string); if (num < 0) { - subst = g_match_info_fetch_named (data->start_regex->u.regex.match, - num_string); + subst = impl_match_info_fetch_named (data->start_regex->u.regex.match, num_string); } else { - subst = g_match_info_fetch (data->start_regex->u.regex.match, - num); + subst = impl_match_info_fetch (data->start_regex->u.regex.match, num); } if (subst != NULL) @@ -263,18 +263,18 @@ _gtk_source_regex_resolve (GtkSourceRegex *regex, data.start_regex = start_regex; data.matched_text = matched_text; - expanded_regex = g_regex_replace_eval (get_start_ref_regex (), - regex->u.info.pattern, - -1, 0, 0, - replace_start_regex, - &data, NULL); + expanded_regex = impl_regex_replace_eval (get_start_ref_regex (), + regex->u.info.pattern, + -1, 0, 0, + replace_start_regex, + &data, NULL); new_regex = _gtk_source_regex_new (expanded_regex, regex->u.info.flags, NULL); if (new_regex == NULL || !new_regex->resolved) { _gtk_source_regex_unref (new_regex); g_warning ("Regular expression %s cannot be expanded.", regex->u.info.pattern); - /* Returns a regex that nevers matches. */ + /* Returns a regex that never matches. */ new_regex = _gtk_source_regex_new ("$never-match^", 0, NULL); } @@ -301,14 +301,14 @@ _gtk_source_regex_match (GtkSourceRegex *regex, if (regex->u.regex.match) { - g_match_info_free (regex->u.regex.match); + impl_match_info_free (regex->u.regex.match); regex->u.regex.match = NULL; } - result = g_regex_match_full (regex->u.regex.regex, line, - byte_length, byte_pos, - 0, ®ex->u.regex.match, - NULL); + result = impl_regex_match_full (regex->u.regex.regex, line, + byte_length, byte_pos, + 0, ®ex->u.regex.match, + NULL); return result; } @@ -319,7 +319,7 @@ _gtk_source_regex_fetch (GtkSourceRegex *regex, { g_assert (regex->resolved); - return g_match_info_fetch (regex->u.regex.match, num); + return impl_match_info_fetch (regex->u.regex.match, num); } void @@ -333,8 +333,8 @@ _gtk_source_regex_fetch_pos (GtkSourceRegex *regex, g_assert (regex->resolved); - /* g_match_info_fetch_pos() can return TRUE with start_pos/end_pos set to -1 */ - if (!g_match_info_fetch_pos (regex->u.regex.match, num, &byte_start_pos, &byte_end_pos)) + /* impl_match_info_fetch_pos() can return TRUE with start_pos/end_pos set to -1 */ + if (!impl_match_info_fetch_pos (regex->u.regex.match, num, &byte_start_pos, &byte_end_pos)) { if (start_pos != NULL) *start_pos = -1; @@ -356,12 +356,12 @@ _gtk_source_regex_fetch_pos_bytes (GtkSourceRegex *regex, gint *start_pos_p, /* byte offsets */ gint *end_pos_p) /* byte offsets */ { - gint start_pos; - gint end_pos; + gint start_pos = -1; + gint end_pos = -1; g_assert (regex->resolved); - if (!g_match_info_fetch_pos (regex->u.regex.match, num, &start_pos, &end_pos)) + if (!impl_match_info_fetch_pos (regex->u.regex.match, num, &start_pos, &end_pos)) { start_pos = -1; end_pos = -1; @@ -384,7 +384,7 @@ _gtk_source_regex_fetch_named_pos (GtkSourceRegex *regex, g_assert (regex->resolved); - if (!g_match_info_fetch_named_pos (regex->u.regex.match, name, &byte_start_pos, &byte_end_pos)) + if (!impl_match_info_fetch_named_pos (regex->u.regex.match, name, &byte_start_pos, &byte_end_pos)) { if (start_pos != NULL) *start_pos = -1; @@ -405,6 +405,6 @@ _gtk_source_regex_get_pattern (GtkSourceRegex *regex) { g_assert (regex->resolved); - return g_regex_get_pattern (regex->u.regex.regex); + return impl_regex_get_pattern (regex->u.regex.regex); } diff --git a/gtksourceview/gtksourceregex.h b/gtksourceview/gtksourceregex.h index edf9d6b7..b70793f6 100644 --- a/gtksourceview/gtksourceregex.h +++ b/gtksourceview/gtksourceregex.h @@ -28,56 +28,46 @@ G_BEGIN_DECLS GTK_SOURCE_INTERNAL -GtkSourceRegex *_gtk_source_regex_new (const gchar *pattern, - GRegexCompileFlags flags, - GError **error); - +GtkSourceRegex *_gtk_source_regex_new (const gchar *pattern, + GRegexCompileFlags flags, + GError **error); GTK_SOURCE_INTERNAL -GtkSourceRegex *_gtk_source_regex_ref (GtkSourceRegex *regex); - +GtkSourceRegex *_gtk_source_regex_ref (GtkSourceRegex *regex); GTK_SOURCE_INTERNAL -void _gtk_source_regex_unref (GtkSourceRegex *regex); - +void _gtk_source_regex_unref (GtkSourceRegex *regex); GTK_SOURCE_INTERNAL -GtkSourceRegex *_gtk_source_regex_resolve (GtkSourceRegex *regex, - GtkSourceRegex *start_regex, - const gchar *matched_text); - +GtkSourceRegex *_gtk_source_regex_resolve (GtkSourceRegex *regex, + GtkSourceRegex *start_regex, + const gchar *matched_text); GTK_SOURCE_INTERNAL -gboolean _gtk_source_regex_is_resolved (GtkSourceRegex *regex); - +gboolean _gtk_source_regex_is_resolved (GtkSourceRegex *regex); GTK_SOURCE_INTERNAL -gboolean _gtk_source_regex_match (GtkSourceRegex *regex, - const gchar *line, - gint byte_length, - gint byte_pos); - +gboolean _gtk_source_regex_match (GtkSourceRegex *regex, + const gchar *line, + gint byte_length, + gint byte_pos); GTK_SOURCE_INTERNAL -gchar *_gtk_source_regex_fetch (GtkSourceRegex *regex, - gint num); - +gchar *_gtk_source_regex_fetch (GtkSourceRegex *regex, + gint num); GTK_SOURCE_INTERNAL -void _gtk_source_regex_fetch_pos (GtkSourceRegex *regex, - const gchar *text, - gint num, - gint *start_pos, /* character offsets */ - gint *end_pos); /* character offsets */ - +void _gtk_source_regex_fetch_pos (GtkSourceRegex *regex, + const gchar *text, + gint num, + gint *start_pos, + gint *end_pos); GTK_SOURCE_INTERNAL -void _gtk_source_regex_fetch_pos_bytes (GtkSourceRegex *regex, - gint num, - gint *start_pos_p, /* byte offsets */ - gint *end_pos_p); /* byte offsets */ - +void _gtk_source_regex_fetch_pos_bytes (GtkSourceRegex *regex, + gint num, + gint *start_pos_p, + gint *end_pos_p); GTK_SOURCE_INTERNAL -void _gtk_source_regex_fetch_named_pos (GtkSourceRegex *regex, - const gchar *text, - const gchar *name, - gint *start_pos, /* character offsets */ - gint *end_pos); /* character offsets */ - +void _gtk_source_regex_fetch_named_pos (GtkSourceRegex *regex, + const gchar *text, + const gchar *name, + gint *start_pos, + gint *end_pos); GTK_SOURCE_INTERNAL -const gchar *_gtk_source_regex_get_pattern (GtkSourceRegex *regex); +const gchar *_gtk_source_regex_get_pattern (GtkSourceRegex *regex); G_END_DECLS diff --git a/gtksourceview/gtksourcesearchcontext.c b/gtksourceview/gtksourcesearchcontext.c index 90811670..69a9c514 100644 --- a/gtksourceview/gtksourcesearchcontext.c +++ b/gtksourceview/gtksourcesearchcontext.c @@ -35,6 +35,8 @@ #include "gtksourceiter.h" #include "gtksource-enumtypes.h" +#include "implregex-private.h" + /** * SECTION:searchcontext * @Short_description: Search context @@ -343,7 +345,7 @@ struct _GtkSourceSearchContextPrivate */ gint text_nb_lines; - GRegex *regex; + ImplRegex *regex; GError *regex_error; gint occurrences_count; @@ -569,7 +571,7 @@ regex_search_get_real_start (GtkSourceSearchContext *search, GtkTextIter *real_start, gint *start_pos) { - gint max_lookbehind = g_regex_get_max_lookbehind (search->priv->regex); + gint max_lookbehind = impl_regex_get_max_lookbehind (search->priv->regex); gint i; gchar *text; @@ -614,35 +616,35 @@ regex_search_get_match_options (const GtkTextIter *real_start, } /* Get the @match_start and @match_end iters of the @match_info. - * g_match_info_fetch_pos() returns byte positions. To get the iters, we need to - * know the number of UTF-8 characters. A GMatchInfo can contain several matches - * (with g_match_info_next()). So instead of calling g_utf8_strlen() each time + * impl_match_info_fetch_pos() returns byte positions. To get the iters, we need to + * know the number of UTF-8 characters. A ImplMatchInfo can contain several matches + * (with impl_match_info_next()). So instead of calling g_utf8_strlen() each time * at the beginning of @subject, @iter and @iter_byte_pos are used to remember * where g_utf8_strlen() stopped. */ static gboolean -regex_search_fetch_match (GMatchInfo *match_info, - const gchar *subject, - gssize subject_length, - GtkTextIter *iter, - gint *iter_byte_pos, - GtkTextIter *match_start, - GtkTextIter *match_end) -{ - gint start_byte_pos; - gint end_byte_pos; +regex_search_fetch_match (ImplMatchInfo *match_info, + const gchar *subject, + gssize subject_length, + GtkTextIter *iter, + gint *iter_byte_pos, + GtkTextIter *match_start, + GtkTextIter *match_end) +{ + gint start_byte_pos = 0; + gint end_byte_pos = 0; gint nb_chars; g_assert (*iter_byte_pos <= subject_length); g_assert (match_start != NULL); g_assert (match_end != NULL); - if (!g_match_info_matches (match_info)) + if (!impl_match_info_matches (match_info)) { return FALSE; } - if (!g_match_info_fetch_pos (match_info, 0, &start_byte_pos, &end_byte_pos)) + if (!impl_match_info_fetch_pos (match_info, 0, &start_byte_pos, &end_byte_pos)) { g_warning ("Impossible to fetch regex match position."); return FALSE; @@ -715,7 +717,7 @@ basic_forward_regex_search (GtkSourceSearchContext *search, GRegexMatchFlags match_options; gchar *subject; gssize subject_length; - GMatchInfo *match_info; + ImplMatchInfo *match_info; GtkTextIter iter; gint iter_byte_pos; GtkTextIter m_start; @@ -725,13 +727,13 @@ basic_forward_regex_search (GtkSourceSearchContext *search, subject = gtk_text_iter_get_visible_text (&real_start, &end); subject_length = strlen (subject); - g_regex_match_full (search->priv->regex, - subject, - subject_length, - start_pos, - match_options, - &match_info, - &search->priv->regex_error); + impl_regex_match_full (search->priv->regex, + subject, + subject_length, + start_pos, + match_options, + &match_info, + &search->priv->regex_error); iter = real_start; iter_byte_pos = 0; @@ -744,13 +746,13 @@ basic_forward_regex_search (GtkSourceSearchContext *search, &m_start, &m_end); - if (!found && g_match_info_is_partial_match (match_info)) + if (!found && impl_match_info_is_partial_match (match_info)) { gtk_text_iter_forward_lines (&end, nb_lines); nb_lines <<= 1; g_free (subject); - g_match_info_free (match_info); + impl_match_info_free (match_info); continue; } @@ -789,7 +791,7 @@ basic_forward_regex_search (GtkSourceSearchContext *search, } g_free (subject); - g_match_info_free (match_info); + impl_match_info_free (match_info); break; } @@ -1824,7 +1826,7 @@ regex_search_scan_segment (GtkSourceSearchContext *search, gchar *subject; gssize subject_length; GRegexMatchFlags match_options; - GMatchInfo *match_info; + ImplMatchInfo *match_info; GtkTextIter iter; gint iter_byte_pos; gboolean segment_finished; @@ -1887,13 +1889,13 @@ regex_search_scan_segment (GtkSourceSearchContext *search, g_free (subject_escaped); }); - g_regex_match_full (search->priv->regex, - subject, - subject_length, - start_pos, - match_options, - &match_info, - &search->priv->regex_error); + impl_regex_match_full (search->priv->regex, + subject, + subject_length, + start_pos, + match_options, + &match_info, + &search->priv->regex_error); iter = real_start; iter_byte_pos = 0; @@ -1921,7 +1923,7 @@ regex_search_scan_segment (GtkSourceSearchContext *search, search->priv->occurrences_count++; - g_match_info_next (match_info, &search->priv->regex_error); + impl_match_info_next (match_info, &search->priv->regex_error); } if (search->priv->regex_error != NULL) @@ -1929,7 +1931,7 @@ regex_search_scan_segment (GtkSourceSearchContext *search, g_object_notify (G_OBJECT (search), "regex-error"); } - if (g_match_info_is_partial_match (match_info)) + if (impl_match_info_is_partial_match (match_info)) { segment_finished = FALSE; @@ -1953,15 +1955,15 @@ regex_search_scan_segment (GtkSourceSearchContext *search, } g_free (subject); - g_match_info_free (match_info); + impl_match_info_free (match_info); return segment_finished; } static void regex_search_scan_chunk (GtkSourceSearchContext *search, - const GtkTextIter *chunk_start, - const GtkTextIter *chunk_end) + const GtkTextIter *chunk_start, + const GtkTextIter *chunk_end) { GtkTextIter segment_start = *chunk_start; @@ -2318,7 +2320,7 @@ update_regex (GtkSourceSearchContext *search) if (search->priv->regex != NULL) { - g_regex_unref (search->priv->regex); + impl_regex_unref (search->priv->regex); search->priv->regex = NULL; } @@ -2331,7 +2333,7 @@ update_regex (GtkSourceSearchContext *search) if (search_text != NULL && gtk_source_search_settings_get_regex_enabled (search->priv->settings)) { - GRegexCompileFlags compile_flags = G_REGEX_OPTIMIZE | G_REGEX_MULTILINE; + GRegexCompileFlags compile_flags = G_REGEX_MULTILINE; gchar *pattern = (gchar *)search_text; search->priv->text_nb_lines = 0; @@ -2346,10 +2348,10 @@ update_regex (GtkSourceSearchContext *search) pattern = g_strdup_printf ("\\b%s\\b", search_text); } - search->priv->regex = g_regex_new (pattern, - compile_flags, - G_REGEX_MATCH_NOTEMPTY, - &search->priv->regex_error); + search->priv->regex = impl_regex_new (pattern, + compile_flags, + G_REGEX_MATCH_NOTEMPTY, + &search->priv->regex_error); if (search->priv->regex_error != NULL) { @@ -2675,11 +2677,7 @@ gtk_source_search_context_finalize (GObject *object) { GtkSourceSearchContext *search = GTK_SOURCE_SEARCH_CONTEXT (object); - if (search->priv->regex != NULL) - { - g_regex_unref (search->priv->regex); - } - + g_clear_pointer (&search->priv->regex, impl_regex_unref); g_clear_error (&search->priv->regex_error); G_OBJECT_CLASS (gtk_source_search_context_parent_class)->finalize (object); @@ -3603,13 +3601,13 @@ regex_replace (GtkSourceSearchContext *search, match_options = regex_search_get_match_options (&real_start, &real_end); match_options |= G_REGEX_MATCH_ANCHORED; - subject_replaced = g_regex_replace (search->priv->regex, - subject, - -1, - start_pos, - replace, - match_options, - &tmp_error); + subject_replaced = impl_regex_replace (search->priv->regex, + subject, + -1, + start_pos, + replace, + match_options, + &tmp_error); if (tmp_error != NULL) { diff --git a/gtksourceview/implregex-private.h b/gtksourceview/implregex-private.h new file mode 100644 index 00000000..b0809414 --- /dev/null +++ b/gtksourceview/implregex-private.h @@ -0,0 +1,89 @@ +/* + * This file is part of GtkSourceView + * + * Copyright 2020 Christian Hergert <chergert@redhat.com> + * + * GtkSourceView is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * GtkSourceView is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, see <http://www.gnu.org/licenses/>. + * + * SPDX-License-Identifier: LGPL-2.1-or-later + */ + +#pragma once + +#include <glib.h> + +G_BEGIN_DECLS + +typedef struct _ImplRegex ImplRegex; +typedef struct _ImplMatchInfo ImplMatchInfo; + +typedef gboolean (*ImplRegexEvalCallback) (const ImplMatchInfo *match_info, + GString *result, + gpointer user_data); + + +ImplRegex *impl_regex_new (const char *pattern, + GRegexCompileFlags compile_options, + GRegexMatchFlags match_options, + GError **error); +gboolean impl_regex_match (const ImplRegex *regex, + const char *string, + GRegexMatchFlags match_options, + ImplMatchInfo **match_info); +ImplRegex *impl_regex_ref (ImplRegex *regex); +void impl_regex_unref (ImplRegex *regex); +void impl_match_info_free (ImplMatchInfo *match_info); +char *impl_match_info_fetch (const ImplMatchInfo *match_info, + int match_num); +char *impl_match_info_fetch_named (const ImplMatchInfo *match_info, + const char *name); +char *impl_regex_replace_eval (const ImplRegex *regex, + const char *string, + gssize string_len, + gsize start_position, + GRegexMatchFlags match_options, + ImplRegexEvalCallback eval, + gpointer user_data, + GError **error); +char *impl_regex_replace (const ImplRegex *regex, + const char *string, + gssize string_len, + int start_position, + const char *replacement, + GRegexMatchFlags match_options, + GError **error); +gboolean impl_regex_match_full (const ImplRegex *regex, + const char *string, + gssize string_len, + gsize start_position, + GRegexMatchFlags match_options, + ImplMatchInfo **match_info, + GError **error); +gboolean impl_match_info_fetch_pos (const ImplMatchInfo *match_info, + int match_num, + int *start_pos, + int *end_pos); +gboolean impl_match_info_fetch_named_pos (const ImplMatchInfo *match_info, + const char *name, + int *start_pos, + int *end_pos); +gboolean impl_match_info_is_partial_match (const ImplMatchInfo *match_info); +gboolean impl_match_info_matches (const ImplMatchInfo *match_info); +gboolean impl_match_info_next (ImplMatchInfo *match_info, + GError **error); +int impl_match_info_get_match_count (const ImplMatchInfo *match_info); +const char *impl_regex_get_pattern (const ImplRegex *regex); +int impl_regex_get_max_lookbehind (const ImplRegex *regex); + +G_END_DECLS diff --git a/gtksourceview/implregex.c b/gtksourceview/implregex.c new file mode 100644 index 00000000..e524ac71 --- /dev/null +++ b/gtksourceview/implregex.c @@ -0,0 +1,1141 @@ +/* + * This file is part of GtkSourceView + * + * Copyright 1999, 2000 Scott Wimer + * Copyright 2004, Matthias Clasen <mclasen@redhat.com> + * Copyright 2005 - 2007, Marco Barisione <marco@barisione.org> + * Copyright 2020 Christian Hergert <chergert@redhat.com> + * + * GtkSourceView is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * GtkSourceView is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, see <http://www.gnu.org/licenses/>. + * + * SPDX-License-Identifier: LGPL-2.1-or-later + */ + +/* Some code in this file is based upon GRegex from GLib */ +/* GRegex -- regular expression API wrapper around PCRE. + * + * Copyright (C) 1999, 2000 Scott Wimer + * Copyright (C) 2004, Matthias Clasen <mclasen@redhat.com> + * Copyright (C) 2005 - 2007, Marco Barisione <marco@barisione.org> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, see <http://www.gnu.org/licenses/>. + */ + +#include "config.h" + +#define PCRE2_CODE_UNIT_WIDTH 8 +#include <pcre2.h> + +#include <glib/gi18n.h> +#include <string.h> + +#include "implregex-private.h" + +struct _ImplRegex +{ + int ref_count; + char *pattern; + gsize compile_flags; + gsize match_flags; + pcre2_compile_context *context; + pcre2_code *code; + guint has_jit : 1; +}; + +struct _ImplMatchInfo +{ + gsize compile_flags; + gsize match_flags; + ImplRegex *regex; + const char *string; + gsize string_len; + pcre2_match_data *match_data; + PCRE2_SIZE *offsets; + int matches; + uint32_t n_subpatterns; + gssize pos; +}; + +/* if the string is in UTF-8 use g_utf8_ functions, else use use just +/- 1. */ +#define NEXT_CHAR(re, s) ((!((re)->compile_flags & PCRE2_UTF)) ? ((s) + 1) : g_utf8_next_char (s)) + +#define TAKE(f,gbit,pbit) \ + G_STMT_START { \ + if (f & gbit) \ + { \ + ret |= pbit; \ + f &= ~gbit; \ + } \ + } G_STMT_END + +static gsize +translate_compile_flags (GRegexCompileFlags flags) +{ + gsize ret = PCRE2_UCP; + + if ((flags & G_REGEX_RAW) == 0) + { + ret |= (PCRE2_UTF | PCRE2_NO_UTF_CHECK); + flags &= ~G_REGEX_RAW; + } + + if (~flags & G_REGEX_BSR_ANYCRLF) + { + ret |= PCRE2_BSR_UNICODE; + flags &= ~G_REGEX_BSR_ANYCRLF; + } + + TAKE (flags, G_REGEX_ANCHORED, PCRE2_ANCHORED); + TAKE (flags, G_REGEX_CASELESS, PCRE2_CASELESS); + TAKE (flags, G_REGEX_EXTENDED, PCRE2_EXTENDED); + TAKE (flags, G_REGEX_DUPNAMES, PCRE2_DUPNAMES); + TAKE (flags, G_REGEX_MULTILINE, PCRE2_MULTILINE); + TAKE (flags, G_REGEX_NEWLINE_ANYCRLF, PCRE2_NEWLINE_ANYCRLF); + TAKE (flags, G_REGEX_NEWLINE_CR, PCRE2_NEWLINE_CR); + TAKE (flags, G_REGEX_NEWLINE_LF, PCRE2_NEWLINE_LF); + + flags &= ~G_REGEX_OPTIMIZE; + + g_assert (flags == 0); + + return ret; +} + +static gsize +translate_match_flags (GRegexMatchFlags flags) +{ + gsize ret = 0; + + TAKE (flags, G_REGEX_MATCH_ANCHORED, PCRE2_ANCHORED); + TAKE (flags, G_REGEX_MATCH_NOTBOL, PCRE2_NOTBOL); + TAKE (flags, G_REGEX_MATCH_NOTEOL, PCRE2_NOTEOL); + TAKE (flags, G_REGEX_MATCH_PARTIAL_SOFT, PCRE2_PARTIAL_SOFT); + TAKE (flags, G_REGEX_MATCH_PARTIAL_HARD, PCRE2_PARTIAL_HARD); + TAKE (flags, G_REGEX_MATCH_NOTEMPTY, PCRE2_NOTEMPTY); + + g_assert (flags == 0); + + return ret; +} + +static gboolean +set_regex_error (GError **error, + int rc) +{ + if (rc < PCRE2_ERROR_NOMATCH && rc != PCRE2_ERROR_PARTIAL) + { + if (error != NULL) + { + guchar errstr[128]; + + pcre2_get_error_message (rc, errstr, sizeof errstr - 1); + errstr[sizeof errstr - 1] = 0; + + g_set_error_literal (error, + G_REGEX_ERROR, + G_REGEX_ERROR_MATCH, + (const gchar *)errstr); + } + + return TRUE; + } + + return FALSE; +} + +ImplRegex * +impl_regex_new (const char *pattern, + GRegexCompileFlags compile_options, + GRegexMatchFlags match_options, + GError **error) +{ + pcre2_compile_context *context; + ImplRegex *regex; + PCRE2_SIZE erroffset; + int errnumber = -1; + + g_return_val_if_fail (pattern != NULL, NULL); + + context = pcre2_compile_context_create (NULL); + + regex = g_slice_new0 (ImplRegex); + regex->ref_count = 1; + regex->context = context; + regex->pattern = g_strdup (pattern); + regex->compile_flags = translate_compile_flags (compile_options); + regex->match_flags = translate_match_flags (match_options); + + if (compile_options & G_REGEX_NEWLINE_LF) + pcre2_set_newline (context, PCRE2_NEWLINE_LF); + else if (compile_options & G_REGEX_NEWLINE_CR) + pcre2_set_newline (context, PCRE2_NEWLINE_CR); + else if (compile_options & G_REGEX_NEWLINE_CRLF) + pcre2_set_newline (context, PCRE2_NEWLINE_CRLF); + else if (compile_options & G_REGEX_NEWLINE_ANYCRLF) + pcre2_set_newline (context, PCRE2_NEWLINE_ANYCRLF); + else + pcre2_set_newline (context, PCRE2_NEWLINE_ANY); + + regex->code = pcre2_compile ((PCRE2_SPTR)pattern, + PCRE2_ZERO_TERMINATED, + regex->compile_flags, + &errnumber, + &erroffset, + context); + + if (regex->code == NULL) + { + char errmsg[128]; + + pcre2_get_error_message (errnumber, (guchar *)errmsg, sizeof errmsg-1); + + g_set_error (error, + G_REGEX_ERROR, + G_REGEX_ERROR_COMPILE, + "%s: offset %d of pattern %s", + errmsg, + (int)erroffset, + pattern); + impl_regex_unref (regex); + return NULL; + } + + /* Now try to JIT the pattern for faster execution time */ + if (compile_options & G_REGEX_OPTIMIZE) + { + regex->has_jit = pcre2_jit_compile (regex->code, PCRE2_JIT_COMPLETE) == 0; + } + + return regex; +} + +const char * +impl_regex_get_pattern (const ImplRegex *regex) +{ + g_return_val_if_fail (regex != NULL, NULL); + + return regex->pattern; +} + +ImplRegex * +impl_regex_ref (ImplRegex *regex) +{ + g_return_val_if_fail (regex != NULL, NULL); + g_return_val_if_fail (regex->ref_count > 0, NULL); + + regex->ref_count++; + + return regex; +} + +void +impl_regex_unref (ImplRegex *regex) +{ + g_return_if_fail (regex != NULL); + g_return_if_fail (regex->ref_count > 0); + + regex->ref_count--; + + if (regex->ref_count == 0) + { + g_clear_pointer (®ex->pattern, g_free); + g_clear_pointer (®ex->code, pcre2_code_free); + g_clear_pointer (®ex->context, pcre2_compile_context_free); + g_slice_free (ImplRegex, regex); + } +} + +static ImplMatchInfo * +impl_match_info_new (ImplRegex *regex, + GRegexMatchFlags match_options, + const char *string, + gssize string_len, + gssize position) +{ + ImplMatchInfo *match_info; + + g_assert (regex != NULL); + g_assert (string != NULL); + g_assert (string_len <= strlen (string)); + + if (string_len < 0) + { + string_len = strlen (string); + } + + match_info = g_slice_new0 (ImplMatchInfo); + match_info->regex = impl_regex_ref (regex); + match_info->match_flags = regex->match_flags | translate_match_flags (match_options); + match_info->pos = MAX (0, position); + match_info->matches = PCRE2_ERROR_NOMATCH; + match_info->string = string; + match_info->string_len = string_len; + match_info->match_data = pcre2_match_data_create_from_pattern (regex->code, NULL); + + if (match_info->match_data == NULL) + g_error ("Failed to allocate match data"); + + pcre2_pattern_info (regex->code, PCRE2_INFO_CAPTURECOUNT, &match_info->n_subpatterns); + + match_info->offsets = pcre2_get_ovector_pointer (match_info->match_data); + match_info->offsets[0] = -1; + match_info->offsets[1] = -1; + + return match_info; +} + +void +impl_match_info_free (ImplMatchInfo *match_info) +{ + if (match_info != NULL) + { + g_clear_pointer (&match_info->match_data, pcre2_match_data_free); + g_clear_pointer (&match_info->regex, impl_regex_unref); + match_info->string = NULL; + match_info->string_len = 0; + match_info->compile_flags = 0; + match_info->match_flags = 0; + match_info->matches = 0; + match_info->pos = 0; + match_info->offsets = NULL; + g_slice_free (ImplMatchInfo, match_info); + } +} + +gboolean +impl_regex_match (const ImplRegex *regex, + const char *string, + GRegexMatchFlags match_options, + ImplMatchInfo **match_info) +{ + g_return_val_if_fail (regex != NULL, FALSE); + g_return_val_if_fail (regex->code != NULL, FALSE); + g_return_val_if_fail (string != NULL, FALSE); + + return impl_regex_match_full (regex, string, -1, 0, match_options, match_info, NULL); +} + +char * +impl_match_info_fetch (const ImplMatchInfo *match_info, + int match_num) +{ + char *match = NULL; + int begin = -1; + int end = -1; + + g_return_val_if_fail (match_info != NULL, NULL); + g_return_val_if_fail (match_info->string != NULL, NULL); + g_return_val_if_fail (match_info->offsets != NULL, NULL); + g_return_val_if_fail (impl_match_info_matches (match_info), NULL); + g_return_val_if_fail (match_num >= 0, NULL); + + if (!impl_match_info_fetch_pos (match_info, match_num, &begin, &end)) + match = NULL; + else if (begin == -1) + match = g_strdup (""); + else + match = g_strndup (&match_info->string[begin], end - begin); + + return match; +} + +char * +impl_match_info_fetch_named (const ImplMatchInfo *match_info, + const char *name) +{ + int begin = -1; + int end = -1; + + g_return_val_if_fail (match_info != NULL, NULL); + + if (impl_match_info_fetch_named_pos (match_info, name, &begin, &end)) + { + if (begin >= 0 && end >= 0) + { + return g_strndup (match_info->string + begin, end - begin); + } + } + + return NULL; +} + +char * +impl_regex_replace_eval (const ImplRegex *regex, + const char *string, + gssize string_len, + gsize start_position, + GRegexMatchFlags match_options, + ImplRegexEvalCallback eval, + gpointer user_data, + GError **error) +{ + ImplMatchInfo *match_info; + GString *result; + gsize str_pos = 0; + gboolean done = FALSE; + GError *tmp_error = NULL; + + g_return_val_if_fail (regex != NULL, NULL); + g_return_val_if_fail (string != NULL, NULL); + g_return_val_if_fail (eval != NULL, NULL); + + if (string_len < 0) + { + string_len = strlen (string); + } + + result = g_string_sized_new (string_len); + + /* run down the string making matches. */ + impl_regex_match_full (regex, + string, + string_len, + start_position, + match_options, + &match_info, + &tmp_error); + + g_assert (match_info != NULL); + + while (!done && impl_match_info_matches (match_info)) + { + g_string_append_len (result, + string + str_pos, + match_info->offsets[0] - str_pos); + done = (*eval) (match_info, result, user_data); + str_pos = match_info->offsets[1]; + impl_match_info_next (match_info, &tmp_error); + + /* We already matched, so ignore future matches */ + if (g_error_matches (tmp_error, G_REGEX_ERROR, G_REGEX_ERROR_MATCH)) + { + g_clear_error (&tmp_error); + break; + } + } + + impl_match_info_free (match_info); + + if (tmp_error != NULL) + { + g_propagate_error (error, tmp_error); + g_string_free (result, TRUE); + return NULL; + } + + g_string_append_len (result, string + str_pos, string_len - str_pos); + + return g_string_free (result, FALSE); +} + +gboolean +impl_regex_match_full (const ImplRegex *regex, + const char *string, + gssize string_len, + gsize start_position, + GRegexMatchFlags match_options, + ImplMatchInfo **match_info, + GError **error) +{ + ImplMatchInfo *local_match_info = NULL; + gboolean ret = FALSE; + + g_return_val_if_fail (regex != NULL, FALSE); + g_return_val_if_fail (regex->code != NULL, FALSE); + g_return_val_if_fail (string != NULL, FALSE); + + if (string_len < 0) + { + string_len = strlen (string); + } + + local_match_info = impl_match_info_new ((ImplRegex *)regex, match_options, string, string_len, start_position); + + ret = impl_match_info_next (local_match_info, error); + + if (match_info != NULL) + { + *match_info = g_steal_pointer (&local_match_info); + } + else + { + impl_match_info_free (local_match_info); + } + + return ret; +} + +enum +{ + REPL_TYPE_STRING, + REPL_TYPE_CHARACTER, + REPL_TYPE_SYMBOLIC_REFERENCE, + REPL_TYPE_NUMERIC_REFERENCE, + REPL_TYPE_CHANGE_CASE +}; + +typedef enum +{ + CHANGE_CASE_NONE = 1 << 0, + CHANGE_CASE_UPPER = 1 << 1, + CHANGE_CASE_LOWER = 1 << 2, + CHANGE_CASE_UPPER_SINGLE = 1 << 3, + CHANGE_CASE_LOWER_SINGLE = 1 << 4, + CHANGE_CASE_SINGLE_MASK = CHANGE_CASE_UPPER_SINGLE | CHANGE_CASE_LOWER_SINGLE, + CHANGE_CASE_LOWER_MASK = CHANGE_CASE_LOWER | CHANGE_CASE_LOWER_SINGLE, + CHANGE_CASE_UPPER_MASK = CHANGE_CASE_UPPER | CHANGE_CASE_UPPER_SINGLE +} ChangeCase; + +typedef struct _InterpolationData +{ + char *text; + int type; + int num; + char c; + ChangeCase change_case; +} InterpolationData; + +static void +free_interpolation_data (InterpolationData *data) +{ + g_free (data->text); + g_free (data); +} + +static const char * +expand_escape (const char *replacement, + const char *p, + InterpolationData *data, + GError **error) +{ + const char *q, *r; + int x, d, h, i; + const char *error_detail; + int base = 0; + GError *tmp_error = NULL; + + p++; + switch (*p) + { + case 't': + p++; + data->c = '\t'; + data->type = REPL_TYPE_CHARACTER; + break; + case 'n': + p++; + data->c = '\n'; + data->type = REPL_TYPE_CHARACTER; + break; + case 'v': + p++; + data->c = '\v'; + data->type = REPL_TYPE_CHARACTER; + break; + case 'r': + p++; + data->c = '\r'; + data->type = REPL_TYPE_CHARACTER; + break; + case 'f': + p++; + data->c = '\f'; + data->type = REPL_TYPE_CHARACTER; + break; + case 'a': + p++; + data->c = '\a'; + data->type = REPL_TYPE_CHARACTER; + break; + case 'b': + p++; + data->c = '\b'; + data->type = REPL_TYPE_CHARACTER; + break; + case '\\': + p++; + data->c = '\\'; + data->type = REPL_TYPE_CHARACTER; + break; + case 'x': + p++; + x = 0; + if (*p == '{') + { + p++; + do + { + h = g_ascii_xdigit_value (*p); + if (h < 0) + { + error_detail = _("hexadecimal digit or “}” expected"); + goto error; + } + x = x * 16 + h; + p++; + } + while (*p != '}'); + p++; + } + else + { + for (i = 0; i < 2; i++) + { + h = g_ascii_xdigit_value (*p); + if (h < 0) + { + error_detail = _("hexadecimal digit expected"); + goto error; + } + x = x * 16 + h; + p++; + } + } + data->type = REPL_TYPE_STRING; + data->text = g_new0 (gchar, 8); + g_unichar_to_utf8 (x, data->text); + break; + case 'l': + p++; + data->type = REPL_TYPE_CHANGE_CASE; + data->change_case = CHANGE_CASE_LOWER_SINGLE; + break; + case 'u': + p++; + data->type = REPL_TYPE_CHANGE_CASE; + data->change_case = CHANGE_CASE_UPPER_SINGLE; + break; + case 'L': + p++; + data->type = REPL_TYPE_CHANGE_CASE; + data->change_case = CHANGE_CASE_LOWER; + break; + case 'U': + p++; + data->type = REPL_TYPE_CHANGE_CASE; + data->change_case = CHANGE_CASE_UPPER; + break; + case 'E': + p++; + data->type = REPL_TYPE_CHANGE_CASE; + data->change_case = CHANGE_CASE_NONE; + break; + case 'g': + p++; + if (*p != '<') + { + error_detail = _("missing “<” in symbolic reference"); + goto error; + } + q = p + 1; + do + { + p++; + if (!*p) + { + error_detail = _("unfinished symbolic reference"); + goto error; + } + } + while (*p != '>'); + if (p - q == 0) + { + error_detail = _("zero-length symbolic reference"); + goto error; + } + if (g_ascii_isdigit (*q)) + { + x = 0; + do + { + h = g_ascii_digit_value (*q); + if (h < 0) + { + error_detail = _("digit expected"); + p = q; + goto error; + } + x = x * 10 + h; + q++; + } + while (q != p); + data->num = x; + data->type = REPL_TYPE_NUMERIC_REFERENCE; + } + else + { + r = q; + do + { + if (!g_ascii_isalnum (*r)) + { + error_detail = _("illegal symbolic reference"); + p = r; + goto error; + } + r++; + } + while (r != p); + data->text = g_strndup (q, p - q); + data->type = REPL_TYPE_SYMBOLIC_REFERENCE; + } + p++; + break; + case '0': + /* if \0 is followed by a number is an octal number representing a + * character, else it is a numeric reference. */ + if (g_ascii_digit_value (*g_utf8_next_char (p)) >= 0) + { + base = 8; + p = g_utf8_next_char (p); + } + /* Fallthrough */ + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + x = 0; + d = 0; + for (i = 0; i < 3; i++) + { + h = g_ascii_digit_value (*p); + if (h < 0) + break; + if (h > 7) + { + if (base == 8) + break; + else + base = 10; + } + if (i == 2 && base == 10) + break; + x = x * 8 + h; + d = d * 10 + h; + p++; + } + if (base == 8 || i == 3) + { + data->type = REPL_TYPE_STRING; + data->text = g_new0 (gchar, 8); + g_unichar_to_utf8 (x, data->text); + } + else + { + data->type = REPL_TYPE_NUMERIC_REFERENCE; + data->num = d; + } + break; + case 0: + error_detail = _("stray final “\\”"); + goto error; + break; + default: + error_detail = _("unknown escape sequence"); + goto error; + } + + return p; + +error: + /* G_GSSIZE_FORMAT doesn't work with gettext, so we use %lu */ + tmp_error = g_error_new (G_REGEX_ERROR, + G_REGEX_ERROR_REPLACE, + _("Error while parsing replacement " + "text “%s” at char %lu: %s"), + replacement, + (gulong)(p - replacement), + error_detail); + g_propagate_error (error, tmp_error); + + return NULL; +} + +static GList * +split_replacement (const gchar *replacement, + GError **error) +{ + GList *list = NULL; + InterpolationData *data; + const gchar *p, *start; + + start = p = replacement; + while (*p) + { + if (*p == '\\') + { + data = g_new0 (InterpolationData, 1); + start = p = expand_escape (replacement, p, data, error); + if (p == NULL) + { + g_list_free_full (list, (GDestroyNotify) free_interpolation_data); + free_interpolation_data (data); + + return NULL; + } + list = g_list_prepend (list, data); + } + else + { + p++; + if (*p == '\\' || *p == '\0') + { + if (p - start > 0) + { + data = g_new0 (InterpolationData, 1); + data->text = g_strndup (start, p - start); + data->type = REPL_TYPE_STRING; + list = g_list_prepend (list, data); + } + } + } + } + + return g_list_reverse (list); +} + +/* Change the case of c based on change_case. */ +#define CHANGE_CASE(c, change_case) \ + (((change_case) & CHANGE_CASE_LOWER_MASK) ? \ + g_unichar_tolower (c) : \ + g_unichar_toupper (c)) + +static void +string_append (GString *string, + const gchar *text, + ChangeCase *change_case) +{ + gunichar c; + + if (text[0] == '\0') + return; + + if (*change_case == CHANGE_CASE_NONE) + { + g_string_append (string, text); + } + else if (*change_case & CHANGE_CASE_SINGLE_MASK) + { + c = g_utf8_get_char (text); + g_string_append_unichar (string, CHANGE_CASE (c, *change_case)); + g_string_append (string, g_utf8_next_char (text)); + *change_case = CHANGE_CASE_NONE; + } + else + { + while (*text != '\0') + { + c = g_utf8_get_char (text); + g_string_append_unichar (string, CHANGE_CASE (c, *change_case)); + text = g_utf8_next_char (text); + } + } +} + +static gboolean +interpolate_replacement (const ImplMatchInfo *match_info, + GString *result, + gpointer data) +{ + GList *list; + InterpolationData *idata; + gchar *match; + ChangeCase change_case = CHANGE_CASE_NONE; + + for (list = data; list; list = list->next) + { + idata = list->data; + switch (idata->type) + { + case REPL_TYPE_STRING: + string_append (result, idata->text, &change_case); + break; + case REPL_TYPE_CHARACTER: + g_string_append_c (result, CHANGE_CASE (idata->c, change_case)); + if (change_case & CHANGE_CASE_SINGLE_MASK) + change_case = CHANGE_CASE_NONE; + break; + case REPL_TYPE_NUMERIC_REFERENCE: + match = impl_match_info_fetch (match_info, idata->num); + if (match) + { + string_append (result, match, &change_case); + g_free (match); + } + break; + case REPL_TYPE_SYMBOLIC_REFERENCE: + match = impl_match_info_fetch_named (match_info, idata->text); + if (match) + { + string_append (result, match, &change_case); + g_free (match); + } + break; + case REPL_TYPE_CHANGE_CASE: + change_case = idata->change_case; + break; + default: + g_warn_if_reached (); + break; + } + } + + return FALSE; +} + +char * +impl_regex_replace (const ImplRegex *regex, + const char *string, + gssize string_len, + int start_position, + const char *replacement, + GRegexMatchFlags match_options, + GError **error) +{ + char *result; + GList *list; + GError *tmp_error = NULL; + + g_return_val_if_fail (regex != NULL, NULL); + g_return_val_if_fail (string != NULL, NULL); + g_return_val_if_fail (start_position >= 0, NULL); + g_return_val_if_fail (replacement != NULL, NULL); + g_return_val_if_fail (error == NULL || *error == NULL, NULL); + + list = split_replacement (replacement, &tmp_error); + + if (tmp_error != NULL) + { + g_propagate_error (error, tmp_error); + return NULL; + } + + result = impl_regex_replace_eval (regex, + string, string_len, start_position, + match_options, + interpolate_replacement, + (gpointer)list, + &tmp_error); + + if (tmp_error != NULL) + g_propagate_error (error, tmp_error); + + g_list_free_full (list, (GDestroyNotify) free_interpolation_data); + + return result; +} + +gboolean +impl_match_info_fetch_pos (const ImplMatchInfo *match_info, + int match_num, + int *start_pos, + int *end_pos) +{ + g_return_val_if_fail (match_info != NULL, FALSE); + g_return_val_if_fail (match_info->match_data != NULL, FALSE); + g_return_val_if_fail (match_info->offsets != NULL, FALSE); + g_return_val_if_fail (match_num >= 0, FALSE); + + if (match_info->matches < 0) + return FALSE; + + /* make sure the sub expression number they're requesting is less than + * the total number of sub expressions in the regex. When matching all + * (g_regex_match_all()), also compare against the number of matches */ + if (match_num >= MAX (match_info->matches, match_info->n_subpatterns + 1)) + return FALSE; + + if (start_pos) + *start_pos = (match_num < match_info->matches) ? match_info->offsets[2 * match_num] : -1; + + if (end_pos) + *end_pos = (match_num < match_info->matches) ? match_info->offsets[2 * match_num + 1] : -1; + + return TRUE; +} + +gboolean +impl_match_info_fetch_named_pos (const ImplMatchInfo *match_info, + const char *name, + int *start_pos, + int *end_pos) +{ + int num; + + g_return_val_if_fail (match_info != NULL, FALSE); + g_return_val_if_fail (match_info->match_data != NULL, FALSE); + g_return_val_if_fail (match_info->regex != NULL, FALSE); + g_return_val_if_fail (start_pos != NULL, FALSE); + g_return_val_if_fail (end_pos != NULL, FALSE); + + num = pcre2_substring_number_from_name (match_info->regex->code, (PCRE2_SPTR)name); + + if (num >= 0) + { + return impl_match_info_fetch_pos (match_info, num, start_pos, end_pos); + } + + return FALSE; +} + +gboolean +impl_match_info_matches (const ImplMatchInfo *match_info) +{ + g_return_val_if_fail (match_info != NULL, FALSE); + g_return_val_if_fail (match_info->matches != 0, FALSE); + + return match_info->matches >= 0; +} + +gboolean +impl_match_info_next (ImplMatchInfo *match_info, + GError **error) +{ + gssize prev_match_start; + gssize prev_match_end; + + g_return_val_if_fail (match_info != NULL, FALSE); + g_return_val_if_fail (error == NULL || *error == NULL, FALSE); + g_return_val_if_fail (match_info->pos >= 0, FALSE); + + prev_match_start = match_info->offsets[0]; + prev_match_end = match_info->offsets[1]; + + if (match_info->pos > match_info->string_len) + { + /* we have reached the end of the string */ + match_info->pos = -1; + match_info->matches = PCRE2_ERROR_NOMATCH; + return FALSE; + } + + if (match_info->regex->has_jit) + { + match_info->matches = pcre2_jit_match (match_info->regex->code, + (PCRE2_SPTR)match_info->string, + match_info->string_len, + match_info->pos, + match_info->match_flags, + match_info->match_data, + NULL); + } + else + { + gsize match_flags = match_info->regex->match_flags | match_info->match_flags; + + if (match_info->regex->compile_flags & PCRE2_UTF) + match_flags |= PCRE2_NO_UTF_CHECK; + + match_info->matches = pcre2_match (match_info->regex->code, + (PCRE2_SPTR)match_info->string, + match_info->string_len, + match_info->pos, + match_flags, + match_info->match_data, + NULL); + } + + if (set_regex_error (error, match_info->matches)) + return FALSE; + + /* avoid infinite loops if the pattern is an empty string or something + * equivalent */ + if (match_info->pos == match_info->offsets[1]) + { + if (match_info->pos > match_info->string_len) + { + /* we have reached the end of the string */ + match_info->pos = -1; + match_info->matches = PCRE2_ERROR_NOMATCH; + return FALSE; + } + + match_info->pos = NEXT_CHAR (match_info->regex, &match_info->string[match_info->pos]) - + match_info->string; + + + } + else + { + match_info->pos = match_info->offsets[1]; + } + + g_assert (match_info->matches <= (int)match_info->n_subpatterns + 1); + + /* it's possible to get two identical matches when we are matching + * empty strings, for instance if the pattern is "(?=[A-Z0-9])" and + * the string is "RegExTest" we have: + * - search at position 0: match from 0 to 0 + * - search at position 1: match from 3 to 3 + * - search at position 3: match from 3 to 3 (duplicate) + * - search at position 4: match from 5 to 5 + * - search at position 5: match from 5 to 5 (duplicate) + * - search at position 6: no match -> stop + * so we have to ignore the duplicates. + * see bug #515944: http://bugzilla.gnome.org/show_bug.cgi?id=515944 */ + if (match_info->matches >= 0 && + prev_match_start == match_info->offsets[0] && + prev_match_end == match_info->offsets[1]) + { + /* ignore this match and search the next one */ + return impl_match_info_next (match_info, error); + } + + return match_info->matches >= 0; +} + +int +impl_regex_get_max_lookbehind (const ImplRegex *regex) +{ + uint32_t value = 0; + + g_return_val_if_fail (regex != NULL, 0); + g_return_val_if_fail (regex->code != NULL, 0); + + pcre2_pattern_info (regex->code, PCRE2_INFO_MAXLOOKBEHIND, &value); + + return value; +} + +gboolean +impl_match_info_is_partial_match (const ImplMatchInfo *match_info) +{ + g_return_val_if_fail (match_info != NULL, FALSE); + + return match_info->matches == PCRE2_ERROR_PARTIAL; +} + +int +impl_match_info_get_match_count (const ImplMatchInfo *match_info) +{ + g_return_val_if_fail (match_info != NULL, 0); + + return MAX (0, match_info->matches); +} diff --git a/gtksourceview/meson.build b/gtksourceview/meson.build index b6192604..c4ddb4a8 100644 --- a/gtksourceview/meson.build +++ b/gtksourceview/meson.build @@ -103,6 +103,7 @@ core_private_c = files([ 'gtksourcepixbufhelper.c', 'gtksourceregex.c', 'gtksourceundomanagerdefault.c', + 'implregex.c', ]) core_c_args = [ @@ -119,6 +120,7 @@ core_deps = [ gio_dep, gtk_dep, libxml_dep, + pcre2_dep, ] if config_h.has('OS_OSX') diff --git a/meson.build b/meson.build index 039d8390..0b76dd7f 100644 --- a/meson.build +++ b/meson.build @@ -79,6 +79,7 @@ gladeui_req = '>= 3.9' introspection_req = '>= 1.42.0' gtk_doc_req = '>= 1.25' fribidi_req = '>= 0.19.7' +pcre2_req = '>= 10.21' glib_dep = dependency('glib-2.0', version: glib_req) gobject_dep = dependency('gobject-2.0', version: glib_req) @@ -86,6 +87,7 @@ gio_dep = dependency('gio-2.0', version: glib_req) gtk_dep = dependency('gtk+-3.0', version: gtk_req) libxml_dep = dependency('libxml-2.0', version: libxml_req, required: cc.get_id() != 'msvc') fribidi_dep = dependency('fribidi', version: fribidi_req) +pcre2_dep = dependency('libpcre2-8', version: pcre2_req, fallback : ['pcre2', 'libpcre2_8']) gtk_quartz_dep = dependency('gtk+-quartz-3.0', version: gtk_doc_req, required: false) diff --git a/subprojects/pcre2.wrap b/subprojects/pcre2.wrap new file mode 100644 index 00000000..65417c61 --- /dev/null +++ b/subprojects/pcre2.wrap @@ -0,0 +1,10 @@ +[wrap-file] +directory = pcre2-10.23 + +source_url = https://github.com/PhilipHazel/pcre2/releases/download/pcre2-10.23/pcre2-10.23.zip +source_filename = pcre2-10.23.zip +source_hash = 6301a525a8a7e63a5fac0c2fbfa0374d3eb133e511d886771e097e427707094a + +patch_url = https://wrapdb.mesonbuild.com/v1/projects/pcre2/10.23/1/get_zip +patch_filename = pcre2-10.23-1-wrap.zip +patch_hash = ad6b4f042a911d06805fbbeeb9ffed0a988b282561164d0624a3ce02e93d4e24 |