summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--runtime/doc/pattern.txt15
-rw-r--r--src/regexp.c21
-rw-r--r--src/regexp_nfa.c44
-rw-r--r--src/version.c2
4 files changed, 69 insertions, 13 deletions
diff --git a/runtime/doc/pattern.txt b/runtime/doc/pattern.txt
index 80688f918..8bc621c9c 100644
--- a/runtime/doc/pattern.txt
+++ b/runtime/doc/pattern.txt
@@ -545,6 +545,7 @@ Character classes {not in Vi}: */character-classes*
|/\%u| \%u \%u match specified multibyte character (eg \%u20ac)
|/\%U| \%U \%U match specified large multibyte character (eg
\%U12345678)
+|/\%C| \%C \%C match any composing characters
Example matches ~
\<\I\i* or
@@ -1207,12 +1208,18 @@ will probably never match.
8. Composing characters *patterns-composing*
*/\Z*
-When "\Z" appears anywhere in the pattern, composing characters are ignored.
-Thus only the base characters need to match, the composing characters may be
-different and the number of composing characters may differ. Only relevant
-when 'encoding' is "utf-8".
+When "\Z" appears anywhere in the pattern, all composing characters are
+ignored. Thus only the base characters need to match, the composing
+characters may be different and the number of composing characters may differ.
+Only relevant when 'encoding' is "utf-8".
Exception: If the pattern starts with one or more composing characters, these
must match.
+ */\%C*
+Use "\%C" to skip any composing characters. For example, the pattern "a" does
+not match in "càt" (where the a has the composing character 0x0300), but
+"a\%C" does. Note that this does not match "cát" (where the á is character
+0xe1, it does not have a compositing character). It does match "cat" (where
+the a is just an a).
When a composing character appears at the start of the pattern of after an
item that doesn't include the composing character, a match is found at any
diff --git a/src/regexp.c b/src/regexp.c
index d66cd2062..26fb813c8 100644
--- a/src/regexp.c
+++ b/src/regexp.c
@@ -244,6 +244,7 @@
#define RE_MARK 207 /* mark cmp Match mark position */
#define RE_VISUAL 208 /* Match Visual area */
+#define RE_COMPOSING 209 /* any composing characters */
/*
* Magic characters have a special meaning, they don't match literally.
@@ -2208,6 +2209,10 @@ regatom(flagp)
ret = regnode(RE_VISUAL);
break;
+ case 'C':
+ ret = regnode(RE_COMPOSING);
+ break;
+
/* \%[abc]: Emit as a list of branches, all ending at the last
* branch which matches nothing. */
case '[':
@@ -4710,11 +4715,13 @@ regmatch(scan)
status = RA_NOMATCH;
}
#ifdef FEAT_MBYTE
- /* Check for following composing character. */
+ /* Check for following composing character, unless %C
+ * follows (skips over all composing chars). */
if (status != RA_NOMATCH
&& enc_utf8
&& UTF_COMPOSINGLIKE(reginput, reginput + len)
- && !ireg_icombine)
+ && !ireg_icombine
+ && OP(next) != RE_COMPOSING)
{
/* raaron: This code makes a composing character get
* ignored, which is the correct behavior (sometimes)
@@ -4791,6 +4798,16 @@ regmatch(scan)
status = RA_NOMATCH;
break;
#endif
+ case RE_COMPOSING:
+#ifdef FEAT_MBYTE
+ if (enc_utf8)
+ {
+ /* Skip composing characters. */
+ while (utf_iscomposing(utf_ptr2char(reginput)))
+ mb_cptr_adv(reginput);
+ }
+#endif
+ break;
case NOTHING:
break;
diff --git a/src/regexp_nfa.c b/src/regexp_nfa.c
index 3873d9ab2..a7fbe7b25 100644
--- a/src/regexp_nfa.c
+++ b/src/regexp_nfa.c
@@ -81,6 +81,7 @@ enum
NFA_COMPOSING, /* Next nodes in NFA are part of the
composing multibyte char */
NFA_END_COMPOSING, /* End of a composing char in the NFA */
+ NFA_ANY_COMPOSING, /* \%C: Any composing characters. */
NFA_OPT_CHARS, /* \%[abc] */
/* The following are used only in the postfix form, not in the NFA */
@@ -1418,6 +1419,10 @@ nfa_regatom()
EMIT(NFA_VISUAL);
break;
+ case 'C':
+ EMIT(NFA_ANY_COMPOSING);
+ break;
+
case '[':
{
int n;
@@ -2429,6 +2434,7 @@ nfa_set_code(c)
case NFA_MARK_LT: STRCPY(code, "NFA_MARK_LT "); break;
case NFA_CURSOR: STRCPY(code, "NFA_CURSOR "); break;
case NFA_VISUAL: STRCPY(code, "NFA_VISUAL "); break;
+ case NFA_ANY_COMPOSING: STRCPY(code, "NFA_ANY_COMPOSING "); break;
case NFA_STAR: STRCPY(code, "NFA_STAR "); break;
case NFA_STAR_NONGREEDY: STRCPY(code, "NFA_STAR_NONGREEDY "); break;
@@ -2967,6 +2973,7 @@ nfa_max_width(startstate, depth)
case NFA_NLOWER_IC:
case NFA_UPPER_IC:
case NFA_NUPPER_IC:
+ case NFA_ANY_COMPOSING:
/* possibly non-ascii */
#ifdef FEAT_MBYTE
if (has_mbyte)
@@ -4152,6 +4159,7 @@ match_follows(startstate, depth)
continue;
case NFA_ANY:
+ case NFA_ANY_COMPOSING:
case NFA_IDENT:
case NFA_SIDENT:
case NFA_KWORD:
@@ -4395,7 +4403,7 @@ skip_add:
switch (state->c)
{
case NFA_MATCH:
- nfa_match = TRUE;
+// nfa_match = TRUE;
break;
case NFA_SPLIT:
@@ -5151,6 +5159,7 @@ failure_chance(state, depth)
case NFA_MATCH:
case NFA_MCLOSE:
+ case NFA_ANY_COMPOSING:
/* empty match works always */
return 0;
@@ -5573,6 +5582,12 @@ nfa_regmatch(prog, start, submatch, m)
{
case NFA_MATCH:
{
+#ifdef FEAT_MBYTE
+ /* If the match ends before a composing characters and
+ * ireg_icombine is not set, that is not really a match. */
+ if (enc_utf8 && !ireg_icombine && utf_iscomposing(curc))
+ break;
+#endif
nfa_match = TRUE;
copy_sub(&submatch->norm, &t->subs.norm);
#ifdef FEAT_SYN_HL
@@ -6120,6 +6135,23 @@ nfa_regmatch(prog, start, submatch, m)
}
break;
+ case NFA_ANY_COMPOSING:
+ /* On a composing character skip over it. Otherwise do
+ * nothing. Always matches. */
+#ifdef FEAT_MBYTE
+ if (enc_utf8 && utf_iscomposing(curc))
+ {
+ add_off = clen;
+ }
+ else
+#endif
+ {
+ add_here = TRUE;
+ add_off = 0;
+ }
+ add_state = t->state->out;
+ break;
+
/*
* Character classes like \a for alpha, \d for digit etc.
*/
@@ -6484,12 +6516,10 @@ nfa_regmatch(prog, start, submatch, m)
if (!result && ireg_ic)
result = MB_TOLOWER(c) == MB_TOLOWER(curc);
#ifdef FEAT_MBYTE
- /* If there is a composing character which is not being
- * ignored there can be no match. Match with composing
- * character uses NFA_COMPOSING above. */
- if (result && enc_utf8 && !ireg_icombine
- && clen != utf_char2len(curc))
- result = FALSE;
+ /* If ireg_icombine is not set only skip over the character
+ * itself. When it is set skip over composing characters. */
+ if (result && enc_utf8 && !ireg_icombine)
+ clen = utf_char2len(curc);
#endif
ADD_STATE_IF_MATCH(t->state);
break;
diff --git a/src/version.c b/src/version.c
index 5efd62d2a..ed7289111 100644
--- a/src/version.c
+++ b/src/version.c
@@ -735,6 +735,8 @@ static char *(features[]) =
static int included_patches[] =
{ /* Add new patch number below this line */
/**/
+ 293,
+/**/
292,
/**/
291,