diff options
author | Bram Moolenaar <Bram@vim.org> | 2013-06-01 19:54:43 +0200 |
---|---|---|
committer | Bram Moolenaar <Bram@vim.org> | 2013-06-01 19:54:43 +0200 |
commit | 61602c5bfe7d4a4a0a6671b132f5b98d7d9da424 (patch) | |
tree | fda5a8fcf6a325647deaf4b25c5071bbe8b5357c | |
parent | 543b7ef7000d08d77409478315d68b607bb8bad8 (diff) | |
download | vim-git-61602c5bfe7d4a4a0a6671b132f5b98d7d9da424.tar.gz |
updated for version 7.3.1088v7.3.1088
Problem: New regexp engine: \@<= and \@<! are not implemented.
Solution: Implement look-behind matching. Fix off-by-one error in old
regexp engine.
-rw-r--r-- | src/regexp.c | 16 | ||||
-rw-r--r-- | src/regexp_nfa.c | 194 | ||||
-rw-r--r-- | src/testdir/test64.in | 17 | ||||
-rw-r--r-- | src/testdir/test64.ok | 11 | ||||
-rw-r--r-- | src/version.c | 2 |
5 files changed, 189 insertions, 51 deletions
diff --git a/src/regexp.c b/src/regexp.c index cfeee4c27..3eaf74dd9 100644 --- a/src/regexp.c +++ b/src/regexp.c @@ -5576,7 +5576,14 @@ regmatch(scan) limit = OPERAND_MIN(rp->rs_scan); if (REG_MULTI) { - if (rp->rs_un.regsave.rs_u.pos.col == 0) + if (limit > 0 + && ((rp->rs_un.regsave.rs_u.pos.lnum + < behind_pos.rs_u.pos.lnum + ? (colnr_T)STRLEN(regline) + : behind_pos.rs_u.pos.col) + - rp->rs_un.regsave.rs_u.pos.col >= limit)) + no = FAIL; + else if (rp->rs_un.regsave.rs_u.pos.col == 0) { if (rp->rs_un.regsave.rs_u.pos.lnum < behind_pos.rs_u.pos.lnum @@ -5601,13 +5608,6 @@ regmatch(scan) else #endif --rp->rs_un.regsave.rs_u.pos.col; - if (limit > 0 - && ((rp->rs_un.regsave.rs_u.pos.lnum - < behind_pos.rs_u.pos.lnum - ? (colnr_T)STRLEN(regline) - : behind_pos.rs_u.pos.col) - - rp->rs_un.regsave.rs_u.pos.col > limit)) - no = FAIL; } } else diff --git a/src/regexp_nfa.c b/src/regexp_nfa.c index fffe4a0cc..17865d6d2 100644 --- a/src/regexp_nfa.c +++ b/src/regexp_nfa.c @@ -56,6 +56,7 @@ enum NFA_NOPEN, /* Start of subexpression marked with \%( */ NFA_NCLOSE, /* End of subexpr. marked with \%( ... \) */ NFA_START_INVISIBLE, + NFA_START_INVISIBLE_BEFORE, NFA_END_INVISIBLE, NFA_COMPOSING, /* Next nodes in NFA are part of the composing multibyte char */ @@ -1369,34 +1370,43 @@ nfa_regpiece() break; case Magic('@'): + c2 = getdecchrs(); op = no_Magic(getchr()); + i = 0; switch(op) { case '=': - EMIT(NFA_PREV_ATOM_NO_WIDTH); + /* \@= */ + i = NFA_PREV_ATOM_NO_WIDTH; break; case '!': - EMIT(NFA_PREV_ATOM_NO_WIDTH_NEG); + /* \@! */ + i = NFA_PREV_ATOM_NO_WIDTH_NEG; break; - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': case '<': + op = no_Magic(getchr()); + if (op == '=') + /* \@<= */ + i = NFA_PREV_ATOM_JUST_BEFORE; + else if (op == '!') + /* \@<! */ + i = NFA_PREV_ATOM_JUST_BEFORE_NEG; + break; case '>': - /* Not supported yet */ - return FAIL; - default: - syntax_error = TRUE; - EMSGN(_("E869: (NFA) Unknown operator '\\@%c'"), op); + /* \@> Not supported yet */ + /* i = NFA_PREV_ATOM_LIKE_PATTERN; */ return FAIL; } + if (i == 0) + { + syntax_error = TRUE; + EMSGN(_("E869: (NFA) Unknown operator '\\@%c'"), op); + return FAIL; + } + EMIT(i); + if (i == NFA_PREV_ATOM_JUST_BEFORE + || i == NFA_PREV_ATOM_JUST_BEFORE_NEG) + EMIT(c2); break; case Magic('?'): @@ -1734,9 +1744,15 @@ nfa_set_code(c) STRCPY(code, "NFA_PREV_ATOM_NO_WIDTH"); break; case NFA_PREV_ATOM_NO_WIDTH_NEG: STRCPY(code, "NFA_PREV_ATOM_NO_WIDTH_NEG"); break; + case NFA_PREV_ATOM_JUST_BEFORE: + STRCPY(code, "NFA_PREV_ATOM_JUST_BEFORE"); break; + case NFA_PREV_ATOM_JUST_BEFORE_NEG: + STRCPY(code, "NFA_PREV_ATOM_JUST_BEFORE_NEG"); break; case NFA_NOPEN: STRCPY(code, "NFA_NOPEN"); break; case NFA_NCLOSE: STRCPY(code, "NFA_NCLOSE"); break; case NFA_START_INVISIBLE: STRCPY(code, "NFA_START_INVISIBLE"); break; + case NFA_START_INVISIBLE_BEFORE: + STRCPY(code, "NFA_START_INVISIBLE_BEFORE"); break; case NFA_END_INVISIBLE: STRCPY(code, "NFA_END_INVISIBLE"); break; case NFA_COMPOSING: STRCPY(code, "NFA_COMPOSING"); break; @@ -2237,7 +2253,7 @@ post2nfa(postfix, end, nfa_calc_size) if (nfa_calc_size == FALSE) { /* Allocate space for the stack. Max states on the stack : nstate */ - stack = (Frag_T *) lalloc((nstate + 1) * sizeof(Frag_T), TRUE); + stack = (Frag_T *)lalloc((nstate + 1) * sizeof(Frag_T), TRUE); stackp = stack; stack_end = stack + (nstate + 1); } @@ -2370,8 +2386,12 @@ post2nfa(postfix, end, nfa_calc_size) case NFA_PREV_ATOM_NO_WIDTH: case NFA_PREV_ATOM_NO_WIDTH_NEG: + case NFA_PREV_ATOM_JUST_BEFORE: + case NFA_PREV_ATOM_JUST_BEFORE_NEG: /* The \@= operator: match the preceding atom with zero width. * The \@! operator: no match for the preceding atom. + * The \@<= operator: match for the preceding atom. + * The \@<! operator: no match for the preceding atom. * Surrounds the preceding atom with START_INVISIBLE and * END_INVISIBLE, similarly to MOPEN. */ @@ -2389,11 +2409,18 @@ post2nfa(postfix, end, nfa_calc_size) s = new_state(NFA_START_INVISIBLE, e.start, s1); if (s == NULL) goto theend; - if (*p == NFA_PREV_ATOM_NO_WIDTH_NEG) + if (*p == NFA_PREV_ATOM_NO_WIDTH_NEG + || *p == NFA_PREV_ATOM_JUST_BEFORE_NEG) { s->negated = TRUE; s1->negated = TRUE; } + if (*p == NFA_PREV_ATOM_JUST_BEFORE + || *p == NFA_PREV_ATOM_JUST_BEFORE_NEG) + { + s->val = *++p; /* get the count */ + ++s->c; /* NFA_START_INVISIBLE -> NFA_START_INVISIBLE_BEFORE */ + } PUSH(frag(s, list1(&s1->out))); break; @@ -3307,21 +3334,24 @@ nfa_re_num_cmp(val, op, pos) return val == pos; } -static int nfa_regmatch __ARGS((nfa_state_T *start, regsub_T *submatch, regsub_T *m)); +static int nfa_regmatch __ARGS((nfa_state_T *start, regsub_T *submatch, regsub_T *m, save_se_T *endp)); /* * Main matching routine. * * Run NFA to determine whether it matches reginput. * + * When "endp" is not NULL it is a required end-of-match position. + * * Return TRUE if there is a match, FALSE otherwise. * Note: Caller must ensure that: start != NULL. */ static int -nfa_regmatch(start, submatch, m) +nfa_regmatch(start, submatch, m, endp) nfa_state_T *start; regsub_T *submatch; regsub_T *m; + save_se_T *endp; { int result; int size = 0; @@ -3532,16 +3562,42 @@ nfa_regmatch(start, submatch, m) } case NFA_END_INVISIBLE: - /* This is only encountered after a NFA_START_INVISIBLE node. - * They surround a zero-width group, used with "\@=" and "\&". + /* This is only encountered after a NFA_START_INVISIBLE or + * NFA_START_INVISIBLE_BEFORE node. + * They surround a zero-width group, used with "\@=", "\&", + * "\@!", "\@<=" and "\@<!". * If we got here, it means that the current "invisible" group * finished successfully, so return control to the parent * nfa_regmatch(). Submatches are stored in *m, and used in * the parent call. */ if (start->c == NFA_MOPEN + 0) + /* TODO: do we ever get here? */ addstate_here(thislist, t->state->out, &t->sub, &listidx); else { +#ifdef ENABLE_LOG + if (endp != NULL) + { + if (REG_MULTI) + fprintf(log_fd, "Current lnum: %d, endp lnum: %d; current col: %d, endp col: %d\n", + (int)reglnum, + (int)endp->se_u.pos.lnum, + (int)(reginput - regline), + endp->se_u.pos.col); + else + fprintf(log_fd, "Current col: %d, endp col: %d\n", + (int)(reginput - regline), + (int)(endp->se_u.ptr - reginput)); + } +#endif + /* It's only a match if it ends at "endp" */ + if (endp != NULL && (REG_MULTI + ? (reglnum != endp->se_u.pos.lnum + || (int)(reginput - regline) + != endp->se_u.pos.col) + : reginput != endp->se_u.ptr)) + break; + /* do not set submatches for \@! */ if (!t->state->negated) /* TODO: only copy positions in use. */ @@ -3551,11 +3607,70 @@ nfa_regmatch(start, submatch, m) break; case NFA_START_INVISIBLE: + case NFA_START_INVISIBLE_BEFORE: { - char_u *save_reginput = reginput; - char_u *save_regline = regline; - int save_reglnum = reglnum; - int save_nfa_match = nfa_match; + char_u *save_reginput = reginput; + char_u *save_regline = regline; + int save_reglnum = reglnum; + int save_nfa_match = nfa_match; + save_se_T endpos; + save_se_T *endposp = NULL; + + if (t->state->c == NFA_START_INVISIBLE_BEFORE) + { + /* The recursive match must end at the current position. */ + endposp = &endpos; + if (REG_MULTI) + { + endpos.se_u.pos.col = (int)(reginput - regline); + endpos.se_u.pos.lnum = reglnum; + } + else + endpos.se_u.ptr = reginput; + + /* Go back the specified number of bytes, or as far as the + * start of the previous line, to try matching "\@<=" or + * not matching "\@<!". */ + if (t->state->val <= 0) + { + if (REG_MULTI) + { + regline = reg_getline(--reglnum); + if (regline == NULL) + /* can't go before the first line */ + regline = reg_getline(++reglnum); + } + reginput = regline; + } + else + { + if (REG_MULTI + && (int)(reginput - regline) < t->state->val) + { + /* Not enough bytes in this line, go to end of + * previous line. */ + regline = reg_getline(--reglnum); + if (regline == NULL) + { + /* can't go before the first line */ + regline = reg_getline(++reglnum); + reginput = regline; + } + else + reginput = regline + STRLEN(regline); + } + if ((int)(reginput - regline) >= t->state->val) + { + reginput -= t->state->val; +#ifdef FEAT_MBYTE + if (has_mbyte) + reginput -= mb_head_off(regline, reginput); +#endif + } + else + reginput = regline; + } + } /* Call nfa_regmatch() to check if the current concat matches * at this position. The concat ends with the node @@ -3579,7 +3694,7 @@ nfa_regmatch(start, submatch, m) * recursion. */ nfa_save_listids(start, listids); nfa_set_null_listids(start); - result = nfa_regmatch(t->state->out, submatch, m); + result = nfa_regmatch(t->state->out, submatch, m, endposp); nfa_set_neg_listids(start); nfa_restore_listids(start, listids); @@ -4120,11 +4235,21 @@ nfa_regmatch(start, submatch, m) * matters! * Do not add the start state in recursive calls of nfa_regmatch(), * because recursive calls should only start in the first position. + * Unless "endp" is not NULL, then we match the end position. * Also don't start a match past the first line. */ - if (nfa_match == FALSE && start->c == NFA_MOPEN + 0 - && reglnum == 0 && clen != 0 - && (ireg_maxcol == 0 - || (colnr_T)(reginput - regline) < ireg_maxcol)) + if (nfa_match == FALSE + && ((start->c == NFA_MOPEN + 0 + && reglnum == 0 + && clen != 0 + && (ireg_maxcol == 0 + || (colnr_T)(reginput - regline) < ireg_maxcol)) + || (endp != NULL + && (REG_MULTI + ? (reglnum < endp->se_u.pos.lnum + || (reglnum == endp->se_u.pos.lnum + && (int)(reginput - regline) + < endp->se_u.pos.col)) + : reginput < endp->se_u.ptr)))) { #ifdef ENABLE_LOG fprintf(log_fd, "(---) STARTSTATE\n"); @@ -4148,7 +4273,8 @@ nextchar: * finish. */ if (clen != 0) reginput += clen; - else if (go_to_nextline) + else if (go_to_nextline || (endp != NULL && REG_MULTI + && reglnum < endp->se_u.pos.lnum)) reg_nextline(); else break; @@ -4225,7 +4351,7 @@ nfa_regtry(start, col) sub.in_use = 0; m.in_use = 0; - if (nfa_regmatch(start, &sub, &m) == FALSE) + if (nfa_regmatch(start, &sub, &m, NULL) == FALSE) return 0; cleanup_subexpr(); diff --git a/src/testdir/test64.in b/src/testdir/test64.in index 962f60ebe..78909246f 100644 --- a/src/testdir/test64.in +++ b/src/testdir/test64.in @@ -363,12 +363,13 @@ STARTTEST :call add(tl, [2, '\(a\)\(b\)\(c\)\(dd\)\(e\)\(f\)\(g\)\(h\)\(i\)\1\2\3\4\5\6\7\8\9', 'xabcddefghiabcddefghix', 'abcddefghiabcddefghi', 'a', 'b', 'c', 'dd', 'e', 'f', 'g', 'h', 'i']) :" :"""" Look-behind with limit -:call add(tl, [0, '<\@<=span.', 'xxspanxx<spanyyy', 'spany']) -:call add(tl, [0, '<\@1<=span.', 'xxspanxx<spanyyy', 'spany']) -:call add(tl, [0, '<\@2<=span.', 'xxspanxx<spanyyy', 'spany']) -:call add(tl, [0, '\(<<\)\@<=span.', 'xxspanxxxx<spanxx<<spanyyy', 'spany', '<<']) -:call add(tl, [0, '\(<<\)\@1<=span.', 'xxspanxxxx<spanxx<<spanyyy']) -:call add(tl, [0, '\(<<\)\@2<=span.', 'xxspanxxxx<spanxx<<spanyyy', 'spany', '<<']) +:call add(tl, [2, '<\@<=span.', 'xxspanxx<spanyyy', 'spany']) +:call add(tl, [2, '<\@1<=span.', 'xxspanxx<spanyyy', 'spany']) +:call add(tl, [2, '<\@2<=span.', 'xxspanxx<spanyyy', 'spany']) +:call add(tl, [2, '\(<<\)\@<=span.', 'xxspanxxxx<spanxx<<spanyyy', 'spany', '<<']) +:call add(tl, [2, '\(<<\)\@1<=span.', 'xxspanxxxx<spanxx<<spanyyy']) +:call add(tl, [2, '\(<<\)\@2<=span.', 'xxspanxxxx<spanxx<<spanyyy', 'spany', '<<']) +:call add(tl, [2, '\(foo\)\@<!bar.', 'xx foobar1 xbar2 xx', 'bar2']) :" :"""" "\_" prepended negated collection matches EOL :call add(tl, [2, '\_[^8-9]\+', "asfi\n9888", "asfi\n"]) @@ -514,8 +515,8 @@ Behind: asdfasd<yyy xxstart1 asdfasd<yy -xxxxstart2 +xxxstart2 asdfasd<yy -xxxstart3 +xxstart3 Results of test64: diff --git a/src/testdir/test64.ok b/src/testdir/test64.ok index b014a16cb..31baa4f21 100644 --- a/src/testdir/test64.ok +++ b/src/testdir/test64.ok @@ -817,16 +817,25 @@ OK 1 - \(a\)\(b\)\(c\)\(dd\)\(e\)\(f\)\(g\)\(h\)\(i\)\1\2\3\4\5\6\7\8\9 OK 2 - \(a\)\(b\)\(c\)\(dd\)\(e\)\(f\)\(g\)\(h\)\(i\)\1\2\3\4\5\6\7\8\9 OK 0 - <\@<=span. OK 1 - <\@<=span. +OK 2 - <\@<=span. OK 0 - <\@1<=span. OK 1 - <\@1<=span. +OK 2 - <\@1<=span. OK 0 - <\@2<=span. OK 1 - <\@2<=span. +OK 2 - <\@2<=span. OK 0 - \(<<\)\@<=span. OK 1 - \(<<\)\@<=span. +OK 2 - \(<<\)\@<=span. OK 0 - \(<<\)\@1<=span. OK 1 - \(<<\)\@1<=span. +OK 2 - \(<<\)\@1<=span. OK 0 - \(<<\)\@2<=span. OK 1 - \(<<\)\@2<=span. +OK 2 - \(<<\)\@2<=span. +OK 0 - \(foo\)\@<!bar. +OK 1 - \(foo\)\@<!bar. +OK 2 - \(foo\)\@<!bar. OK 0 - \_[^8-9]\+ OK 1 - \_[^8-9]\+ OK 2 - \_[^8-9]\+ @@ -844,7 +853,7 @@ OK 2 - [0-9a-zA-Z]\{8}-\([0-9a-zA-Z]\{4}-\)\{3}[0-9a-zA-Z]\{12} <T="7">Ac 7</Title> ghi -xxxstart3 +xxstart3 -0- ffo bob diff --git a/src/version.c b/src/version.c index 641fb6793..0427db3f2 100644 --- a/src/version.c +++ b/src/version.c @@ -729,6 +729,8 @@ static char *(features[]) = static int included_patches[] = { /* Add new patch number below this line */ /**/ + 1088, +/**/ 1087, /**/ 1086, |