diff options
Diffstat (limited to 'regexec.c')
-rw-r--r-- | regexec.c | 392 |
1 files changed, 392 insertions, 0 deletions
@@ -1764,6 +1764,18 @@ REXEC_FBC_SCAN( /* Loops while (s < strend) */ \ #define getGCB_VAL_UTF8(pos, strend) \ _generic_GET_BREAK_VAL_UTF8(getGCB_VAL_CP, pos, strend) +/* Returns the LB value for the input code point */ +#define getLB_VAL_CP(cp) \ + _generic_GET_BREAK_VAL_CP( \ + PL_LB_invlist, \ + _Perl_LB_invmap, \ + (cp)) + +/* Returns the LB value for the first code point in the UTF-8 encoded string + * bounded by pos and strend */ +#define getLB_VAL_UTF8(pos, strend) \ + _generic_GET_BREAK_VAL_UTF8(getLB_VAL_CP, pos, strend) + /* Returns the SB value for the input code point */ #define getSB_VAL_CP(cp) \ @@ -2115,6 +2127,63 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, } break; + case LB_BOUND: + if (s == reginfo->strbeg) { + if (reginfo->intuit || regtry(reginfo, &s)) { + goto got_it; + } + s += (utf8_target) ? UTF8SKIP(s) : 1; + if (UNLIKELY(s >= reginfo->strend)) { + break; + } + } + + if (utf8_target) { + LB_enum before = getLB_VAL_UTF8(reghop3((U8*)s, + -1, + (U8*)(reginfo->strbeg)), + (U8*) reginfo->strend); + while (s < strend) { + LB_enum after = getLB_VAL_UTF8((U8*) s, (U8*) reginfo->strend); + if (to_complement ^ isLB(before, + after, + (U8*) reginfo->strbeg, + (U8*) s, + (U8*) reginfo->strend, + utf8_target) + && (reginfo->intuit || regtry(reginfo, &s))) + { + goto got_it; + } + before = after; + s += UTF8SKIP(s); + } + } + else { /* Not utf8. */ + LB_enum before = getLB_VAL_CP((U8) *(s -1)); + while (s < strend) { + LB_enum after = getLB_VAL_CP((U8) *s); + if (to_complement ^ isLB(before, + after, + (U8*) reginfo->strbeg, + (U8*) s, + (U8*) reginfo->strend, + utf8_target) + && (reginfo->intuit || regtry(reginfo, &s))) + { + goto got_it; + } + before = after; + s++; + } + } + + if (reginfo->intuit || regtry(reginfo, &s)) { + goto got_it; + } + + break; + case SB_BOUND: if (s == reginfo->strbeg) { if (reginfo->intuit || regtry(reginfo, &s)) { @@ -4277,6 +4346,290 @@ S_isGCB(const GCB_enum before, const GCB_enum after) NOT_REACHED; /* NOTREACHED */ } +/* Combining marks attach to most classes that precede them, but this defines + * the exceptions (from TR14) */ +#define LB_CM_ATTACHES_TO(prev) ( ! ( prev == LB_EDGE \ + || prev == LB_Mandatory_Break \ + || prev == LB_Carriage_Return \ + || prev == LB_Line_Feed \ + || prev == LB_Next_Line \ + || prev == LB_Space \ + || prev == LB_ZWSpace)) + +STATIC bool +S_isLB(pTHX_ LB_enum before, + LB_enum after, + const U8 * const strbeg, + const U8 * const curpos, + const U8 * const strend, + const bool utf8_target) +{ + U8 * temp_pos = (U8 *) curpos; + LB_enum prev = before; + + /* Is the boundary between 'before' and 'after' line-breakable? + * Most of this is just a table lookup of a generated table from Unicode + * rules. But some rules require context to decide, and so have to be + * implemented in code */ + + PERL_ARGS_ASSERT_ISLB; + + /* Rule numbers in the comments below are as of Unicode 8.0 */ + + redo: + before = prev; + switch (LB_table[before][after]) { + case LB_BREAKABLE: + return TRUE; + + case LB_NOBREAK: + case LB_NOBREAK_EVEN_WITH_SP_BETWEEN: + return FALSE; + + case LB_SP_foo + LB_BREAKABLE: + case LB_SP_foo + LB_NOBREAK: + case LB_SP_foo + LB_NOBREAK_EVEN_WITH_SP_BETWEEN: + + /* When we have something following a SP, we have to look at the + * context in order to know what to do. + * + * SP SP should not reach here because LB7: Do not break before + * spaces. (For two spaces in a row there is nothing that + * overrides that) */ + assert(after != LB_Space); + + /* Here we have a space followed by a non-space. Mostly this is a + * case of LB18: "Break after spaces". But there are complications + * as the handling of spaces is somewhat tricky. They are in a + * number of rules, which have to be applied in priority order, but + * something earlier in the string can cause a rule to be skipped + * and a lower priority rule invoked. A prime example is LB7 which + * says don't break before a space. But rule LB8 (lower priority) + * says that the first break opportunity after a ZW is after any + * span of spaces immediately after it. If a ZW comes before a SP + * in the input, rule LB8 applies, and not LB7. Other such rules + * involve combining marks which are rules 9 and 10, but they may + * override higher priority rules if they come earlier in the + * string. Since we're doing random access into the middle of the + * string, we have to look for rules that should get applied based + * on both string position and priority. Combining marks do not + * attach to either ZW nor SP, so we don't have to consider them + * until later. + * + * To check for LB8, we have to find the first non-space character + * before this span of spaces */ + do { + prev = backup_one_LB(strbeg, &temp_pos, utf8_target); + } + while (prev == LB_Space); + + /* LB8 Break before any character following a zero-width space, + * even if one or more spaces intervene. + * ZW SP* ÷ + * So if we have a ZW just before this span, and to get here this + * is the final space in the span. */ + if (prev == LB_ZWSpace) { + return TRUE; + } + + /* Here, not ZW SP+. There are several rules that have higher + * priority than LB18 and can be resolved now, as they don't depend + * on anything earlier in the string (except ZW, which we have + * already handled). One of these rules is LB11 Do not break + * before Word joiner, but we have specially encoded that in the + * lookup table so it is caught by the single test below which + * catches the other ones. */ + if (LB_table[LB_Space][after] - LB_SP_foo + == LB_NOBREAK_EVEN_WITH_SP_BETWEEN) + { + return FALSE; + } + + /* If we get here, we have to XXX consider combining marks. */ + if (prev == LB_Combining_Mark) { + + /* What happens with these depends on the character they + * follow. */ + do { + prev = backup_one_LB(strbeg, &temp_pos, utf8_target); + } + while (prev == LB_Combining_Mark); + + /* Most times these attach to and inherit the characteristics + * of that character, but not always, and when not, they are to + * be treated as AL by rule LB10. */ + if (! LB_CM_ATTACHES_TO(prev)) { + prev = LB_Alphabetic; + } + } + + /* Here, we have the character preceding the span of spaces all set + * up. We follow LB18: "Break after spaces" unless the table shows + * that is overriden */ + return LB_table[prev][after] != LB_NOBREAK_EVEN_WITH_SP_BETWEEN; + + case LB_CM_foo: + + /* We don't know how to treat the CM except by looking at the first + * non-CM character preceding it */ + do { + prev = backup_one_LB(strbeg, &temp_pos, utf8_target); + } + while (prev == LB_Combining_Mark); + + /* Here, 'prev' is that first earlier non-CM character. If the CM + * attatches to it, then it inherits the behavior of 'prev'. If it + * doesn't attach, it is to be treated as an AL */ + if (! LB_CM_ATTACHES_TO(prev)) { + prev = LB_Alphabetic; + } + + goto redo; + + case LB_HY_or_BA_then_foo + LB_BREAKABLE: + case LB_HY_or_BA_then_foo + LB_NOBREAK: + + /* LB21a Don't break after Hebrew + Hyphen. + * HL (HY | BA) × */ + + if (backup_one_LB(strbeg, &temp_pos, utf8_target) + == LB_Hebrew_Letter) + { + return FALSE; + } + + return LB_table[prev][after] - LB_HY_or_BA_then_foo == LB_BREAKABLE; + + case LB_PR_or_PO_then_OP_or_HY + LB_BREAKABLE: + case LB_PR_or_PO_then_OP_or_HY + LB_NOBREAK: + + /* LB25a (PR | PO) × ( OP | HY )? NU */ + if (advance_one_LB(&temp_pos, strend, utf8_target) == LB_Numeric) { + return FALSE; + } + + return LB_table[prev][after] - LB_PR_or_PO_then_OP_or_HY + == LB_BREAKABLE; + + case LB_SY_or_IS_then_various + LB_BREAKABLE: + case LB_SY_or_IS_then_various + LB_NOBREAK: + { + /* LB25d NU (SY | IS)* × (NU | SY | IS | CL | CP ) */ + + LB_enum temp = prev; + do { + temp = backup_one_LB(strbeg, &temp_pos, utf8_target); + } + while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric); + if (temp == LB_Numeric) { + return FALSE; + } + + return LB_table[prev][after] - LB_SY_or_IS_then_various + == LB_BREAKABLE; + } + + case LB_various_then_PO_or_PR + LB_BREAKABLE: + case LB_various_then_PO_or_PR + LB_NOBREAK: + { + /* LB25e NU (SY | IS)* (CL | CP)? × (PO | PR) */ + + LB_enum temp = prev; + if (temp == LB_Close_Punctuation || temp == LB_Close_Parenthesis) + { + temp = backup_one_LB(strbeg, &temp_pos, utf8_target); + } + while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric) { + temp = backup_one_LB(strbeg, &temp_pos, utf8_target); + } + if (temp == LB_Numeric) { + return FALSE; + } + return LB_various_then_PO_or_PR; + } + + default: + break; + } + +#ifdef DEBUGGING + PerlIO_printf(Perl_error_log, "Unhandled LB pair: LB_table[%d, %d] = %d\n", + before, after, LB_table[before][after]); + assert(0); +#endif + return TRUE; +} + +STATIC LB_enum +S_advance_one_LB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target) +{ + LB_enum lb; + + PERL_ARGS_ASSERT_ADVANCE_ONE_LB; + + if (*curpos >= strend) { + return LB_EDGE; + } + + if (utf8_target) { + *curpos += UTF8SKIP(*curpos); + if (*curpos >= strend) { + return LB_EDGE; + } + lb = getLB_VAL_UTF8(*curpos, strend); + } + else { + (*curpos)++; + if (*curpos >= strend) { + return LB_EDGE; + } + lb = getLB_VAL_CP(**curpos); + } + + return lb; +} + +STATIC LB_enum +S_backup_one_LB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target) +{ + LB_enum lb; + + PERL_ARGS_ASSERT_BACKUP_ONE_LB; + + if (*curpos < strbeg) { + return LB_EDGE; + } + + if (utf8_target) { + U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg); + U8 * prev_prev_char_pos; + + if (! prev_char_pos) { + return LB_EDGE; + } + + if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, strbeg))) { + lb = getLB_VAL_UTF8(prev_prev_char_pos, prev_char_pos); + *curpos = prev_char_pos; + prev_char_pos = prev_prev_char_pos; + } + else { + *curpos = (U8 *) strbeg; + return LB_EDGE; + } + } + else { + if (*curpos - 2 < strbeg) { + *curpos = (U8 *) strbeg; + return LB_EDGE; + } + (*curpos)--; + lb = getLB_VAL_CP(*(*curpos - 1)); + } + + return lb; +} + #define SBcase(before, after) ((SB_ENUM_COUNT * before) + after) STATIC bool @@ -5700,6 +6053,28 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) } break; + case LB_BOUND: + if (locinput == reginfo->strbeg) { + match = FALSE; + } + else if (NEXTCHR_IS_EOS) { + match = TRUE; + } + else { + match = isLB(getLB_VAL_UTF8( + reghop3((U8*)locinput, + -1, + (U8*)(reginfo->strbeg)), + (U8*) reginfo->strend), + getLB_VAL_UTF8((U8*) locinput, + (U8*) reginfo->strend), + (U8*) reginfo->strbeg, + (U8*) locinput, + (U8*) reginfo->strend, + utf8_target); + } + break; + case SB_BOUND: /* Always matches at begin and end */ if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) { match = TRUE; @@ -5767,6 +6142,23 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) } break; + case LB_BOUND: + if (locinput == reginfo->strbeg) { + match = FALSE; + } + else if (NEXTCHR_IS_EOS) { + match = TRUE; + } + else { + match = isLB(getLB_VAL_CP(UCHARAT(locinput -1)), + getLB_VAL_CP(UCHARAT(locinput)), + (U8*) reginfo->strbeg, + (U8*) locinput, + (U8*) reginfo->strend, + utf8_target); + } + break; + case SB_BOUND: /* Always matches at begin and end */ if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) { match = TRUE; |