summaryrefslogtreecommitdiff
path: root/regexec.c
diff options
context:
space:
mode:
Diffstat (limited to 'regexec.c')
-rw-r--r--regexec.c392
1 files changed, 392 insertions, 0 deletions
diff --git a/regexec.c b/regexec.c
index c057efe241..ffc1a62776 100644
--- a/regexec.c
+++ b/regexec.c
@@ -1764,6 +1764,18 @@ REXEC_FBC_SCAN( /* Loops while (s < strend) */ \
#define getGCB_VAL_UTF8(pos, strend) \
_generic_GET_BREAK_VAL_UTF8(getGCB_VAL_CP, pos, strend)
+/* Returns the LB value for the input code point */
+#define getLB_VAL_CP(cp) \
+ _generic_GET_BREAK_VAL_CP( \
+ PL_LB_invlist, \
+ _Perl_LB_invmap, \
+ (cp))
+
+/* Returns the LB value for the first code point in the UTF-8 encoded string
+ * bounded by pos and strend */
+#define getLB_VAL_UTF8(pos, strend) \
+ _generic_GET_BREAK_VAL_UTF8(getLB_VAL_CP, pos, strend)
+
/* Returns the SB value for the input code point */
#define getSB_VAL_CP(cp) \
@@ -2115,6 +2127,63 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
}
break;
+ case LB_BOUND:
+ if (s == reginfo->strbeg) {
+ if (reginfo->intuit || regtry(reginfo, &s)) {
+ goto got_it;
+ }
+ s += (utf8_target) ? UTF8SKIP(s) : 1;
+ if (UNLIKELY(s >= reginfo->strend)) {
+ break;
+ }
+ }
+
+ if (utf8_target) {
+ LB_enum before = getLB_VAL_UTF8(reghop3((U8*)s,
+ -1,
+ (U8*)(reginfo->strbeg)),
+ (U8*) reginfo->strend);
+ while (s < strend) {
+ LB_enum after = getLB_VAL_UTF8((U8*) s, (U8*) reginfo->strend);
+ if (to_complement ^ isLB(before,
+ after,
+ (U8*) reginfo->strbeg,
+ (U8*) s,
+ (U8*) reginfo->strend,
+ utf8_target)
+ && (reginfo->intuit || regtry(reginfo, &s)))
+ {
+ goto got_it;
+ }
+ before = after;
+ s += UTF8SKIP(s);
+ }
+ }
+ else { /* Not utf8. */
+ LB_enum before = getLB_VAL_CP((U8) *(s -1));
+ while (s < strend) {
+ LB_enum after = getLB_VAL_CP((U8) *s);
+ if (to_complement ^ isLB(before,
+ after,
+ (U8*) reginfo->strbeg,
+ (U8*) s,
+ (U8*) reginfo->strend,
+ utf8_target)
+ && (reginfo->intuit || regtry(reginfo, &s)))
+ {
+ goto got_it;
+ }
+ before = after;
+ s++;
+ }
+ }
+
+ if (reginfo->intuit || regtry(reginfo, &s)) {
+ goto got_it;
+ }
+
+ break;
+
case SB_BOUND:
if (s == reginfo->strbeg) {
if (reginfo->intuit || regtry(reginfo, &s)) {
@@ -4277,6 +4346,290 @@ S_isGCB(const GCB_enum before, const GCB_enum after)
NOT_REACHED; /* NOTREACHED */
}
+/* Combining marks attach to most classes that precede them, but this defines
+ * the exceptions (from TR14) */
+#define LB_CM_ATTACHES_TO(prev) ( ! ( prev == LB_EDGE \
+ || prev == LB_Mandatory_Break \
+ || prev == LB_Carriage_Return \
+ || prev == LB_Line_Feed \
+ || prev == LB_Next_Line \
+ || prev == LB_Space \
+ || prev == LB_ZWSpace))
+
+STATIC bool
+S_isLB(pTHX_ LB_enum before,
+ LB_enum after,
+ const U8 * const strbeg,
+ const U8 * const curpos,
+ const U8 * const strend,
+ const bool utf8_target)
+{
+ U8 * temp_pos = (U8 *) curpos;
+ LB_enum prev = before;
+
+ /* Is the boundary between 'before' and 'after' line-breakable?
+ * Most of this is just a table lookup of a generated table from Unicode
+ * rules. But some rules require context to decide, and so have to be
+ * implemented in code */
+
+ PERL_ARGS_ASSERT_ISLB;
+
+ /* Rule numbers in the comments below are as of Unicode 8.0 */
+
+ redo:
+ before = prev;
+ switch (LB_table[before][after]) {
+ case LB_BREAKABLE:
+ return TRUE;
+
+ case LB_NOBREAK:
+ case LB_NOBREAK_EVEN_WITH_SP_BETWEEN:
+ return FALSE;
+
+ case LB_SP_foo + LB_BREAKABLE:
+ case LB_SP_foo + LB_NOBREAK:
+ case LB_SP_foo + LB_NOBREAK_EVEN_WITH_SP_BETWEEN:
+
+ /* When we have something following a SP, we have to look at the
+ * context in order to know what to do.
+ *
+ * SP SP should not reach here because LB7: Do not break before
+ * spaces. (For two spaces in a row there is nothing that
+ * overrides that) */
+ assert(after != LB_Space);
+
+ /* Here we have a space followed by a non-space. Mostly this is a
+ * case of LB18: "Break after spaces". But there are complications
+ * as the handling of spaces is somewhat tricky. They are in a
+ * number of rules, which have to be applied in priority order, but
+ * something earlier in the string can cause a rule to be skipped
+ * and a lower priority rule invoked. A prime example is LB7 which
+ * says don't break before a space. But rule LB8 (lower priority)
+ * says that the first break opportunity after a ZW is after any
+ * span of spaces immediately after it. If a ZW comes before a SP
+ * in the input, rule LB8 applies, and not LB7. Other such rules
+ * involve combining marks which are rules 9 and 10, but they may
+ * override higher priority rules if they come earlier in the
+ * string. Since we're doing random access into the middle of the
+ * string, we have to look for rules that should get applied based
+ * on both string position and priority. Combining marks do not
+ * attach to either ZW nor SP, so we don't have to consider them
+ * until later.
+ *
+ * To check for LB8, we have to find the first non-space character
+ * before this span of spaces */
+ do {
+ prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
+ }
+ while (prev == LB_Space);
+
+ /* LB8 Break before any character following a zero-width space,
+ * even if one or more spaces intervene.
+ * ZW SP* ÷
+ * So if we have a ZW just before this span, and to get here this
+ * is the final space in the span. */
+ if (prev == LB_ZWSpace) {
+ return TRUE;
+ }
+
+ /* Here, not ZW SP+. There are several rules that have higher
+ * priority than LB18 and can be resolved now, as they don't depend
+ * on anything earlier in the string (except ZW, which we have
+ * already handled). One of these rules is LB11 Do not break
+ * before Word joiner, but we have specially encoded that in the
+ * lookup table so it is caught by the single test below which
+ * catches the other ones. */
+ if (LB_table[LB_Space][after] - LB_SP_foo
+ == LB_NOBREAK_EVEN_WITH_SP_BETWEEN)
+ {
+ return FALSE;
+ }
+
+ /* If we get here, we have to XXX consider combining marks. */
+ if (prev == LB_Combining_Mark) {
+
+ /* What happens with these depends on the character they
+ * follow. */
+ do {
+ prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
+ }
+ while (prev == LB_Combining_Mark);
+
+ /* Most times these attach to and inherit the characteristics
+ * of that character, but not always, and when not, they are to
+ * be treated as AL by rule LB10. */
+ if (! LB_CM_ATTACHES_TO(prev)) {
+ prev = LB_Alphabetic;
+ }
+ }
+
+ /* Here, we have the character preceding the span of spaces all set
+ * up. We follow LB18: "Break after spaces" unless the table shows
+ * that is overriden */
+ return LB_table[prev][after] != LB_NOBREAK_EVEN_WITH_SP_BETWEEN;
+
+ case LB_CM_foo:
+
+ /* We don't know how to treat the CM except by looking at the first
+ * non-CM character preceding it */
+ do {
+ prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
+ }
+ while (prev == LB_Combining_Mark);
+
+ /* Here, 'prev' is that first earlier non-CM character. If the CM
+ * attatches to it, then it inherits the behavior of 'prev'. If it
+ * doesn't attach, it is to be treated as an AL */
+ if (! LB_CM_ATTACHES_TO(prev)) {
+ prev = LB_Alphabetic;
+ }
+
+ goto redo;
+
+ case LB_HY_or_BA_then_foo + LB_BREAKABLE:
+ case LB_HY_or_BA_then_foo + LB_NOBREAK:
+
+ /* LB21a Don't break after Hebrew + Hyphen.
+ * HL (HY | BA) × */
+
+ if (backup_one_LB(strbeg, &temp_pos, utf8_target)
+ == LB_Hebrew_Letter)
+ {
+ return FALSE;
+ }
+
+ return LB_table[prev][after] - LB_HY_or_BA_then_foo == LB_BREAKABLE;
+
+ case LB_PR_or_PO_then_OP_or_HY + LB_BREAKABLE:
+ case LB_PR_or_PO_then_OP_or_HY + LB_NOBREAK:
+
+ /* LB25a (PR | PO) × ( OP | HY )? NU */
+ if (advance_one_LB(&temp_pos, strend, utf8_target) == LB_Numeric) {
+ return FALSE;
+ }
+
+ return LB_table[prev][after] - LB_PR_or_PO_then_OP_or_HY
+ == LB_BREAKABLE;
+
+ case LB_SY_or_IS_then_various + LB_BREAKABLE:
+ case LB_SY_or_IS_then_various + LB_NOBREAK:
+ {
+ /* LB25d NU (SY | IS)* × (NU | SY | IS | CL | CP ) */
+
+ LB_enum temp = prev;
+ do {
+ temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
+ }
+ while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric);
+ if (temp == LB_Numeric) {
+ return FALSE;
+ }
+
+ return LB_table[prev][after] - LB_SY_or_IS_then_various
+ == LB_BREAKABLE;
+ }
+
+ case LB_various_then_PO_or_PR + LB_BREAKABLE:
+ case LB_various_then_PO_or_PR + LB_NOBREAK:
+ {
+ /* LB25e NU (SY | IS)* (CL | CP)? × (PO | PR) */
+
+ LB_enum temp = prev;
+ if (temp == LB_Close_Punctuation || temp == LB_Close_Parenthesis)
+ {
+ temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
+ }
+ while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric) {
+ temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
+ }
+ if (temp == LB_Numeric) {
+ return FALSE;
+ }
+ return LB_various_then_PO_or_PR;
+ }
+
+ default:
+ break;
+ }
+
+#ifdef DEBUGGING
+ PerlIO_printf(Perl_error_log, "Unhandled LB pair: LB_table[%d, %d] = %d\n",
+ before, after, LB_table[before][after]);
+ assert(0);
+#endif
+ return TRUE;
+}
+
+STATIC LB_enum
+S_advance_one_LB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
+{
+ LB_enum lb;
+
+ PERL_ARGS_ASSERT_ADVANCE_ONE_LB;
+
+ if (*curpos >= strend) {
+ return LB_EDGE;
+ }
+
+ if (utf8_target) {
+ *curpos += UTF8SKIP(*curpos);
+ if (*curpos >= strend) {
+ return LB_EDGE;
+ }
+ lb = getLB_VAL_UTF8(*curpos, strend);
+ }
+ else {
+ (*curpos)++;
+ if (*curpos >= strend) {
+ return LB_EDGE;
+ }
+ lb = getLB_VAL_CP(**curpos);
+ }
+
+ return lb;
+}
+
+STATIC LB_enum
+S_backup_one_LB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
+{
+ LB_enum lb;
+
+ PERL_ARGS_ASSERT_BACKUP_ONE_LB;
+
+ if (*curpos < strbeg) {
+ return LB_EDGE;
+ }
+
+ if (utf8_target) {
+ U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
+ U8 * prev_prev_char_pos;
+
+ if (! prev_char_pos) {
+ return LB_EDGE;
+ }
+
+ if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, strbeg))) {
+ lb = getLB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
+ *curpos = prev_char_pos;
+ prev_char_pos = prev_prev_char_pos;
+ }
+ else {
+ *curpos = (U8 *) strbeg;
+ return LB_EDGE;
+ }
+ }
+ else {
+ if (*curpos - 2 < strbeg) {
+ *curpos = (U8 *) strbeg;
+ return LB_EDGE;
+ }
+ (*curpos)--;
+ lb = getLB_VAL_CP(*(*curpos - 1));
+ }
+
+ return lb;
+}
+
#define SBcase(before, after) ((SB_ENUM_COUNT * before) + after)
STATIC bool
@@ -5700,6 +6053,28 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
}
break;
+ case LB_BOUND:
+ if (locinput == reginfo->strbeg) {
+ match = FALSE;
+ }
+ else if (NEXTCHR_IS_EOS) {
+ match = TRUE;
+ }
+ else {
+ match = isLB(getLB_VAL_UTF8(
+ reghop3((U8*)locinput,
+ -1,
+ (U8*)(reginfo->strbeg)),
+ (U8*) reginfo->strend),
+ getLB_VAL_UTF8((U8*) locinput,
+ (U8*) reginfo->strend),
+ (U8*) reginfo->strbeg,
+ (U8*) locinput,
+ (U8*) reginfo->strend,
+ utf8_target);
+ }
+ break;
+
case SB_BOUND: /* Always matches at begin and end */
if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
match = TRUE;
@@ -5767,6 +6142,23 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
}
break;
+ case LB_BOUND:
+ if (locinput == reginfo->strbeg) {
+ match = FALSE;
+ }
+ else if (NEXTCHR_IS_EOS) {
+ match = TRUE;
+ }
+ else {
+ match = isLB(getLB_VAL_CP(UCHARAT(locinput -1)),
+ getLB_VAL_CP(UCHARAT(locinput)),
+ (U8*) reginfo->strbeg,
+ (U8*) locinput,
+ (U8*) reginfo->strend,
+ utf8_target);
+ }
+ break;
+
case SB_BOUND: /* Always matches at begin and end */
if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
match = TRUE;