diff options
author | zherczeg <zherczeg@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2013-04-01 17:04:17 +0000 |
---|---|---|
committer | zherczeg <zherczeg@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2013-04-01 17:04:17 +0000 |
commit | f5d0082e0003890add3831d1a6aa1b9d94cc3c51 (patch) | |
tree | 99dce600871d11d26f7d0cff29588df6f0e20c23 | |
parent | 2124c9651464dc3c97b9d1cb9a13fe57bca181e4 (diff) | |
download | pcre-f5d0082e0003890add3831d1a6aa1b9d94cc3c51.tar.gz |
Auto-detect and optimize limited repetitions in JIT.
git-svn-id: svn://vcs.exim.org/pcre/code/trunk@1306 2f5784b3-3f2a-0410-8824-cb99058d5e15
-rw-r--r-- | ChangeLog | 2 | ||||
-rw-r--r-- | pcre_jit_compile.c | 560 | ||||
-rw-r--r-- | pcre_jit_test.c | 11 |
3 files changed, 347 insertions, 226 deletions
@@ -130,6 +130,8 @@ Version 8.33 xx-xxxx-201x 33. An opening parenthesis in a MARK/PRUNE/SKIP/THEN name in a pattern that contained a forward subroutine reference caused a compile error. +34. Auto-detect and optimize limited repetitions in JIT. + Version 8.32 30-November-2012 ----------------------------- diff --git a/pcre_jit_compile.c b/pcre_jit_compile.c index 920be48..9b09268 100644 --- a/pcre_jit_compile.c +++ b/pcre_jit_compile.c @@ -313,7 +313,7 @@ typedef struct compiler_common { /* First byte code. */ pcre_uchar *start; /* Maps private data offset to each opcode. */ - int *private_data_ptrs; + sljit_si *private_data_ptrs; /* Tells whether the capturing bracket is optimized. */ pcre_uint8 *optimized_cbracket; /* Tells whether the starting offset is a target of then. */ @@ -534,7 +534,7 @@ return cc; /* Functions whose might need modification for all new supported opcodes: next_opcode - get_private_data_length + check_opcode_types set_private_data_ptrs get_framesize init_frame @@ -733,98 +733,15 @@ switch(*cc) } } -#define CASE_ITERATOR_PRIVATE_DATA_1 \ - case OP_MINSTAR: \ - case OP_MINPLUS: \ - case OP_QUERY: \ - case OP_MINQUERY: \ - case OP_MINSTARI: \ - case OP_MINPLUSI: \ - case OP_QUERYI: \ - case OP_MINQUERYI: \ - case OP_NOTMINSTAR: \ - case OP_NOTMINPLUS: \ - case OP_NOTQUERY: \ - case OP_NOTMINQUERY: \ - case OP_NOTMINSTARI: \ - case OP_NOTMINPLUSI: \ - case OP_NOTQUERYI: \ - case OP_NOTMINQUERYI: - -#define CASE_ITERATOR_PRIVATE_DATA_2A \ - case OP_STAR: \ - case OP_PLUS: \ - case OP_STARI: \ - case OP_PLUSI: \ - case OP_NOTSTAR: \ - case OP_NOTPLUS: \ - case OP_NOTSTARI: \ - case OP_NOTPLUSI: - -#define CASE_ITERATOR_PRIVATE_DATA_2B \ - case OP_UPTO: \ - case OP_MINUPTO: \ - case OP_UPTOI: \ - case OP_MINUPTOI: \ - case OP_NOTUPTO: \ - case OP_NOTMINUPTO: \ - case OP_NOTUPTOI: \ - case OP_NOTMINUPTOI: - -#define CASE_ITERATOR_TYPE_PRIVATE_DATA_1 \ - case OP_TYPEMINSTAR: \ - case OP_TYPEMINPLUS: \ - case OP_TYPEQUERY: \ - case OP_TYPEMINQUERY: - -#define CASE_ITERATOR_TYPE_PRIVATE_DATA_2A \ - case OP_TYPESTAR: \ - case OP_TYPEPLUS: - -#define CASE_ITERATOR_TYPE_PRIVATE_DATA_2B \ - case OP_TYPEUPTO: \ - case OP_TYPEMINUPTO: - -static int get_class_iterator_size(pcre_uchar *cc) +static BOOL check_opcode_types(compiler_common *common, pcre_uchar *cc, pcre_uchar *ccend) { -switch(*cc) - { - case OP_CRSTAR: - case OP_CRPLUS: - return 2; - - case OP_CRMINSTAR: - case OP_CRMINPLUS: - case OP_CRQUERY: - case OP_CRMINQUERY: - return 1; - - case OP_CRRANGE: - case OP_CRMINRANGE: - if (GET2(cc, 1) == GET2(cc, 1 + IMM2_SIZE)) - return 0; - return 2; - - default: - return 0; - } -} - -static int get_private_data_length(compiler_common *common, pcre_uchar *cc, pcre_uchar *ccend) -{ -int private_data_length = 0; -pcre_uchar *alternative; pcre_uchar *name; -pcre_uchar *end = NULL; -int space, size, i; -pcre_uint32 bracketlen; +pcre_uchar *name2; +int i, cbra_index; /* Calculate important variables (like stack size) and checks whether all opcodes are supported. */ while (cc < ccend) { - space = 0; - size = 0; - bracketlen = 0; switch(*cc) { case OP_SET_SOM: @@ -838,24 +755,10 @@ while (cc < ccend) cc += 1 + IMM2_SIZE; break; - case OP_ASSERT: - case OP_ASSERT_NOT: - case OP_ASSERTBACK: - case OP_ASSERTBACK_NOT: - case OP_ONCE: - case OP_ONCE_NC: - case OP_BRAPOS: - case OP_SBRA: - case OP_SBRAPOS: - private_data_length += sizeof(sljit_sw); - bracketlen = 1 + LINK_SIZE; - break; - case OP_CBRAPOS: case OP_SCBRAPOS: - private_data_length += sizeof(sljit_sw); common->optimized_cbracket[GET2(cc, 1 + LINK_SIZE)] = 0; - bracketlen = 1 + LINK_SIZE + IMM2_SIZE; + cc += 1 + LINK_SIZE + IMM2_SIZE; break; case OP_COND: @@ -863,18 +766,8 @@ while (cc < ccend) /* Only AUTO_CALLOUT can insert this opcode. We do not intend to support this case. */ if (cc[1 + LINK_SIZE] == OP_CALLOUT) - return -1; - - if (*cc == OP_COND) - { - /* Might be a hidden SCOND. */ - alternative = cc + GET(cc, 1); - if (*alternative == OP_KETRMAX || *alternative == OP_KETRMIN) - private_data_length += sizeof(sljit_sw); - } - else - private_data_length += sizeof(sljit_sw); - bracketlen = 1 + LINK_SIZE; + return FALSE; + cc += 1 + LINK_SIZE; break; case OP_CREF: @@ -884,80 +777,25 @@ while (cc < ccend) break; case OP_NCREF: - bracketlen = GET2(cc, 1); + cbra_index = GET2(cc, 1); name = (pcre_uchar *)common->name_table; - alternative = name; + name2 = name; for (i = 0; i < common->name_count; i++) { - if (GET2(name, 0) == bracketlen) break; + if (GET2(name, 0) == cbra_index) break; name += common->name_entry_size; } SLJIT_ASSERT(i != common->name_count); for (i = 0; i < common->name_count; i++) { - if (STRCMP_UC_UC(alternative + IMM2_SIZE, name + IMM2_SIZE) == 0) - common->optimized_cbracket[GET2(alternative, 0)] = 0; - alternative += common->name_entry_size; + if (STRCMP_UC_UC(name2 + IMM2_SIZE, name + IMM2_SIZE) == 0) + common->optimized_cbracket[GET2(name2, 0)] = 0; + name2 += common->name_entry_size; } - bracketlen = 0; cc += 1 + IMM2_SIZE; break; - case OP_BRA: - bracketlen = 1 + LINK_SIZE; - break; - - case OP_CBRA: - case OP_SCBRA: - bracketlen = 1 + LINK_SIZE + IMM2_SIZE; - break; - - CASE_ITERATOR_PRIVATE_DATA_1 - space = 1; - size = -2; - break; - - CASE_ITERATOR_PRIVATE_DATA_2A - space = 2; - size = -2; - break; - - CASE_ITERATOR_PRIVATE_DATA_2B - space = 2; - size = -(2 + IMM2_SIZE); - break; - - CASE_ITERATOR_TYPE_PRIVATE_DATA_1 - space = 1; - size = 1; - break; - - CASE_ITERATOR_TYPE_PRIVATE_DATA_2A - if (cc[1] != OP_ANYNL && cc[1] != OP_EXTUNI) - space = 2; - size = 1; - break; - - CASE_ITERATOR_TYPE_PRIVATE_DATA_2B - if (cc[1 + IMM2_SIZE] != OP_ANYNL && cc[1 + IMM2_SIZE] != OP_EXTUNI) - space = 2; - size = 1 + IMM2_SIZE; - break; - - case OP_CLASS: - case OP_NCLASS: - size += 1 + 32 / sizeof(pcre_uchar); - space = get_class_iterator_size(cc + size); - break; - -#if defined SUPPORT_UTF || !defined COMPILE_PCRE8 - case OP_XCLASS: - size = GET(cc, 1); - space = get_class_iterator_size(cc + size); - break; -#endif - case OP_RECURSE: /* Set its value only once. */ if (common->recursive_head_ptr == 0) @@ -1015,45 +853,181 @@ while (cc < ccend) default: cc = next_opcode(common, cc); if (cc == NULL) - return -1; + return FALSE; break; } + } +return TRUE; +} - if (space > 0 && cc >= end) - private_data_length += sizeof(sljit_sw) * space; +static int get_class_iterator_size(pcre_uchar *cc) +{ +switch(*cc) + { + case OP_CRSTAR: + case OP_CRPLUS: + return 2; - if (size != 0) + case OP_CRMINSTAR: + case OP_CRMINPLUS: + case OP_CRQUERY: + case OP_CRMINQUERY: + return 1; + + case OP_CRRANGE: + case OP_CRMINRANGE: + if (GET2(cc, 1) == GET2(cc, 1 + IMM2_SIZE)) + return 0; + return 2; + + default: + return 0; + } +} + +static BOOL detect_repeat(compiler_common *common, pcre_uchar *begin) +{ +pcre_uchar *end = bracketend(begin); +pcre_uchar *next; +pcre_uchar *next_end; +pcre_uchar *max_end; +pcre_uchar type; +sljit_uw length = end - begin; +int min, max, i; + +/* Detect fixed iterations first. */ +if (end[-(1 + LINK_SIZE)] != OP_KET) + return FALSE; + +/* Already detected repeat. */ +if (common->private_data_ptrs[end - common->start - LINK_SIZE] != 0) + return TRUE; + +next = end; +min = 1; +while (1) + { + if (*next != *begin) + break; + next_end = bracketend(next); + if (next_end - next != length || memcmp(begin, next, IN_UCHARS(length)) != 0) + break; + next = next_end; + min++; + } + +if (min == 2) + return FALSE; + +max = 0; +max_end = next; +if (*next == OP_BRAZERO || *next == OP_BRAMINZERO) + { + type = *next; + while (1) { - if (size < 0) - { - cc += -size; -#ifdef SUPPORT_UTF - if (common->utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]); -#endif - } - else - cc += size; + if (next[0] != type || next[1] != OP_BRA || next[2 + LINK_SIZE] != *begin) + break; + next_end = bracketend(next + 2 + LINK_SIZE); + if (next_end - next != (length + 2 + LINK_SIZE) || memcmp(begin, next + 2 + LINK_SIZE, IN_UCHARS(length)) != 0) + break; + next = next_end; + max++; } - if (bracketlen != 0) + if (next[0] == type && next[1] == *begin && max >= 1) { - if (cc >= end) + next_end = bracketend(next + 1); + if (next_end - next == (length + 1) && memcmp(begin, next + 1, IN_UCHARS(length)) == 0) { - end = bracketend(cc); - if (end[-1 - LINK_SIZE] == OP_KET) - end = NULL; + for (i = 0; i < max; i++, next_end += 1 + LINK_SIZE) + if (*next_end != OP_KET) + break; + + if (i == max) + { + common->private_data_ptrs[max_end - common->start - LINK_SIZE] = next_end - max_end; + common->private_data_ptrs[max_end - common->start - LINK_SIZE + 1] = (type == OP_BRAZERO) ? OP_UPTO : OP_MINUPTO; + /* +2 the original and the last. */ + common->private_data_ptrs[max_end - common->start - LINK_SIZE + 2] = max + 2; + if (min == 1) + return TRUE; + min--; + max_end -= (1 + LINK_SIZE) + GET(max_end, -LINK_SIZE); + } } - cc += bracketlen; } } -return private_data_length; + +if (min >= 3) + { + common->private_data_ptrs[end - common->start - LINK_SIZE] = max_end - end; + common->private_data_ptrs[end - common->start - LINK_SIZE + 1] = OP_EXACT; + common->private_data_ptrs[end - common->start - LINK_SIZE + 2] = min; + return TRUE; + } + +return FALSE; } -static void set_private_data_ptrs(compiler_common *common, int private_data_ptr, pcre_uchar *ccend) +#define CASE_ITERATOR_PRIVATE_DATA_1 \ + case OP_MINSTAR: \ + case OP_MINPLUS: \ + case OP_QUERY: \ + case OP_MINQUERY: \ + case OP_MINSTARI: \ + case OP_MINPLUSI: \ + case OP_QUERYI: \ + case OP_MINQUERYI: \ + case OP_NOTMINSTAR: \ + case OP_NOTMINPLUS: \ + case OP_NOTQUERY: \ + case OP_NOTMINQUERY: \ + case OP_NOTMINSTARI: \ + case OP_NOTMINPLUSI: \ + case OP_NOTQUERYI: \ + case OP_NOTMINQUERYI: + +#define CASE_ITERATOR_PRIVATE_DATA_2A \ + case OP_STAR: \ + case OP_PLUS: \ + case OP_STARI: \ + case OP_PLUSI: \ + case OP_NOTSTAR: \ + case OP_NOTPLUS: \ + case OP_NOTSTARI: \ + case OP_NOTPLUSI: + +#define CASE_ITERATOR_PRIVATE_DATA_2B \ + case OP_UPTO: \ + case OP_MINUPTO: \ + case OP_UPTOI: \ + case OP_MINUPTOI: \ + case OP_NOTUPTO: \ + case OP_NOTMINUPTO: \ + case OP_NOTUPTOI: \ + case OP_NOTMINUPTOI: + +#define CASE_ITERATOR_TYPE_PRIVATE_DATA_1 \ + case OP_TYPEMINSTAR: \ + case OP_TYPEMINPLUS: \ + case OP_TYPEQUERY: \ + case OP_TYPEMINQUERY: + +#define CASE_ITERATOR_TYPE_PRIVATE_DATA_2A \ + case OP_TYPESTAR: \ + case OP_TYPEPLUS: + +#define CASE_ITERATOR_TYPE_PRIVATE_DATA_2B \ + case OP_TYPEUPTO: \ + case OP_TYPEMINUPTO: + +static void set_private_data_ptrs(compiler_common *common, int *private_data_start, pcre_uchar *ccend) { pcre_uchar *cc = common->start; pcre_uchar *alternative; pcre_uchar *end = NULL; +int private_data_ptr = *private_data_start; int space, size, bracketlen; while (cc < ccend) @@ -1061,8 +1035,30 @@ while (cc < ccend) space = 0; size = 0; bracketlen = 0; + if (private_data_ptr > SLJIT_MAX_LOCAL_SIZE) + return; + + if (*cc == OP_BRA || *cc == OP_CBRA || *cc == OP_ONCE || *cc == OP_ONCE_NC) + if (detect_repeat(common, cc)) + { + /* These brackets are converted to repeats, so no global + based single character repeat is allowed. */ + if (cc >= end) + end = bracketend(cc); + } + switch(*cc) { + case OP_KET: + if (common->private_data_ptrs[cc + 1 - common->start] != 0) + { + common->private_data_ptrs[cc - common->start] = private_data_ptr; + private_data_ptr += sizeof(sljit_sw); + cc += common->private_data_ptrs[cc + 1 - common->start]; + } + cc += 1 + LINK_SIZE; + break; + case OP_ASSERT: case OP_ASSERT_NOT: case OP_ASSERTBACK: @@ -1156,6 +1152,8 @@ while (cc < ccend) break; } + /* Character iterators, which are not inside a repeated bracket, + gets a private slot instead of allocating it on the stack. */ if (space > 0 && cc >= end) { common->private_data_ptrs[cc - common->start] = private_data_ptr; @@ -1186,6 +1184,7 @@ while (cc < ccend) cc += bracketlen; } } +*private_data_start = private_data_ptr; } /* Returns with a frame_types (always < 0) if no need for frame. */ @@ -6129,6 +6128,8 @@ pcre_uchar opcode; int private_data_ptr = 0; int offset = 0; int stacksize; +int repeat_ptr = 0, repeat_length = 0; +int repeat_type = 0, repeat_count = 0; pcre_uchar *ccbegin; pcre_uchar *matchingpath; pcre_uchar bra = OP_BRA; @@ -6152,16 +6153,29 @@ if (*cc == OP_BRAZERO || *cc == OP_BRAMINZERO) opcode = *cc; ccbegin = cc; -matchingpath = ccbegin + 1 + LINK_SIZE; +matchingpath = bracketend(cc) - 1 - LINK_SIZE; +ket = *matchingpath; +if (ket == OP_KET && PRIVATE_DATA(matchingpath) != 0) + { + repeat_ptr = PRIVATE_DATA(matchingpath); + repeat_length = PRIVATE_DATA(matchingpath + 1); + repeat_type = PRIVATE_DATA(matchingpath + 2); + repeat_count = PRIVATE_DATA(matchingpath + 3); + SLJIT_ASSERT(repeat_length != 0 && repeat_type != 0 && repeat_count != 0); + if (repeat_type == OP_UPTO) + ket = OP_KETRMAX; + if (repeat_type == OP_MINUPTO) + ket = OP_KETRMIN; + } if ((opcode == OP_COND || opcode == OP_SCOND) && cc[1 + LINK_SIZE] == OP_DEF) { /* Drop this bracket_backtrack. */ parent->top = backtrack->prev; - return bracketend(cc); + return matchingpath + 1 + LINK_SIZE + repeat_length; } -ket = *(bracketend(cc) - 1 - LINK_SIZE); +matchingpath = ccbegin + 1 + LINK_SIZE; SLJIT_ASSERT(ket == OP_KET || ket == OP_KETRMAX || ket == OP_KETRMIN); SLJIT_ASSERT(!((bra == OP_BRAZERO && ket == OP_KETRMIN) || (bra == OP_BRAMINZERO && ket == OP_KETRMAX))); cc += GET(cc, 1); @@ -6275,13 +6289,20 @@ if (bra == OP_BRAMINZERO) } } +if (repeat_type != 0) + { + OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_LOCALS_REG), repeat_ptr, SLJIT_IMM, repeat_count); + if (repeat_type == OP_EXACT) + rmaxlabel = LABEL(); + } + if (ket == OP_KETRMIN) BACKTRACK_AS(bracket_backtrack)->recursive_matchingpath = LABEL(); if (ket == OP_KETRMAX) { rmaxlabel = LABEL(); - if (has_alternatives && opcode != OP_ONCE && opcode < OP_SBRA) + if (has_alternatives && opcode != OP_ONCE && opcode < OP_SBRA && repeat_type == 0) BACKTRACK_AS(bracket_backtrack)->alternative_matchingpath = rmaxlabel; } @@ -6500,6 +6521,12 @@ if (opcode == OP_ONCE) match_once_common(common, ket, BACKTRACK_AS(bracket_backtrack)->u.framesize, private_data_ptr, has_alternatives, needs_control_head); stacksize = 0; +if (repeat_type == OP_MINUPTO) + { + /* We need to preserve the counter. TMP2 will be used below. */ + OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(SLJIT_LOCALS_REG), repeat_ptr); + stacksize++; + } if (ket != OP_KET || bra != OP_BRA) stacksize++; if (offset != 0) @@ -6516,6 +6543,13 @@ if (stacksize > 0) allocate_stack(common, stacksize); stacksize = 0; +if (repeat_type == OP_MINUPTO) + { + /* TMP2 was set above. */ + OP2(SLJIT_SUB, SLJIT_MEM1(STACK_TOP), STACK(stacksize), TMP2, 0, SLJIT_IMM, 1); + stacksize++; + } + if (ket != OP_KET || bra != OP_BRA) { if (ket != OP_KET) @@ -6545,7 +6579,17 @@ if (offset != 0 && common->optimized_cbracket[offset >> 1] != 0) if (ket == OP_KETRMAX) { - if (opcode == OP_ONCE || opcode >= OP_SBRA) + if (repeat_type != 0) + { + if (has_alternatives) + BACKTRACK_AS(bracket_backtrack)->alternative_matchingpath = LABEL(); + OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_MEM1(SLJIT_LOCALS_REG), repeat_ptr, SLJIT_MEM1(SLJIT_LOCALS_REG), repeat_ptr, SLJIT_IMM, 1); + JUMPTO(SLJIT_C_NOT_ZERO, rmaxlabel); + /* Drop STR_PTR for greedy plus quantifier. */ + if (opcode != OP_ONCE) + free_stack(common, 1); + } + else if (opcode == OP_ONCE || opcode >= OP_SBRA) { if (has_alternatives) BACKTRACK_AS(bracket_backtrack)->alternative_matchingpath = LABEL(); @@ -6566,6 +6610,19 @@ if (ket == OP_KETRMAX) BACKTRACK_AS(bracket_backtrack)->recursive_matchingpath = LABEL(); } +if (repeat_type == OP_EXACT) + { + OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_MEM1(SLJIT_LOCALS_REG), repeat_ptr, SLJIT_MEM1(SLJIT_LOCALS_REG), repeat_ptr, SLJIT_IMM, 1); + JUMPTO(SLJIT_C_NOT_ZERO, rmaxlabel); + } +else if (repeat_type == OP_UPTO) + { + /* We need to preserve the counter. */ + OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(SLJIT_LOCALS_REG), repeat_ptr); + allocate_stack(common, 1); + OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), STACK(0), TMP2, 0); + } + if (bra == OP_BRAZERO) BACKTRACK_AS(bracket_backtrack)->zero_matchingpath = LABEL(); @@ -6601,7 +6658,7 @@ cc += 1 + LINK_SIZE; /* Temporarily encoding the needs_control_head in framesize. */ if (opcode == OP_ONCE) BACKTRACK_AS(bracket_backtrack)->u.framesize = (BACKTRACK_AS(bracket_backtrack)->u.framesize << 1) | (needs_control_head ? 1 : 0); -return cc; +return cc + repeat_length; } static pcre_uchar *compile_bracketpos_matchingpath(compiler_common *common, pcre_uchar *cc, backtrack_common *parent) @@ -7888,11 +7945,10 @@ if (bra == OP_BRAZERO) static void compile_bracket_backtrackingpath(compiler_common *common, struct backtrack_common *current) { DEFINE_COMPILER; -int opcode; +int opcode, stacksize, count; int offset = 0; int private_data_ptr = CURRENT_AS(bracket_backtrack)->private_data_ptr; -int stacksize; -int count; +int repeat_ptr = 0, repeat_type = 0, repeat_count = 0; pcre_uchar *cc = current->cc; pcre_uchar *ccbegin; pcre_uchar *ccprev; @@ -7907,6 +7963,7 @@ struct sljit_jump *brazero = NULL; struct sljit_jump *once = NULL; struct sljit_jump *cond = NULL; struct sljit_label *rminlabel = NULL; +struct sljit_label *exact_label = NULL; if (*cc == OP_BRAZERO || *cc == OP_BRAMINZERO) { @@ -7915,8 +7972,20 @@ if (*cc == OP_BRAZERO || *cc == OP_BRAMINZERO) } opcode = *cc; +ccbegin = bracketend(cc) - 1 - LINK_SIZE; +ket = *ccbegin; +if (ket == OP_KET && PRIVATE_DATA(ccbegin) != 0) + { + repeat_ptr = PRIVATE_DATA(ccbegin); + repeat_type = PRIVATE_DATA(ccbegin + 2); + repeat_count = PRIVATE_DATA(ccbegin + 3); + SLJIT_ASSERT(repeat_type != 0 && repeat_count != 0); + if (repeat_type == OP_UPTO) + ket = OP_KETRMAX; + if (repeat_type == OP_MINUPTO) + ket = OP_KETRMIN; + } ccbegin = cc; -ket = *(bracketend(ccbegin) - 1 - LINK_SIZE); cc += GET(cc, 1); has_alternatives = *cc == OP_ALT; if (SLJIT_UNLIKELY(opcode == OP_COND) || SLJIT_UNLIKELY(opcode == OP_SCOND)) @@ -7935,6 +8004,17 @@ if (opcode == OP_ONCE) CURRENT_AS(bracket_backtrack)->u.framesize >>= 1; } +if (ket != OP_KET && repeat_type != 0) + { + /* TMP1 is used in OP_KETRMIN below. */ + OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(STACK_TOP), STACK(0)); + free_stack(common, 1); + if (repeat_type == OP_UPTO) + OP2(SLJIT_ADD, SLJIT_MEM1(SLJIT_LOCALS_REG), repeat_ptr, TMP1, 0, SLJIT_IMM, 1); + else + OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_LOCALS_REG), repeat_ptr, TMP1, 0); + } + if (ket == OP_KETRMAX) { if (bra == OP_BRAZERO) @@ -7949,7 +8029,15 @@ else if (ket == OP_KETRMIN) if (bra != OP_BRAMINZERO) { OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_MEM1(STACK_TOP), STACK(0)); - if (opcode >= OP_SBRA || opcode == OP_ONCE) + if (repeat_type != 0) + { + /* TMP1 was set a few lines above. */ + CMPTO(SLJIT_C_NOT_EQUAL, TMP1, 0, SLJIT_IMM, 0, CURRENT_AS(bracket_backtrack)->recursive_matchingpath); + /* Drop STR_PTR for non-greedy plus quantifier. */ + if (opcode != OP_ONCE) + free_stack(common, 1); + } + else if (opcode >= OP_SBRA || opcode == OP_ONCE) { /* Checking zero-length iteration. */ if (opcode != OP_ONCE || CURRENT_AS(bracket_backtrack)->u.framesize < 0) @@ -7959,6 +8047,7 @@ else if (ket == OP_KETRMIN) OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_LOCALS_REG), private_data_ptr); CMPTO(SLJIT_C_NOT_EQUAL, STR_PTR, 0, SLJIT_MEM1(TMP1), (CURRENT_AS(bracket_backtrack)->u.framesize + 1) * sizeof(sljit_sw), CURRENT_AS(bracket_backtrack)->recursive_matchingpath); } + /* Drop STR_PTR for non-greedy plus quantifier. */ if (opcode != OP_ONCE) free_stack(common, 1); } @@ -7966,6 +8055,8 @@ else if (ket == OP_KETRMIN) JUMPTO(SLJIT_JUMP, CURRENT_AS(bracket_backtrack)->recursive_matchingpath); } rminlabel = LABEL(); + if (repeat_type != 0) + OP2(SLJIT_ADD, SLJIT_MEM1(SLJIT_LOCALS_REG), repeat_ptr, SLJIT_MEM1(SLJIT_LOCALS_REG), repeat_ptr, SLJIT_IMM, 1); } else if (bra == OP_BRAZERO) { @@ -7973,6 +8064,11 @@ else if (bra == OP_BRAZERO) free_stack(common, 1); brazero = CMP(SLJIT_C_NOT_EQUAL, TMP1, 0, SLJIT_IMM, 0); } +else if (repeat_type == OP_EXACT) + { + OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_LOCALS_REG), repeat_ptr, SLJIT_IMM, 1); + exact_label = LABEL(); + } if (offset != 0) { @@ -8120,6 +8216,12 @@ if (has_alternatives) match_once_common(common, ket, CURRENT_AS(bracket_backtrack)->u.framesize, private_data_ptr, has_alternatives, needs_control_head); stacksize = 0; + if (repeat_type == OP_MINUPTO) + { + /* We need to preserve the counter. TMP2 will be used below. */ + OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(SLJIT_LOCALS_REG), repeat_ptr); + stacksize++; + } if (ket != OP_KET || bra != OP_BRA) stacksize++; if (offset != 0) @@ -8133,18 +8235,16 @@ if (has_alternatives) stacksize++; if (stacksize > 0) + allocate_stack(common, stacksize); + + stacksize = 0; + if (repeat_type == OP_MINUPTO) { - if (opcode != OP_ONCE || CURRENT_AS(bracket_backtrack)->u.framesize >= 0) - allocate_stack(common, stacksize); - else - { - /* We know we have place at least for one item on the top of the stack. */ - SLJIT_ASSERT(stacksize == 1); - OP2(SLJIT_ADD, STACK_TOP, 0, STACK_TOP, 0, SLJIT_IMM, sizeof(sljit_sw)); - } + /* TMP2 was set above. */ + OP2(SLJIT_SUB, SLJIT_MEM1(STACK_TOP), STACK(stacksize), TMP2, 0, SLJIT_IMM, 1); + stacksize++; } - stacksize = 0; if (ket != OP_KET || bra != OP_BRA) { if (ket != OP_KET) @@ -8255,11 +8355,18 @@ else if (opcode == OP_ONCE) } } -if (ket == OP_KETRMAX) +if (repeat_type == OP_EXACT) + { + OP2(SLJIT_ADD, TMP1, 0, SLJIT_MEM1(SLJIT_LOCALS_REG), repeat_ptr, SLJIT_IMM, 1); + OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_LOCALS_REG), repeat_ptr, TMP1, 0); + CMPTO(SLJIT_C_LESS_EQUAL, TMP1, 0, SLJIT_IMM, repeat_count, exact_label); + } +else if (ket == OP_KETRMAX) { OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_MEM1(STACK_TOP), STACK(0)); if (bra != OP_BRAZERO) free_stack(common, 1); + CMPTO(SLJIT_C_NOT_EQUAL, STR_PTR, 0, SLJIT_IMM, 0, CURRENT_AS(bracket_backtrack)->recursive_matchingpath); if (bra == OP_BRAZERO) { @@ -8863,8 +8970,7 @@ SLJIT_ASSERT(*rootbacktrack.cc == OP_BRA && ccend[-(1 + LINK_SIZE)] == OP_KET); common->capture_last_ptr = common->ovector_start; common->ovector_start += sizeof(sljit_sw); #endif -private_data_size = get_private_data_length(common, rootbacktrack.cc, ccend); -if (private_data_size < 0) +if (!check_opcode_types(common, rootbacktrack.cc, ccend)) { SLJIT_FREE(common->optimized_cbracket); return; @@ -8926,21 +9032,23 @@ if (common->capture_last_ptr != 0) SLJIT_ASSERT(!(common->req_char_ptr != 0 && common->start_used_ptr != 0)); common->cbra_ptr = OVECTOR_START + (re->top_bracket + 1) * 2 * sizeof(sljit_sw); -private_data_size += common->cbra_ptr + (re->top_bracket + 1) * sizeof(sljit_sw); -if (private_data_size > SLJIT_MAX_LOCAL_SIZE) + +common->private_data_ptrs = (int *)SLJIT_MALLOC((ccend - rootbacktrack.cc) * sizeof(sljit_si)); +if (!common->private_data_ptrs) { SLJIT_FREE(common->optimized_cbracket); return; } +memset(common->private_data_ptrs, 0, (ccend - rootbacktrack.cc) * sizeof(int)); -common->private_data_ptrs = (int *)SLJIT_MALLOC((ccend - rootbacktrack.cc) * sizeof(int)); -if (!common->private_data_ptrs) +private_data_size = common->cbra_ptr + (re->top_bracket + 1) * sizeof(sljit_sw); +set_private_data_ptrs(common, &private_data_size, ccend); +if (private_data_size > SLJIT_MAX_LOCAL_SIZE) { + SLJIT_FREE(common->private_data_ptrs); SLJIT_FREE(common->optimized_cbracket); return; } -memset(common->private_data_ptrs, 0, (ccend - rootbacktrack.cc) * sizeof(int)); -set_private_data_ptrs(common, common->cbra_ptr + (re->top_bracket + 1) * sizeof(sljit_sw), ccend); if (common->has_then) { diff --git a/pcre_jit_test.c b/pcre_jit_test.c index 1860323..c8daffb 100644 --- a/pcre_jit_test.c +++ b/pcre_jit_test.c @@ -308,6 +308,17 @@ static struct regression_test_case regression_test_cases[] = { { MUA, 0, "[^\xe1\xbd\xb8][^\xc3\xa9]", "\xe1\xbd\xb8\xe1\xbf\xb8\xc3\xa9\xc3\x89#" }, { MUA, 0, "[^\xe1\xbd\xb8]{3,}?", "##\xe1\xbd\xb8#\xe1\xbd\xb8#\xc3\x89#\xe1\xbd\xb8" }, + /* Bracket repeats with limit. */ + { MUA, 0, "(?:(ab){2}){5}M", "abababababababababababM" }, + { MUA, 0, "(?:ab|abab){1,5}M", "abababababababababababM" }, + { MUA, 0, "(?>ab|abab){1,5}M", "abababababababababababM" }, + { MUA, 0, "(?:ab|abab){1,5}?M", "abababababababababababM" }, + { MUA, 0, "(?>ab|abab){1,5}?M", "abababababababababababM" }, + { MUA, 0, "(?:(ab){1,4}?){1,3}?M", "abababababababababababababM" }, + { MUA, 0, "(?:(ab){1,4}){1,3}abababababababababababM", "ababababababababababababM" }, + { MUA, 0 | F_NOMATCH, "(?:(ab){1,4}){1,3}abababababababababababM", "abababababababababababM" }, + { MUA, 0, "(ab){4,6}?M", "abababababababM" }, + /* Basic character sets. */ { MUA, 0, "(?:\\s)+(?:\\S)+", "ab \t\xc3\xa9\xe6\x92\xad " }, { MUA, 0, "(\\w)*(k)(\\W)?\?", "abcdef abck11" }, |