summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDaniel Veillard <veillard@src.gnome.org>2006-03-21 23:17:57 +0000
committerDaniel Veillard <veillard@src.gnome.org>2006-03-21 23:17:57 +0000
commit54eb0243c442292953b4a3df39568735039ebc82 (patch)
treee0cbf73653c5d83a9d6148eba287f25537b213af
parentaac7c68e87d00732b319698723de1ec43252fb01 (diff)
downloadlibxml2-54eb0243c442292953b4a3df39568735039ebc82.tar.gz
applied patch from Youri Golovanov fixing bug #316338 and adding a couple
* xmlregexp.c: applied patch from Youri Golovanov fixing bug #316338 and adding a couple of optimizations in the regexp compilation engine. * test/regexp/bug316338 result/regexp/bug316338: added regression tests based on the examples provided in the bug report. Daniel
-rw-r--r--ChangeLog8
-rw-r--r--result/regexp/bug31633820
-rw-r--r--test/regexp/bug31633820
-rw-r--r--xmlregexp.c52
4 files changed, 82 insertions, 18 deletions
diff --git a/ChangeLog b/ChangeLog
index cca2cb31..9dbf8f97 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,11 @@
+Wed Mar 22 00:14:34 CET 2006 Daniel Veillard <daniel@veillard.com>
+
+ * xmlregexp.c: applied patch from Youri Golovanov fixing bug
+ #316338 and adding a couple of optimizations in the regexp
+ compilation engine.
+ * test/regexp/bug316338 result/regexp/bug316338: added regression
+ tests based on the examples provided in the bug report.
+
Fri Mar 10 08:40:55 EST 2006 Daniel Veillard <daniel@veillard.com>
* c14n.c encoding.c xmlschemas.c xpath.c xpointer.c: fix a few
diff --git a/result/regexp/bug316338 b/result/regexp/bug316338
new file mode 100644
index 00000000..1cd1ac0e
--- /dev/null
+++ b/result/regexp/bug316338
@@ -0,0 +1,20 @@
+Regexp: (((C|c)(([\s]*\-?[0-9]+(\.[0-9]+)?((e|E)\-?[0-9]+)?){3})+[\s]*))+
+C 433: Ok
+C 433 12: Fail
+C 433 123: Ok
+C 433 123 456: Ok
+C 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12: Fail
+Regexp: (((C|c)(([\s]+\-?[0-9]+(\.[0-9]+)?((e|E)\-?[0-9]+)?){3})+[\s]*))+
+C 433: Fail
+C 433 12: Fail
+C 433 123: Fail
+C 433 123 456: Ok
+C 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12: Fail
+Regexp: (((C|c)(([\s]+\-?[0-9]+(\.[0-9]+)?[\s]*,[\s]*\-?[0-9]+(\.[0-9]+)?){3})+[\s]*))+
+C 433: Fail
+C 433 12: Fail
+C 433 123: Fail
+C 433 123 456: Fail
+C 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12: Fail
+Regexp: (((M|m)([\s]+\-?[0-9]+(\.[0-9]+)?((e|E)\-?[0-9]+)?[\s]*,[\s]*\-?[0-9]+(\.[0-9]+)?((e|E)\-?[0-9]+)?)[\s]*)|((L|l)([\s]+\-?[0-9]+(\.[0-9]+)?((e|E)\-?[0-9]+)?[\s]*,[\s]*\-?[0-9]+(\.[0-9]+)?((e|E)\-?[0-9]+)?)+[\s]*)|((H|h)([\s]+\-?[0-9]+(\.[0-9]+)?((e|E)\-?[0-9]+)?)+[\s]*)|((V|v)([\s]+\-?[0-9]+(\.[0-9]+)?((e|E)\-?[0-9]+)?)+[\s]*)|((C|c)(([\s]+\-?[0-9]+(\.[0-9]+)?((e|E)\-?[0-9]+)?[\s]*,[\s]*\-?[0-9]+(\.[0-9]+)?((e|E)\-?[0-9]+)?){3})+[\s]*)|((Q|q)(([\s]+\-?[0-9]+(\.[0-9]+)?((e|E)\-?[0-9]+)?[\s]*,[\s]*\-?[0-9]+(\.[0-9]+)?((e|E)\-?[0-9]+)?){2})+[\s]*)|((S|s)(([\s]+\-?[0-9]+(\.[0-9]+)?((e|E)\-?[0-9]+)?[\s]*,[\s]*\-?[0-9]+(\.[0-9]+)?((e|E)\-?[0-9]+)?){2})+[\s]*)|((A|a)([\s]+\-?[0-9]+(\.[0-9]+)?((e|E)\-?[0-9]+)?[\s]*,[\s]*\-?[0-9]+(\.[0-9]+)?((e|E)\-?[0-9]+)?[\s]*\-?[0-9]+(\.[0-9]+)?((e|E)\-?[0-9]+)?[\s]+[0-1][\s]+[0-1][\s]+\-?[0-9]+(\.[0-9]+)?((e|E)\-?[0-9]+)?[\s]*,[\s]*\-?[0-9]+(\.[0-9]+)?((e|E)\-?[0-9]+)?)+[\s]*)|((Z|z)[\s]*))*
+M 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12: Fail
diff --git a/test/regexp/bug316338 b/test/regexp/bug316338
new file mode 100644
index 00000000..98c78385
--- /dev/null
+++ b/test/regexp/bug316338
@@ -0,0 +1,20 @@
+=>(((C|c)(([\s]*\-?[0-9]+(\.[0-9]+)?((e|E)\-?[0-9]+)?){3})+[\s]*))+
+C 433
+C 433 12
+C 433 123
+C 433 123 456
+C 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12
+=>(((C|c)(([\s]+\-?[0-9]+(\.[0-9]+)?((e|E)\-?[0-9]+)?){3})+[\s]*))+
+C 433
+C 433 12
+C 433 123
+C 433 123 456
+C 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12
+=>(((C|c)(([\s]+\-?[0-9]+(\.[0-9]+)?[\s]*,[\s]*\-?[0-9]+(\.[0-9]+)?){3})+[\s]*))+
+C 433
+C 433 12
+C 433 123
+C 433 123 456
+C 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12
+=>(((M|m)([\s]+\-?[0-9]+(\.[0-9]+)?((e|E)\-?[0-9]+)?[\s]*,[\s]*\-?[0-9]+(\.[0-9]+)?((e|E)\-?[0-9]+)?)[\s]*)|((L|l)([\s]+\-?[0-9]+(\.[0-9]+)?((e|E)\-?[0-9]+)?[\s]*,[\s]*\-?[0-9]+(\.[0-9]+)?((e|E)\-?[0-9]+)?)+[\s]*)|((H|h)([\s]+\-?[0-9]+(\.[0-9]+)?((e|E)\-?[0-9]+)?)+[\s]*)|((V|v)([\s]+\-?[0-9]+(\.[0-9]+)?((e|E)\-?[0-9]+)?)+[\s]*)|((C|c)(([\s]+\-?[0-9]+(\.[0-9]+)?((e|E)\-?[0-9]+)?[\s]*,[\s]*\-?[0-9]+(\.[0-9]+)?((e|E)\-?[0-9]+)?){3})+[\s]*)|((Q|q)(([\s]+\-?[0-9]+(\.[0-9]+)?((e|E)\-?[0-9]+)?[\s]*,[\s]*\-?[0-9]+(\.[0-9]+)?((e|E)\-?[0-9]+)?){2})+[\s]*)|((S|s)(([\s]+\-?[0-9]+(\.[0-9]+)?((e|E)\-?[0-9]+)?[\s]*,[\s]*\-?[0-9]+(\.[0-9]+)?((e|E)\-?[0-9]+)?){2})+[\s]*)|((A|a)([\s]+\-?[0-9]+(\.[0-9]+)?((e|E)\-?[0-9]+)?[\s]*,[\s]*\-?[0-9]+(\.[0-9]+)?((e|E)\-?[0-9]+)?[\s]*\-?[0-9]+(\.[0-9]+)?((e|E)\-?[0-9]+)?[\s]+[0-1][\s]+[0-1][\s]+\-?[0-9]+(\.[0-9]+)?((e|E)\-?[0-9]+)?[\s]*,[\s]*\-?[0-9]+(\.[0-9]+)?((e|E)\-?[0-9]+)?)+[\s]*)|((Z|z)[\s]*))*
+M 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12 724.64,433.12 718.08,433.12 711.52,433.12
diff --git a/xmlregexp.c b/xmlregexp.c
index 8d7ee97f..58f480da 100644
--- a/xmlregexp.c
+++ b/xmlregexp.c
@@ -1479,7 +1479,13 @@ xmlFAGenerateTransitions(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr from,
switch (atom->quant) {
case XML_REGEXP_QUANT_OPT:
atom->quant = XML_REGEXP_QUANT_ONCE;
- xmlFAGenerateEpsilonTransition(ctxt, atom->start, atom->stop);
+ /*
+ * transition done to the state after end of atom.
+ * 1. set transition from atom start to new state
+ * 2. set transition from atom end to this state.
+ */
+ xmlFAGenerateEpsilonTransition(ctxt, atom->start, 0);
+ xmlFAGenerateEpsilonTransition(ctxt, atom->stop, ctxt->state);
break;
case XML_REGEXP_QUANT_MULT:
atom->quant = XML_REGEXP_QUANT_ONCE;
@@ -1522,8 +1528,6 @@ xmlFAGenerateTransitions(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr from,
atom->min = 0;
atom->max = 0;
atom->quant = XML_REGEXP_QUANT_ONCE;
- xmlFAGenerateCountedEpsilonTransition(ctxt, atom->stop,
- atom->start, counter);
if (to != NULL) {
newstate = to;
} else {
@@ -1533,6 +1537,13 @@ xmlFAGenerateTransitions(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr from,
ctxt->state = newstate;
xmlFAGenerateCountedTransition(ctxt, atom->stop,
newstate, counter);
+
+ /*
+ * first check count and if OK jump forward;
+ * if checking fail increment count and jump back
+ */
+ xmlFAGenerateCountedEpsilonTransition(ctxt, atom->stop,
+ atom->start, counter);
}
default:
break;
@@ -4299,6 +4310,15 @@ xmlRegExecPushChar(xmlRegExecCtxtPtr exec, int UCS) {
if (exec->state->nbTrans > exec->transno + 1) {
xmlFARegExecSave(exec);
}
+ /*
+ * restart count for expressions like this ((abc){2})*
+ */
+ if (trans->count >= 0) {
+#ifdef DEBUG_REGEXP_EXEC
+ printf("Reset count %d\n", trans->count);
+#endif
+ exec->counts[trans->count] = 0;
+ }
if (trans->counter >= 0) {
#ifdef DEBUG_REGEXP_EXEC
printf("Increasing count %d\n", trans->counter);
@@ -5112,19 +5132,23 @@ xmlFAParsePiece(xmlRegParserCtxtPtr ctxt) {
/**
* xmlFAParseBranch:
* @ctxt: a regexp parser context
+ * @to: optional target to the end of the branch
+ *
+ * @to is used to optimize by removing duplicate path in automata
+ * in expressions like (a|b)(c|d)
*
* [2] branch ::= piece*
- 8
*/
static int
-xmlFAParseBranch(xmlRegParserCtxtPtr ctxt) {
+xmlFAParseBranch(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr to) {
xmlRegStatePtr previous;
int ret;
previous = ctxt->state;
ret = xmlFAParsePiece(ctxt);
if (ret != 0) {
- if (xmlFAGenerateTransitions(ctxt, previous, NULL, ctxt->atom) < 0)
+ if (xmlFAGenerateTransitions(ctxt, previous,
+ (CUR=='|' || CUR==')') ? to : NULL, ctxt->atom) < 0)
return(-1);
previous = ctxt->state;
ctxt->atom = NULL;
@@ -5132,8 +5156,8 @@ xmlFAParseBranch(xmlRegParserCtxtPtr ctxt) {
while ((ret != 0) && (ctxt->error == 0)) {
ret = xmlFAParsePiece(ctxt);
if (ret != 0) {
- if (xmlFAGenerateTransitions(ctxt, previous, NULL,
- ctxt->atom) < 0)
+ if (xmlFAGenerateTransitions(ctxt, previous,
+ (CUR=='|' || CUR==')') ? to : NULL, ctxt->atom) < 0)
return(-1);
previous = ctxt->state;
ctxt->atom = NULL;
@@ -5156,7 +5180,7 @@ xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top) {
/* if not top start should have been generated by an epsilon trans */
start = ctxt->state;
ctxt->end = NULL;
- xmlFAParseBranch(ctxt);
+ xmlFAParseBranch(ctxt, NULL);
if (top) {
#ifdef DEBUG_REGEXP_GRAPH
printf("State %d is final\n", ctxt->state->no);
@@ -5172,15 +5196,7 @@ xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top) {
NEXT;
ctxt->state = start;
ctxt->end = NULL;
- xmlFAParseBranch(ctxt);
- if (top) {
- ctxt->state->type = XML_REGEXP_FINAL_STATE;
-#ifdef DEBUG_REGEXP_GRAPH
- printf("State %d is final\n", ctxt->state->no);
-#endif
- } else {
- xmlFAGenerateEpsilonTransition(ctxt, ctxt->state, end);
- }
+ xmlFAParseBranch(ctxt, end);
}
if (!top) {
ctxt->state = end;