summaryrefslogtreecommitdiff
path: root/HTMLparser.c
diff options
context:
space:
mode:
authorNick Wellnhofer <wellnhofer@aevum.de>2021-03-04 18:46:11 +0100
committerNick Wellnhofer <wellnhofer@aevum.de>2021-03-04 19:22:35 +0100
commit8095365b77ca820dc565f3c37165d320db917ee1 (patch)
tree1ff350d5d2bfa5d0262491813734a38eac4ca7c6 /HTMLparser.c
parentb25acce858d4eea49b02b5d4e32708b914107b29 (diff)
downloadlibxml2-8095365b77ca820dc565f3c37165d320db917ee1.tar.gz
Speed up htmlCheckAutoClose
Switch to binary search.
Diffstat (limited to 'HTMLparser.c')
-rw-r--r--HTMLparser.c416
1 files changed, 280 insertions, 136 deletions
diff --git a/HTMLparser.c b/HTMLparser.c
index 376fbd71..e63e9b78 100644
--- a/HTMLparser.c
+++ b/HTMLparser.c
@@ -1072,102 +1072,266 @@ html40ElementTable[] = {
}
};
+typedef struct {
+ const char *oldTag;
+ const char *newTag;
+} htmlStartCloseEntry;
+
/*
* start tags that imply the end of current element
*/
-static const char * const htmlStartClose[] = {
-"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
- "dl", "ul", "ol", "menu", "dir", "address", "pre",
- "listing", "xmp", "head", NULL,
-"head", "p", NULL,
-"title", "p", NULL,
-"body", "head", "style", "link", "title", "p", NULL,
-"frameset", "head", "style", "link", "title", "p", NULL,
-"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
- "pre", "listing", "xmp", "head", "li", NULL,
-"hr", "p", "head", NULL,
-"h1", "p", "head", NULL,
-"h2", "p", "head", NULL,
-"h3", "p", "head", NULL,
-"h4", "p", "head", NULL,
-"h5", "p", "head", NULL,
-"h6", "p", "head", NULL,
-"dir", "p", "head", NULL,
-"address", "p", "head", "ul", NULL,
-"pre", "p", "head", "ul", NULL,
-"listing", "p", "head", NULL,
-"xmp", "p", "head", NULL,
-"blockquote", "p", "head", NULL,
-"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
- "xmp", "head", NULL,
-"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
- "head", "dd", NULL,
-"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
- "head", "dt", NULL,
-"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
- "listing", "xmp", NULL,
-"ol", "p", "head", "ul", NULL,
-"menu", "p", "head", "ul", NULL,
-"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", FONTSTYLE, NULL,
-"div", "p", "head", NULL,
-"noscript", "script", NULL,
-"center", "font", "b", "i", "p", "head", NULL,
-"a", "a", "head", NULL,
-"caption", "p", NULL,
-"colgroup", "caption", "colgroup", "col", "p", NULL,
-"col", "caption", "col", "p", NULL,
-"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
- "listing", "xmp", "a", NULL,
-"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
-"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
-"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
-"thead", "caption", "col", "colgroup", NULL,
-"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
- "tbody", "p", NULL,
-"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
- "tfoot", "tbody", "p", NULL,
-"optgroup", "option", NULL,
-"option", "option", NULL,
-"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
- "pre", "listing", "xmp", "a", NULL,
-/* most tags in in FONTSTYLE, PHRASE and SPECIAL should close <head> */
-"tt", "head", NULL,
-"i", "head", NULL,
-"b", "head", NULL,
-"u", "head", NULL,
-"s", "head", NULL,
-"strike", "head", NULL,
-"big", "head", NULL,
-"small", "head", NULL,
-
-"em", "head", NULL,
-"strong", "head", NULL,
-"dfn", "head", NULL,
-"code", "head", NULL,
-"samp", "head", NULL,
-"kbd", "head", NULL,
-"var", "head", NULL,
-"cite", "head", NULL,
-"abbr", "head", NULL,
-"acronym", "head", NULL,
-
-/* "a" */
-"img", "head", NULL,
-/* "applet" */
-/* "embed" */
-/* "object" */
-"font", "head", NULL,
-/* "basefont" */
-"br", "head", NULL,
-/* "script" */
-"map", "head", NULL,
-"q", "head", NULL,
-"sub", "head", NULL,
-"sup", "head", NULL,
-"span", "head", NULL,
-"bdo", "head", NULL,
-"iframe", "head", NULL,
-NULL
+static const htmlStartCloseEntry htmlStartClose[] = {
+ { "a", "a" },
+ { "a", "fieldset" },
+ { "a", "table" },
+ { "a", "td" },
+ { "a", "th" },
+ { "address", "dd" },
+ { "address", "dl" },
+ { "address", "dt" },
+ { "address", "form" },
+ { "address", "li" },
+ { "address", "ul" },
+ { "b", "center" },
+ { "b", "p" },
+ { "b", "td" },
+ { "b", "th" },
+ { "big", "p" },
+ { "caption", "col" },
+ { "caption", "colgroup" },
+ { "caption", "tbody" },
+ { "caption", "tfoot" },
+ { "caption", "thead" },
+ { "caption", "tr" },
+ { "col", "col" },
+ { "col", "colgroup" },
+ { "col", "tbody" },
+ { "col", "tfoot" },
+ { "col", "thead" },
+ { "col", "tr" },
+ { "colgroup", "colgroup" },
+ { "colgroup", "tbody" },
+ { "colgroup", "tfoot" },
+ { "colgroup", "thead" },
+ { "colgroup", "tr" },
+ { "dd", "dt" },
+ { "dir", "dd" },
+ { "dir", "dl" },
+ { "dir", "dt" },
+ { "dir", "form" },
+ { "dir", "ul" },
+ { "dl", "form" },
+ { "dl", "li" },
+ { "dt", "dd" },
+ { "dt", "dl" },
+ { "font", "center" },
+ { "font", "td" },
+ { "font", "th" },
+ { "form", "form" },
+ { "h1", "fieldset" },
+ { "h1", "form" },
+ { "h1", "li" },
+ { "h1", "p" },
+ { "h1", "table" },
+ { "h2", "fieldset" },
+ { "h2", "form" },
+ { "h2", "li" },
+ { "h2", "p" },
+ { "h2", "table" },
+ { "h3", "fieldset" },
+ { "h3", "form" },
+ { "h3", "li" },
+ { "h3", "p" },
+ { "h3", "table" },
+ { "h4", "fieldset" },
+ { "h4", "form" },
+ { "h4", "li" },
+ { "h4", "p" },
+ { "h4", "table" },
+ { "h5", "fieldset" },
+ { "h5", "form" },
+ { "h5", "li" },
+ { "h5", "p" },
+ { "h5", "table" },
+ { "h6", "fieldset" },
+ { "h6", "form" },
+ { "h6", "li" },
+ { "h6", "p" },
+ { "h6", "table" },
+ { "head", "a" },
+ { "head", "abbr" },
+ { "head", "acronym" },
+ { "head", "address" },
+ { "head", "b" },
+ { "head", "bdo" },
+ { "head", "big" },
+ { "head", "blockquote" },
+ { "head", "body" },
+ { "head", "br" },
+ { "head", "center" },
+ { "head", "cite" },
+ { "head", "code" },
+ { "head", "dd" },
+ { "head", "dfn" },
+ { "head", "dir" },
+ { "head", "div" },
+ { "head", "dl" },
+ { "head", "dt" },
+ { "head", "em" },
+ { "head", "fieldset" },
+ { "head", "font" },
+ { "head", "form" },
+ { "head", "frameset" },
+ { "head", "h1" },
+ { "head", "h2" },
+ { "head", "h3" },
+ { "head", "h4" },
+ { "head", "h5" },
+ { "head", "h6" },
+ { "head", "hr" },
+ { "head", "i" },
+ { "head", "iframe" },
+ { "head", "img" },
+ { "head", "kbd" },
+ { "head", "li" },
+ { "head", "listing" },
+ { "head", "map" },
+ { "head", "menu" },
+ { "head", "ol" },
+ { "head", "p" },
+ { "head", "pre" },
+ { "head", "q" },
+ { "head", "s" },
+ { "head", "samp" },
+ { "head", "small" },
+ { "head", "span" },
+ { "head", "strike" },
+ { "head", "strong" },
+ { "head", "sub" },
+ { "head", "sup" },
+ { "head", "table" },
+ { "head", "tt" },
+ { "head", "u" },
+ { "head", "ul" },
+ { "head", "var" },
+ { "head", "xmp" },
+ { "hr", "form" },
+ { "i", "center" },
+ { "i", "p" },
+ { "i", "td" },
+ { "i", "th" },
+ { "legend", "fieldset" },
+ { "li", "li" },
+ { "link", "body" },
+ { "link", "frameset" },
+ { "listing", "dd" },
+ { "listing", "dl" },
+ { "listing", "dt" },
+ { "listing", "fieldset" },
+ { "listing", "form" },
+ { "listing", "li" },
+ { "listing", "table" },
+ { "listing", "ul" },
+ { "menu", "dd" },
+ { "menu", "dl" },
+ { "menu", "dt" },
+ { "menu", "form" },
+ { "menu", "ul" },
+ { "ol", "form" },
+ { "ol", "ul" },
+ { "option", "optgroup" },
+ { "option", "option" },
+ { "p", "address" },
+ { "p", "blockquote" },
+ { "p", "body" },
+ { "p", "caption" },
+ { "p", "center" },
+ { "p", "col" },
+ { "p", "colgroup" },
+ { "p", "dd" },
+ { "p", "dir" },
+ { "p", "div" },
+ { "p", "dl" },
+ { "p", "dt" },
+ { "p", "fieldset" },
+ { "p", "form" },
+ { "p", "frameset" },
+ { "p", "h1" },
+ { "p", "h2" },
+ { "p", "h3" },
+ { "p", "h4" },
+ { "p", "h5" },
+ { "p", "h6" },
+ { "p", "head" },
+ { "p", "hr" },
+ { "p", "li" },
+ { "p", "listing" },
+ { "p", "menu" },
+ { "p", "ol" },
+ { "p", "p" },
+ { "p", "pre" },
+ { "p", "table" },
+ { "p", "tbody" },
+ { "p", "td" },
+ { "p", "tfoot" },
+ { "p", "th" },
+ { "p", "title" },
+ { "p", "tr" },
+ { "p", "ul" },
+ { "p", "xmp" },
+ { "pre", "dd" },
+ { "pre", "dl" },
+ { "pre", "dt" },
+ { "pre", "fieldset" },
+ { "pre", "form" },
+ { "pre", "li" },
+ { "pre", "table" },
+ { "pre", "ul" },
+ { "s", "p" },
+ { "script", "noscript" },
+ { "small", "p" },
+ { "span", "td" },
+ { "span", "th" },
+ { "strike", "p" },
+ { "style", "body" },
+ { "style", "frameset" },
+ { "tbody", "tbody" },
+ { "tbody", "tfoot" },
+ { "td", "tbody" },
+ { "td", "td" },
+ { "td", "tfoot" },
+ { "td", "th" },
+ { "td", "tr" },
+ { "tfoot", "tbody" },
+ { "th", "tbody" },
+ { "th", "td" },
+ { "th", "tfoot" },
+ { "th", "th" },
+ { "th", "tr" },
+ { "thead", "tbody" },
+ { "thead", "tfoot" },
+ { "title", "body" },
+ { "title", "frameset" },
+ { "tr", "tbody" },
+ { "tr", "tfoot" },
+ { "tr", "tr" },
+ { "tt", "p" },
+ { "u", "p" },
+ { "u", "td" },
+ { "u", "th" },
+ { "ul", "address" },
+ { "ul", "form" },
+ { "ul", "menu" },
+ { "ul", "ol" },
+ { "ul", "pre" },
+ { "xmp", "dd" },
+ { "xmp", "dl" },
+ { "xmp", "dt" },
+ { "xmp", "fieldset" },
+ { "xmp", "form" },
+ { "xmp", "li" },
+ { "xmp", "table" },
+ { "xmp", "ul" }
};
/*
@@ -1237,9 +1401,6 @@ static const elementPriority htmlEndPriority[] = {
{NULL, 100} /* Default priority */
};
-static const char** htmlStartCloseIndex[100];
-static int htmlStartCloseIndexinitialized = 0;
-
/************************************************************************
* *
* functions to handle HTML specific data *
@@ -1249,24 +1410,10 @@ static int htmlStartCloseIndexinitialized = 0;
/**
* htmlInitAutoClose:
*
- * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
- * This is not reentrant. Call xmlInitParser() once before processing in
- * case of use in multithreaded programs.
+ * This is a no-op now.
*/
void
htmlInitAutoClose(void) {
- int indx, i = 0;
-
- if (htmlStartCloseIndexinitialized) return;
-
- for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
- indx = 0;
- while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
- htmlStartCloseIndex[indx++] = (const char**) &htmlStartClose[i];
- while (htmlStartClose[i] != NULL) i++;
- i++;
- }
- htmlStartCloseIndexinitialized = 1;
}
static int
@@ -1313,6 +1460,19 @@ htmlGetEndPriority (const xmlChar *name) {
}
+static int
+htmlCompareStartClose(const void *vkey, const void *member) {
+ const htmlStartCloseEntry *key = (const htmlStartCloseEntry *) vkey;
+ const htmlStartCloseEntry *entry = (const htmlStartCloseEntry *) member;
+ int ret;
+
+ ret = strcmp(key->oldTag, entry->oldTag);
+ if (ret == 0)
+ ret = strcmp(key->newTag, entry->newTag);
+
+ return(ret);
+}
+
/**
* htmlCheckAutoClose:
* @newtag: The new tag name
@@ -1320,37 +1480,21 @@ htmlGetEndPriority (const xmlChar *name) {
*
* Checks whether the new tag is one of the registered valid tags for
* closing old.
- * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
*
* Returns 0 if no, 1 if yes.
*/
static int
htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
{
- int i, indx;
- const char **closed = NULL;
-
- if (htmlStartCloseIndexinitialized == 0)
- htmlInitAutoClose();
-
- /* inefficient, but not a big deal */
- for (indx = 0; indx < 100; indx++) {
- closed = htmlStartCloseIndex[indx];
- if (closed == NULL)
- return (0);
- if (xmlStrEqual(BAD_CAST * closed, newtag))
- break;
- }
-
- i = closed - htmlStartClose;
- i++;
- while (htmlStartClose[i] != NULL) {
- if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
- return (1);
- }
- i++;
- }
- return (0);
+ htmlStartCloseEntry key;
+ void *res;
+
+ key.oldTag = (const char *) oldtag;
+ key.newTag = (const char *) newtag;
+ res = bsearch(&key, htmlStartClose,
+ sizeof(htmlStartClose) / sizeof(htmlStartCloseEntry),
+ sizeof(htmlStartCloseEntry), htmlCompareStartClose);
+ return(res != NULL);
}
/**