From 8095365b77ca820dc565f3c37165d320db917ee1 Mon Sep 17 00:00:00 2001 From: Nick Wellnhofer Date: Thu, 4 Mar 2021 18:46:11 +0100 Subject: Speed up htmlCheckAutoClose Switch to binary search. --- HTMLparser.c | 416 ++++++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 280 insertions(+), 136 deletions(-) (limited to 'HTMLparser.c') diff --git a/HTMLparser.c b/HTMLparser.c index 376fbd71..e63e9b78 100644 --- a/HTMLparser.c +++ b/HTMLparser.c @@ -1072,102 +1072,266 @@ html40ElementTable[] = { } }; +typedef struct { + const char *oldTag; + const char *newTag; +} htmlStartCloseEntry; + /* * start tags that imply the end of current element */ -static const char * const htmlStartClose[] = { -"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6", - "dl", "ul", "ol", "menu", "dir", "address", "pre", - "listing", "xmp", "head", NULL, -"head", "p", NULL, -"title", "p", NULL, -"body", "head", "style", "link", "title", "p", NULL, -"frameset", "head", "style", "link", "title", "p", NULL, -"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address", - "pre", "listing", "xmp", "head", "li", NULL, -"hr", "p", "head", NULL, -"h1", "p", "head", NULL, -"h2", "p", "head", NULL, -"h3", "p", "head", NULL, -"h4", "p", "head", NULL, -"h5", "p", "head", NULL, -"h6", "p", "head", NULL, -"dir", "p", "head", NULL, -"address", "p", "head", "ul", NULL, -"pre", "p", "head", "ul", NULL, -"listing", "p", "head", NULL, -"xmp", "p", "head", NULL, -"blockquote", "p", "head", NULL, -"dl", "p", "dt", "menu", "dir", "address", "pre", "listing", - "xmp", "head", NULL, -"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp", - "head", "dd", NULL, -"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp", - "head", "dt", NULL, -"ul", "p", "head", "ol", "menu", "dir", "address", "pre", - "listing", "xmp", NULL, -"ol", "p", "head", "ul", NULL, -"menu", "p", "head", "ul", NULL, -"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", FONTSTYLE, NULL, -"div", "p", "head", NULL, -"noscript", "script", NULL, -"center", "font", "b", "i", "p", "head", NULL, -"a", "a", "head", NULL, -"caption", "p", NULL, -"colgroup", "caption", "colgroup", "col", "p", NULL, -"col", "caption", "col", "p", NULL, -"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre", - "listing", "xmp", "a", NULL, -"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL, -"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL, -"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL, -"thead", "caption", "col", "colgroup", NULL, -"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead", - "tbody", "p", NULL, -"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead", - "tfoot", "tbody", "p", NULL, -"optgroup", "option", NULL, -"option", "option", NULL, -"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", - "pre", "listing", "xmp", "a", NULL, -/* most tags in in FONTSTYLE, PHRASE and SPECIAL should close */ -"tt", "head", NULL, -"i", "head", NULL, -"b", "head", NULL, -"u", "head", NULL, -"s", "head", NULL, -"strike", "head", NULL, -"big", "head", NULL, -"small", "head", NULL, - -"em", "head", NULL, -"strong", "head", NULL, -"dfn", "head", NULL, -"code", "head", NULL, -"samp", "head", NULL, -"kbd", "head", NULL, -"var", "head", NULL, -"cite", "head", NULL, -"abbr", "head", NULL, -"acronym", "head", NULL, - -/* "a" */ -"img", "head", NULL, -/* "applet" */ -/* "embed" */ -/* "object" */ -"font", "head", NULL, -/* "basefont" */ -"br", "head", NULL, -/* "script" */ -"map", "head", NULL, -"q", "head", NULL, -"sub", "head", NULL, -"sup", "head", NULL, -"span", "head", NULL, -"bdo", "head", NULL, -"iframe", "head", NULL, -NULL +static const htmlStartCloseEntry htmlStartClose[] = { + { "a", "a" }, + { "a", "fieldset" }, + { "a", "table" }, + { "a", "td" }, + { "a", "th" }, + { "address", "dd" }, + { "address", "dl" }, + { "address", "dt" }, + { "address", "form" }, + { "address", "li" }, + { "address", "ul" }, + { "b", "center" }, + { "b", "p" }, + { "b", "td" }, + { "b", "th" }, + { "big", "p" }, + { "caption", "col" }, + { "caption", "colgroup" }, + { "caption", "tbody" }, + { "caption", "tfoot" }, + { "caption", "thead" }, + { "caption", "tr" }, + { "col", "col" }, + { "col", "colgroup" }, + { "col", "tbody" }, + { "col", "tfoot" }, + { "col", "thead" }, + { "col", "tr" }, + { "colgroup", "colgroup" }, + { "colgroup", "tbody" }, + { "colgroup", "tfoot" }, + { "colgroup", "thead" }, + { "colgroup", "tr" }, + { "dd", "dt" }, + { "dir", "dd" }, + { "dir", "dl" }, + { "dir", "dt" }, + { "dir", "form" }, + { "dir", "ul" }, + { "dl", "form" }, + { "dl", "li" }, + { "dt", "dd" }, + { "dt", "dl" }, + { "font", "center" }, + { "font", "td" }, + { "font", "th" }, + { "form", "form" }, + { "h1", "fieldset" }, + { "h1", "form" }, + { "h1", "li" }, + { "h1", "p" }, + { "h1", "table" }, + { "h2", "fieldset" }, + { "h2", "form" }, + { "h2", "li" }, + { "h2", "p" }, + { "h2", "table" }, + { "h3", "fieldset" }, + { "h3", "form" }, + { "h3", "li" }, + { "h3", "p" }, + { "h3", "table" }, + { "h4", "fieldset" }, + { "h4", "form" }, + { "h4", "li" }, + { "h4", "p" }, + { "h4", "table" }, + { "h5", "fieldset" }, + { "h5", "form" }, + { "h5", "li" }, + { "h5", "p" }, + { "h5", "table" }, + { "h6", "fieldset" }, + { "h6", "form" }, + { "h6", "li" }, + { "h6", "p" }, + { "h6", "table" }, + { "head", "a" }, + { "head", "abbr" }, + { "head", "acronym" }, + { "head", "address" }, + { "head", "b" }, + { "head", "bdo" }, + { "head", "big" }, + { "head", "blockquote" }, + { "head", "body" }, + { "head", "br" }, + { "head", "center" }, + { "head", "cite" }, + { "head", "code" }, + { "head", "dd" }, + { "head", "dfn" }, + { "head", "dir" }, + { "head", "div" }, + { "head", "dl" }, + { "head", "dt" }, + { "head", "em" }, + { "head", "fieldset" }, + { "head", "font" }, + { "head", "form" }, + { "head", "frameset" }, + { "head", "h1" }, + { "head", "h2" }, + { "head", "h3" }, + { "head", "h4" }, + { "head", "h5" }, + { "head", "h6" }, + { "head", "hr" }, + { "head", "i" }, + { "head", "iframe" }, + { "head", "img" }, + { "head", "kbd" }, + { "head", "li" }, + { "head", "listing" }, + { "head", "map" }, + { "head", "menu" }, + { "head", "ol" }, + { "head", "p" }, + { "head", "pre" }, + { "head", "q" }, + { "head", "s" }, + { "head", "samp" }, + { "head", "small" }, + { "head", "span" }, + { "head", "strike" }, + { "head", "strong" }, + { "head", "sub" }, + { "head", "sup" }, + { "head", "table" }, + { "head", "tt" }, + { "head", "u" }, + { "head", "ul" }, + { "head", "var" }, + { "head", "xmp" }, + { "hr", "form" }, + { "i", "center" }, + { "i", "p" }, + { "i", "td" }, + { "i", "th" }, + { "legend", "fieldset" }, + { "li", "li" }, + { "link", "body" }, + { "link", "frameset" }, + { "listing", "dd" }, + { "listing", "dl" }, + { "listing", "dt" }, + { "listing", "fieldset" }, + { "listing", "form" }, + { "listing", "li" }, + { "listing", "table" }, + { "listing", "ul" }, + { "menu", "dd" }, + { "menu", "dl" }, + { "menu", "dt" }, + { "menu", "form" }, + { "menu", "ul" }, + { "ol", "form" }, + { "ol", "ul" }, + { "option", "optgroup" }, + { "option", "option" }, + { "p", "address" }, + { "p", "blockquote" }, + { "p", "body" }, + { "p", "caption" }, + { "p", "center" }, + { "p", "col" }, + { "p", "colgroup" }, + { "p", "dd" }, + { "p", "dir" }, + { "p", "div" }, + { "p", "dl" }, + { "p", "dt" }, + { "p", "fieldset" }, + { "p", "form" }, + { "p", "frameset" }, + { "p", "h1" }, + { "p", "h2" }, + { "p", "h3" }, + { "p", "h4" }, + { "p", "h5" }, + { "p", "h6" }, + { "p", "head" }, + { "p", "hr" }, + { "p", "li" }, + { "p", "listing" }, + { "p", "menu" }, + { "p", "ol" }, + { "p", "p" }, + { "p", "pre" }, + { "p", "table" }, + { "p", "tbody" }, + { "p", "td" }, + { "p", "tfoot" }, + { "p", "th" }, + { "p", "title" }, + { "p", "tr" }, + { "p", "ul" }, + { "p", "xmp" }, + { "pre", "dd" }, + { "pre", "dl" }, + { "pre", "dt" }, + { "pre", "fieldset" }, + { "pre", "form" }, + { "pre", "li" }, + { "pre", "table" }, + { "pre", "ul" }, + { "s", "p" }, + { "script", "noscript" }, + { "small", "p" }, + { "span", "td" }, + { "span", "th" }, + { "strike", "p" }, + { "style", "body" }, + { "style", "frameset" }, + { "tbody", "tbody" }, + { "tbody", "tfoot" }, + { "td", "tbody" }, + { "td", "td" }, + { "td", "tfoot" }, + { "td", "th" }, + { "td", "tr" }, + { "tfoot", "tbody" }, + { "th", "tbody" }, + { "th", "td" }, + { "th", "tfoot" }, + { "th", "th" }, + { "th", "tr" }, + { "thead", "tbody" }, + { "thead", "tfoot" }, + { "title", "body" }, + { "title", "frameset" }, + { "tr", "tbody" }, + { "tr", "tfoot" }, + { "tr", "tr" }, + { "tt", "p" }, + { "u", "p" }, + { "u", "td" }, + { "u", "th" }, + { "ul", "address" }, + { "ul", "form" }, + { "ul", "menu" }, + { "ul", "ol" }, + { "ul", "pre" }, + { "xmp", "dd" }, + { "xmp", "dl" }, + { "xmp", "dt" }, + { "xmp", "fieldset" }, + { "xmp", "form" }, + { "xmp", "li" }, + { "xmp", "table" }, + { "xmp", "ul" } }; /* @@ -1237,9 +1401,6 @@ static const elementPriority htmlEndPriority[] = { {NULL, 100} /* Default priority */ }; -static const char** htmlStartCloseIndex[100]; -static int htmlStartCloseIndexinitialized = 0; - /************************************************************************ * * * functions to handle HTML specific data * @@ -1249,24 +1410,10 @@ static int htmlStartCloseIndexinitialized = 0; /** * htmlInitAutoClose: * - * Initialize the htmlStartCloseIndex for fast lookup of closing tags names. - * This is not reentrant. Call xmlInitParser() once before processing in - * case of use in multithreaded programs. + * This is a no-op now. */ void htmlInitAutoClose(void) { - int indx, i = 0; - - if (htmlStartCloseIndexinitialized) return; - - for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL; - indx = 0; - while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) { - htmlStartCloseIndex[indx++] = (const char**) &htmlStartClose[i]; - while (htmlStartClose[i] != NULL) i++; - i++; - } - htmlStartCloseIndexinitialized = 1; } static int @@ -1313,6 +1460,19 @@ htmlGetEndPriority (const xmlChar *name) { } +static int +htmlCompareStartClose(const void *vkey, const void *member) { + const htmlStartCloseEntry *key = (const htmlStartCloseEntry *) vkey; + const htmlStartCloseEntry *entry = (const htmlStartCloseEntry *) member; + int ret; + + ret = strcmp(key->oldTag, entry->oldTag); + if (ret == 0) + ret = strcmp(key->newTag, entry->newTag); + + return(ret); +} + /** * htmlCheckAutoClose: * @newtag: The new tag name @@ -1320,37 +1480,21 @@ htmlGetEndPriority (const xmlChar *name) { * * Checks whether the new tag is one of the registered valid tags for * closing old. - * Initialize the htmlStartCloseIndex for fast lookup of closing tags names. * * Returns 0 if no, 1 if yes. */ static int htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag) { - int i, indx; - const char **closed = NULL; - - if (htmlStartCloseIndexinitialized == 0) - htmlInitAutoClose(); - - /* inefficient, but not a big deal */ - for (indx = 0; indx < 100; indx++) { - closed = htmlStartCloseIndex[indx]; - if (closed == NULL) - return (0); - if (xmlStrEqual(BAD_CAST * closed, newtag)) - break; - } - - i = closed - htmlStartClose; - i++; - while (htmlStartClose[i] != NULL) { - if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) { - return (1); - } - i++; - } - return (0); + htmlStartCloseEntry key; + void *res; + + key.oldTag = (const char *) oldtag; + key.newTag = (const char *) newtag; + res = bsearch(&key, htmlStartClose, + sizeof(htmlStartClose) / sizeof(htmlStartCloseEntry), + sizeof(htmlStartCloseEntry), htmlCompareStartClose); + return(res != NULL); } /** -- cgit v1.2.1