From 432340af8dca83a4ddb5d4599f2ae8276669ab1c Mon Sep 17 00:00:00 2001 From: Koert van der Veer Date: Thu, 16 Mar 2017 10:07:38 +0100 Subject: Build a retry mechanism around html5lib's unpredictable useChardet support Closes LP1654544 --- src/lxml/html/html5parser.py | 35 +++++++++++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) (limited to 'src/lxml/html/html5parser.py') diff --git a/src/lxml/html/html5parser.py b/src/lxml/html/html5parser.py index 7188c7ea..3524535c 100644 --- a/src/lxml/html/html5parser.py +++ b/src/lxml/html/html5parser.py @@ -7,9 +7,8 @@ import string from html5lib import HTMLParser as _HTMLParser from html5lib.treebuilders.etree_lxml import TreeBuilder - from lxml import etree -from lxml.html import _contains_block_level_tag, XHTML_NAMESPACE, Element +from lxml.html import Element, XHTML_NAMESPACE, _contains_block_level_tag # python3 compatibility try: @@ -25,12 +24,41 @@ try: except ImportError: from urllib.parse import urlparse +def _dodgeUseChardet(method_name): + # html5lib does not accept useChardet as an argument, if it + # detected the html argument would produce unicode objects. + # However, there is no reasonable way to predict if html5lib will + # detect the argument to be unicode (all of that code is private), + # so we'll have to settle for a retry. + + # this function creates a wrapper around the specified object, which + # retries when html5lib complains about the useChardet argument + + def inner(self, *args, **kwargs): + callee = getattr(super(type(self), self), method_name) + try: + return callee(*args, **kwargs) + except TypeError: + exception = sys.exc_info()[1] + if "'useChardet'" not in str(exception): + # Some other issue caused the exception. Tell the caller + raise + kwargs.pop('useChardet') + return callee(*args, **kwargs) + inner.__name__ = method_name + return inner + + + class HTMLParser(_HTMLParser): """An html5lib HTML parser with lxml as tree.""" def __init__(self, strict=False, **kwargs): _HTMLParser.__init__(self, strict=strict, tree=TreeBuilder, **kwargs) + parse = _dodgeUseChardet('parse') + parseFragment = _dodgeUseChardet('parseFragment') + try: from html5lib import XHTMLParser as _XHTMLParser @@ -43,6 +71,9 @@ else: def __init__(self, strict=False, **kwargs): _XHTMLParser.__init__(self, strict=strict, tree=TreeBuilder, **kwargs) + parse = _dodgeUseChardet('parse') + parseFragment = _dodgeUseChardet('parseFragment') + xhtml_parser = XHTMLParser() -- cgit v1.2.1 From 656984a011298de68703e87cfb828878d809c882 Mon Sep 17 00:00:00 2001 From: Koert van der Veer Date: Thu, 16 Mar 2017 10:08:58 +0100 Subject: Make sure the html5lib tests are included in CI --- src/lxml/html/html5parser.py | 42 ++++++++++++++++++++++-------------------- 1 file changed, 22 insertions(+), 20 deletions(-) (limited to 'src/lxml/html/html5parser.py') diff --git a/src/lxml/html/html5parser.py b/src/lxml/html/html5parser.py index 3524535c..3ef7915f 100644 --- a/src/lxml/html/html5parser.py +++ b/src/lxml/html/html5parser.py @@ -1,7 +1,7 @@ """ An interface to html5lib that mimics the lxml.html interface. """ - +import functools import sys import string @@ -24,56 +24,58 @@ try: except ImportError: from urllib.parse import urlparse -def _dodgeUseChardet(method_name): + +def _dodgeUseChardet(fn): # html5lib does not accept useChardet as an argument, if it # detected the html argument would produce unicode objects. # However, there is no reasonable way to predict if html5lib will # detect the argument to be unicode (all of that code is private), # so we'll have to settle for a retry. - # this function creates a wrapper around the specified object, which - # retries when html5lib complains about the useChardet argument - - def inner(self, *args, **kwargs): - callee = getattr(super(type(self), self), method_name) + # this decorator wraps around the a method, which is retried when html5lib + # complains about the useChardet argument + @functools.wraps(fn) + def inner(*args, **kwargs): try: - return callee(*args, **kwargs) - except TypeError: - exception = sys.exc_info()[1] + return fn(*args, **kwargs) + except TypeError as exception: if "'useChardet'" not in str(exception): # Some other issue caused the exception. Tell the caller raise kwargs.pop('useChardet') - return callee(*args, **kwargs) - inner.__name__ = method_name + return fn(*args, **kwargs) return inner +class _DodgeUseChardetMixin: + + @_dodgeUseChardet + def parse(self, *args, **kwargs): + return super(_DodgeUseChardetMixin, self).parse(*args, **kwargs) -class HTMLParser(_HTMLParser): + @_dodgeUseChardet + def parseFragment(self, *args, **kwargs): + return super(_DodgeUseChardetMixin, self).parseFragment(*args, **kwargs) + + +class HTMLParser(_DodgeUseChardetMixin, _HTMLParser): """An html5lib HTML parser with lxml as tree.""" def __init__(self, strict=False, **kwargs): _HTMLParser.__init__(self, strict=strict, tree=TreeBuilder, **kwargs) - parse = _dodgeUseChardet('parse') - parseFragment = _dodgeUseChardet('parseFragment') - try: from html5lib import XHTMLParser as _XHTMLParser except ImportError: pass else: - class XHTMLParser(_XHTMLParser): + class XHTMLParser(_DodgeUseChardetMixin, _XHTMLParser): """An html5lib XHTML Parser with lxml as tree.""" def __init__(self, strict=False, **kwargs): _XHTMLParser.__init__(self, strict=strict, tree=TreeBuilder, **kwargs) - parse = _dodgeUseChardet('parse') - parseFragment = _dodgeUseChardet('parseFragment') - xhtml_parser = XHTMLParser() -- cgit v1.2.1