diff options
| author | Mathieu Pillard <m@virgule.net> | 2014-05-20 00:27:54 +0200 |
|---|---|---|
| committer | Mathieu Pillard <m@virgule.net> | 2014-05-20 00:27:54 +0200 |
| commit | e747dce3d7e04fe595bbfed54f9554c2725eb757 (patch) | |
| tree | f2c2e310e48062893f8224e08364bbff043afd84 /compressor/parser | |
| parent | 772ecd1ef2ce021d05cbc44eb8602f7d59db2c52 (diff) | |
| parent | 804c302495bd9d043f830ed012c76183eb5a1e2d (diff) | |
| download | django-compressor-1.4.tar.gz | |
Merge branch 'release/1.4'1.4
Diffstat (limited to 'compressor/parser')
| -rw-r--r-- | compressor/parser/__init__.py | 6 | ||||
| -rw-r--r-- | compressor/parser/beautifulsoup.py | 24 | ||||
| -rw-r--r-- | compressor/parser/default_htmlparser.py | 14 | ||||
| -rw-r--r-- | compressor/parser/html5lib.py | 37 | ||||
| -rw-r--r-- | compressor/parser/lxml.py | 54 |
5 files changed, 87 insertions, 48 deletions
diff --git a/compressor/parser/__init__.py b/compressor/parser/__init__.py index bc8c18c..a3fe78f 100644 --- a/compressor/parser/__init__.py +++ b/compressor/parser/__init__.py @@ -1,3 +1,4 @@ +from django.utils import six from django.utils.functional import LazyObject from django.utils.importlib import import_module @@ -11,8 +12,9 @@ from compressor.parser.html5lib import Html5LibParser # noqa class AutoSelectParser(LazyObject): options = ( - ('lxml.html', LxmlParser), # lxml, extremely fast - ('HTMLParser', HtmlParser), # fast and part of the Python stdlib + # TODO: make lxml.html parser first again + (six.moves.html_parser.__name__, HtmlParser), # fast and part of the Python stdlib + ('lxml.html', LxmlParser), # lxml, extremely fast ) def __init__(self, content): diff --git a/compressor/parser/beautifulsoup.py b/compressor/parser/beautifulsoup.py index 498cde8..d143df4 100644 --- a/compressor/parser/beautifulsoup.py +++ b/compressor/parser/beautifulsoup.py @@ -1,6 +1,7 @@ from __future__ import absolute_import from django.core.exceptions import ImproperlyConfigured -from django.utils.encoding import smart_unicode +from django.utils import six +from django.utils.encoding import smart_text from compressor.exceptions import ParserError from compressor.parser import ParserBase @@ -12,18 +13,27 @@ class BeautifulSoupParser(ParserBase): @cached_property def soup(self): try: - from BeautifulSoup import BeautifulSoup + if six.PY3: + from bs4 import BeautifulSoup + else: + from BeautifulSoup import BeautifulSoup return BeautifulSoup(self.content) - except ImportError, err: + except ImportError as err: raise ImproperlyConfigured("Error while importing BeautifulSoup: %s" % err) - except Exception, err: + except Exception as err: raise ParserError("Error while initializing Parser: %s" % err) def css_elems(self): - return self.soup.findAll({'link': True, 'style': True}) + if six.PY3: + return self.soup.find_all({'link': True, 'style': True}) + else: + return self.soup.findAll({'link': True, 'style': True}) def js_elems(self): - return self.soup.findAll('script') + if six.PY3: + return self.soup.find_all('script') + else: + return self.soup.findAll('script') def elem_attribs(self, elem): return dict(elem.attrs) @@ -35,4 +45,4 @@ class BeautifulSoupParser(ParserBase): return elem.name def elem_str(self, elem): - return smart_unicode(elem) + return smart_text(elem) diff --git a/compressor/parser/default_htmlparser.py b/compressor/parser/default_htmlparser.py index 8425d77..80272cb 100644 --- a/compressor/parser/default_htmlparser.py +++ b/compressor/parser/default_htmlparser.py @@ -1,13 +1,13 @@ -from HTMLParser import HTMLParser -from django.utils.encoding import smart_unicode +from django.utils import six +from django.utils.encoding import smart_text + from compressor.exceptions import ParserError from compressor.parser import ParserBase -class DefaultHtmlParser(ParserBase, HTMLParser): - +class DefaultHtmlParser(ParserBase, six.moves.html_parser.HTMLParser): def __init__(self, content): - HTMLParser.__init__(self) + six.moves.html_parser.HTMLParser.__init__(self) self.content = content self._css_elems = [] self._js_elems = [] @@ -15,7 +15,7 @@ class DefaultHtmlParser(ParserBase, HTMLParser): try: self.feed(self.content) self.close() - except Exception, err: + except Exception as err: lineno = err.lineno line = self.content.splitlines()[lineno] raise ParserError("Error while initializing HtmlParser: %s (line: %s)" % (err, repr(line))) @@ -65,7 +65,7 @@ class DefaultHtmlParser(ParserBase, HTMLParser): return elem['attrs_dict'] def elem_content(self, elem): - return smart_unicode(elem['text']) + return smart_text(elem['text']) def elem_str(self, elem): tag = {} diff --git a/compressor/parser/html5lib.py b/compressor/parser/html5lib.py index 7fee590..b1d0948 100644 --- a/compressor/parser/html5lib.py +++ b/compressor/parser/html5lib.py @@ -1,6 +1,6 @@ from __future__ import absolute_import -from django.utils.encoding import smart_unicode from django.core.exceptions import ImproperlyConfigured +from django.utils.encoding import smart_text from compressor.exceptions import ParserError from compressor.parser import ParserBase @@ -15,42 +15,45 @@ class Html5LibParser(ParserBase): self.html5lib = html5lib def _serialize(self, elem): - fragment = self.html5lib.treebuilders.simpletree.DocumentFragment() - fragment.appendChild(elem) - return self.html5lib.serialize(fragment, - quote_attr_values=True, omit_optional_tags=False) + return self.html5lib.serialize( + elem, tree="etree", quote_attr_values=True, + omit_optional_tags=False, use_trailing_solidus=True, + ) def _find(self, *names): - for node in self.html.childNodes: - if node.type == 5 and node.name in names: - yield node + for elem in self.html: + if elem.tag in names: + yield elem @cached_property def html(self): try: - return self.html5lib.parseFragment(self.content) - except ImportError, err: + return self.html5lib.parseFragment(self.content, treebuilder="etree") + except ImportError as err: raise ImproperlyConfigured("Error while importing html5lib: %s" % err) - except Exception, err: + except Exception as err: raise ParserError("Error while initializing Parser: %s" % err) def css_elems(self): - return self._find('style', 'link') + return self._find('{http://www.w3.org/1999/xhtml}link', + '{http://www.w3.org/1999/xhtml}style') def js_elems(self): - return self._find('script') + return self._find('{http://www.w3.org/1999/xhtml}script') def elem_attribs(self, elem): - return elem.attributes + return elem.attrib def elem_content(self, elem): - return elem.childNodes[0].value + return smart_text(elem.text) def elem_name(self, elem): - return elem.name + if '}' in elem.tag: + return elem.tag.split('}')[1] + return elem.tag def elem_str(self, elem): # This method serializes HTML in a way that does not pass all tests. # However, this method is only called in tests anyway, so it doesn't # really matter. - return smart_unicode(self._serialize(elem)) + return smart_text(self._serialize(elem)) diff --git a/compressor/parser/lxml.py b/compressor/parser/lxml.py index 7bbb561..64a8fcb 100644 --- a/compressor/parser/lxml.py +++ b/compressor/parser/lxml.py @@ -1,6 +1,8 @@ -from __future__ import absolute_import +from __future__ import absolute_import, unicode_literals + from django.core.exceptions import ImproperlyConfigured -from django.utils.encoding import smart_unicode +from django.utils import six +from django.utils.encoding import smart_text from compressor.exceptions import ParserError from compressor.parser import ParserBase @@ -8,28 +10,50 @@ from compressor.utils.decorators import cached_property class LxmlParser(ParserBase): - + """ + LxmlParser will use `lxml.html` parser to parse rendered contents of + {% compress %} tag. Under python 2 it will also try to use beautiful + soup parser in case of any problems with encoding. + """ def __init__(self, content): try: - from lxml.html import fromstring, soupparser + from lxml.html import fromstring from lxml.etree import tostring - self.fromstring = fromstring - self.soupparser = soupparser - self.tostring = tostring - except ImportError, err: + except ImportError as err: raise ImproperlyConfigured("Error while importing lxml: %s" % err) - except Exception, err: - raise ParserError("Error while initializing Parser: %s" % err) + except Exception as err: + raise ParserError("Error while initializing parser: %s" % err) + + if not six.PY3: + # soupparser uses Beautiful Soup 3 which does not run on python 3.x + try: + from lxml.html import soupparser + except ImportError as err: + soupparser = None + except Exception as err: + raise ParserError("Error while initializing parser: %s" % err) + else: + soupparser = None + + self.soupparser = soupparser + self.fromstring = fromstring + self.tostring = tostring super(LxmlParser, self).__init__(content) @cached_property def tree(self): + """ + Document tree. + """ content = '<root>%s</root>' % self.content tree = self.fromstring(content) try: - self.tostring(tree, encoding=unicode) + self.tostring(tree, encoding=six.text_type) except UnicodeDecodeError: - tree = self.soupparser.fromstring(content) + if self.soupparser: # use soup parser on python 2 + tree = self.soupparser.fromstring(content) + else: # raise an error on python 3 + raise return tree def css_elems(self): @@ -43,14 +67,14 @@ class LxmlParser(ParserBase): return elem.attrib def elem_content(self, elem): - return smart_unicode(elem.text) + return smart_text(elem.text) def elem_name(self, elem): return elem.tag def elem_str(self, elem): - elem_as_string = smart_unicode( - self.tostring(elem, method='html', encoding=unicode)) + elem_as_string = smart_text( + self.tostring(elem, method='html', encoding=six.text_type)) if elem.tag == 'link': # This makes testcases happy return elem_as_string.replace('>', ' />') |
