summaryrefslogtreecommitdiff
path: root/compressor/parser
diff options
context:
space:
mode:
Diffstat (limited to 'compressor/parser')
-rw-r--r--compressor/parser/__init__.py6
-rw-r--r--compressor/parser/beautifulsoup.py24
-rw-r--r--compressor/parser/default_htmlparser.py14
-rw-r--r--compressor/parser/html5lib.py37
-rw-r--r--compressor/parser/lxml.py54
5 files changed, 87 insertions, 48 deletions
diff --git a/compressor/parser/__init__.py b/compressor/parser/__init__.py
index bc8c18c..a3fe78f 100644
--- a/compressor/parser/__init__.py
+++ b/compressor/parser/__init__.py
@@ -1,3 +1,4 @@
+from django.utils import six
from django.utils.functional import LazyObject
from django.utils.importlib import import_module
@@ -11,8 +12,9 @@ from compressor.parser.html5lib import Html5LibParser # noqa
class AutoSelectParser(LazyObject):
options = (
- ('lxml.html', LxmlParser), # lxml, extremely fast
- ('HTMLParser', HtmlParser), # fast and part of the Python stdlib
+ # TODO: make lxml.html parser first again
+ (six.moves.html_parser.__name__, HtmlParser), # fast and part of the Python stdlib
+ ('lxml.html', LxmlParser), # lxml, extremely fast
)
def __init__(self, content):
diff --git a/compressor/parser/beautifulsoup.py b/compressor/parser/beautifulsoup.py
index 498cde8..d143df4 100644
--- a/compressor/parser/beautifulsoup.py
+++ b/compressor/parser/beautifulsoup.py
@@ -1,6 +1,7 @@
from __future__ import absolute_import
from django.core.exceptions import ImproperlyConfigured
-from django.utils.encoding import smart_unicode
+from django.utils import six
+from django.utils.encoding import smart_text
from compressor.exceptions import ParserError
from compressor.parser import ParserBase
@@ -12,18 +13,27 @@ class BeautifulSoupParser(ParserBase):
@cached_property
def soup(self):
try:
- from BeautifulSoup import BeautifulSoup
+ if six.PY3:
+ from bs4 import BeautifulSoup
+ else:
+ from BeautifulSoup import BeautifulSoup
return BeautifulSoup(self.content)
- except ImportError, err:
+ except ImportError as err:
raise ImproperlyConfigured("Error while importing BeautifulSoup: %s" % err)
- except Exception, err:
+ except Exception as err:
raise ParserError("Error while initializing Parser: %s" % err)
def css_elems(self):
- return self.soup.findAll({'link': True, 'style': True})
+ if six.PY3:
+ return self.soup.find_all({'link': True, 'style': True})
+ else:
+ return self.soup.findAll({'link': True, 'style': True})
def js_elems(self):
- return self.soup.findAll('script')
+ if six.PY3:
+ return self.soup.find_all('script')
+ else:
+ return self.soup.findAll('script')
def elem_attribs(self, elem):
return dict(elem.attrs)
@@ -35,4 +45,4 @@ class BeautifulSoupParser(ParserBase):
return elem.name
def elem_str(self, elem):
- return smart_unicode(elem)
+ return smart_text(elem)
diff --git a/compressor/parser/default_htmlparser.py b/compressor/parser/default_htmlparser.py
index 8425d77..80272cb 100644
--- a/compressor/parser/default_htmlparser.py
+++ b/compressor/parser/default_htmlparser.py
@@ -1,13 +1,13 @@
-from HTMLParser import HTMLParser
-from django.utils.encoding import smart_unicode
+from django.utils import six
+from django.utils.encoding import smart_text
+
from compressor.exceptions import ParserError
from compressor.parser import ParserBase
-class DefaultHtmlParser(ParserBase, HTMLParser):
-
+class DefaultHtmlParser(ParserBase, six.moves.html_parser.HTMLParser):
def __init__(self, content):
- HTMLParser.__init__(self)
+ six.moves.html_parser.HTMLParser.__init__(self)
self.content = content
self._css_elems = []
self._js_elems = []
@@ -15,7 +15,7 @@ class DefaultHtmlParser(ParserBase, HTMLParser):
try:
self.feed(self.content)
self.close()
- except Exception, err:
+ except Exception as err:
lineno = err.lineno
line = self.content.splitlines()[lineno]
raise ParserError("Error while initializing HtmlParser: %s (line: %s)" % (err, repr(line)))
@@ -65,7 +65,7 @@ class DefaultHtmlParser(ParserBase, HTMLParser):
return elem['attrs_dict']
def elem_content(self, elem):
- return smart_unicode(elem['text'])
+ return smart_text(elem['text'])
def elem_str(self, elem):
tag = {}
diff --git a/compressor/parser/html5lib.py b/compressor/parser/html5lib.py
index 7fee590..b1d0948 100644
--- a/compressor/parser/html5lib.py
+++ b/compressor/parser/html5lib.py
@@ -1,6 +1,6 @@
from __future__ import absolute_import
-from django.utils.encoding import smart_unicode
from django.core.exceptions import ImproperlyConfigured
+from django.utils.encoding import smart_text
from compressor.exceptions import ParserError
from compressor.parser import ParserBase
@@ -15,42 +15,45 @@ class Html5LibParser(ParserBase):
self.html5lib = html5lib
def _serialize(self, elem):
- fragment = self.html5lib.treebuilders.simpletree.DocumentFragment()
- fragment.appendChild(elem)
- return self.html5lib.serialize(fragment,
- quote_attr_values=True, omit_optional_tags=False)
+ return self.html5lib.serialize(
+ elem, tree="etree", quote_attr_values=True,
+ omit_optional_tags=False, use_trailing_solidus=True,
+ )
def _find(self, *names):
- for node in self.html.childNodes:
- if node.type == 5 and node.name in names:
- yield node
+ for elem in self.html:
+ if elem.tag in names:
+ yield elem
@cached_property
def html(self):
try:
- return self.html5lib.parseFragment(self.content)
- except ImportError, err:
+ return self.html5lib.parseFragment(self.content, treebuilder="etree")
+ except ImportError as err:
raise ImproperlyConfigured("Error while importing html5lib: %s" % err)
- except Exception, err:
+ except Exception as err:
raise ParserError("Error while initializing Parser: %s" % err)
def css_elems(self):
- return self._find('style', 'link')
+ return self._find('{http://www.w3.org/1999/xhtml}link',
+ '{http://www.w3.org/1999/xhtml}style')
def js_elems(self):
- return self._find('script')
+ return self._find('{http://www.w3.org/1999/xhtml}script')
def elem_attribs(self, elem):
- return elem.attributes
+ return elem.attrib
def elem_content(self, elem):
- return elem.childNodes[0].value
+ return smart_text(elem.text)
def elem_name(self, elem):
- return elem.name
+ if '}' in elem.tag:
+ return elem.tag.split('}')[1]
+ return elem.tag
def elem_str(self, elem):
# This method serializes HTML in a way that does not pass all tests.
# However, this method is only called in tests anyway, so it doesn't
# really matter.
- return smart_unicode(self._serialize(elem))
+ return smart_text(self._serialize(elem))
diff --git a/compressor/parser/lxml.py b/compressor/parser/lxml.py
index 7bbb561..64a8fcb 100644
--- a/compressor/parser/lxml.py
+++ b/compressor/parser/lxml.py
@@ -1,6 +1,8 @@
-from __future__ import absolute_import
+from __future__ import absolute_import, unicode_literals
+
from django.core.exceptions import ImproperlyConfigured
-from django.utils.encoding import smart_unicode
+from django.utils import six
+from django.utils.encoding import smart_text
from compressor.exceptions import ParserError
from compressor.parser import ParserBase
@@ -8,28 +10,50 @@ from compressor.utils.decorators import cached_property
class LxmlParser(ParserBase):
-
+ """
+ LxmlParser will use `lxml.html` parser to parse rendered contents of
+ {% compress %} tag. Under python 2 it will also try to use beautiful
+ soup parser in case of any problems with encoding.
+ """
def __init__(self, content):
try:
- from lxml.html import fromstring, soupparser
+ from lxml.html import fromstring
from lxml.etree import tostring
- self.fromstring = fromstring
- self.soupparser = soupparser
- self.tostring = tostring
- except ImportError, err:
+ except ImportError as err:
raise ImproperlyConfigured("Error while importing lxml: %s" % err)
- except Exception, err:
- raise ParserError("Error while initializing Parser: %s" % err)
+ except Exception as err:
+ raise ParserError("Error while initializing parser: %s" % err)
+
+ if not six.PY3:
+ # soupparser uses Beautiful Soup 3 which does not run on python 3.x
+ try:
+ from lxml.html import soupparser
+ except ImportError as err:
+ soupparser = None
+ except Exception as err:
+ raise ParserError("Error while initializing parser: %s" % err)
+ else:
+ soupparser = None
+
+ self.soupparser = soupparser
+ self.fromstring = fromstring
+ self.tostring = tostring
super(LxmlParser, self).__init__(content)
@cached_property
def tree(self):
+ """
+ Document tree.
+ """
content = '<root>%s</root>' % self.content
tree = self.fromstring(content)
try:
- self.tostring(tree, encoding=unicode)
+ self.tostring(tree, encoding=six.text_type)
except UnicodeDecodeError:
- tree = self.soupparser.fromstring(content)
+ if self.soupparser: # use soup parser on python 2
+ tree = self.soupparser.fromstring(content)
+ else: # raise an error on python 3
+ raise
return tree
def css_elems(self):
@@ -43,14 +67,14 @@ class LxmlParser(ParserBase):
return elem.attrib
def elem_content(self, elem):
- return smart_unicode(elem.text)
+ return smart_text(elem.text)
def elem_name(self, elem):
return elem.tag
def elem_str(self, elem):
- elem_as_string = smart_unicode(
- self.tostring(elem, method='html', encoding=unicode))
+ elem_as_string = smart_text(
+ self.tostring(elem, method='html', encoding=six.text_type))
if elem.tag == 'link':
# This makes testcases happy
return elem_as_string.replace('>', ' />')