# Copyright (c) 2004 Ian Bicking. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
#
# 1. Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in
# the documentation and/or other materials provided with the
# distribution.
#
# 3. Neither the name of Ian Bicking nor the names of its contributors may
# be used to endorse or promote products derived from this software
# without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL IAN BICKING OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""The ``lxml.html`` tool set for HTML handling.
"""
import threading
import re
try:
from urlparse import urljoin
except ImportError:
# Python 3
from urllib.parse import urljoin
import copy
from lxml import etree
from lxml.html import defs
from lxml.html._setmixin import SetMixin
try:
from collections import MutableMapping as DictMixin
except ImportError:
# Python < 2.6
from UserDict import DictMixin
try:
set
except NameError:
# Python 2.3
from sets import Set as set
try:
bytes
except NameError:
# Python < 2.6
bytes = str
try:
unicode
except NameError:
# Python 3
unicode = str
try:
basestring
except NameError:
# Python 3
basestring = (str, bytes)
def __fix_docstring(s):
if not s:
return s
import sys
if sys.version_info[0] >= 3:
sub = re.compile(r"^(\s*)u'", re.M).sub
else:
sub = re.compile(r"^(\s*)b'", re.M).sub
return sub(r"\1'", s)
__all__ = [
'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring',
'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form',
'find_rel_links', 'find_class', 'make_links_absolute',
'resolve_base_href', 'iterlinks', 'rewrite_links', 'open_in_browser', 'parse']
XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml"
_rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]|descendant-or-self::x:a[@rel]",
namespaces={'x':XHTML_NAMESPACE})
_options_xpath = etree.XPath("descendant-or-self::option|descendant-or-self::x:option",
namespaces={'x':XHTML_NAMESPACE})
_forms_xpath = etree.XPath("descendant-or-self::form|descendant-or-self::x:form",
namespaces={'x':XHTML_NAMESPACE})
#_class_xpath = etree.XPath(r"descendant-or-self::*[regexp:match(@class, concat('\b', $class_name, '\b'))]", {'regexp': 'http://exslt.org/regular-expressions'})
_class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]")
_id_xpath = etree.XPath("descendant-or-self::*[@id=$id]")
_collect_string_content = etree.XPath("string()")
_css_url_re = re.compile(r'url\(('+'["][^"]*["]|'+"['][^']*[']|"+r'[^)]*)\)', re.I)
_css_import_re = re.compile(r'@import "(.*?)"')
_label_xpath = etree.XPath("//label[@for=$id]|//x:label[@for=$id]",
namespaces={'x':XHTML_NAMESPACE})
_archive_re = re.compile(r'[^ ]+')
def _unquote_match(s, pos):
if s[:1] == '"' and s[-1:] == '"' or s[:1] == "'" and s[-1:] == "'":
return s[1:-1], pos+1
else:
return s,pos
def _transform_result(typ, result):
"""Convert the result back into the input type.
"""
if issubclass(typ, bytes):
return tostring(result, encoding='utf-8')
elif issubclass(typ, unicode):
return tostring(result, encoding=unicode)
else:
return result
def _nons(tag):
if isinstance(tag, basestring):
if tag[0] == '{' and tag[1:len(XHTML_NAMESPACE)+1] == XHTML_NAMESPACE:
return tag.split('}')[-1]
return tag
class HtmlMixin(object):
def base_url(self):
"""
Returns the base URL, given when the page was parsed.
Use with ``urlparse.urljoin(el.base_url, href)`` to get
absolute URLs.
"""
return self.getroottree().docinfo.URL
base_url = property(base_url, doc=base_url.__doc__)
def forms(self):
"""
Return a list of all the forms
"""
return _forms_xpath(self)
forms = property(forms, doc=forms.__doc__)
def body(self):
"""
Return the
element. Can be called from a child element
to get the document's head.
"""
return self.xpath('//body|//x:body', namespaces={'x':XHTML_NAMESPACE})[0]
body = property(body, doc=body.__doc__)
def head(self):
"""
Returns the element. Can be called from a child
element to get the document's head.
"""
return self.xpath('//head|//x:head', namespaces={'x':XHTML_NAMESPACE})[0]
head = property(head, doc=head.__doc__)
def _label__get(self):
"""
Get or set any