summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorStefan Behnel <stefan_ml@behnel.de>2014-08-23 09:28:32 +0200
committerStefan Behnel <stefan_ml@behnel.de>2014-08-23 09:28:32 +0200
commit7acfab8124f2f78801d503165ea9305231482243 (patch)
tree4c30d9b3a0a1043a3f43c26e4e8610a04fb303d9 /src
parentb1eea60da558ab7a2099e5ebe22ad8dada16a0bc (diff)
downloadpython-lxml-7acfab8124f2f78801d503165ea9305231482243.tar.gz
refactor tag processing code in iterlinks()
Diffstat (limited to 'src')
-rw-r--r--src/lxml/html/__init__.py44
1 files changed, 22 insertions, 22 deletions
diff --git a/src/lxml/html/__init__.py b/src/lxml/html/__init__.py
index b5f2e07f..556ab3e1 100644
--- a/src/lxml/html/__init__.py
+++ b/src/lxml/html/__init__.py
@@ -381,32 +381,14 @@ class HtmlMixin(object):
for el in self.iter(etree.Element):
attribs = el.attrib
tag = _nons(el.tag)
- if tag == 'meta':
- http_equiv = attribs.get('http-equiv', '').lower()
- if http_equiv == 'refresh':
- content = attribs.get('content', '')
- i = content.find(';')
- url = content[i+1:] if i > -1 else content
- if 'url=' == url[:4].lower():
- url = url[4:]
- #else:
- # No "url=" means the redirect won't work, but we might
- # as well be permissive and return the entire string.
- if url:
- url, pos = _unquote_match(url, i + 5)
- yield (el, 'content', url, pos)
- if tag != 'object':
- for attrib in link_attrs:
- if attrib in attribs:
- yield (el, attrib, attribs[attrib], 0)
- elif tag == 'object':
+ if tag == 'object':
codebase = None
## <object> tags have attributes that are relative to
## codebase
if 'codebase' in attribs:
codebase = el.get('codebase')
yield (el, 'codebase', codebase, 0)
- for attrib in 'classid', 'data':
+ for attrib in ('classid', 'data'):
if attrib in attribs:
value = el.get(attrib)
if codebase is not None:
@@ -418,7 +400,25 @@ class HtmlMixin(object):
if codebase is not None:
value = urljoin(codebase, value)
yield (el, 'archive', value, match.start())
- if tag == 'param':
+ else:
+ for attrib in link_attrs:
+ if attrib in attribs:
+ yield (el, attrib, attribs[attrib], 0)
+ if tag == 'meta':
+ http_equiv = attribs.get('http-equiv', '').lower()
+ if http_equiv == 'refresh':
+ content = attribs.get('content', '')
+ i = content.find(';')
+ url = content[i+1:] if i >= 0 else content
+ if url[:4].lower() == 'url=':
+ url = url[4:]
+ #else:
+ # No "url=" means the redirect won't work, but we might
+ # as well be permissive and return the entire string.
+ if url:
+ url, pos = _unquote_match(url, i + 5)
+ yield (el, 'content', url, pos)
+ elif tag == 'param':
valuetype = el.get('valuetype') or ''
if valuetype.lower() == 'ref':
## FIXME: while it's fine we *find* this link,
@@ -428,7 +428,7 @@ class HtmlMixin(object):
## doesn't have a valuetype="ref" (which seems to be the norm)
## http://www.w3.org/TR/html401/struct/objects.html#adef-valuetype
yield (el, 'value', el.get('value'), 0)
- if tag == 'style' and el.text:
+ elif tag == 'style' and el.text:
urls = [
# (start_pos, url)
_unquote_match(match.group(1), match.start(1))[::-1]