summaryrefslogtreecommitdiff
path: root/src/lxml/html/diff.py
diff options
context:
space:
mode:
authororf <tom@tomforb.es>2013-07-29 15:00:10 +0100
committerorf <tom@tomforb.es>2013-07-29 15:00:10 +0100
commit44e697fa9a8b580326bfaf6ffffeda3220c6c733 (patch)
treee944a56afaae31018093117c2610b4bdeb4ec9d2 /src/lxml/html/diff.py
parentd441222ff775b4d838673ac58b667936543c4738 (diff)
downloadpython-lxml-44e697fa9a8b580326bfaf6ffffeda3220c6c733.tar.gz
All trailing whitespace is stored and ignored while diffing
Diffstat (limited to 'src/lxml/html/diff.py')
-rw-r--r--src/lxml/html/diff.py52
1 files changed, 33 insertions, 19 deletions
diff --git a/src/lxml/html/diff.py b/src/lxml/html/diff.py
index edabffb8..ddd258d7 100644
--- a/src/lxml/html/diff.py
+++ b/src/lxml/html/diff.py
@@ -121,7 +121,7 @@ def compress_merge_back(tokens, tok):
else:
text = _unicode(last)
if last.trailing_whitespace:
- text += ' '
+ text += last.trailing_whitespace
text += tok
merged = token(text,
pre_tags=last.pre_tags,
@@ -141,7 +141,7 @@ def markup_serialize_tokens(tokens, markup_func):
html = token.html()
html = markup_func(html, token.annotation)
if token.trailing_whitespace:
- html += ' '
+ html += token.trailing_whitespace
yield html
for post in token.post_tags:
yield post
@@ -170,6 +170,7 @@ def htmldiff(old_html, new_html):
"""
old_html_tokens = tokenize(old_html)
new_html_tokens = tokenize(new_html)
+ print old_html_tokens, new_html_tokens
result = htmldiff_tokens(old_html_tokens, new_html_tokens)
result = ''.join(result).strip()
return fixup_ins_del_tags(result)
@@ -221,7 +222,7 @@ def expand_tokens(tokens, equal=False):
yield pre
if not equal or not token.hide_when_equal:
if token.trailing_whitespace:
- yield token.html() + ' '
+ yield token.html() + token.trailing_whitespace
else:
yield token.html()
for post in token.post_tags:
@@ -451,7 +452,7 @@ class token(_unicode):
# displayed diff if no change has occurred:
hide_when_equal = False
- def __new__(cls, text, pre_tags=None, post_tags=None, trailing_whitespace=False):
+ def __new__(cls, text, pre_tags=None, post_tags=None, trailing_whitespace=None):
obj = _unicode.__new__(cls, text)
if pre_tags is not None:
@@ -469,7 +470,8 @@ class token(_unicode):
return obj
def __repr__(self):
- return 'token(%s, %r, %r)' % (_unicode.__repr__(self), self.pre_tags, self.post_tags)
+ return 'token(%s, %r, %r, %r)' % (_unicode.__repr__(self), self.pre_tags,
+ self.post_tags, self.trailing_whitespace)
def html(self):
return _unicode(self)
@@ -481,7 +483,7 @@ class tag_token(token):
is only represented in a document by a tag. """
def __new__(cls, tag, data, html_repr, pre_tags=None,
- post_tags=None, trailing_whitespace=False):
+ post_tags=None, trailing_whitespace=None):
obj = token.__new__(cls, "%s: %s" % (type, data),
pre_tags=pre_tags,
post_tags=post_tags,
@@ -492,7 +494,7 @@ class tag_token(token):
return obj
def __repr__(self):
- return 'tag_token(%s, %s, html_repr=%s, post_tags=%r, pre_tags=%r, trailing_whitespace=%s)' % (
+ return 'tag_token(%s, %s, html_repr=%s, post_tags=%r, pre_tags=%r, trailing_whitespace=%r)' % (
self.tag,
self.data,
self.html_repr,
@@ -569,6 +571,14 @@ def cleanup_html(html):
end_whitespace_re = re.compile(r'[ \t\n\r]$')
+def split_trailing_whitespace(word):
+ """
+ This function takes a word, such as 'test\n\n' and returns ('test','\n\n')
+ """
+ stripped_length = len(word.rstrip())
+ return word[0:stripped_length], word[stripped_length:]
+
+
def fixup_chunks(chunks):
"""
This function takes a list of chunks and produces a list of tokens.
@@ -580,34 +590,37 @@ def fixup_chunks(chunks):
if isinstance(chunk, tuple):
if chunk[0] == 'img':
src = chunk[1]
- tag = chunk[2]
- if tag.endswith(' '):
- tag = tag[:-1]
- trailing_whitespace = True
+ tag, whitespace = split_trailing_whitespace(chunk[2])
+ if whitespace:
+ trailing_whitespace = whitespace
else:
- trailing_whitespace = False
+ trailing_whitespace = None
cur_word = tag_token('img', src, html_repr=tag,
pre_tags=tag_accum,
trailing_whitespace=trailing_whitespace)
tag_accum = []
result.append(cur_word)
+
elif chunk[0] == 'href':
href = chunk[1]
cur_word = href_token(href, pre_tags=tag_accum, trailing_whitespace=True)
tag_accum = []
result.append(cur_word)
continue
+
if is_word(chunk):
- if chunk.endswith(' '):
- chunk = chunk[:-1]
- trailing_whitespace = True
+ chunk, whitespace = split_trailing_whitespace(chunk)
+ if whitespace:
+ trailing_whitespace = whitespace
else:
- trailing_whitespace = False
+ trailing_whitespace = None
cur_word = token(chunk, pre_tags=tag_accum, trailing_whitespace=trailing_whitespace)
tag_accum = []
result.append(cur_word)
+
elif is_start_tag(chunk):
tag_accum.append(chunk)
+
elif is_end_tag(chunk):
if tag_accum:
tag_accum.append(chunk)
@@ -701,14 +714,15 @@ def flatten_el(el, include_hrefs, skip_tag=False):
for word in end_words:
yield html_escape(word)
+split_words_re = re.compile(r'\S+(?:\s|$)')
+
def split_words(text):
""" Splits some text into words. Includes trailing whitespace (one
space) on each word when appropriate. """
if not text or not text.strip():
return []
- words = [w + ' ' for w in text.strip().split()]
- if not end_whitespace_re.search(text):
- words[-1] = words[-1][:-1]
+
+ words = split_words_re.findall(text)
return words
start_whitespace_re = re.compile(r'^[ \t\n\r]')