diff options
| author | orf <tom@tomforb.es> | 2013-07-29 15:00:10 +0100 |
|---|---|---|
| committer | orf <tom@tomforb.es> | 2013-07-29 15:00:10 +0100 |
| commit | 44e697fa9a8b580326bfaf6ffffeda3220c6c733 (patch) | |
| tree | e944a56afaae31018093117c2610b4bdeb4ec9d2 /src/lxml/html/diff.py | |
| parent | d441222ff775b4d838673ac58b667936543c4738 (diff) | |
| download | python-lxml-44e697fa9a8b580326bfaf6ffffeda3220c6c733.tar.gz | |
All trailing whitespace is stored and ignored while diffing
Diffstat (limited to 'src/lxml/html/diff.py')
| -rw-r--r-- | src/lxml/html/diff.py | 52 |
1 files changed, 33 insertions, 19 deletions
diff --git a/src/lxml/html/diff.py b/src/lxml/html/diff.py index edabffb8..ddd258d7 100644 --- a/src/lxml/html/diff.py +++ b/src/lxml/html/diff.py @@ -121,7 +121,7 @@ def compress_merge_back(tokens, tok): else: text = _unicode(last) if last.trailing_whitespace: - text += ' ' + text += last.trailing_whitespace text += tok merged = token(text, pre_tags=last.pre_tags, @@ -141,7 +141,7 @@ def markup_serialize_tokens(tokens, markup_func): html = token.html() html = markup_func(html, token.annotation) if token.trailing_whitespace: - html += ' ' + html += token.trailing_whitespace yield html for post in token.post_tags: yield post @@ -170,6 +170,7 @@ def htmldiff(old_html, new_html): """ old_html_tokens = tokenize(old_html) new_html_tokens = tokenize(new_html) + print old_html_tokens, new_html_tokens result = htmldiff_tokens(old_html_tokens, new_html_tokens) result = ''.join(result).strip() return fixup_ins_del_tags(result) @@ -221,7 +222,7 @@ def expand_tokens(tokens, equal=False): yield pre if not equal or not token.hide_when_equal: if token.trailing_whitespace: - yield token.html() + ' ' + yield token.html() + token.trailing_whitespace else: yield token.html() for post in token.post_tags: @@ -451,7 +452,7 @@ class token(_unicode): # displayed diff if no change has occurred: hide_when_equal = False - def __new__(cls, text, pre_tags=None, post_tags=None, trailing_whitespace=False): + def __new__(cls, text, pre_tags=None, post_tags=None, trailing_whitespace=None): obj = _unicode.__new__(cls, text) if pre_tags is not None: @@ -469,7 +470,8 @@ class token(_unicode): return obj def __repr__(self): - return 'token(%s, %r, %r)' % (_unicode.__repr__(self), self.pre_tags, self.post_tags) + return 'token(%s, %r, %r, %r)' % (_unicode.__repr__(self), self.pre_tags, + self.post_tags, self.trailing_whitespace) def html(self): return _unicode(self) @@ -481,7 +483,7 @@ class tag_token(token): is only represented in a document by a tag. """ def __new__(cls, tag, data, html_repr, pre_tags=None, - post_tags=None, trailing_whitespace=False): + post_tags=None, trailing_whitespace=None): obj = token.__new__(cls, "%s: %s" % (type, data), pre_tags=pre_tags, post_tags=post_tags, @@ -492,7 +494,7 @@ class tag_token(token): return obj def __repr__(self): - return 'tag_token(%s, %s, html_repr=%s, post_tags=%r, pre_tags=%r, trailing_whitespace=%s)' % ( + return 'tag_token(%s, %s, html_repr=%s, post_tags=%r, pre_tags=%r, trailing_whitespace=%r)' % ( self.tag, self.data, self.html_repr, @@ -569,6 +571,14 @@ def cleanup_html(html): end_whitespace_re = re.compile(r'[ \t\n\r]$') +def split_trailing_whitespace(word): + """ + This function takes a word, such as 'test\n\n' and returns ('test','\n\n') + """ + stripped_length = len(word.rstrip()) + return word[0:stripped_length], word[stripped_length:] + + def fixup_chunks(chunks): """ This function takes a list of chunks and produces a list of tokens. @@ -580,34 +590,37 @@ def fixup_chunks(chunks): if isinstance(chunk, tuple): if chunk[0] == 'img': src = chunk[1] - tag = chunk[2] - if tag.endswith(' '): - tag = tag[:-1] - trailing_whitespace = True + tag, whitespace = split_trailing_whitespace(chunk[2]) + if whitespace: + trailing_whitespace = whitespace else: - trailing_whitespace = False + trailing_whitespace = None cur_word = tag_token('img', src, html_repr=tag, pre_tags=tag_accum, trailing_whitespace=trailing_whitespace) tag_accum = [] result.append(cur_word) + elif chunk[0] == 'href': href = chunk[1] cur_word = href_token(href, pre_tags=tag_accum, trailing_whitespace=True) tag_accum = [] result.append(cur_word) continue + if is_word(chunk): - if chunk.endswith(' '): - chunk = chunk[:-1] - trailing_whitespace = True + chunk, whitespace = split_trailing_whitespace(chunk) + if whitespace: + trailing_whitespace = whitespace else: - trailing_whitespace = False + trailing_whitespace = None cur_word = token(chunk, pre_tags=tag_accum, trailing_whitespace=trailing_whitespace) tag_accum = [] result.append(cur_word) + elif is_start_tag(chunk): tag_accum.append(chunk) + elif is_end_tag(chunk): if tag_accum: tag_accum.append(chunk) @@ -701,14 +714,15 @@ def flatten_el(el, include_hrefs, skip_tag=False): for word in end_words: yield html_escape(word) +split_words_re = re.compile(r'\S+(?:\s|$)') + def split_words(text): """ Splits some text into words. Includes trailing whitespace (one space) on each word when appropriate. """ if not text or not text.strip(): return [] - words = [w + ' ' for w in text.strip().split()] - if not end_whitespace_re.search(text): - words[-1] = words[-1][:-1] + + words = split_words_re.findall(text) return words start_whitespace_re = re.compile(r'^[ \t\n\r]') |
