All trailing whitespace is stored and ignored while diffing

author: orf <tom@tomforb.es> 2013-07-29 15:00:10 +0100
committer: orf <tom@tomforb.es> 2013-07-29 15:00:10 +0100
commit: 44e697fa9a8b580326bfaf6ffffeda3220c6c733 (patch)
tree: e944a56afaae31018093117c2610b4bdeb4ec9d2 /src/lxml/html/diff.py
parent: d441222ff775b4d838673ac58b667936543c4738 (diff)
download: python-lxml-44e697fa9a8b580326bfaf6ffffeda3220c6c733.tar.gz
1 files changed, 33 insertions, 19 deletions
diff --git a/src/lxml/html/diff.py b/src/lxml/html/diff.py
index edabffb8..ddd258d7 100644
--- a/src/lxml/html/diff.py
+++ b/src/lxml/html/diff.py
@@ -121,7 +121,7 @@ def compress_merge_back(tokens, tok):
     else:
         text = _unicode(last)
         if last.trailing_whitespace:
-            text += ' '
+            text += last.trailing_whitespace
         text += tok
         merged = token(text,
                        pre_tags=last.pre_tags,
@@ -141,7 +141,7 @@ def markup_serialize_tokens(tokens, markup_func):
         html = token.html()
         html = markup_func(html, token.annotation)
         if token.trailing_whitespace:
-            html += ' '
+            html += token.trailing_whitespace
         yield html
         for post in token.post_tags:
             yield post
@@ -170,6 +170,7 @@ def htmldiff(old_html, new_html):
     """ 
     old_html_tokens = tokenize(old_html)
     new_html_tokens = tokenize(new_html)
+    print old_html_tokens, new_html_tokens
     result = htmldiff_tokens(old_html_tokens, new_html_tokens)
     result = ''.join(result).strip()
     return fixup_ins_del_tags(result)
@@ -221,7 +222,7 @@ def expand_tokens(tokens, equal=False):
             yield pre
         if not equal or not token.hide_when_equal:
             if token.trailing_whitespace:
-                yield token.html() + ' '
+                yield token.html() + token.trailing_whitespace
             else:
                 yield token.html()
         for post in token.post_tags:
@@ -451,7 +452,7 @@ class token(_unicode):
     # displayed diff if no change has occurred:
     hide_when_equal = False
 
-    def __new__(cls, text, pre_tags=None, post_tags=None, trailing_whitespace=False):
+    def __new__(cls, text, pre_tags=None, post_tags=None, trailing_whitespace=None):
         obj = _unicode.__new__(cls, text)
 
         if pre_tags is not None:
@@ -469,7 +470,8 @@ class token(_unicode):
         return obj
 
     def __repr__(self):
-        return 'token(%s, %r, %r)' % (_unicode.__repr__(self), self.pre_tags, self.post_tags)
+        return 'token(%s, %r, %r, %r)' % (_unicode.__repr__(self), self.pre_tags,
+                                          self.post_tags, self.trailing_whitespace)
 
     def html(self):
         return _unicode(self)
@@ -481,7 +483,7 @@ class tag_token(token):
     is only represented in a document by a tag.  """
 
     def __new__(cls, tag, data, html_repr, pre_tags=None, 
-                post_tags=None, trailing_whitespace=False):
+                post_tags=None, trailing_whitespace=None):
         obj = token.__new__(cls, "%s: %s" % (type, data), 
                             pre_tags=pre_tags, 
                             post_tags=post_tags, 
@@ -492,7 +494,7 @@ class tag_token(token):
         return obj
 
     def __repr__(self):
-        return 'tag_token(%s, %s, html_repr=%s, post_tags=%r, pre_tags=%r, trailing_whitespace=%s)' % (
+        return 'tag_token(%s, %s, html_repr=%s, post_tags=%r, pre_tags=%r, trailing_whitespace=%r)' % (
             self.tag, 
             self.data, 
             self.html_repr, 
@@ -569,6 +571,14 @@ def cleanup_html(html):
 
 end_whitespace_re = re.compile(r'[ \t\n\r]$')
 
+def split_trailing_whitespace(word):
+    """
+    This function takes a word, such as 'test\n\n' and returns ('test','\n\n')
+    """
+    stripped_length = len(word.rstrip())
+    return word[0:stripped_length], word[stripped_length:]
+
+
 def fixup_chunks(chunks):
     """
     This function takes a list of chunks and produces a list of tokens.
@@ -580,34 +590,37 @@ def fixup_chunks(chunks):
         if isinstance(chunk, tuple):
             if chunk[0] == 'img':
                 src = chunk[1]
-                tag = chunk[2]
-                if tag.endswith(' '):
-                    tag = tag[:-1]
-                    trailing_whitespace = True
+                tag, whitespace = split_trailing_whitespace(chunk[2])
+                if whitespace:
+                    trailing_whitespace = whitespace
                 else:
-                    trailing_whitespace = False
+                    trailing_whitespace = None
                 cur_word = tag_token('img', src, html_repr=tag,
                                      pre_tags=tag_accum,
                                      trailing_whitespace=trailing_whitespace)
                 tag_accum = []
                 result.append(cur_word)
+
             elif chunk[0] == 'href':
                 href = chunk[1]
                 cur_word = href_token(href, pre_tags=tag_accum, trailing_whitespace=True)
                 tag_accum = []
                 result.append(cur_word)
             continue
+
         if is_word(chunk):
-            if chunk.endswith(' '):
-                chunk = chunk[:-1]
-                trailing_whitespace = True
+            chunk, whitespace = split_trailing_whitespace(chunk)
+            if whitespace:
+                trailing_whitespace = whitespace
             else:
-                trailing_whitespace = False
+                trailing_whitespace = None
             cur_word = token(chunk, pre_tags=tag_accum, trailing_whitespace=trailing_whitespace)
             tag_accum = []
             result.append(cur_word)
+
         elif is_start_tag(chunk):
             tag_accum.append(chunk)
+
         elif is_end_tag(chunk):
             if tag_accum:
                 tag_accum.append(chunk)
@@ -701,14 +714,15 @@ def flatten_el(el, include_hrefs, skip_tag=False):
         for word in end_words:
             yield html_escape(word)
 
+split_words_re = re.compile(r'\S+(?:\s|$)')
+
 def split_words(text):
     """ Splits some text into words. Includes trailing whitespace (one
     space) on each word when appropriate.  """
     if not text or not text.strip():
         return []
-    words = [w + ' ' for w in text.strip().split()]
-    if not end_whitespace_re.search(text):
-        words[-1] = words[-1][:-1]
+
+    words = split_words_re.findall(text)
     return words
 
 start_whitespace_re = re.compile(r'^[ \t\n\r]')
author	orf <tom@tomforb.es>	2013-07-29 15:00:10 +0100
committer	orf <tom@tomforb.es>	2013-07-29 15:00:10 +0100
commit	44e697fa9a8b580326bfaf6ffffeda3220c6c733 (patch)
tree	e944a56afaae31018093117c2610b4bdeb4ec9d2 /src/lxml/html/diff.py
parent	d441222ff775b4d838673ac58b667936543c4738 (diff)
download	python-lxml-44e697fa9a8b580326bfaf6ffffeda3220c6c733.tar.gz