Reverted rawhtmlblock preprocessor to the pre-attr state as it was too slow.slowhtml

author: Waylan Limberg <waylan@gmail.com> 2011-12-14 21:59:21 -0500
committer: Waylan Limberg <waylan@gmail.com> 2011-12-14 21:59:21 -0500
commit: 790aeb56053940f299d600d8d364fc3fc8151085 (patch)
tree: f9c84fdafccf4ada2bdfdfb26cb178ac3b0bf072
parent: 2e3830b9bf3a0829db7a5d04f731b5019d28cc65 (diff)
download: python-markdown-slowhtml.tar.gz
1 files changed, 19 insertions, 114 deletions
diff --git a/markdown/preprocessors.py b/markdown/preprocessors.py
index b59f057..f4a33b3 100644
--- a/markdown/preprocessors.py
+++ b/markdown/preprocessors.py
@@ -44,71 +44,20 @@ class HtmlBlockPreprocessor(Preprocessor):
     """Remove html blocks from the text and store them for later retrieval."""
 
     right_tag_patterns = ["</%s>", "%s>"]
-    attrs_pattern = r"""
-        \s+(?P<attr>[^>"'/= ]+)=(?P<q>['"])(?P<value>.*?)(?P=q)   # attr="value"
-        |                                                         # OR 
-        \s+(?P<attr1>[^>"'/= ]+)=(?P<value1>[^> ]+)               # attr=value
-        |                                                         # OR
-        \s+(?P<attr2>[^>"'/= ]+)                                  # attr
-        """
-    left_tag_pattern = r'^\<(?P<tag>[^> ]+)(?P<attrs>(%s)*)\s*\/?\>?' % attrs_pattern
-    attrs_re = re.compile(attrs_pattern, re.VERBOSE)
-    left_tag_re = re.compile(left_tag_pattern, re.VERBOSE)
-    markdown_in_raw = False
 
     def _get_left_tag(self, block):
-        m = self.left_tag_re.match(block)
-        if m:
-            tag = m.group('tag')
-            raw_attrs = m.group('attrs')
-            attrs = {}
-            if raw_attrs:
-                for ma in self.attrs_re.finditer(raw_attrs):
-                    if ma.group('attr'):
-                        if ma.group('value'):
-                            attrs[ma.group('attr').strip()] = ma.group('value')
-                        else:
-                            attrs[ma.group('attr').strip()] = ""
-                    elif ma.group('attr1'):
-                        if ma.group('value1'):
-                            attrs[ma.group('attr1').strip()] = ma.group('value1')
-                        else:
-                            attrs[ma.group('attr1').strip()] = ""
-                    elif ma.group('attr2'):
-                        attrs[ma.group('attr2').strip()] = ""
-            return tag, len(m.group(0)), attrs
-        else:
-            tag = block[1:].replace(">", " ", 1).split()[0].lower()
-            return tag, len(tag)+2, {}
-
-    def _recursive_tagfind(self, ltag, rtag, start_index, block):
-        while 1:
-            i = block.find(rtag, start_index)
-            if i == -1:
-                return -1
-            j = block.find(ltag, start_index) 
-            # if no ltag, or rtag found before another ltag, return index
-            if (j > i or j == -1):
-                return i + len(rtag)
-            # another ltag found before rtag, use end of ltag as starting
-            # point and search again
-            j = block.find('>', j)
-            start_index = self._recursive_tagfind(ltag, rtag, j + 1, block)
-            if start_index == -1:
-                # HTML potentially malformed- ltag has no corresponding 
-                # rtag
-                return -1
-
-    def _get_right_tag(self, left_tag, left_index, block):
+        return block[1:].replace(">", " ", 1).split()[0].lower()
+
+    def _get_right_tag(self, left_tag, block):
         for p in self.right_tag_patterns:
             tag = p % left_tag
-            i = self._recursive_tagfind("<%s" % left_tag, tag, left_index, block)
+            i = block.rfind(tag)
             if i > 2:
-                return tag.lstrip("<").rstrip(">"), i
-        return block.rstrip()[-left_index:-1].lower(), len(block)
-    
+                return tag.lstrip("<").rstrip(">"), i + len(p)-2 + len(left_tag)
+        return block.rstrip()[-len(left_tag)-2:-1].lower(), len(block)
+
     def _equal_tags(self, left_tag, right_tag):
-        if left_tag[0] in ['?', '@', '%']: # handle PHP, etc.
+        if left_tag == 'div' or left_tag[0] in ['?', '@', '%']: # handle PHP, etc.
             return True
         if ("/" + left_tag) == right_tag:
             return True
@@ -142,18 +91,14 @@ class HtmlBlockPreprocessor(Preprocessor):
                 block = block[1:]
 
             if not in_tag:
-                if block.startswith("<") and len(block.strip()) > 1:
-                    left_tag, left_index, attrs = self._get_left_tag(block)
-                    right_tag, data_index = self._get_right_tag(left_tag, 
-                                                                left_index,
-                                                                block)
+                if block.startswith("<"):
+                    left_tag = self._get_left_tag(block)
+                    right_tag, data_index = self._get_right_tag(left_tag, block)
 
                     if block[1] == "!":
                         # is a comment block
                         left_tag = "--"
-                        right_tag, data_index = self._get_right_tag(left_tag, 
-                                                                    left_index,
-                                                                    block)
+                        right_tag, data_index = self._get_right_tag(left_tag, block)
                         # keep checking conditions below and maybe just append
                     
                     if data_index < len(block) \
@@ -172,21 +117,10 @@ class HtmlBlockPreprocessor(Preprocessor):
 
                     if block.rstrip().endswith(">") \
                         and self._equal_tags(left_tag, right_tag):
-                        if self.markdown_in_raw and 'markdown' in attrs.keys():
-                            start = re.sub(r'\smarkdown(=[\'"]?[^> ]*[\'"]?)?', 
-                                           '', block[:left_index])
-                            end = block[-len(right_tag)-2:]
-                            block = block[left_index:-len(right_tag)-2]
-                            new_blocks.append(
-                                self.markdown.htmlStash.store(start))
-                            new_blocks.append(block)
-                            new_blocks.append(
-                                self.markdown.htmlStash.store(end))
-                        else:
-                            new_blocks.append(
-                                self.markdown.htmlStash.store(block.strip()))
+                        new_blocks.append(
+                            self.markdown.htmlStash.store(block.strip()))
                         continue
-                    else: 
+                    else: #if not block[1] == "!":
                         # if is block level tag and is not complete
 
                         if util.isBlockLevel(left_tag) or left_tag == "--" \
@@ -204,46 +138,17 @@ class HtmlBlockPreprocessor(Preprocessor):
             else:
                 items.append(block)
 
-                right_tag, data_index = self._get_right_tag(left_tag, 
-                                                            left_index, 
-                                                            block)
+                right_tag, data_index = self._get_right_tag(left_tag, block)
 
                 if self._equal_tags(left_tag, right_tag):
                     # if find closing tag
                     in_tag = False
-                    if self.markdown_in_raw and 'markdown' in attrs.keys():
-                        start = re.sub(r'\smarkdown(=[\'"]?[^> ]*[\'"]?)?', 
-                                       '', items[0][:left_index])
-                        items[0] = items[0][left_index:]
-                        end = items[-1][-len(right_tag)-2:]
-                        items[-1] = items[-1][:-len(right_tag)-2]
-                        new_blocks.append(
-                            self.markdown.htmlStash.store(start))
-                        new_blocks.extend(items)
-                        new_blocks.append(
-                            self.markdown.htmlStash.store(end))
-                    else:
-                        new_blocks.append(
-                            self.markdown.htmlStash.store('\n\n'.join(items)))
+                    new_blocks.append(
+                        self.markdown.htmlStash.store('\n\n'.join(items)))
                     items = []
 
         if items:
-            if self.markdown_in_raw and 'markdown' in attrs.keys():
-                start = re.sub(r'\smarkdown(=[\'"]?[^> ]*[\'"]?)?', 
-                               '', items[0][:left_index])
-                items[0] = items[0][left_index:]
-                end = items[-1][-len(right_tag)-2:]
-                items[-1] = items[-1][:-len(right_tag)-2]
-                new_blocks.append(
-                    self.markdown.htmlStash.store(start))
-                new_blocks.extend(items)
-                if end.strip():
-                    new_blocks.append(
-                        self.markdown.htmlStash.store(end))
-            else:
-                new_blocks.append(
-                    self.markdown.htmlStash.store('\n\n'.join(items)))
-            #new_blocks.append(self.markdown.htmlStash.store('\n\n'.join(items)))
+            new_blocks.append(self.markdown.htmlStash.store('\n\n'.join(items)))
             new_blocks.append('\n')
 
         new_text = "\n\n".join(new_blocks)
author	Waylan Limberg <waylan@gmail.com>	2011-12-14 21:59:21 -0500
committer	Waylan Limberg <waylan@gmail.com>	2011-12-14 21:59:21 -0500
commit	790aeb56053940f299d600d8d364fc3fc8151085 (patch)
tree	f9c84fdafccf4ada2bdfdfb26cb178ac3b0bf072
parent	2e3830b9bf3a0829db7a5d04f731b5019d28cc65 (diff)
download	python-markdown-slowhtml.tar.gz