diff options
author | Waylan Limberg <waylan@gmail.com> | 2011-12-14 21:59:21 -0500 |
---|---|---|
committer | Waylan Limberg <waylan@gmail.com> | 2011-12-14 21:59:21 -0500 |
commit | 790aeb56053940f299d600d8d364fc3fc8151085 (patch) | |
tree | f9c84fdafccf4ada2bdfdfb26cb178ac3b0bf072 | |
parent | 2e3830b9bf3a0829db7a5d04f731b5019d28cc65 (diff) | |
download | python-markdown-slowhtml.tar.gz |
Reverted rawhtmlblock preprocessor to the pre-attr state as it was too slow.slowhtml
-rw-r--r-- | markdown/preprocessors.py | 133 |
1 files changed, 19 insertions, 114 deletions
diff --git a/markdown/preprocessors.py b/markdown/preprocessors.py index b59f057..f4a33b3 100644 --- a/markdown/preprocessors.py +++ b/markdown/preprocessors.py @@ -44,71 +44,20 @@ class HtmlBlockPreprocessor(Preprocessor): """Remove html blocks from the text and store them for later retrieval.""" right_tag_patterns = ["</%s>", "%s>"] - attrs_pattern = r""" - \s+(?P<attr>[^>"'/= ]+)=(?P<q>['"])(?P<value>.*?)(?P=q) # attr="value" - | # OR - \s+(?P<attr1>[^>"'/= ]+)=(?P<value1>[^> ]+) # attr=value - | # OR - \s+(?P<attr2>[^>"'/= ]+) # attr - """ - left_tag_pattern = r'^\<(?P<tag>[^> ]+)(?P<attrs>(%s)*)\s*\/?\>?' % attrs_pattern - attrs_re = re.compile(attrs_pattern, re.VERBOSE) - left_tag_re = re.compile(left_tag_pattern, re.VERBOSE) - markdown_in_raw = False def _get_left_tag(self, block): - m = self.left_tag_re.match(block) - if m: - tag = m.group('tag') - raw_attrs = m.group('attrs') - attrs = {} - if raw_attrs: - for ma in self.attrs_re.finditer(raw_attrs): - if ma.group('attr'): - if ma.group('value'): - attrs[ma.group('attr').strip()] = ma.group('value') - else: - attrs[ma.group('attr').strip()] = "" - elif ma.group('attr1'): - if ma.group('value1'): - attrs[ma.group('attr1').strip()] = ma.group('value1') - else: - attrs[ma.group('attr1').strip()] = "" - elif ma.group('attr2'): - attrs[ma.group('attr2').strip()] = "" - return tag, len(m.group(0)), attrs - else: - tag = block[1:].replace(">", " ", 1).split()[0].lower() - return tag, len(tag)+2, {} - - def _recursive_tagfind(self, ltag, rtag, start_index, block): - while 1: - i = block.find(rtag, start_index) - if i == -1: - return -1 - j = block.find(ltag, start_index) - # if no ltag, or rtag found before another ltag, return index - if (j > i or j == -1): - return i + len(rtag) - # another ltag found before rtag, use end of ltag as starting - # point and search again - j = block.find('>', j) - start_index = self._recursive_tagfind(ltag, rtag, j + 1, block) - if start_index == -1: - # HTML potentially malformed- ltag has no corresponding - # rtag - return -1 - - def _get_right_tag(self, left_tag, left_index, block): + return block[1:].replace(">", " ", 1).split()[0].lower() + + def _get_right_tag(self, left_tag, block): for p in self.right_tag_patterns: tag = p % left_tag - i = self._recursive_tagfind("<%s" % left_tag, tag, left_index, block) + i = block.rfind(tag) if i > 2: - return tag.lstrip("<").rstrip(">"), i - return block.rstrip()[-left_index:-1].lower(), len(block) - + return tag.lstrip("<").rstrip(">"), i + len(p)-2 + len(left_tag) + return block.rstrip()[-len(left_tag)-2:-1].lower(), len(block) + def _equal_tags(self, left_tag, right_tag): - if left_tag[0] in ['?', '@', '%']: # handle PHP, etc. + if left_tag == 'div' or left_tag[0] in ['?', '@', '%']: # handle PHP, etc. return True if ("/" + left_tag) == right_tag: return True @@ -142,18 +91,14 @@ class HtmlBlockPreprocessor(Preprocessor): block = block[1:] if not in_tag: - if block.startswith("<") and len(block.strip()) > 1: - left_tag, left_index, attrs = self._get_left_tag(block) - right_tag, data_index = self._get_right_tag(left_tag, - left_index, - block) + if block.startswith("<"): + left_tag = self._get_left_tag(block) + right_tag, data_index = self._get_right_tag(left_tag, block) if block[1] == "!": # is a comment block left_tag = "--" - right_tag, data_index = self._get_right_tag(left_tag, - left_index, - block) + right_tag, data_index = self._get_right_tag(left_tag, block) # keep checking conditions below and maybe just append if data_index < len(block) \ @@ -172,21 +117,10 @@ class HtmlBlockPreprocessor(Preprocessor): if block.rstrip().endswith(">") \ and self._equal_tags(left_tag, right_tag): - if self.markdown_in_raw and 'markdown' in attrs.keys(): - start = re.sub(r'\smarkdown(=[\'"]?[^> ]*[\'"]?)?', - '', block[:left_index]) - end = block[-len(right_tag)-2:] - block = block[left_index:-len(right_tag)-2] - new_blocks.append( - self.markdown.htmlStash.store(start)) - new_blocks.append(block) - new_blocks.append( - self.markdown.htmlStash.store(end)) - else: - new_blocks.append( - self.markdown.htmlStash.store(block.strip())) + new_blocks.append( + self.markdown.htmlStash.store(block.strip())) continue - else: + else: #if not block[1] == "!": # if is block level tag and is not complete if util.isBlockLevel(left_tag) or left_tag == "--" \ @@ -204,46 +138,17 @@ class HtmlBlockPreprocessor(Preprocessor): else: items.append(block) - right_tag, data_index = self._get_right_tag(left_tag, - left_index, - block) + right_tag, data_index = self._get_right_tag(left_tag, block) if self._equal_tags(left_tag, right_tag): # if find closing tag in_tag = False - if self.markdown_in_raw and 'markdown' in attrs.keys(): - start = re.sub(r'\smarkdown(=[\'"]?[^> ]*[\'"]?)?', - '', items[0][:left_index]) - items[0] = items[0][left_index:] - end = items[-1][-len(right_tag)-2:] - items[-1] = items[-1][:-len(right_tag)-2] - new_blocks.append( - self.markdown.htmlStash.store(start)) - new_blocks.extend(items) - new_blocks.append( - self.markdown.htmlStash.store(end)) - else: - new_blocks.append( - self.markdown.htmlStash.store('\n\n'.join(items))) + new_blocks.append( + self.markdown.htmlStash.store('\n\n'.join(items))) items = [] if items: - if self.markdown_in_raw and 'markdown' in attrs.keys(): - start = re.sub(r'\smarkdown(=[\'"]?[^> ]*[\'"]?)?', - '', items[0][:left_index]) - items[0] = items[0][left_index:] - end = items[-1][-len(right_tag)-2:] - items[-1] = items[-1][:-len(right_tag)-2] - new_blocks.append( - self.markdown.htmlStash.store(start)) - new_blocks.extend(items) - if end.strip(): - new_blocks.append( - self.markdown.htmlStash.store(end)) - else: - new_blocks.append( - self.markdown.htmlStash.store('\n\n'.join(items))) - #new_blocks.append(self.markdown.htmlStash.store('\n\n'.join(items))) + new_blocks.append(self.markdown.htmlStash.store('\n\n'.join(items))) new_blocks.append('\n') new_text = "\n\n".join(new_blocks) |