summaryrefslogtreecommitdiff
path: root/creole/shared/html_parser.py
blob: 1e969e539141c60a8f810d82db8441d87bc33cab (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
# coding: utf-8

"""
    Patched version of HTMLParser.HTMLParser with patch from:
        http://bugs.python.org/issue670664
    See also:
        https://github.com/gregmuellegger/django/issues/1
"""

import HTMLParser

class HTMLParser2(HTMLParser.HTMLParser):
    """ Patched version of HTMLParser.HTMLParser """
    def __init__(self):
        HTMLParser.HTMLParser.__init__(self)
        self.cdata_tag = None

    def set_cdata_mode(self, tag):
        self.interesting = HTMLParser.interesting_cdata
        self.cdata_tag = tag.lower()

    def clear_cdata_mode(self):
        self.interesting = HTMLParser.interesting_normal
        self.cdata_tag = None

    # Internal -- handle starttag, return end or -1 if not terminated
    def parse_starttag(self, i):
        self.__starttag_text = None
        endpos = self.check_for_whole_start_tag(i)
        if endpos < 0:
            return endpos
        rawdata = self.rawdata
        self.__starttag_text = rawdata[i:endpos]

        # Now parse the data between i+1 and j into a tag and attrs
        attrs = []
        match = HTMLParser.tagfind.match(rawdata, i + 1)
        assert match, 'unexpected call to parse_starttag()'
        k = match.end()
        self.lasttag = tag = rawdata[i + 1:k].lower()

        while k < endpos:
            m = HTMLParser.attrfind.match(rawdata, k)
            if not m:
                break
            attrname, rest, attrvalue = m.group(1, 2, 3)
            if not rest:
                attrvalue = None
            elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
                 attrvalue[:1] == '"' == attrvalue[-1:]:
                attrvalue = attrvalue[1:-1]
                attrvalue = self.unescape(attrvalue)
            attrs.append((attrname.lower(), attrvalue))
            k = m.end()

        end = rawdata[k:endpos].strip()
        if end not in (">", "/>"):
            lineno, offset = self.getpos()
            if "\n" in self.__starttag_text:
                lineno = lineno + self.__starttag_text.count("\n")
                offset = len(self.__starttag_text) \
                         - self.__starttag_text.rfind("\n")
            else:
                offset = offset + len(self.__starttag_text)
            self.error("junk characters in start tag: %r"
                       % (rawdata[k:endpos][:20],))
        if end.endswith('/>'):
            # XHTML-style empty tag: <span attr="value" />
            self.handle_startendtag(tag, attrs)
        else:
            self.handle_starttag(tag, attrs)
            if tag in self.CDATA_CONTENT_ELEMENTS:
                self.set_cdata_mode(tag) # <--------------------------- Changed
        return endpos

    # Internal -- parse endtag, return end or -1 if incomplete
    def parse_endtag(self, i):
        rawdata = self.rawdata
        assert rawdata[i:i + 2] == "</", "unexpected call to parse_endtag"
        match = HTMLParser.endendtag.search(rawdata, i + 1) # >
        if not match:
            return -1
        j = match.end()
        match = HTMLParser.endtagfind.match(rawdata, i) # </ + tag + >
        if not match:
            if self.cdata_tag is not None: # *** add ***
                self.handle_data(rawdata[i:j]) # *** add ***
                return j # *** add ***
            self.error("bad end tag: %r" % (rawdata[i:j],))
        # --- changed start ---------------------------------------------------
        tag = match.group(1).strip()
        if self.cdata_tag is not None:
            if tag.lower() != self.cdata_tag:
                self.handle_data(rawdata[i:j])
                return j
        # --- changed end -----------------------------------------------------
        self.handle_endtag(tag.lower())
        self.clear_cdata_mode()
        return j