1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
|
#!/usr/bin/env python
# coding: utf-8
"""
Base document tree emitter
~~~~~~~~~~~~~~~~~~~~~~~~~~
:copyleft: 2008-2011 by python-creole team, see AUTHORS for more details.
:license: GNU GPL v3 or above, see LICENSE for more details.
"""
from creole.html_tools.deentity import Deentity
from creole.parser.html_parser_config import BLOCK_TAGS
from creole.shared.markup_table import MarkupTable
from creole.shared.unknown_tags import transparent_unknown_nodes
class BaseEmitter(object):
"""
Build from a document_tree (html2creole.parser.HtmlParser instance) a
creole markup text.
"""
def __init__(self, document_tree, unknown_emit=None, debug=False):
self.root = document_tree
if unknown_emit is None:
self._unknown_emit = transparent_unknown_nodes
else:
self._unknown_emit = unknown_emit
self.last = None
self.debugging = debug
self.deentity = Deentity() # for replacing html entities
self._inner_list = ""
self._mask_linebreak = False
# --------------------------------------------------------------------------
def blockdata_pass_emit(self, node):
return f"{node.content}\n\n"
return node.content
# --------------------------------------------------------------------------
def data_emit(self, node):
# node.debug()
return node.content
def entityref_emit(self, node):
"""
emit a named html entity
"""
entity = node.content
try:
return self.deentity.replace_named(entity)
except KeyError as err:
if self.debugging:
print(f"unknown html entity found: {entity!r}")
return f"&{entity}" # FIXME
except UnicodeDecodeError as err:
raise UnicodeError(f"Error handling entity {entity!r}: {err}")
def charref_emit(self, node):
"""
emit a not named html entity
"""
entity = node.content
if entity.startswith("x"):
# entity in hex
hex_no = entity[1:]
return self.deentity.replace_hex(hex_no)
else:
# entity as a unicode number
return self.deentity.replace_number(entity)
# --------------------------------------------------------------------------
def p_emit(self, node):
return "%s\n\n" % self.emit_children(node)
def br_emit(self, node):
if self._inner_list != "":
return "\\\\"
else:
return "\n"
# --------------------------------------------------------------------------
def _typeface(self, node, key):
return key + self.emit_children(node) + key
# --------------------------------------------------------------------------
def li_emit(self, node):
content = self.emit_children(node)
return f"\n{self._inner_list} {content}"
def _list_emit(self, node, list_type):
start_newline = False
if self.last and self.last.kind not in BLOCK_TAGS:
if not self.last.content or not self.last.content.endswith("\n"):
start_newline = True
if self._inner_list == "": # Start a new list
self._inner_list = list_type
else:
self._inner_list += list_type
content = "%s" % self.emit_children(node)
self._inner_list = self._inner_list[:-1]
if self._inner_list == "": # Start a new list
if start_newline:
return "\n" + content + "\n\n"
else:
return content.strip() + "\n\n"
else:
return content
# --------------------------------------------------------------------------
def table_emit(self, node):
self._table = MarkupTable(
head_prefix=self.table_head_prefix, auto_width=self.table_auto_width, debug_msg=self.debug_msg
)
self.emit_children(node)
content = self._table.get_table_markup()
return f"{content}\n"
def tr_emit(self, node):
self._table.add_tr()
self.emit_children(node)
return ""
def _escape_linebreaks(self, text):
text = text.strip()
text = text.split("\n")
lines = [line.strip() for line in text]
lines = [line for line in lines if line]
content = "\\\\".join(lines)
content = content.strip("\\")
return content
def th_emit(self, node):
content = self.emit_children(node)
content = self._escape_linebreaks(content)
self._table.add_th(content)
return ""
def td_emit(self, node):
content = self.emit_children(node)
content = self._escape_linebreaks(content)
self._table.add_td(content)
return ""
# --------------------------------------------------------------------------
def _emit_content(self, node):
content = self.emit_children(node)
content = self._escape_linebreaks(content)
if node.kind in BLOCK_TAGS:
content = f"{content}\n\n"
return content
def div_emit(self, node):
return self._emit_content(node)
def span_emit(self, node):
return self._emit_content(node)
# --------------------------------------------------------------------------
def document_emit(self, node):
self.last = node
return self.emit_children(node)
def emit_children(self, node):
"""Emit all the children of a node."""
return "".join(self.emit_children_list(node))
def emit_children_list(self, node):
"""Emit all the children of a node."""
self.last = node
result = []
for child in node.children:
content = self.emit_node(child)
assert isinstance(content, str)
result.append(content)
return result
def emit_node(self, node):
"""Emit a single node."""
def unicode_error(method_name, method, node, content):
node.debug()
raise AssertionError(
f"Method '{method_name}' ({method}) returns no unicode - returns: {repr(content)} ({type(content)})"
)
if node.level:
self.debug_msg("emit_node", f"{node.kind} (level: {node.level:d}): {node.content!r}")
else:
self.debug_msg("emit_node", f"{node.kind}: {node.content!r}")
method_name = f"{node.kind}_emit"
emit_method = getattr(self, method_name, None)
if emit_method:
content = emit_method(node)
if not isinstance(content, str):
unicode_error(method_name, emit_method, node, content)
else:
content = self._unknown_emit(self, node)
if not isinstance(content, str):
unicode_error(method_name, self._unknown_emit, node, content)
self.last = node
return content
# def emit(self):
# """Emit the document represented by self.root DOM tree."""
# result = self.emit_node(self.root)
## return result.strip() # FIXME
# return result.rstrip() # FIXME
# -------------------------------------------------------------------------
def debug_msg(self, method, txt):
if not self.debugging:
return
print("%13s: %s" % (method, txt))
|