"""
html -> Markdown Emitter
~~~~~~~~~~~~~~~~~~~~~~
https://ct.de/y5hr
:copyleft: 2021 by python-creole team, see AUTHORS for more details.
:license: GNU GPL v3 or above, see LICENSE for more details.
"""
from creole.parser.html_parser import HtmlParser
from creole.shared.base_emitter import BaseEmitter
from creole.shared.document_tree import DocNode
from creole.shared.markup_table import MarkupTable
class MarkdownEmitter(BaseEmitter):
"""
Build from a document_tree (html2creole.parser.HtmlParser instance) a
Markdown markup text.
"""
def __init__(self, document_tree, strict=False, *args, **kwargs):
self.strict = strict
super().__init__(document_tree, *args, **kwargs)
def emit(self):
"""Emit the document represented by self.root DOM tree."""
return self.emit_node(self.root).strip() # FIXME
# --------------------------------------------------------------------------
def table_emit(self, node):
self._table = MarkupTable(head_prefix='', debug_msg=self.debug_msg)
self.emit_children(node)
content = self._table.get_markdown_table()
return f'\n{content}\n'
def tr_emit(self, node):
self._table.add_tr()
self.emit_children(node)
return ''
def th_emit(self, node):
self._table.add_th(self.emit_children(node))
return ''
def td_emit(self, node):
self._table.add_td(self.emit_children(node))
return ''
# --------------------------------------------------------------------------
def blockdata_pre_emit(self, node: DocNode):
"""pre block -> with newline at the end"""
return f'```{self.deentity.replace_all(node.content)}```\n'
def inlinedata_pre_emit(self, node: DocNode):
"""a pre inline block -> no newline at the end"""
pre_content = node.content
if pre_content.endswith(''):
# TODO: The parser should parse this!
p = HtmlParser(debug=True)
root_node: DocNode = p.feed(pre_content, preprocess=False)
code_node: DocNode = root_node.children[0]
code = self.deentity.replace_all(code_node.children[0].content)
class_value = code_node.attrs.get('class')
if class_value:
if class_value.startswith('language-'):
language = class_value.partition('-')[2]
return f'\n```{language}{code}```\n'
return f'\n```{code}```\n'
return f'\n```{self.deentity.replace_all(pre_content)}```\n'
def blockdata_pass_emit(self, node: DocNode):
return f'\n{node.content}\n'
# --------------------------------------------------------------------------
def p_emit(self, node: DocNode):
return f'\n{self.emit_children(node)}\n'
def br_emit(self, node: DocNode):
return '\n'
def headline_emit(self, node: DocNode):
prefix = '#' * node.level
if node.parent not in ('document', 'headline', 'p'):
prefix = f'\n{prefix}'
return f'{prefix} {self.emit_children(node)}\n'
# --------------------------------------------------------------------------
def strong_emit(self, node: DocNode):
return self._typeface(node, key='**')
b_emit = strong_emit
big_emit = strong_emit
def i_emit(self, node: DocNode):
return self._typeface(node, key='_')
em_emit = i_emit
def tt_emit(self, node: DocNode):
return self._typeface(node, key='##')
def sup_emit(self, node: DocNode):
return self._typeface(node, key='^^')
def sub_emit(self, node: DocNode):
return self._typeface(node, key=',,')
def u_emit(self, node: DocNode):
return self._typeface(node, key='__')
def small_emit(self, node: DocNode):
return self._typeface(node, key='--')
def del_emit(self, node: DocNode):
return self._typeface(node, key='~~')
strike_emit = del_emit
# --------------------------------------------------------------------------
def hr_emit(self, node: DocNode):
return '\n----\n'
def a_emit(self, node: DocNode):
link_text = self.emit_children(node)
url = node.attrs['href']
title = node.attrs.get('title')
if title:
return f'[{link_text}]({url} "{title}")'
else:
return f'[{link_text}]({url})'
def img_emit(self, node: DocNode):
src = node.attrs['src']
title = node.attrs.get('title')
alt = node.attrs.get('alt', '')
if title and alt:
return f'![{alt}]({src} "{title}")'
return f'![{alt}]({src})'
# --------------------------------------------------------------------------
def list_emit(self, node: DocNode):
content = self.emit_children(node)
if node.level == 1:
return f'\n{content}\n'
return content
ul_emit = list_emit
ol_emit = list_emit
def li_emit(self, node: DocNode):
list_level = node.level
list_node = node.parent
list_kind = list_node.kind
if list_kind == 'ul':
prefix = '*'
elif list_kind == 'ol':
prefix = '1.'
else:
raise NotImplementedError(f'List type: {list_kind}')
indent = ' ' * (list_level - 1)
content = self.emit_children(node)
return f"\n{indent}{prefix} {content}"
# --------------------------------------------------------------------------
def data_emit(self, node: DocNode):
content = node.content
if content == ' ':
# FIXME: Because of bug in creole.html_tools.strip_html.strip_html()
return ''
return node.content
def code_emit(self, node: DocNode):
code_block = self._emit_content(node)
assert '\n' not in code_block
if '`' in code_block:
return f'``{code_block}``'
else:
return f'`{code_block}`'
# --------------------------------------------------------------------------
def div_emit(self, node: DocNode):
return self._emit_content(node)
def span_emit(self, node: DocNode):
return self._emit_content(node)