summaryrefslogtreecommitdiff
path: root/creole
diff options
context:
space:
mode:
authorJensDiemer <git@jensdiemer.de>2021-12-05 17:20:31 +0100
committerJensDiemer <git@jensdiemer.de>2021-12-05 19:20:59 +0100
commit49be3d0008885019fef8e96d3178810c966daa7f (patch)
treed162912cab0e05a4220e3fecd9fabc94dab71986 /creole
parentb2939d23c2da001cf2322a72e730025e52ffcc55 (diff)
downloadcreole-html2markdown.tar.gz
WIP: html2markdownhtml2markdown
Diffstat (limited to 'creole')
-rw-r--r--creole/__init__.py17
-rw-r--r--creole/emitter/html2markdown_emitter.py155
-rw-r--r--creole/setup_utils.py101
-rw-r--r--creole/shared/base_emitter.py1
-rw-r--r--creole/tests/test_cross_compare_markdown.py59
-rw-r--r--creole/tests/test_project_setup.py13
-rw-r--r--creole/tests/utils/base_unittest.py53
7 files changed, 393 insertions, 6 deletions
diff --git a/creole/__init__.py b/creole/__init__.py
index 4c22c25..a620b84 100644
--- a/creole/__init__.py
+++ b/creole/__init__.py
@@ -17,6 +17,7 @@ import warnings
from creole.emitter.creol2html_emitter import HtmlEmitter
from creole.emitter.html2creole_emitter import CreoleEmitter
+from creole.emitter.html2markdown_emitter import MarkdownEmitter
from creole.emitter.html2rest_emitter import ReStructuredTextEmitter
from creole.emitter.html2textile_emitter import TextileEmitter
from creole.parser.creol2html_parser import CreoleParser
@@ -111,6 +112,22 @@ def html2textile(html_string, debug=False,
return emitter.emit()
+def html2markdown(html_string, debug=False,
+ unknown_emit=None
+ ):
+ """
+ convert html code into markdown markup
+
+ >>> html2markdown('<p>This is <strong>markdown <i>markup</i></strong>!</p>')
+ 'This is **markdown _markup_**!'
+ """
+ document_tree = parse_html(html_string, debug=debug)
+
+ # create markdown markup from the document tree
+ emitter = MarkdownEmitter(document_tree, debug=debug, unknown_emit=unknown_emit)
+ return emitter.emit()
+
+
def html2rest(html_string, debug=False,
unknown_emit=None
):
diff --git a/creole/emitter/html2markdown_emitter.py b/creole/emitter/html2markdown_emitter.py
new file mode 100644
index 0000000..b44b5c1
--- /dev/null
+++ b/creole/emitter/html2markdown_emitter.py
@@ -0,0 +1,155 @@
+"""
+ html -> Markdown Emitter
+ ~~~~~~~~~~~~~~~~~~~~~~
+
+
+ :copyleft: 2021 by python-creole team, see AUTHORS for more details.
+ :license: GNU GPL v3 or above, see LICENSE for more details.
+"""
+
+
+import posixpath
+
+from creole.shared.base_emitter import BaseEmitter
+
+
+class MarkdownEmitter(BaseEmitter):
+ """
+ Build from a document_tree (html2creole.parser.HtmlParser instance) a
+ Markdown markup text.
+ """
+
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+
+ self.table_head_prefix = '_. '
+ self.table_auto_width = False
+
+ def emit(self):
+ """Emit the document represented by self.root DOM tree."""
+ return self.emit_node(self.root).strip() # FIXME
+
+ # --------------------------------------------------------------------------
+
+ def indent_text(self, text, spaces):
+ return '\n'.join(f'{spaces}{line}' for line in text.splitlines())
+
+ # --------------------------------------------------------------------------
+
+ def blockdata_pre_emit(self, node):
+ """ pre block -> with newline at the end """
+ text = self.deentity.replace_all(node.content)
+ text = self.indent_text(text, spaces=' ')
+ return text
+
+ def inlinedata_pre_emit(self, node):
+ """ a pre inline block -> no newline at the end """
+ raise NotImplementedError
+ return f'<pre>{self.deentity.replace_all(node.content)}</pre>'
+
+ def blockdata_pass_emit(self, node):
+ raise NotImplementedError
+ return f'{node.content}\n\n'
+
+ # --------------------------------------------------------------------------
+
+ def p_emit(self, node):
+ return f'{self.emit_children(node)}\n\n'
+
+ def headline_emit(self, node):
+ prefix = '#' * node.level
+ return f'{prefix} {self.emit_children(node)}\n\n'
+
+ # --------------------------------------------------------------------------
+
+ def _typeface(self, node, key):
+ return key + self.emit_children(node) + key
+
+ def strong_emit(self, node):
+ return self._typeface(node, key='**')
+
+ def b_emit(self, node):
+ return self._typeface(node, key='**')
+ big_emit = strong_emit
+
+ def i_emit(self, node):
+ return self._typeface(node, key='_')
+
+ def em_emit(self, node):
+ return self._typeface(node, key='_')
+
+ def sup_emit(self, node):
+ return self._typeface(node, key='^')
+
+ def sub_emit(self, node):
+ return self._typeface(node, key='~')
+
+ def del_emit(self, node):
+ return self._typeface(node, key='-')
+
+ def cite_emit(self, node):
+ return self._typeface(node, key='??')
+
+ def ins_emit(self, node):
+ return self._typeface(node, key='+')
+
+ def span_emit(self, node):
+ return self._typeface(node, key='%')
+
+ def code_emit(self, node):
+ return self._typeface(node, key='`')
+
+ def tt_emit(self, node):
+ return self._typeface(node, key='`')
+
+ # --------------------------------------------------------------------------
+
+ def hr_emit(self, node):
+ return '----\n\n'
+
+ def a_emit(self, node):
+ link_text = self.emit_children(node)
+ url = node.attrs['href']
+ return f'[{link_text}]({url})'
+
+ def img_emit(self, node):
+ src = node.attrs['src']
+
+ if src.split(':')[0] == 'data':
+ return ''
+
+ title = node.attrs.get('title', '')
+ alt = node.attrs.get('alt', '')
+ if len(alt) > len(title): # Use the longest one
+ text = alt
+ else:
+ text = title
+
+ if text == '': # Use filename as picture text
+ text = posixpath.basename(src)
+
+ return f'![{text}]({src})'
+
+ # --------------------------------------------------------------------------
+
+ # def li_emit(self, node):
+ # content = self.emit_children(node).strip("\n")
+ # prefix = ' ' * (node.level - 1)
+ # result = f"\n{prefix}{content}\n"
+ # return result
+
+ def _list_emit(self, node, list_type):
+ content=self.emit_children(node)
+ last_kind = self.last.kind if self.last else None
+ print(11111, repr(content), last_kind)
+ content=content.strip()
+ content = f"{list_type} {content}"
+ content = self.indent_text(content, spaces=' '*node.level)
+ content = f"{content}\n"
+ return content
+
+ def ul_emit(self, node): # Bullet list
+ return self._list_emit(node, list_type='*')
+
+ def ol_emit(self, node): # Numbered list
+ return self._list_emit(node, list_type='1.')
diff --git a/creole/setup_utils.py b/creole/setup_utils.py
index b2e4dbb..1e6058e 100644
--- a/creole/setup_utils.py
+++ b/creole/setup_utils.py
@@ -21,7 +21,7 @@ from pathlib import Path
from readme_renderer.rst import render
-from creole import creole2html, html2rest
+from creole import creole2html, html2markdown, html2rest
from creole.shared.diff_utils import unified_diff
from creole.shared.unknown_tags import raise_unknown_node, transparent_unknown_nodes
@@ -125,6 +125,30 @@ def _generate_rst_readme(*, creole_readme_path):
return rest_readme
+def _generate_markdown_readme(*, creole_readme_path):
+ with creole_readme_path.open('r') as f:
+ creole_readme = f.read().strip()
+
+ # convert creole into html
+ html_readme = creole2html(creole_readme)
+ html_readme_path = creole_readme_path.with_suffix('.html')
+ html_readme_path.write_text(html_readme, encoding='utf-8')
+
+ # convert html to ReSt
+ rest_readme = html2markdown(
+ html_readme,
+ unknown_emit=raise_unknown_node # raise a error if a unknown node found
+ )
+
+ # Check if generated ReSt is valid, see also:
+ # https://pypi.org/help/#description-content-type
+ rendered = render(rest_readme, stream=sys.stderr)
+ if rendered is None:
+ sys.exit(1)
+
+ return rest_readme
+
+
def update_rst_readme(package_root, filename='README.creole'):
"""
Generate README.rst from README.creole
@@ -172,6 +196,53 @@ def update_rst_readme(package_root, filename='README.creole'):
return rest_readme_path
+def update_markdown_readme(package_root, filename='README.creole'):
+ """
+ Generate README.md from README.creole
+ """
+ assert isinstance(package_root, Path)
+ assert package_root.is_dir(), f'Directory not found: {package_root}'
+ creole_readme_path = Path(package_root, filename)
+ assert creole_readme_path.is_file(), f'File not found: {creole_readme_path}'
+
+ rest_readme_path = creole_readme_path.with_suffix('.md')
+ print(
+ f'Generate {rest_readme_path.name} from {creole_readme_path.name}',
+ end='...', flush=True
+ )
+
+ markdown_readme = _generate_markdown_readme(creole_readme_path=creole_readme_path)
+
+ # Check if content was changed
+ changed = False
+ with rest_readme_path.open('r') as f:
+ for new_line, old_line in zip(markdown_readme.splitlines(), f):
+ if new_line.rstrip() != old_line.rstrip():
+ changed = True
+ break
+
+ if not changed:
+ # The existing README.rst is up-to-date: Don't change the timestamp
+ print('nothing changed, ok.')
+ return rest_readme_path
+
+ with rest_readme_path.open('w') as f:
+ f.write(markdown_readme)
+
+ # Add a note about generation with modification time from source:
+
+ f.write('\n\n------------\n\n')
+
+ modification_time = creole_readme_path.stat().st_mtime
+ dt = datetime.datetime.fromtimestamp(modification_time)
+ dt = dt.replace(microsecond=0)
+ dt = dt.isoformat(sep=' ')
+ f.write(f'``Note: this file is generated from {filename} {dt} with "python-creole"``')
+
+ print('done.')
+ return rest_readme_path
+
+
def assert_rst_readme(package_root, filename='README.creole'):
"""
raise AssertionError if README.rst is not up-to-date.
@@ -179,8 +250,7 @@ def assert_rst_readme(package_root, filename='README.creole'):
creole_readme_path = Path(package_root, filename)
rest_readme = _generate_rst_readme(creole_readme_path=creole_readme_path)
rest_readme_path = creole_readme_path.with_suffix('.rst')
- with rest_readme_path.open('r') as f:
- content = f.read()
+ content = rest_readme_path.read_text(encoding='UTF-8')
assert len(content) > 0, f'Empty content in {rest_readme_path}'
content = content.rsplit('\n', 4)[0] # remove note about generation with modification time
@@ -190,6 +260,23 @@ def assert_rst_readme(package_root, filename='README.creole'):
raise AssertionError(f'{rest_readme_path.name} is not up-to-date:\n{diff}')
+def assert_markdown_readme(package_root, filename='README.creole'):
+ """
+ raise AssertionError if README.md is not up-to-date.
+ """
+ creole_readme_path = Path(package_root, filename)
+ markdown_readme = _generate_markdown_readme(creole_readme_path=creole_readme_path)
+ markdown_readme_path = creole_readme_path.with_suffix('.md')
+ content = markdown_readme_path.read_text(encoding='UTF-8')
+
+ assert len(content) > 0, f'Empty content in {markdown_readme_path}'
+ content = content.rsplit('\n', 4)[0] # remove note about generation with modification time
+
+ if markdown_readme != content:
+ diff = unified_diff(content, markdown_readme, filename=markdown_readme_path.name)
+ raise AssertionError(f'{markdown_readme_path.name} is not up-to-date:\n{diff}')
+
+
def update_creole_rst_readme():
return update_rst_readme(
package_root=Path(__file__).parent.parent,
@@ -197,5 +284,13 @@ def update_creole_rst_readme():
)
+def update_creole_markdown_readme():
+ return update_markdown_readme(
+ package_root=Path(__file__).parent.parent,
+ filename='README.creole'
+ )
+
+
if __name__ == '__main__':
update_creole_rst_readme()
+ update_creole_markdown_readme()
diff --git a/creole/shared/base_emitter.py b/creole/shared/base_emitter.py
index 4f7c5f4..35d7f88 100644
--- a/creole/shared/base_emitter.py
+++ b/creole/shared/base_emitter.py
@@ -39,7 +39,6 @@ class BaseEmitter:
def blockdata_pass_emit(self, node):
return f"{node.content}\n\n"
- return node.content
# --------------------------------------------------------------------------
diff --git a/creole/tests/test_cross_compare_markdown.py b/creole/tests/test_cross_compare_markdown.py
new file mode 100644
index 0000000..5b26ddd
--- /dev/null
+++ b/creole/tests/test_cross_compare_markdown.py
@@ -0,0 +1,59 @@
+"""
+ cross compare markdown unittest
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ Compare all similarities between:
+ * markdown2html (used the python markdown module)
+ * html2markdown
+
+ Note: This only works fine if there is no problematic whitespace handling.
+ In this case, we must test in test_creole2html.py or test_html2creole.py
+
+ :copyleft: 2021 by python-creole team, see AUTHORS for more details.
+ :license: GNU GPL v3 or above, see LICENSE for more details.
+"""
+from inspect import cleandoc
+
+from creole.tests.utils.base_unittest import BaseCreoleTest
+
+
+class CrossCompareTextileTests(BaseCreoleTest):
+ def test_typeface_basic(self):
+ self.cross_compare_markdown(
+ markdown_string="Text attributes _italic_, **bold**, `monospace`.",
+ html_string=(
+ '<p>Text attributes <em>italic</em>,'
+ ' <strong>bold</strong>,'
+ ' <code>monospace</code>.</p>'
+ )
+ )
+
+ def test_lists(self):
+ self.cross_compare_markdown(
+ markdown_string=cleandoc('''
+ Bullet lists nested within numbered list:
+
+ 1. fruits
+ * apple
+ * banana
+ 1. vegetables
+ - carrot
+ - broccoli
+ '''),
+ html_string=cleandoc('''
+ <p>Bullet lists nested within numbered list:</p>
+
+ <ol>
+ <li>fruits <ul>
+ <li>apple</li>
+ <li>banana</li>
+ </ul></li>
+ <li>vegetables <ul>
+ <li>carrot</li>
+ <li>broccoli</li>
+ </ul></li>
+ </ol>
+ '''),
+ debug=True
+ )
+
diff --git a/creole/tests/test_project_setup.py b/creole/tests/test_project_setup.py
index cb9aa1e..d05ffed 100644
--- a/creole/tests/test_project_setup.py
+++ b/creole/tests/test_project_setup.py
@@ -8,7 +8,7 @@ from poetry_publish.tests.test_project_setup import test_poetry_check as assert_
from poetry_publish.tests.test_project_setup import test_version as assert_version
from creole import __version__
-from creole.setup_utils import update_rst_readme
+from creole.setup_utils import update_markdown_readme, update_rst_readme
from creole.tests.constants import CREOLE_PACKAGE_ROOT
@@ -31,6 +31,17 @@ def test_update_rst_readme(capsys):
assert str(rest_readme_path).endswith('/README.rst')
+def test_update_md_readme(capsys):
+ rest_readme_path = update_markdown_readme(
+ package_root=CREOLE_PACKAGE_ROOT, filename='README.creole'
+ )
+ captured = capsys.readouterr()
+ assert captured.out == 'Generate README.md from README.creole...nothing changed, ok.\n'
+ assert captured.err == ''
+ assert isinstance(rest_readme_path, Path)
+ assert str(rest_readme_path).endswith('/README.md')
+
+
def test_poetry_check():
"""
Test 'poetry check' output.
diff --git a/creole/tests/utils/base_unittest.py b/creole/tests/utils/base_unittest.py
index e97fac5..ae7ce5e 100644
--- a/creole/tests/utils/base_unittest.py
+++ b/creole/tests/utils/base_unittest.py
@@ -11,9 +11,10 @@
import re
+import markdown
import textile
-from creole import creole2html, html2creole, html2rest, html2textile
+from creole import creole2html, html2creole, html2markdown, html2rest, html2textile
from creole.rest_tools.clean_writer import rest2html
from creole.tests.utils.utils import MarkupTest
@@ -215,6 +216,30 @@ class BaseCreoleTest(MarkupTest):
return textile_string, html_string
+ def assert_html2markdown(self, markdown_string, html_string,
+ strip_lines=False, debug=False, **kwargs):
+ """
+ Check html2markdown
+ """
+ self.assertNotEqual(markdown_string, html_string)
+
+ markdown_string = self._prepare_text(markdown_string)
+ html_string = self._prepare_text(html_string)
+
+ if strip_lines:
+ html_string = strip_html_lines(html_string, strip_lines)
+
+ # compare html -> markdown
+ markdown_string2 = html2markdown(html_string, debug, **kwargs)
+ if debug:
+ print("-" * 79)
+ print(markdown_string2)
+ print("-" * 79)
+
+ self.assertEqual(markdown_string2, markdown_string, msg="html2markdown")
+
+ return markdown_string, html_string
+
def cross_compare_textile(self, textile_string, html_string,
strip_lines=False, debug=False, **kwargs):
"""
@@ -241,6 +266,32 @@ class BaseCreoleTest(MarkupTest):
self.assertEqual(html_string, html, msg="textile2html")
+ def cross_compare_markdown(self, markdown_string, html_string,
+ strip_lines=False, debug=False, **kwargs):
+ """
+ Checks:
+ * html2markdown
+ * markdown2html
+ """
+# assert isinstance(markdown_string, str)
+# assert isinstance(html_string, str)
+ self.assertNotEqual(markdown_string, html_string)
+
+ # compare html -> markdown
+ markdown_string, html_string = self.assert_html2markdown(
+ markdown_string, html_string,
+ strip_lines, debug, **kwargs
+ )
+
+ # compare markdown -> html
+ html = markdown.markdown(markdown_string)
+ html = html.replace("<br />", "<br />\n")
+ html = tabs2spaces(html)
+ if strip_lines:
+ html = strip_html_lines(html, strip_lines)
+
+ self.assertEqual(html_string, html, msg="markdown2html")
+
def assert_html2rest(self, rest_string, html_string,
strip_lines=False, debug=False, **kwargs):
"""