From 49be3d0008885019fef8e96d3178810c966daa7f Mon Sep 17 00:00:00 2001 From: JensDiemer Date: Sun, 5 Dec 2021 17:20:31 +0100 Subject: WIP: html2markdown --- Makefile | 3 +- README.creole | 28 +++-- README.rst | 36 +++++-- creole/__init__.py | 17 +++ creole/emitter/html2markdown_emitter.py | 155 ++++++++++++++++++++++++++++ creole/setup_utils.py | 101 +++++++++++++++++- creole/shared/base_emitter.py | 1 - creole/tests/test_cross_compare_markdown.py | 59 +++++++++++ creole/tests/test_project_setup.py | 13 ++- creole/tests/utils/base_unittest.py | 53 +++++++++- pyproject.toml | 2 + 11 files changed, 444 insertions(+), 24 deletions(-) create mode 100644 creole/emitter/html2markdown_emitter.py create mode 100644 creole/tests/test_cross_compare_markdown.py diff --git a/Makefile b/Makefile index 1e23439..65a523c 100644 --- a/Makefile +++ b/Makefile @@ -63,8 +63,9 @@ tox-py39: check-poetry ## Run pytest via tox with *python v3.9* pytest: check-poetry ## Run pytest poetry run pytest -update-rst-readme: ## update README.rst from README.creole +update-readmes: ## update README.rst from README.creole poetry run update_rst_readme + poetry run update_markdown_readme publish: ## Release new version to PyPi poetry run publish diff --git a/README.creole b/README.creole index c40a487..285a723 100644 --- a/README.creole +++ b/README.creole @@ -51,7 +51,7 @@ Convert creole markup to html code: {{{ >>> from creole import creole2html >>> creole2html("This is **creole //markup//**") -u'

This is creole markup

\n' +'

This is creole markup

\n' }}} @@ -59,8 +59,8 @@ u'

This is creole markup

\n' Convert html code back into creole markup: {{{ >>> from creole import html2creole ->>> html2creole(u'

This is creole markup

\n') -u'This is **creole //markup//**' +>>> html2creole('

This is creole markup

\n') +'This is **creole //markup//**' }}} @@ -69,7 +69,7 @@ Convert ReStructuredText into clean html code (needs [[http://pypi.python.org/py {{{ >>> from creole.rest2html.clean_writer import rest2html >>> rest2html(u"A ReSt link to `PyLucid CMS `_ :)") -u'

A ReSt link to PyLucid CMS :)

\\n' +'

A ReSt link to PyLucid CMS :)

\\n' }}} (more information: [[https://github.com/jedie/python-creole/wiki/rest2html|rest2html wiki page]]) @@ -78,8 +78,8 @@ u'

A ReSt link to PyLucid CMS :)

\\n' Convert html code into ReStructuredText markup: {{{ >>> from creole import html2rest ->>> html2rest(u'

This is ReStructuredText markup!

') -u'This is **ReStructuredText** *markup*!' +>>> html2rest('

This is ReStructuredText markup!

') +'This is **ReStructuredText** *markup*!' }}} @@ -87,8 +87,18 @@ u'This is **ReStructuredText** *markup*!' Convert html code into textile markup {{{ >>> from creole import html2textile ->>> html2textile(u'

This is textile markup!

') -u'This is *textile __markup__*!' +>>> html2textile('

This is textile markup!

') +'This is *textile __markup__*!' +}}} + +See also: [[http://github.com/jedie/python-creole/blob/master/demo.py]] + +== html2markdown == +Convert html code into textile markup +{{{ +>>> from creole import html2markdown +>>> html2markdown('

This is markdown markup!

') +'This is **markdown _markup_**!' }}} See also: [[http://github.com/jedie/python-creole/blob/master/demo.py]] @@ -129,6 +139,7 @@ If you have python-creole installed, you will get these simple CLI scripts: * html2creole * html2rest * html2textile +* html2markdown Here the {{{--help}}} output from {{{html2creole}}}: {{{ @@ -228,6 +239,7 @@ Note: In this case you must install **docutils**! See above. = history = * *dev* - [[https://github.com/jedie/python-creole/compare/v1.4.10...master|compare v1.4.10...master]] +** NEW: html2markdown ** Remove deprecated "parser_kwargs" and "emitter_kwargs" ** TBC * v1.4.10 - 2021-05-11 - [[https://github.com/jedie/python-creole/compare/v1.4.9...v1.4.10|compare v1.4.9...v1.4.10]] diff --git a/README.rst b/README.rst index 23086ef..cf9dd38 100644 --- a/README.rst +++ b/README.rst @@ -81,7 +81,7 @@ Convert creole markup to html code: >>> from creole import creole2html >>> creole2html("This is **creole //markup//**") - u'

This is creole markup

\n' + '

This is creole markup

\n' ----------- html2creole @@ -92,8 +92,8 @@ Convert html code back into creole markup: :: >>> from creole import html2creole - >>> html2creole(u'

This is creole markup

\n') - u'This is **creole //markup//**' + >>> html2creole('

This is creole markup

\n') + 'This is **creole //markup//**' --------- rest2html @@ -105,7 +105,7 @@ Convert ReStructuredText into clean html code (needs `docutils`_): >>> from creole.rest2html.clean_writer import rest2html >>> rest2html(u"A ReSt link to `PyLucid CMS `_ :)") - u'

A ReSt link to PyLucid CMS :)

\\n' + '

A ReSt link to PyLucid CMS :)

\\n' (more information: `rest2html wiki page `_) @@ -118,8 +118,8 @@ Convert html code into ReStructuredText markup: :: >>> from creole import html2rest - >>> html2rest(u'

This is ReStructuredText markup!

') - u'This is **ReStructuredText** *markup*!' + >>> html2rest('

This is ReStructuredText markup!

') + 'This is **ReStructuredText** *markup*!' ------------ html2textile @@ -130,11 +130,25 @@ Convert html code into textile markup :: >>> from creole import html2textile - >>> html2textile(u'

This is textile markup!

') - u'This is *textile __markup__*!' + >>> html2textile('

This is textile markup!

') + 'This is *textile __markup__*!' See also: `http://github.com/jedie/python-creole/blob/master/demo.py `_ +------------- +html2markdown +------------- + +Convert html code into textile markup + +:: + + >>> from creole import html2markdown + >>> html2markdown('

This is markdown markup!

') + 'This is **markdown _markup_**!' + +See also: `http://github.com/jedie/python-creole/blob/master/demo.py`_ + ===================== Image size additional ===================== @@ -184,6 +198,8 @@ If you have python-creole installed, you will get these simple CLI scripts: * html2textile +* html2markdown + Here the ``--help`` output from ``html2creole``: :: @@ -297,6 +313,8 @@ history * *dev* - `compare v1.4.10...master `_ + * NEW: html2markdown + * Remove deprecated "parser_kwargs" and "emitter_kwargs" * TBC @@ -702,4 +720,4 @@ donation ------------ -``Note: this file is generated from README.creole 2021-12-05 16:58:04 with "python-creole"`` \ No newline at end of file +``Note: this file is generated from README.creole 2021-12-05 17:19:56 with "python-creole"`` \ No newline at end of file diff --git a/creole/__init__.py b/creole/__init__.py index 4c22c25..a620b84 100644 --- a/creole/__init__.py +++ b/creole/__init__.py @@ -17,6 +17,7 @@ import warnings from creole.emitter.creol2html_emitter import HtmlEmitter from creole.emitter.html2creole_emitter import CreoleEmitter +from creole.emitter.html2markdown_emitter import MarkdownEmitter from creole.emitter.html2rest_emitter import ReStructuredTextEmitter from creole.emitter.html2textile_emitter import TextileEmitter from creole.parser.creol2html_parser import CreoleParser @@ -111,6 +112,22 @@ def html2textile(html_string, debug=False, return emitter.emit() +def html2markdown(html_string, debug=False, + unknown_emit=None + ): + """ + convert html code into markdown markup + + >>> html2markdown('

This is markdown markup!

') + 'This is **markdown _markup_**!' + """ + document_tree = parse_html(html_string, debug=debug) + + # create markdown markup from the document tree + emitter = MarkdownEmitter(document_tree, debug=debug, unknown_emit=unknown_emit) + return emitter.emit() + + def html2rest(html_string, debug=False, unknown_emit=None ): diff --git a/creole/emitter/html2markdown_emitter.py b/creole/emitter/html2markdown_emitter.py new file mode 100644 index 0000000..b44b5c1 --- /dev/null +++ b/creole/emitter/html2markdown_emitter.py @@ -0,0 +1,155 @@ +""" + html -> Markdown Emitter + ~~~~~~~~~~~~~~~~~~~~~~ + + + :copyleft: 2021 by python-creole team, see AUTHORS for more details. + :license: GNU GPL v3 or above, see LICENSE for more details. +""" + + +import posixpath + +from creole.shared.base_emitter import BaseEmitter + + +class MarkdownEmitter(BaseEmitter): + """ + Build from a document_tree (html2creole.parser.HtmlParser instance) a + Markdown markup text. + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + self.table_head_prefix = '_. ' + self.table_auto_width = False + + def emit(self): + """Emit the document represented by self.root DOM tree.""" + return self.emit_node(self.root).strip() # FIXME + + # -------------------------------------------------------------------------- + + def indent_text(self, text, spaces): + return '\n'.join(f'{spaces}{line}' for line in text.splitlines()) + + # -------------------------------------------------------------------------- + + def blockdata_pre_emit(self, node): + """ pre block -> with newline at the end """ + text = self.deentity.replace_all(node.content) + text = self.indent_text(text, spaces=' ') + return text + + def inlinedata_pre_emit(self, node): + """ a pre inline block -> no newline at the end """ + raise NotImplementedError + return f'
{self.deentity.replace_all(node.content)}
' + + def blockdata_pass_emit(self, node): + raise NotImplementedError + return f'{node.content}\n\n' + + # -------------------------------------------------------------------------- + + def p_emit(self, node): + return f'{self.emit_children(node)}\n\n' + + def headline_emit(self, node): + prefix = '#' * node.level + return f'{prefix} {self.emit_children(node)}\n\n' + + # -------------------------------------------------------------------------- + + def _typeface(self, node, key): + return key + self.emit_children(node) + key + + def strong_emit(self, node): + return self._typeface(node, key='**') + + def b_emit(self, node): + return self._typeface(node, key='**') + big_emit = strong_emit + + def i_emit(self, node): + return self._typeface(node, key='_') + + def em_emit(self, node): + return self._typeface(node, key='_') + + def sup_emit(self, node): + return self._typeface(node, key='^') + + def sub_emit(self, node): + return self._typeface(node, key='~') + + def del_emit(self, node): + return self._typeface(node, key='-') + + def cite_emit(self, node): + return self._typeface(node, key='??') + + def ins_emit(self, node): + return self._typeface(node, key='+') + + def span_emit(self, node): + return self._typeface(node, key='%') + + def code_emit(self, node): + return self._typeface(node, key='`') + + def tt_emit(self, node): + return self._typeface(node, key='`') + + # -------------------------------------------------------------------------- + + def hr_emit(self, node): + return '----\n\n' + + def a_emit(self, node): + link_text = self.emit_children(node) + url = node.attrs['href'] + return f'[{link_text}]({url})' + + def img_emit(self, node): + src = node.attrs['src'] + + if src.split(':')[0] == 'data': + return '' + + title = node.attrs.get('title', '') + alt = node.attrs.get('alt', '') + if len(alt) > len(title): # Use the longest one + text = alt + else: + text = title + + if text == '': # Use filename as picture text + text = posixpath.basename(src) + + return f'![{text}]({src})' + + # -------------------------------------------------------------------------- + + # def li_emit(self, node): + # content = self.emit_children(node).strip("\n") + # prefix = ' ' * (node.level - 1) + # result = f"\n{prefix}{content}\n" + # return result + + def _list_emit(self, node, list_type): + content=self.emit_children(node) + last_kind = self.last.kind if self.last else None + print(11111, repr(content), last_kind) + content=content.strip() + content = f"{list_type} {content}" + content = self.indent_text(content, spaces=' '*node.level) + content = f"{content}\n" + return content + + def ul_emit(self, node): # Bullet list + return self._list_emit(node, list_type='*') + + def ol_emit(self, node): # Numbered list + return self._list_emit(node, list_type='1.') diff --git a/creole/setup_utils.py b/creole/setup_utils.py index b2e4dbb..1e6058e 100644 --- a/creole/setup_utils.py +++ b/creole/setup_utils.py @@ -21,7 +21,7 @@ from pathlib import Path from readme_renderer.rst import render -from creole import creole2html, html2rest +from creole import creole2html, html2markdown, html2rest from creole.shared.diff_utils import unified_diff from creole.shared.unknown_tags import raise_unknown_node, transparent_unknown_nodes @@ -125,6 +125,30 @@ def _generate_rst_readme(*, creole_readme_path): return rest_readme +def _generate_markdown_readme(*, creole_readme_path): + with creole_readme_path.open('r') as f: + creole_readme = f.read().strip() + + # convert creole into html + html_readme = creole2html(creole_readme) + html_readme_path = creole_readme_path.with_suffix('.html') + html_readme_path.write_text(html_readme, encoding='utf-8') + + # convert html to ReSt + rest_readme = html2markdown( + html_readme, + unknown_emit=raise_unknown_node # raise a error if a unknown node found + ) + + # Check if generated ReSt is valid, see also: + # https://pypi.org/help/#description-content-type + rendered = render(rest_readme, stream=sys.stderr) + if rendered is None: + sys.exit(1) + + return rest_readme + + def update_rst_readme(package_root, filename='README.creole'): """ Generate README.rst from README.creole @@ -172,6 +196,53 @@ def update_rst_readme(package_root, filename='README.creole'): return rest_readme_path +def update_markdown_readme(package_root, filename='README.creole'): + """ + Generate README.md from README.creole + """ + assert isinstance(package_root, Path) + assert package_root.is_dir(), f'Directory not found: {package_root}' + creole_readme_path = Path(package_root, filename) + assert creole_readme_path.is_file(), f'File not found: {creole_readme_path}' + + rest_readme_path = creole_readme_path.with_suffix('.md') + print( + f'Generate {rest_readme_path.name} from {creole_readme_path.name}', + end='...', flush=True + ) + + markdown_readme = _generate_markdown_readme(creole_readme_path=creole_readme_path) + + # Check if content was changed + changed = False + with rest_readme_path.open('r') as f: + for new_line, old_line in zip(markdown_readme.splitlines(), f): + if new_line.rstrip() != old_line.rstrip(): + changed = True + break + + if not changed: + # The existing README.rst is up-to-date: Don't change the timestamp + print('nothing changed, ok.') + return rest_readme_path + + with rest_readme_path.open('w') as f: + f.write(markdown_readme) + + # Add a note about generation with modification time from source: + + f.write('\n\n------------\n\n') + + modification_time = creole_readme_path.stat().st_mtime + dt = datetime.datetime.fromtimestamp(modification_time) + dt = dt.replace(microsecond=0) + dt = dt.isoformat(sep=' ') + f.write(f'``Note: this file is generated from {filename} {dt} with "python-creole"``') + + print('done.') + return rest_readme_path + + def assert_rst_readme(package_root, filename='README.creole'): """ raise AssertionError if README.rst is not up-to-date. @@ -179,8 +250,7 @@ def assert_rst_readme(package_root, filename='README.creole'): creole_readme_path = Path(package_root, filename) rest_readme = _generate_rst_readme(creole_readme_path=creole_readme_path) rest_readme_path = creole_readme_path.with_suffix('.rst') - with rest_readme_path.open('r') as f: - content = f.read() + content = rest_readme_path.read_text(encoding='UTF-8') assert len(content) > 0, f'Empty content in {rest_readme_path}' content = content.rsplit('\n', 4)[0] # remove note about generation with modification time @@ -190,6 +260,23 @@ def assert_rst_readme(package_root, filename='README.creole'): raise AssertionError(f'{rest_readme_path.name} is not up-to-date:\n{diff}') +def assert_markdown_readme(package_root, filename='README.creole'): + """ + raise AssertionError if README.md is not up-to-date. + """ + creole_readme_path = Path(package_root, filename) + markdown_readme = _generate_markdown_readme(creole_readme_path=creole_readme_path) + markdown_readme_path = creole_readme_path.with_suffix('.md') + content = markdown_readme_path.read_text(encoding='UTF-8') + + assert len(content) > 0, f'Empty content in {markdown_readme_path}' + content = content.rsplit('\n', 4)[0] # remove note about generation with modification time + + if markdown_readme != content: + diff = unified_diff(content, markdown_readme, filename=markdown_readme_path.name) + raise AssertionError(f'{markdown_readme_path.name} is not up-to-date:\n{diff}') + + def update_creole_rst_readme(): return update_rst_readme( package_root=Path(__file__).parent.parent, @@ -197,5 +284,13 @@ def update_creole_rst_readme(): ) +def update_creole_markdown_readme(): + return update_markdown_readme( + package_root=Path(__file__).parent.parent, + filename='README.creole' + ) + + if __name__ == '__main__': update_creole_rst_readme() + update_creole_markdown_readme() diff --git a/creole/shared/base_emitter.py b/creole/shared/base_emitter.py index 4f7c5f4..35d7f88 100644 --- a/creole/shared/base_emitter.py +++ b/creole/shared/base_emitter.py @@ -39,7 +39,6 @@ class BaseEmitter: def blockdata_pass_emit(self, node): return f"{node.content}\n\n" - return node.content # -------------------------------------------------------------------------- diff --git a/creole/tests/test_cross_compare_markdown.py b/creole/tests/test_cross_compare_markdown.py new file mode 100644 index 0000000..5b26ddd --- /dev/null +++ b/creole/tests/test_cross_compare_markdown.py @@ -0,0 +1,59 @@ +""" + cross compare markdown unittest + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + Compare all similarities between: + * markdown2html (used the python markdown module) + * html2markdown + + Note: This only works fine if there is no problematic whitespace handling. + In this case, we must test in test_creole2html.py or test_html2creole.py + + :copyleft: 2021 by python-creole team, see AUTHORS for more details. + :license: GNU GPL v3 or above, see LICENSE for more details. +""" +from inspect import cleandoc + +from creole.tests.utils.base_unittest import BaseCreoleTest + + +class CrossCompareTextileTests(BaseCreoleTest): + def test_typeface_basic(self): + self.cross_compare_markdown( + markdown_string="Text attributes _italic_, **bold**, `monospace`.", + html_string=( + '

Text attributes italic,' + ' bold,' + ' monospace.

' + ) + ) + + def test_lists(self): + self.cross_compare_markdown( + markdown_string=cleandoc(''' + Bullet lists nested within numbered list: + + 1. fruits + * apple + * banana + 1. vegetables + - carrot + - broccoli + '''), + html_string=cleandoc(''' +

Bullet lists nested within numbered list:

+ +
    +
  1. fruits
      +
    • apple
    • +
    • banana
    • +
  2. +
  3. vegetables
      +
    • carrot
    • +
    • broccoli
    • +
  4. +
+ '''), + debug=True + ) + diff --git a/creole/tests/test_project_setup.py b/creole/tests/test_project_setup.py index cb9aa1e..d05ffed 100644 --- a/creole/tests/test_project_setup.py +++ b/creole/tests/test_project_setup.py @@ -8,7 +8,7 @@ from poetry_publish.tests.test_project_setup import test_poetry_check as assert_ from poetry_publish.tests.test_project_setup import test_version as assert_version from creole import __version__ -from creole.setup_utils import update_rst_readme +from creole.setup_utils import update_markdown_readme, update_rst_readme from creole.tests.constants import CREOLE_PACKAGE_ROOT @@ -31,6 +31,17 @@ def test_update_rst_readme(capsys): assert str(rest_readme_path).endswith('/README.rst') +def test_update_md_readme(capsys): + rest_readme_path = update_markdown_readme( + package_root=CREOLE_PACKAGE_ROOT, filename='README.creole' + ) + captured = capsys.readouterr() + assert captured.out == 'Generate README.md from README.creole...nothing changed, ok.\n' + assert captured.err == '' + assert isinstance(rest_readme_path, Path) + assert str(rest_readme_path).endswith('/README.md') + + def test_poetry_check(): """ Test 'poetry check' output. diff --git a/creole/tests/utils/base_unittest.py b/creole/tests/utils/base_unittest.py index e97fac5..ae7ce5e 100644 --- a/creole/tests/utils/base_unittest.py +++ b/creole/tests/utils/base_unittest.py @@ -11,9 +11,10 @@ import re +import markdown import textile -from creole import creole2html, html2creole, html2rest, html2textile +from creole import creole2html, html2creole, html2markdown, html2rest, html2textile from creole.rest_tools.clean_writer import rest2html from creole.tests.utils.utils import MarkupTest @@ -215,6 +216,30 @@ class BaseCreoleTest(MarkupTest): return textile_string, html_string + def assert_html2markdown(self, markdown_string, html_string, + strip_lines=False, debug=False, **kwargs): + """ + Check html2markdown + """ + self.assertNotEqual(markdown_string, html_string) + + markdown_string = self._prepare_text(markdown_string) + html_string = self._prepare_text(html_string) + + if strip_lines: + html_string = strip_html_lines(html_string, strip_lines) + + # compare html -> markdown + markdown_string2 = html2markdown(html_string, debug, **kwargs) + if debug: + print("-" * 79) + print(markdown_string2) + print("-" * 79) + + self.assertEqual(markdown_string2, markdown_string, msg="html2markdown") + + return markdown_string, html_string + def cross_compare_textile(self, textile_string, html_string, strip_lines=False, debug=False, **kwargs): """ @@ -241,6 +266,32 @@ class BaseCreoleTest(MarkupTest): self.assertEqual(html_string, html, msg="textile2html") + def cross_compare_markdown(self, markdown_string, html_string, + strip_lines=False, debug=False, **kwargs): + """ + Checks: + * html2markdown + * markdown2html + """ +# assert isinstance(markdown_string, str) +# assert isinstance(html_string, str) + self.assertNotEqual(markdown_string, html_string) + + # compare html -> markdown + markdown_string, html_string = self.assert_html2markdown( + markdown_string, html_string, + strip_lines, debug, **kwargs + ) + + # compare markdown -> html + html = markdown.markdown(markdown_string) + html = html.replace("
", "
\n") + html = tabs2spaces(html) + if strip_lines: + html = strip_html_lines(html, strip_lines) + + self.assertEqual(html_string, html, msg="markdown2html") + def assert_html2rest(self, rest_string, html_string, strip_lines=False, debug=False, **kwargs): """ diff --git a/pyproject.toml b/pyproject.toml index 3c3135b..a12eda9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -52,6 +52,7 @@ flake8 = "*" flynt = "*" autopep8 = "*" pyupgrade = "*" +markdown = "*" [tool.poetry.scripts] creole2html = "creole.cmdline:cli_creole2html" @@ -59,6 +60,7 @@ html2creole = "creole.cmdline:cli_html2creole" html2rest = "creole.cmdline:cli_html2rest" html2textile = "creole.cmdline:cli_html2textile" update_rst_readme = "creole.setup_utils:update_creole_rst_readme" +update_markdown_readme = "creole.setup_utils:update_creole_markdown_readme" publish = "creole.publish:publish" [build-system] -- cgit v1.2.1