diff options
author | JensDiemer <git@jensdiemer.de> | 2020-01-19 10:42:33 +0100 |
---|---|---|
committer | JensDiemer <git@jensdiemer.de> | 2020-01-19 10:42:33 +0100 |
commit | 842c244de2a81b39cde9994e67a000e7bef75f15 (patch) | |
tree | cf042e589075bbfb03424ee40a8993f847badc14 | |
parent | aa38ea16d9e02b2484232ad816865b6a93637cdc (diff) | |
download | creole-842c244de2a81b39cde9994e67a000e7bef75f15.tar.gz |
remove python 2 compatibility code
-rw-r--r-- | creole/__init__.py | 5 | ||||
-rw-r--r-- | creole/emitter/creol2html_emitter.py | 11 | ||||
-rw-r--r-- | creole/html_tools/deentity.py | 27 | ||||
-rw-r--r-- | creole/html_tools/strip_html.py | 2 | ||||
-rw-r--r-- | creole/parser/creol2html_parser.py | 10 | ||||
-rw-r--r-- | creole/parser/html_parser.py | 28 | ||||
-rw-r--r-- | creole/py3compat.py | 43 | ||||
-rw-r--r-- | creole/rest_tools/clean_writer.py | 17 | ||||
-rw-r--r-- | creole/setup_utils.py | 13 | ||||
-rw-r--r-- | creole/shared/HTMLParsercompat.py | 589 | ||||
-rw-r--r-- | creole/shared/base_emitter.py | 12 | ||||
-rw-r--r-- | creole/shared/document_tree.py | 7 | ||||
-rw-r--r-- | creole/shared/html_parser.py | 32 | ||||
-rw-r--r-- | creole/shared/utils.py | 10 | ||||
-rw-r--r-- | creole/tests/test_creole2html.py | 12 | ||||
-rw-r--r-- | creole/tests/test_setup_utils.py | 17 | ||||
-rw-r--r-- | creole/tests/utils/base_unittest.py | 27 |
17 files changed, 55 insertions, 807 deletions
diff --git a/creole/__init__.py b/creole/__init__.py index 3c6a113..e9dc0da 100644 --- a/creole/__init__.py +++ b/creole/__init__.py @@ -28,7 +28,6 @@ from creole.emitter.html2creole_emitter import CreoleEmitter from creole.emitter.html2rest_emitter import ReStructuredTextEmitter from creole.emitter.html2textile_emitter import TextileEmitter from creole.parser.html_parser import HtmlParser -from creole.py3compat import TEXT_TYPE __version__ = "1.4.0" @@ -52,7 +51,7 @@ def creole2html(markup_string, debug=False, Info: parser_kwargs and emitter_kwargs are deprecated """ - assert isinstance(markup_string, TEXT_TYPE), "given markup_string must be unicode!" + assert isinstance(markup_string, str), "given markup_string must be unicode!" parser_kwargs2 = { "block_rules": block_rules, @@ -84,7 +83,7 @@ def creole2html(markup_string, debug=False, def parse_html(html_string, debug=False): """ create the document tree from html code """ - assert isinstance(html_string, TEXT_TYPE), "given html_string must be unicode!" + assert isinstance(html_string, str), "given html_string must be unicode!" h2c = HtmlParser(debug=debug) document_tree = h2c.feed(html_string) diff --git a/creole/emitter/creol2html_emitter.py b/creole/emitter/creol2html_emitter.py index 0fa0def..76f6997 100644 --- a/creole/emitter/creol2html_emitter.py +++ b/creole/emitter/creol2html_emitter.py @@ -1,23 +1,18 @@ -# coding: utf-8 - """ WikiCreole to HTML converter - :copyleft: 2008-2015 by python-creole team, see AUTHORS for more details. + :copyleft: 2008-2020 by python-creole team, see AUTHORS for more details. :license: GNU GPL v3 or above, see LICENSE for more details. """ - - import json from xml.sax.saxutils import escape import sys import traceback from creole.parser.creol2html_parser import CreoleParser -from creole.py3compat import TEXT_TYPE from creole.shared.utils import string2dict @@ -70,7 +65,7 @@ class TableOfContent(object): def nested_headlines2html(self, nested_headlines, level=0): """Convert a python nested list like the one representing the toc to an html equivalent.""" indent = "\t"*level - if isinstance(nested_headlines, TEXT_TYPE): + if isinstance(nested_headlines, str): return '%s<li><a href="#%s">%s</a></li>\n' % (indent, nested_headlines, nested_headlines) elif isinstance(nested_headlines, list): html = '%s<ul>\n' % indent @@ -359,7 +354,7 @@ class HtmlEmitter(object): exc_info=sys.exc_info() ) - if not isinstance(result, TEXT_TYPE): + if not isinstance(result, str): msg = "Macro '%s' doesn't return a unicode string!" % macro_name if self.verbose > 1: msg += " - returns: %r, type %r" % (result, type(result)) diff --git a/creole/html_tools/deentity.py b/creole/html_tools/deentity.py index 2f6104a..fd23782 100644 --- a/creole/html_tools/deentity.py +++ b/creole/html_tools/deentity.py @@ -1,24 +1,16 @@ -#!/usr/bin/env python -# coding: utf-8 """ python-creole utils ~~~~~~~~~~~~~~~~~~~ - :copyleft: 2008-2011 by python-creole team, see AUTHORS for more details. + :copyleft: 2008-2020 by python-creole team, see AUTHORS for more details. :license: GNU GPL v3 or above, see LICENSE for more details. """ - import re -try: - import htmlentitydefs as entities -except ImportError: - from html import entities # python 3 - -from creole.py3compat import PY3 +from html import entities entities_rules = '|'.join([ @@ -53,18 +45,12 @@ class Deentity(object): def replace_number(self, text): """ unicode number entity """ unicode_no = int(text) - if PY3: - return chr(unicode_no) - else: - return unichr(unicode_no) + return chr(unicode_no) def replace_hex(self, text): """ hex entity """ unicode_no = int(text, 16) - if PY3: - return chr(unicode_no) - else: - return unichr(unicode_no) + return chr(unicode_no) def replace_named(self, text): """ named entity """ @@ -73,10 +59,7 @@ class Deentity(object): return " " else: codepoint = entities.name2codepoint[text] - if PY3: - return chr(codepoint) - else: - return unichr(codepoint) + return chr(codepoint) def replace_all(self, content): """ replace all html entities form the given text. """ diff --git a/creole/html_tools/strip_html.py b/creole/html_tools/strip_html.py index 10534ad..c5e79a2 100644 --- a/creole/html_tools/strip_html.py +++ b/creole/html_tools/strip_html.py @@ -58,8 +58,6 @@ def strip_html(html_code): >>> strip_html('<p>a <img src="/image.jpg" /> image.</p>') '<p>a <img src="/image.jpg" /> image.</p>' - - """ def strip_tag(match): diff --git a/creole/parser/creol2html_parser.py b/creole/parser/creol2html_parser.py index 63f3b14..7c519d8 100644 --- a/creole/parser/creol2html_parser.py +++ b/creole/parser/creol2html_parser.py @@ -1,6 +1,3 @@ -# coding: utf-8 - - """ Creole wiki markup parser @@ -18,18 +15,15 @@ unrecognized schemes (like wtf://server/path) triggering italic rendering for the rest of the paragraph. - :copyleft: 2008-2011 by python-creole team, see AUTHORS for more details. + :copyleft: 2008-2020 by python-creole team, see AUTHORS for more details. :license: GNU GPL v3 or above, see LICENSE for more details. """ - - import re from pprint import pformat from creole.parser.creol2html_rules import BlockRules, INLINE_FLAGS, INLINE_RULES, \ SpecialRules, InlineRules -from creole.py3compat import TEXT_TYPE from creole.shared.document_tree import DocNode @@ -61,7 +55,7 @@ class CreoleParser(object): def __init__(self, raw, block_rules=None, blog_line_breaks=True, debug=False): - assert isinstance(raw, TEXT_TYPE) + assert isinstance(raw, str) self.raw = raw if block_rules is None: diff --git a/creole/parser/html_parser.py b/creole/parser/html_parser.py index 4380431..6cd1c88 100644 --- a/creole/parser/html_parser.py +++ b/creole/parser/html_parser.py @@ -1,28 +1,21 @@ -#!/usr/bin/env python -# coding: utf-8 """ python-creole ~~~~~~~~~~~~~ - - :copyleft: 2008-2011 by python-creole team, see AUTHORS for more details. + :copyleft: 2008-2020 by python-creole team, see AUTHORS for more details. :license: GNU GPL v3 or above, see LICENSE for more details. """ - - import re import warnings +from html.parser import HTMLParser -from creole.parser.html_parser_config import BLOCK_TAGS, IGNORE_TAGS from creole.html_tools.strip_html import strip_html -from creole.py3compat import TEXT_TYPE, BINARY_TYPE -from creole.shared.document_tree import DocNode, DebugList -from creole.shared.html_parser import HTMLParser - +from creole.parser.html_parser_config import BLOCK_TAGS, IGNORE_TAGS +from creole.shared.document_tree import DebugList, DocNode -#------------------------------------------------------------------------------ +# ------------------------------------------------------------------------------ block_re = re.compile(r''' ^<pre> \s* $ @@ -44,7 +37,7 @@ inline_re = re.compile(r''' headline_tag_re = re.compile(r"h(\d)", re.UNICODE) -#------------------------------------------------------------------------------ +# ------------------------------------------------------------------------------ class HtmlParser(HTMLParser): @@ -83,7 +76,7 @@ class HtmlParser(HTMLParser): _inline_placeholder = "inlinedata" def __init__(self, debug=False): - HTMLParser.__init__(self) + super(HtmlParser, self).__init__(convert_charrefs=False) self.debugging = debug if self.debugging: @@ -104,7 +97,7 @@ class HtmlParser(HTMLParser): def _pre_cut(self, data, type, placeholder): if self.debugging: print("append blockdata: %r" % data) - assert isinstance(data, TEXT_TYPE), "blockdata is not unicode" + assert isinstance(data, str), "blockdata is not unicode" self.blockdata.append(data) id = len(self.blockdata) - 1 return '<%s type="%s" id="%s" />' % (placeholder, type, id) @@ -133,7 +126,7 @@ class HtmlParser(HTMLParser): # data = match.group("data") def feed(self, raw_data): - assert isinstance(raw_data, TEXT_TYPE), "feed data must be unicode!" + assert isinstance(raw_data, str), "feed data must be unicode!" data = raw_data.strip() # cut out <pre> and <tt> areas block tag areas @@ -208,8 +201,7 @@ class HtmlParser(HTMLParser): def handle_data(self, data): self.debug_msg("data", "%r" % data) - if isinstance(data, BINARY_TYPE): - data = unicode(data) + assert isinstance(data, str) DocNode("data", self.cur, content=data) def handle_charref(self, name): diff --git a/creole/py3compat.py b/creole/py3compat.py deleted file mode 100644 index 76c55b4..0000000 --- a/creole/py3compat.py +++ /dev/null @@ -1,43 +0,0 @@ -# coding: utf-8 - -""" - Helper to support Python v2 and v3 - ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - Some ideas borrowed from six - - See also: - http://python3porting.com - https://bitbucket.org/gutworth/six/src/tip/six.py - http://packages.python.org/six/ -""" - - - -import sys -import doctest -import re - -# True if we are running on Python 3. -PY3 = sys.version_info[0] == 3 - - -if PY3: - TEXT_TYPE = str - BINARY_TYPE = bytes -else: - TEXT_TYPE = unicode - BINARY_TYPE = str - - # Simple remove 'u' from python 2 unicode repr string - # See also: - # http://bugs.python.org/issue3955 - # http://www.python-forum.de/viewtopic.php?f=1&t=27509 (de) - origin_OutputChecker = doctest.OutputChecker - class OutputChecker2(origin_OutputChecker): - def check_output(self, want, got, optionflags): - got = got.replace("u'", "'").replace('u"', '"') - return origin_OutputChecker.check_output(self, want, got, optionflags) - doctest.OutputChecker = OutputChecker2 - - diff --git a/creole/rest_tools/clean_writer.py b/creole/rest_tools/clean_writer.py index 2b6ae66..b6272ab 100644 --- a/creole/rest_tools/clean_writer.py +++ b/creole/rest_tools/clean_writer.py @@ -1,6 +1,3 @@ -#!/usr/bin/env python -# coding: utf-8 - """ A clean reStructuredText html writer ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -12,17 +9,14 @@ http://www.arnebrodowski.de/blog/write-your-own-restructuredtext-writer.html https://github.com/alex-morega/docutils-plainhtml/blob/master/plain_html_writer.py - :copyleft: 2011-2013 by python-creole team, see AUTHORS for more details. + :copyleft: 2011-2020 by python-creole team, see AUTHORS for more details. :license: GNU GPL v3 or above, see LICENSE for more details. """ - -#import warnings import sys from creole.exceptions import DocutilsImportError -from creole.py3compat import TEXT_TYPE, PY3 try: import docutils @@ -95,9 +89,9 @@ class CleanHTMLTranslator(html4css1.HTMLTranslator, object): continue if isinstance(value, list): - value = ' '.join([TEXT_TYPE(x) for x in value]) + value = ' '.join([str(x) for x in value]) - part = '%s="%s"' % (name.lower(), self.attval(TEXT_TYPE(value))) + part = '%s="%s"' % (name.lower(), self.attval(str(value))) parts.append(part) if DEBUG: @@ -205,10 +199,7 @@ def rest2html(content, enable_exit_status=None, **kwargs): ... SystemExit: 13 """ - if not PY3: - content = unicode(content) - - assert isinstance(content, TEXT_TYPE), "rest2html content must be %s, but it's %s" % (TEXT_TYPE, type(content)) + assert isinstance(content, str), "rest2html content must be %s, but it's %s" % (str, type(content)) settings_overrides = { "input_encoding": "unicode", diff --git a/creole/setup_utils.py b/creole/setup_utils.py index 6f6b651..36e30e1 100644 --- a/creole/setup_utils.py +++ b/creole/setup_utils.py @@ -38,12 +38,10 @@ ) --------------------------------------------------------------------------- - :copyleft: 2011-2014 by the python-creole team, see AUTHORS for more details. + :copyleft: 2011-2020 by the python-creole team, see AUTHORS for more details. :license: GNU GPL v3 or above, see LICENSE for more details. """ - - import codecs import os import sys @@ -51,7 +49,6 @@ import warnings from creole import creole2html, html2rest from creole.shared.unknown_tags import raise_unknown_node, transparent_unknown_nodes -from creole.py3compat import PY3 RAISE_ERRORS_ARGS = ( @@ -99,14 +96,10 @@ def get_long_description(package_root, filename="README.creole", raise_errors=No long_description_html = creole2html(long_description_origin) # convert html to ReSt - long_description_rest_unicode = html2rest( + long_description_rest = html2rest( long_description_html, emitter_kwargs={"unknown_emit":unknown_emit} ) - if PY3: - long_description_rest = long_description_rest_unicode - else: - long_description_rest = long_description_rest_unicode.encode("utf-8") except Exception: if raise_errors: raise @@ -120,7 +113,7 @@ def get_long_description(package_root, filename="README.creole", raise_errors=No # Test created ReSt code like PyPi does it. from creole.rest_tools.pypi_rest2html import pypi_rest2html try: - pypi_rest2html(long_description_rest_unicode) + pypi_rest2html(long_description_rest) except SystemExit as e: msg = "Error creole2rest self test failed: rest2html() exist with status code: %s\n" % e.args[0] sys.stderr.write(msg) diff --git a/creole/shared/HTMLParsercompat.py b/creole/shared/HTMLParsercompat.py deleted file mode 100644 index 6f61cc5..0000000 --- a/creole/shared/HTMLParsercompat.py +++ /dev/null @@ -1,589 +0,0 @@ -""" -Patched version of the original from: - http://hg.python.org/cpython/file/tip/Lib/html/parser.py - -compare: - http://hg.python.org/cpython/file/2.7/Lib/HTMLParser.py - http://hg.python.org/cpython/file/3.2/Lib/html/parser.py - -e.g.: - cd /tmp/ - wget http://hg.python.org/cpython/raw-file/2.7/Lib/HTMLParser.py - wget http://hg.python.org/cpython/raw-file/3.2/Lib/html/parser.py - meld HTMLParser.py parser.py - -Make it compatible with Python 2.x and 3.x - -More info see html_parser.py ! -""" - -# ------------------------------------------------------------------- add start - -from creole.py3compat import PY3 -# --------------------------------------------------------------------- add end - -"""A parser for HTML and XHTML.""" - -# This file is based on sgmllib.py, but the API is slightly different. - -# XXX There should be a way to distinguish between PCDATA (parsed -# character data -- the normal case), RCDATA (replaceable character -# data -- only char and entity references and end tags are special) -# and CDATA (character data -- only end tags are special). - - -# --------------------------------------------------------------- changes start -try: - import _markupbase # python 3 -except ImportError: - import markupbase as _markupbase # python 2 -# --------------------------------------------------------------- changes end -import re - -# Regular expressions used for parsing - -interesting_normal = re.compile('[&<]') -incomplete = re.compile('&[a-zA-Z#]') - -entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]') -charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]') - -starttagopen = re.compile('<[a-zA-Z]') -piclose = re.compile('>') -commentclose = re.compile(r'--\s*>') -tagfind = re.compile('([a-zA-Z][-.a-zA-Z0-9:_]*)(?:\s|/(?!>))*') -# see http://www.w3.org/TR/html5/tokenization.html#tag-open-state -# and http://www.w3.org/TR/html5/tokenization.html#tag-name-state -tagfind_tolerant = re.compile('[a-zA-Z][^\t\n\r\f />\x00]*') -# Note: -# 1) the strict attrfind isn't really strict, but we can't make it -# correctly strict without breaking backward compatibility; -# 2) if you change attrfind remember to update locatestarttagend too; -# 3) if you change attrfind and/or locatestarttagend the parser will -# explode, so don't do it. -attrfind = re.compile( - r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*' - r'(\'[^\']*\'|"[^"]*"|[^\s"\'=<>`]*))?') -attrfind_tolerant = re.compile( - r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*' - r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*') -locatestarttagend = re.compile(r""" - <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name - (?:\s+ # whitespace before attribute name - (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name - (?:\s*=\s* # value indicator - (?:'[^']*' # LITA-enclosed value - |\"[^\"]*\" # LIT-enclosed value - |[^'\">\s]+ # bare value - ) - )? - ) - )* - \s* # trailing whitespace -""", re.VERBOSE) -locatestarttagend_tolerant = re.compile(r""" - <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name - (?:[\s/]* # optional whitespace before attribute name - (?:(?<=['"\s/])[^\s/>][^\s/=>]* # attribute name - (?:\s*=+\s* # value indicator - (?:'[^']*' # LITA-enclosed value - |"[^"]*" # LIT-enclosed value - |(?!['"])[^>\s]* # bare value - ) - (?:\s*,)* # possibly followed by a comma - )?(?:\s|/(?!>))* - )* - )? - \s* # trailing whitespace -""", re.VERBOSE) -endendtag = re.compile('>') -# the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between -# </ and the tag name, so maybe this should be fixed -endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>') - - -class HTMLParseError(Exception): - """Exception raised for all parse errors.""" - - def __init__(self, msg, position=(None, None)): - assert msg - self.msg = msg - self.lineno = position[0] - self.offset = position[1] - - def __str__(self): - result = self.msg - if self.lineno is not None: - result = result + ", at line %d" % self.lineno - if self.offset is not None: - result = result + ", column %d" % (self.offset + 1) - return result - - -class HTMLParser(_markupbase.ParserBase): - """Find tags and other markup and call handler functions. - - Usage: - p = HTMLParser() - p.feed(data) - ... - p.close() - - Start tags are handled by calling self.handle_starttag() or - self.handle_startendtag(); end tags by self.handle_endtag(). The - data between tags is passed from the parser to the derived class - by calling self.handle_data() with the data as argument (the data - may be split up in arbitrary chunks). Entity references are - passed by calling self.handle_entityref() with the entity - reference as the argument. Numeric character references are - passed to self.handle_charref() with the string containing the - reference as the argument. - """ - - CDATA_CONTENT_ELEMENTS = ("script", "style") - - def __init__(self, strict=True): - """Initialize and reset this instance. - - If strict is set to True (the default), errors are raised when invalid - HTML is encountered. If set to False, an attempt is instead made to - continue parsing, making "best guesses" about the intended meaning, in - a fashion similar to what browsers typically do. - """ - self.strict = strict - self.reset() - - def reset(self): - """Reset this instance. Loses all unprocessed data.""" - self.rawdata = '' - self.lasttag = '???' - self.interesting = interesting_normal - self.cdata_elem = None - _markupbase.ParserBase.reset(self) - - def feed(self, data): - r"""Feed data to the parser. - - Call this as often as you want, with as little or as much text - as you want (may include '\n'). - """ - self.rawdata = self.rawdata + data - self.goahead(0) - - def close(self): - """Handle any buffered data.""" - self.goahead(1) - - def error(self, message): - raise HTMLParseError(message, self.getpos()) - - __starttag_text = None - - def get_starttag_text(self): - """Return full source of start tag: '<...>'.""" - return self.__starttag_text - - def set_cdata_mode(self, elem): - self.cdata_elem = elem.lower() - self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I) - - def clear_cdata_mode(self): - self.interesting = interesting_normal - self.cdata_elem = None - - # Internal -- handle data as far as reasonable. May leave state - # and data to be processed by a subsequent call. If 'end' is - # true, force handling all data as if followed by EOF marker. - def goahead(self, end): - rawdata = self.rawdata - i = 0 - n = len(rawdata) - while i < n: - match = self.interesting.search(rawdata, i) # < or & - if match: - j = match.start() - else: - if self.cdata_elem: - break - j = n - if i < j: self.handle_data(rawdata[i:j]) - i = self.updatepos(i, j) - if i == n: break - startswith = rawdata.startswith - if startswith('<', i): - if starttagopen.match(rawdata, i): # < + letter - k = self.parse_starttag(i) - elif startswith("</", i): - k = self.parse_endtag(i) - elif startswith("<!--", i): - k = self.parse_comment(i) - elif startswith("<?", i): - k = self.parse_pi(i) - elif startswith("<!", i): - if self.strict: - k = self.parse_declaration(i) - else: - k = self.parse_html_declaration(i) - elif (i + 1) < n: - self.handle_data("<") - k = i + 1 - else: - break - if k < 0: - if not end: - break - if self.strict: - self.error("EOF in middle of construct") - k = rawdata.find('>', i + 1) - if k < 0: - k = rawdata.find('<', i + 1) - if k < 0: - k = i + 1 - else: - k += 1 - self.handle_data(rawdata[i:k]) - i = self.updatepos(i, k) - elif startswith("&#", i): - match = charref.match(rawdata, i) - if match: - name = match.group()[2:-1] - self.handle_charref(name) - k = match.end() - if not startswith(';', k-1): - k = k - 1 - i = self.updatepos(i, k) - continue - else: - if ";" in rawdata[i:]: #bail by consuming &# - self.handle_data(rawdata[0:2]) - i = self.updatepos(i, 2) - break - elif startswith('&', i): - match = entityref.match(rawdata, i) - if match: - name = match.group(1) - self.handle_entityref(name) - k = match.end() - if not startswith(';', k-1): - k = k - 1 - i = self.updatepos(i, k) - continue - match = incomplete.match(rawdata, i) - if match: - # match.group() will contain at least 2 chars - if end and match.group() == rawdata[i:]: - if self.strict: - self.error("EOF in middle of entity or char ref") - else: - if k <= i: - k = n - i = self.updatepos(i, i + 1) - # incomplete - break - elif (i + 1) < n: - # not the end of the buffer, and can't be confused - # with some other construct - self.handle_data("&") - i = self.updatepos(i, i + 1) - else: - break - else: - assert 0, "interesting.search() lied" - # end while - if end and i < n and not self.cdata_elem: - self.handle_data(rawdata[i:n]) - i = self.updatepos(i, n) - self.rawdata = rawdata[i:] - - # Internal -- parse html declarations, return length or -1 if not terminated - # See w3.org/TR/html5/tokenization.html#markup-declaration-open-state - # See also parse_declaration in _markupbase - def parse_html_declaration(self, i): - rawdata = self.rawdata - if rawdata[i:i+2] != '<!': - self.error('unexpected call to parse_html_declaration()') - if rawdata[i:i+4] == '<!--': - # this case is actually already handled in goahead() - return self.parse_comment(i) - elif rawdata[i:i+3] == '<![': - return self.parse_marked_section(i) - elif rawdata[i:i+9].lower() == '<!doctype': - # find the closing > - gtpos = rawdata.find('>', i+9) - if gtpos == -1: - return -1 - self.handle_decl(rawdata[i+2:gtpos]) - return gtpos+1 - else: - return self.parse_bogus_comment(i) - - # Internal -- parse bogus comment, return length or -1 if not terminated - # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state - def parse_bogus_comment(self, i, report=1): - rawdata = self.rawdata - if rawdata[i:i+2] not in ('<!', '</'): - self.error('unexpected call to parse_comment()') - pos = rawdata.find('>', i+2) - if pos == -1: - return -1 - if report: - self.handle_comment(rawdata[i+2:pos]) - return pos + 1 - - # Internal -- parse processing instr, return end or -1 if not terminated - def parse_pi(self, i): - rawdata = self.rawdata - assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()' - match = piclose.search(rawdata, i+2) # > - if not match: - return -1 - j = match.start() - self.handle_pi(rawdata[i+2: j]) - j = match.end() - return j - - # Internal -- handle starttag, return end or -1 if not terminated - def parse_starttag(self, i): - self.__starttag_text = None - endpos = self.check_for_whole_start_tag(i) - if endpos < 0: - return endpos - rawdata = self.rawdata - self.__starttag_text = rawdata[i:endpos] - - # Now parse the data between i+1 and j into a tag and attrs - attrs = [] - match = tagfind.match(rawdata, i+1) - assert match, 'unexpected call to parse_starttag()' - k = match.end() - self.lasttag = tag = match.group(1).lower() - while k < endpos: - if self.strict: - m = attrfind.match(rawdata, k) - else: - m = attrfind_tolerant.match(rawdata, k) - if not m: - break - attrname, rest, attrvalue = m.group(1, 2, 3) - if not rest: - attrvalue = None - elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ - attrvalue[:1] == '"' == attrvalue[-1:]: - attrvalue = attrvalue[1:-1] - if attrvalue: - attrvalue = self.unescape(attrvalue) - attrs.append((attrname.lower(), attrvalue)) - k = m.end() - - end = rawdata[k:endpos].strip() - if end not in (">", "/>"): - lineno, offset = self.getpos() - if "\n" in self.__starttag_text: - lineno = lineno + self.__starttag_text.count("\n") - offset = len(self.__starttag_text) \ - - self.__starttag_text.rfind("\n") - else: - offset = offset + len(self.__starttag_text) - if self.strict: - self.error("junk characters in start tag: %r" - % (rawdata[k:endpos][:20],)) - self.handle_data(rawdata[i:endpos]) - return endpos - if end.endswith('/>'): - # XHTML-style empty tag: <span attr="value" /> - self.handle_startendtag(tag, attrs) - else: - self.handle_starttag(tag, attrs) - if tag in self.CDATA_CONTENT_ELEMENTS: - self.set_cdata_mode(tag) - return endpos - - # Internal -- check to see if we have a complete starttag; return end - # or -1 if incomplete. - def check_for_whole_start_tag(self, i): - rawdata = self.rawdata - if self.strict: - m = locatestarttagend.match(rawdata, i) - else: - m = locatestarttagend_tolerant.match(rawdata, i) - if m: - j = m.end() - next = rawdata[j:j+1] - if next == ">": - return j + 1 - if next == "/": - if rawdata.startswith("/>", j): - return j + 2 - if rawdata.startswith("/", j): - # buffer boundary - return -1 - # else bogus input - if self.strict: - self.updatepos(i, j + 1) - self.error("malformed empty start tag") - if j > i: - return j - else: - return i + 1 - if next == "": - # end of input - return -1 - if next in ("abcdefghijklmnopqrstuvwxyz=/" - "ABCDEFGHIJKLMNOPQRSTUVWXYZ"): - # end of input in or before attribute value, or we have the - # '/' from a '/>' ending - return -1 - if self.strict: - self.updatepos(i, j) - self.error("malformed start tag") - if j > i: - return j - else: - return i + 1 - raise AssertionError("we should not get here!") - - # Internal -- parse endtag, return end or -1 if incomplete - def parse_endtag(self, i): - rawdata = self.rawdata - assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag" - match = endendtag.search(rawdata, i+1) # > - if not match: - return -1 - gtpos = match.end() - match = endtagfind.match(rawdata, i) # </ + tag + > - if not match: - if self.cdata_elem is not None: - self.handle_data(rawdata[i:gtpos]) - return gtpos - if self.strict: - self.error("bad end tag: %r" % (rawdata[i:gtpos],)) - # find the name: w3.org/TR/html5/tokenization.html#tag-name-state - namematch = tagfind_tolerant.match(rawdata, i+2) - if not namematch: - # w3.org/TR/html5/tokenization.html#end-tag-open-state - if rawdata[i:i+3] == '</>': - return i+3 - else: - return self.parse_bogus_comment(i) - tagname = namematch.group().lower() - # consume and ignore other stuff between the name and the > - # Note: this is not 100% correct, since we might have things like - # </tag attr=">">, but looking for > after tha name should cover - # most of the cases and is much simpler - gtpos = rawdata.find('>', namematch.end()) - self.handle_endtag(tagname) - return gtpos+1 - - elem = match.group(1).lower() # script or style - if self.cdata_elem is not None: - if elem != self.cdata_elem: - self.handle_data(rawdata[i:gtpos]) - return gtpos - - self.handle_endtag(elem.lower()) - self.clear_cdata_mode() - return gtpos - - # Overridable -- finish processing of start+end tag: <tag.../> - def handle_startendtag(self, tag, attrs): - self.handle_starttag(tag, attrs) - self.handle_endtag(tag) - - # Overridable -- handle start tag - def handle_starttag(self, tag, attrs): - pass - - # Overridable -- handle end tag - def handle_endtag(self, tag): - pass - - # Overridable -- handle character reference - def handle_charref(self, name): - pass - - # Overridable -- handle entity reference - def handle_entityref(self, name): - pass - - # Overridable -- handle data - def handle_data(self, data): - pass - - # Overridable -- handle comment - def handle_comment(self, data): - pass - - # Overridable -- handle declaration - def handle_decl(self, decl): - pass - - # Overridable -- handle processing instruction - def handle_pi(self, data): - pass - - def unknown_decl(self, data): - if self.strict: - self.error("unknown declaration: %r" % (data,)) - - # Internal -- helper to remove special character quoting - entitydefs = None - def unescape(self, s): - if '&' not in s: - return s - # -------------------------------------------------------- change start - if PY3: - def replaceEntities(s): - s = s.groups()[0] - try: - if s[0] == "#": - s = s[1:] - if s[0] in ['x','X']: - c = int(s[1:], 16) - else: - c = int(s) - return chr(c) - except ValueError: - return '&#'+ s +';' - else: - # Cannot use name2codepoint directly, because HTMLParser - # supports apos, which is not part of HTML 4 - import html.entities - if HTMLParser.entitydefs is None: - entitydefs = HTMLParser.entitydefs = {'apos':"'"} - for k, v in html.entities.name2codepoint.items(): - entitydefs[k] = chr(v) - try: - return self.entitydefs[s] - except KeyError: - return '&'+s+';' - - return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));", - replaceEntities, s, flags=re.ASCII) - else: - def replaceEntities(s): - s = s.groups()[0] - try: - if s[0] == "#": - s = s[1:] - if s[0] in ['x','X']: - c = int(s[1:], 16) - else: - c = int(s) - return unichr(c) - except ValueError: - return '&#'+s+';' - else: - # Cannot use name2codepoint directly, because HTMLParser supports apos, - # which is not part of HTML 4 - import htmlentitydefs - if HTMLParser.entitydefs is None: - entitydefs = HTMLParser.entitydefs = {'apos':"'"} - for k, v in htmlentitydefs.name2codepoint.iteritems(): - entitydefs[k] = unichr(v) - try: - return self.entitydefs[s] - except KeyError: - return '&'+s+';' - - return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));", replaceEntities, s) - # -------------------------------------------------------- change end diff --git a/creole/shared/base_emitter.py b/creole/shared/base_emitter.py index de6fd2f..a29c224 100644 --- a/creole/shared/base_emitter.py +++ b/creole/shared/base_emitter.py @@ -1,20 +1,16 @@ -#!/usr/bin/env python -# coding: utf-8 """ Base document tree emitter ~~~~~~~~~~~~~~~~~~~~~~~~~~ - :copyleft: 2008-2011 by python-creole team, see AUTHORS for more details. + :copyleft: 2008-2020 by python-creole team, see AUTHORS for more details. :license: GNU GPL v3 or above, see LICENSE for more details. """ - from creole.parser.html_parser_config import BLOCK_TAGS from creole.html_tools.deentity import Deentity -from creole.py3compat import TEXT_TYPE from creole.shared.markup_table import MarkupTable from creole.shared.unknown_tags import transparent_unknown_nodes @@ -196,7 +192,7 @@ class BaseEmitter(object): result = [] for child in node.children: content = self.emit_node(child) - assert isinstance(content, TEXT_TYPE) + assert isinstance(content, str) result.append(content) return result @@ -220,11 +216,11 @@ class BaseEmitter(object): if emit_method: content = emit_method(node) - if not isinstance(content, TEXT_TYPE): + if not isinstance(content, str): unicode_error(method_name, emit_method, node, content) else: content = self._unknown_emit(self, node) - if not isinstance(content, TEXT_TYPE): + if not isinstance(content, str): unicode_error(method_name, self._unknown_emit, node, content) self.last = node diff --git a/creole/shared/document_tree.py b/creole/shared/document_tree.py index 4971953..e5b7bcf 100644 --- a/creole/shared/document_tree.py +++ b/creole/shared/document_tree.py @@ -6,16 +6,13 @@ ~~~~~~~~~~~~~ - :copyleft: 2008-2011 by python-creole team, see AUTHORS for more details. + :copyleft: 2008-2020 by python-creole team, see AUTHORS for more details. :license: GNU GPL v3 or above, see LICENSE for more details. """ - - import warnings import inspect -from creole.py3compat import TEXT_TYPE from creole.shared.utils import dict2string @@ -35,7 +32,7 @@ class DocNode: self.attrs = dict(attrs) if content: - assert isinstance(content, TEXT_TYPE), "Given content %r is not unicode, it's type: %s" % ( + assert isinstance(content, str), "Given content %r is not unicode, it's type: %s" % ( content, type(content) ) diff --git a/creole/shared/html_parser.py b/creole/shared/html_parser.py deleted file mode 100644 index 0bdb7c4..0000000 --- a/creole/shared/html_parser.py +++ /dev/null @@ -1,32 +0,0 @@ -# coding: utf-8 - -""" - HTMLParser for Python 2.x and 3.x - ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - The HTMLParser has problems with the correct handling of <script>...</script> - and <style>...</style> areas. - - It was fixed with v2.7.3 and 3.2.3, see: - http://www.python.org/download/releases/2.7.3/ - http://www.python.org/download/releases/3.2.3/ - see also: - http://bugs.python.org/issue670664#msg146770 - - :copyleft: 2011-2012 by python-creole team, see AUTHORS for more details. - :license: GNU GPL v3 or above, see LICENSE for more details. -""" - - -try: - import HTMLParser as OriginHTMLParser -except ImportError: - from html import parser as OriginHTMLParser # python 3 - - -if hasattr(OriginHTMLParser, "cdata_elem"): - # Current python version is patched -> use the original - HTMLParser = OriginHTMLParser -else: - # Current python version is not patched -> use own patched version - from creole.shared.HTMLParsercompat import HTMLParser diff --git a/creole/shared/utils.py b/creole/shared/utils.py index f1b981d..cb55299 100644 --- a/creole/shared/utils.py +++ b/creole/shared/utils.py @@ -5,7 +5,7 @@ python creole utilities ~~~~~~~~~~~~~~~~~~~~~~~ - :copyleft: 2011-2014 by python-creole team, see AUTHORS for more details. + :copyleft: 2011-2020 by python-creole team, see AUTHORS for more details. :license: GNU GPL v3 or above, see LICENSE for more details. """ @@ -14,7 +14,7 @@ import shlex import json -from creole.py3compat import TEXT_TYPE, PY3 + try: from pygments import lexers @@ -31,7 +31,7 @@ KEYWORD_MAP = { "None": None, } -def string2dict(raw_content, encoding="utf-8"): +def string2dict(raw_content): """ convert a string into a dictionary. e.g.: @@ -43,10 +43,6 @@ def string2dict(raw_content, encoding="utf-8"): See test_creole2html.TestString2Dict() """ - if not PY3 and isinstance(raw_content, TEXT_TYPE): - # shlex.split doesn't work with unicode?!? - raw_content = raw_content.encode(encoding) - parts = shlex.split(raw_content) result = {} diff --git a/creole/tests/test_creole2html.py b/creole/tests/test_creole2html.py index b75435d..b6f9ed0 100644 --- a/creole/tests/test_creole2html.py +++ b/creole/tests/test_creole2html.py @@ -1,5 +1,3 @@ -#!/usr/bin/env python -# coding: utf-8 """ creole2html unittest @@ -12,7 +10,7 @@ Test the creole markup. - :copyleft: 2008-2014 by python-creole team, see AUTHORS for more details. + :copyleft: 2008-2020 by python-creole team, see AUTHORS for more details. :license: GNU GPL v3 or above, see LICENSE for more details. """ @@ -20,12 +18,9 @@ import sys import unittest -import warnings -try: - from StringIO import StringIO -except ImportError: - from io import StringIO # python 3 + +from io import StringIO try: from pygments import highlight @@ -35,7 +30,6 @@ except ImportError: from creole.tests.utils.base_unittest import BaseCreoleTest from creole.tests import test_macros -from creole.py3compat import PY3 from creole import creole2html from creole.shared import example_macros diff --git a/creole/tests/test_setup_utils.py b/creole/tests/test_setup_utils.py index 462351e..30a532e 100644 --- a/creole/tests/test_setup_utils.py +++ b/creole/tests/test_setup_utils.py @@ -1,5 +1,3 @@ -#!/usr/bin/env python -# coding: utf-8 """ unittest for setup_utils @@ -7,12 +5,11 @@ https://code.google.com/p/python-creole/wiki/UseInSetup - :copyleft: 2011-2014 by python-creole team, see AUTHORS for more details. + :copyleft: 2011-2020 by python-creole team, see AUTHORS for more details. :license: GNU GPL v3 or above, see LICENSE for more details. """ - import unittest import os import warnings @@ -26,7 +23,6 @@ except ImportError: import creole from creole.setup_utils import get_long_description from creole.tests.utils.base_unittest import BaseCreoleTest -from creole.py3compat import BINARY_TYPE, PY3, TEXT_TYPE import tempfile @@ -124,16 +120,7 @@ class SetupUtilsTests(BaseCreoleTest): def test_readme_encoding(self): long_description = get_long_description(TEST_README_DIR, filename=TEST_README_FILENAME, raise_errors=True) - - if PY3: - self.assertTrue(isinstance(long_description, TEXT_TYPE)) - else: - self.assertTrue(isinstance(long_description, BINARY_TYPE)) + self.assertTrue(isinstance(long_description, str)) txt = "German Umlaute: ä ö ü ß Ä Ö Ü" - if not PY3: - txt = txt.encode("utf-8") self.assertIn(txt, long_description) - -if __name__ == '__main__': - unittest.main() diff --git a/creole/tests/utils/base_unittest.py b/creole/tests/utils/base_unittest.py index e375168..26b67fa 100644 --- a/creole/tests/utils/base_unittest.py +++ b/creole/tests/utils/base_unittest.py @@ -1,5 +1,3 @@ -# coding: utf-8 - """ unitest base class @@ -7,7 +5,7 @@ Basic unittest class for all python-creole tests. - :copyleft: 2008-2014 by python-creole team, see AUTHORS for more details. + :copyleft: 2008-2020 by python-creole team, see AUTHORS for more details. :license: GNU GPL v3 or above, see LICENSE for more details. """ @@ -17,7 +15,6 @@ import re import warnings from creole.tests.utils.utils import MarkupTest -from creole.py3compat import TEXT_TYPE try: @@ -109,10 +106,10 @@ class BaseCreoleTest(MarkupTest): # prepare whitespace on test strings markup_string = self._prepare_text(raw_creole) - assert isinstance(markup_string, TEXT_TYPE) + assert isinstance(markup_string, str) html_string = self._prepare_text(raw_html) - assert isinstance(html_string, TEXT_TYPE) + assert isinstance(html_string, str) if strip_lines: html_string = strip_html_lines(html_string, strip_lines) self._debug_text("assert_creole2html() html_string reference", html_string) @@ -162,7 +159,7 @@ class BaseCreoleTest(MarkupTest): """ self.assertEqual(parser_kwargs, {}, "parser_kwargs is deprecated!") self.assertEqual(emitter_kwargs, {}, "parser_kwargs is deprecated!") -# assert isinstance(raw_html, TEXT_TYPE) +# assert isinstance(raw_html, str) # creole_string = unicode(creole_string, encoding="utf8") # raw_html = unicode(raw_html, "utf8") @@ -170,12 +167,12 @@ class BaseCreoleTest(MarkupTest): # prepare whitespace on test strings creole = self._prepare_text(raw_creole) - assert isinstance(creole, TEXT_TYPE) + assert isinstance(creole, str) if debug: self._debug_text("assert_creole2html() markup", creole) html = self._prepare_text(raw_html) - assert isinstance(html, TEXT_TYPE) + assert isinstance(html, str) self.assert_html2creole2(creole, html, debug, unknown_emit, strict) @@ -201,8 +198,8 @@ class BaseCreoleTest(MarkupTest): self.assertEqual(html_parser_kwargs, {}, "html_parser_kwargs is deprecated!") self.assertEqual(creole_emitter_kwargs, {}, "creole_emitter_kwargs is deprecated!") - assert isinstance(creole_string, TEXT_TYPE) - assert isinstance(html_string, TEXT_TYPE) + assert isinstance(creole_string, str) + assert isinstance(html_string, str) self.assertNotEqual(creole_string, html_string) self.assert_creole2html( @@ -248,8 +245,8 @@ class BaseCreoleTest(MarkupTest): * html2textile * textile2html """ -# assert isinstance(textile_string, TEXT_TYPE) -# assert isinstance(html_string, TEXT_TYPE) +# assert isinstance(textile_string, str) +# assert isinstance(html_string, str) self.assertNotEqual(textile_string, html_string) # compare html -> textile @@ -327,8 +324,8 @@ class BaseCreoleTest(MarkupTest): def cross_compare_rest(self, rest_string, html_string, \ strip_lines=False, debug=False, parser_kwargs={}, emitter_kwargs={}): -# assert isinstance(textile_string, TEXT_TYPE) -# assert isinstance(html_string, TEXT_TYPE) +# assert isinstance(textile_string, str) +# assert isinstance(html_string, str) self.assertNotEqual(rest_string, html_string) rest_string, html_string = self.assert_html2rest( |