summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJensDiemer <git@jensdiemer.de>2020-01-19 10:42:33 +0100
committerJensDiemer <git@jensdiemer.de>2020-01-19 10:42:33 +0100
commit842c244de2a81b39cde9994e67a000e7bef75f15 (patch)
treecf042e589075bbfb03424ee40a8993f847badc14
parentaa38ea16d9e02b2484232ad816865b6a93637cdc (diff)
downloadcreole-842c244de2a81b39cde9994e67a000e7bef75f15.tar.gz
remove python 2 compatibility code
-rw-r--r--creole/__init__.py5
-rw-r--r--creole/emitter/creol2html_emitter.py11
-rw-r--r--creole/html_tools/deentity.py27
-rw-r--r--creole/html_tools/strip_html.py2
-rw-r--r--creole/parser/creol2html_parser.py10
-rw-r--r--creole/parser/html_parser.py28
-rw-r--r--creole/py3compat.py43
-rw-r--r--creole/rest_tools/clean_writer.py17
-rw-r--r--creole/setup_utils.py13
-rw-r--r--creole/shared/HTMLParsercompat.py589
-rw-r--r--creole/shared/base_emitter.py12
-rw-r--r--creole/shared/document_tree.py7
-rw-r--r--creole/shared/html_parser.py32
-rw-r--r--creole/shared/utils.py10
-rw-r--r--creole/tests/test_creole2html.py12
-rw-r--r--creole/tests/test_setup_utils.py17
-rw-r--r--creole/tests/utils/base_unittest.py27
17 files changed, 55 insertions, 807 deletions
diff --git a/creole/__init__.py b/creole/__init__.py
index 3c6a113..e9dc0da 100644
--- a/creole/__init__.py
+++ b/creole/__init__.py
@@ -28,7 +28,6 @@ from creole.emitter.html2creole_emitter import CreoleEmitter
from creole.emitter.html2rest_emitter import ReStructuredTextEmitter
from creole.emitter.html2textile_emitter import TextileEmitter
from creole.parser.html_parser import HtmlParser
-from creole.py3compat import TEXT_TYPE
__version__ = "1.4.0"
@@ -52,7 +51,7 @@ def creole2html(markup_string, debug=False,
Info: parser_kwargs and emitter_kwargs are deprecated
"""
- assert isinstance(markup_string, TEXT_TYPE), "given markup_string must be unicode!"
+ assert isinstance(markup_string, str), "given markup_string must be unicode!"
parser_kwargs2 = {
"block_rules": block_rules,
@@ -84,7 +83,7 @@ def creole2html(markup_string, debug=False,
def parse_html(html_string, debug=False):
""" create the document tree from html code """
- assert isinstance(html_string, TEXT_TYPE), "given html_string must be unicode!"
+ assert isinstance(html_string, str), "given html_string must be unicode!"
h2c = HtmlParser(debug=debug)
document_tree = h2c.feed(html_string)
diff --git a/creole/emitter/creol2html_emitter.py b/creole/emitter/creol2html_emitter.py
index 0fa0def..76f6997 100644
--- a/creole/emitter/creol2html_emitter.py
+++ b/creole/emitter/creol2html_emitter.py
@@ -1,23 +1,18 @@
-# coding: utf-8
-
"""
WikiCreole to HTML converter
- :copyleft: 2008-2015 by python-creole team, see AUTHORS for more details.
+ :copyleft: 2008-2020 by python-creole team, see AUTHORS for more details.
:license: GNU GPL v3 or above, see LICENSE for more details.
"""
-
-
import json
from xml.sax.saxutils import escape
import sys
import traceback
from creole.parser.creol2html_parser import CreoleParser
-from creole.py3compat import TEXT_TYPE
from creole.shared.utils import string2dict
@@ -70,7 +65,7 @@ class TableOfContent(object):
def nested_headlines2html(self, nested_headlines, level=0):
"""Convert a python nested list like the one representing the toc to an html equivalent."""
indent = "\t"*level
- if isinstance(nested_headlines, TEXT_TYPE):
+ if isinstance(nested_headlines, str):
return '%s<li><a href="#%s">%s</a></li>\n' % (indent, nested_headlines, nested_headlines)
elif isinstance(nested_headlines, list):
html = '%s<ul>\n' % indent
@@ -359,7 +354,7 @@ class HtmlEmitter(object):
exc_info=sys.exc_info()
)
- if not isinstance(result, TEXT_TYPE):
+ if not isinstance(result, str):
msg = "Macro '%s' doesn't return a unicode string!" % macro_name
if self.verbose > 1:
msg += " - returns: %r, type %r" % (result, type(result))
diff --git a/creole/html_tools/deentity.py b/creole/html_tools/deentity.py
index 2f6104a..fd23782 100644
--- a/creole/html_tools/deentity.py
+++ b/creole/html_tools/deentity.py
@@ -1,24 +1,16 @@
-#!/usr/bin/env python
-# coding: utf-8
"""
python-creole utils
~~~~~~~~~~~~~~~~~~~
- :copyleft: 2008-2011 by python-creole team, see AUTHORS for more details.
+ :copyleft: 2008-2020 by python-creole team, see AUTHORS for more details.
:license: GNU GPL v3 or above, see LICENSE for more details.
"""
-
import re
-try:
- import htmlentitydefs as entities
-except ImportError:
- from html import entities # python 3
-
-from creole.py3compat import PY3
+from html import entities
entities_rules = '|'.join([
@@ -53,18 +45,12 @@ class Deentity(object):
def replace_number(self, text):
""" unicode number entity """
unicode_no = int(text)
- if PY3:
- return chr(unicode_no)
- else:
- return unichr(unicode_no)
+ return chr(unicode_no)
def replace_hex(self, text):
""" hex entity """
unicode_no = int(text, 16)
- if PY3:
- return chr(unicode_no)
- else:
- return unichr(unicode_no)
+ return chr(unicode_no)
def replace_named(self, text):
""" named entity """
@@ -73,10 +59,7 @@ class Deentity(object):
return " "
else:
codepoint = entities.name2codepoint[text]
- if PY3:
- return chr(codepoint)
- else:
- return unichr(codepoint)
+ return chr(codepoint)
def replace_all(self, content):
""" replace all html entities form the given text. """
diff --git a/creole/html_tools/strip_html.py b/creole/html_tools/strip_html.py
index 10534ad..c5e79a2 100644
--- a/creole/html_tools/strip_html.py
+++ b/creole/html_tools/strip_html.py
@@ -58,8 +58,6 @@ def strip_html(html_code):
>>> strip_html('<p>a <img src="/image.jpg" /> image.</p>')
'<p>a <img src="/image.jpg" /> image.</p>'
-
-
"""
def strip_tag(match):
diff --git a/creole/parser/creol2html_parser.py b/creole/parser/creol2html_parser.py
index 63f3b14..7c519d8 100644
--- a/creole/parser/creol2html_parser.py
+++ b/creole/parser/creol2html_parser.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-
-
"""
Creole wiki markup parser
@@ -18,18 +15,15 @@
unrecognized schemes (like wtf://server/path) triggering italic rendering
for the rest of the paragraph.
- :copyleft: 2008-2011 by python-creole team, see AUTHORS for more details.
+ :copyleft: 2008-2020 by python-creole team, see AUTHORS for more details.
:license: GNU GPL v3 or above, see LICENSE for more details.
"""
-
-
import re
from pprint import pformat
from creole.parser.creol2html_rules import BlockRules, INLINE_FLAGS, INLINE_RULES, \
SpecialRules, InlineRules
-from creole.py3compat import TEXT_TYPE
from creole.shared.document_tree import DocNode
@@ -61,7 +55,7 @@ class CreoleParser(object):
def __init__(self, raw, block_rules=None, blog_line_breaks=True, debug=False):
- assert isinstance(raw, TEXT_TYPE)
+ assert isinstance(raw, str)
self.raw = raw
if block_rules is None:
diff --git a/creole/parser/html_parser.py b/creole/parser/html_parser.py
index 4380431..6cd1c88 100644
--- a/creole/parser/html_parser.py
+++ b/creole/parser/html_parser.py
@@ -1,28 +1,21 @@
-#!/usr/bin/env python
-# coding: utf-8
"""
python-creole
~~~~~~~~~~~~~
-
- :copyleft: 2008-2011 by python-creole team, see AUTHORS for more details.
+ :copyleft: 2008-2020 by python-creole team, see AUTHORS for more details.
:license: GNU GPL v3 or above, see LICENSE for more details.
"""
-
-
import re
import warnings
+from html.parser import HTMLParser
-from creole.parser.html_parser_config import BLOCK_TAGS, IGNORE_TAGS
from creole.html_tools.strip_html import strip_html
-from creole.py3compat import TEXT_TYPE, BINARY_TYPE
-from creole.shared.document_tree import DocNode, DebugList
-from creole.shared.html_parser import HTMLParser
-
+from creole.parser.html_parser_config import BLOCK_TAGS, IGNORE_TAGS
+from creole.shared.document_tree import DebugList, DocNode
-#------------------------------------------------------------------------------
+# ------------------------------------------------------------------------------
block_re = re.compile(r'''
^<pre> \s* $
@@ -44,7 +37,7 @@ inline_re = re.compile(r'''
headline_tag_re = re.compile(r"h(\d)", re.UNICODE)
-#------------------------------------------------------------------------------
+# ------------------------------------------------------------------------------
class HtmlParser(HTMLParser):
@@ -83,7 +76,7 @@ class HtmlParser(HTMLParser):
_inline_placeholder = "inlinedata"
def __init__(self, debug=False):
- HTMLParser.__init__(self)
+ super(HtmlParser, self).__init__(convert_charrefs=False)
self.debugging = debug
if self.debugging:
@@ -104,7 +97,7 @@ class HtmlParser(HTMLParser):
def _pre_cut(self, data, type, placeholder):
if self.debugging:
print("append blockdata: %r" % data)
- assert isinstance(data, TEXT_TYPE), "blockdata is not unicode"
+ assert isinstance(data, str), "blockdata is not unicode"
self.blockdata.append(data)
id = len(self.blockdata) - 1
return '<%s type="%s" id="%s" />' % (placeholder, type, id)
@@ -133,7 +126,7 @@ class HtmlParser(HTMLParser):
# data = match.group("data")
def feed(self, raw_data):
- assert isinstance(raw_data, TEXT_TYPE), "feed data must be unicode!"
+ assert isinstance(raw_data, str), "feed data must be unicode!"
data = raw_data.strip()
# cut out <pre> and <tt> areas block tag areas
@@ -208,8 +201,7 @@ class HtmlParser(HTMLParser):
def handle_data(self, data):
self.debug_msg("data", "%r" % data)
- if isinstance(data, BINARY_TYPE):
- data = unicode(data)
+ assert isinstance(data, str)
DocNode("data", self.cur, content=data)
def handle_charref(self, name):
diff --git a/creole/py3compat.py b/creole/py3compat.py
deleted file mode 100644
index 76c55b4..0000000
--- a/creole/py3compat.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# coding: utf-8
-
-"""
- Helper to support Python v2 and v3
- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
- Some ideas borrowed from six
-
- See also:
- http://python3porting.com
- https://bitbucket.org/gutworth/six/src/tip/six.py
- http://packages.python.org/six/
-"""
-
-
-
-import sys
-import doctest
-import re
-
-# True if we are running on Python 3.
-PY3 = sys.version_info[0] == 3
-
-
-if PY3:
- TEXT_TYPE = str
- BINARY_TYPE = bytes
-else:
- TEXT_TYPE = unicode
- BINARY_TYPE = str
-
- # Simple remove 'u' from python 2 unicode repr string
- # See also:
- # http://bugs.python.org/issue3955
- # http://www.python-forum.de/viewtopic.php?f=1&t=27509 (de)
- origin_OutputChecker = doctest.OutputChecker
- class OutputChecker2(origin_OutputChecker):
- def check_output(self, want, got, optionflags):
- got = got.replace("u'", "'").replace('u"', '"')
- return origin_OutputChecker.check_output(self, want, got, optionflags)
- doctest.OutputChecker = OutputChecker2
-
-
diff --git a/creole/rest_tools/clean_writer.py b/creole/rest_tools/clean_writer.py
index 2b6ae66..b6272ab 100644
--- a/creole/rest_tools/clean_writer.py
+++ b/creole/rest_tools/clean_writer.py
@@ -1,6 +1,3 @@
-#!/usr/bin/env python
-# coding: utf-8
-
"""
A clean reStructuredText html writer
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -12,17 +9,14 @@
http://www.arnebrodowski.de/blog/write-your-own-restructuredtext-writer.html
https://github.com/alex-morega/docutils-plainhtml/blob/master/plain_html_writer.py
- :copyleft: 2011-2013 by python-creole team, see AUTHORS for more details.
+ :copyleft: 2011-2020 by python-creole team, see AUTHORS for more details.
:license: GNU GPL v3 or above, see LICENSE for more details.
"""
-
-#import warnings
import sys
from creole.exceptions import DocutilsImportError
-from creole.py3compat import TEXT_TYPE, PY3
try:
import docutils
@@ -95,9 +89,9 @@ class CleanHTMLTranslator(html4css1.HTMLTranslator, object):
continue
if isinstance(value, list):
- value = ' '.join([TEXT_TYPE(x) for x in value])
+ value = ' '.join([str(x) for x in value])
- part = '%s="%s"' % (name.lower(), self.attval(TEXT_TYPE(value)))
+ part = '%s="%s"' % (name.lower(), self.attval(str(value)))
parts.append(part)
if DEBUG:
@@ -205,10 +199,7 @@ def rest2html(content, enable_exit_status=None, **kwargs):
...
SystemExit: 13
"""
- if not PY3:
- content = unicode(content)
-
- assert isinstance(content, TEXT_TYPE), "rest2html content must be %s, but it's %s" % (TEXT_TYPE, type(content))
+ assert isinstance(content, str), "rest2html content must be %s, but it's %s" % (str, type(content))
settings_overrides = {
"input_encoding": "unicode",
diff --git a/creole/setup_utils.py b/creole/setup_utils.py
index 6f6b651..36e30e1 100644
--- a/creole/setup_utils.py
+++ b/creole/setup_utils.py
@@ -38,12 +38,10 @@
)
---------------------------------------------------------------------------
- :copyleft: 2011-2014 by the python-creole team, see AUTHORS for more details.
+ :copyleft: 2011-2020 by the python-creole team, see AUTHORS for more details.
:license: GNU GPL v3 or above, see LICENSE for more details.
"""
-
-
import codecs
import os
import sys
@@ -51,7 +49,6 @@ import warnings
from creole import creole2html, html2rest
from creole.shared.unknown_tags import raise_unknown_node, transparent_unknown_nodes
-from creole.py3compat import PY3
RAISE_ERRORS_ARGS = (
@@ -99,14 +96,10 @@ def get_long_description(package_root, filename="README.creole", raise_errors=No
long_description_html = creole2html(long_description_origin)
# convert html to ReSt
- long_description_rest_unicode = html2rest(
+ long_description_rest = html2rest(
long_description_html,
emitter_kwargs={"unknown_emit":unknown_emit}
)
- if PY3:
- long_description_rest = long_description_rest_unicode
- else:
- long_description_rest = long_description_rest_unicode.encode("utf-8")
except Exception:
if raise_errors:
raise
@@ -120,7 +113,7 @@ def get_long_description(package_root, filename="README.creole", raise_errors=No
# Test created ReSt code like PyPi does it.
from creole.rest_tools.pypi_rest2html import pypi_rest2html
try:
- pypi_rest2html(long_description_rest_unicode)
+ pypi_rest2html(long_description_rest)
except SystemExit as e:
msg = "Error creole2rest self test failed: rest2html() exist with status code: %s\n" % e.args[0]
sys.stderr.write(msg)
diff --git a/creole/shared/HTMLParsercompat.py b/creole/shared/HTMLParsercompat.py
deleted file mode 100644
index 6f61cc5..0000000
--- a/creole/shared/HTMLParsercompat.py
+++ /dev/null
@@ -1,589 +0,0 @@
-"""
-Patched version of the original from:
- http://hg.python.org/cpython/file/tip/Lib/html/parser.py
-
-compare:
- http://hg.python.org/cpython/file/2.7/Lib/HTMLParser.py
- http://hg.python.org/cpython/file/3.2/Lib/html/parser.py
-
-e.g.:
- cd /tmp/
- wget http://hg.python.org/cpython/raw-file/2.7/Lib/HTMLParser.py
- wget http://hg.python.org/cpython/raw-file/3.2/Lib/html/parser.py
- meld HTMLParser.py parser.py
-
-Make it compatible with Python 2.x and 3.x
-
-More info see html_parser.py !
-"""
-
-# ------------------------------------------------------------------- add start
-
-from creole.py3compat import PY3
-# --------------------------------------------------------------------- add end
-
-"""A parser for HTML and XHTML."""
-
-# This file is based on sgmllib.py, but the API is slightly different.
-
-# XXX There should be a way to distinguish between PCDATA (parsed
-# character data -- the normal case), RCDATA (replaceable character
-# data -- only char and entity references and end tags are special)
-# and CDATA (character data -- only end tags are special).
-
-
-# --------------------------------------------------------------- changes start
-try:
- import _markupbase # python 3
-except ImportError:
- import markupbase as _markupbase # python 2
-# --------------------------------------------------------------- changes end
-import re
-
-# Regular expressions used for parsing
-
-interesting_normal = re.compile('[&<]')
-incomplete = re.compile('&[a-zA-Z#]')
-
-entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
-charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
-
-starttagopen = re.compile('<[a-zA-Z]')
-piclose = re.compile('>')
-commentclose = re.compile(r'--\s*>')
-tagfind = re.compile('([a-zA-Z][-.a-zA-Z0-9:_]*)(?:\s|/(?!>))*')
-# see http://www.w3.org/TR/html5/tokenization.html#tag-open-state
-# and http://www.w3.org/TR/html5/tokenization.html#tag-name-state
-tagfind_tolerant = re.compile('[a-zA-Z][^\t\n\r\f />\x00]*')
-# Note:
-# 1) the strict attrfind isn't really strict, but we can't make it
-# correctly strict without breaking backward compatibility;
-# 2) if you change attrfind remember to update locatestarttagend too;
-# 3) if you change attrfind and/or locatestarttagend the parser will
-# explode, so don't do it.
-attrfind = re.compile(
- r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
- r'(\'[^\']*\'|"[^"]*"|[^\s"\'=<>`]*))?')
-attrfind_tolerant = re.compile(
- r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*'
- r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*')
-locatestarttagend = re.compile(r"""
- <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
- (?:\s+ # whitespace before attribute name
- (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
- (?:\s*=\s* # value indicator
- (?:'[^']*' # LITA-enclosed value
- |\"[^\"]*\" # LIT-enclosed value
- |[^'\">\s]+ # bare value
- )
- )?
- )
- )*
- \s* # trailing whitespace
-""", re.VERBOSE)
-locatestarttagend_tolerant = re.compile(r"""
- <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
- (?:[\s/]* # optional whitespace before attribute name
- (?:(?<=['"\s/])[^\s/>][^\s/=>]* # attribute name
- (?:\s*=+\s* # value indicator
- (?:'[^']*' # LITA-enclosed value
- |"[^"]*" # LIT-enclosed value
- |(?!['"])[^>\s]* # bare value
- )
- (?:\s*,)* # possibly followed by a comma
- )?(?:\s|/(?!>))*
- )*
- )?
- \s* # trailing whitespace
-""", re.VERBOSE)
-endendtag = re.compile('>')
-# the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between
-# </ and the tag name, so maybe this should be fixed
-endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
-
-
-class HTMLParseError(Exception):
- """Exception raised for all parse errors."""
-
- def __init__(self, msg, position=(None, None)):
- assert msg
- self.msg = msg
- self.lineno = position[0]
- self.offset = position[1]
-
- def __str__(self):
- result = self.msg
- if self.lineno is not None:
- result = result + ", at line %d" % self.lineno
- if self.offset is not None:
- result = result + ", column %d" % (self.offset + 1)
- return result
-
-
-class HTMLParser(_markupbase.ParserBase):
- """Find tags and other markup and call handler functions.
-
- Usage:
- p = HTMLParser()
- p.feed(data)
- ...
- p.close()
-
- Start tags are handled by calling self.handle_starttag() or
- self.handle_startendtag(); end tags by self.handle_endtag(). The
- data between tags is passed from the parser to the derived class
- by calling self.handle_data() with the data as argument (the data
- may be split up in arbitrary chunks). Entity references are
- passed by calling self.handle_entityref() with the entity
- reference as the argument. Numeric character references are
- passed to self.handle_charref() with the string containing the
- reference as the argument.
- """
-
- CDATA_CONTENT_ELEMENTS = ("script", "style")
-
- def __init__(self, strict=True):
- """Initialize and reset this instance.
-
- If strict is set to True (the default), errors are raised when invalid
- HTML is encountered. If set to False, an attempt is instead made to
- continue parsing, making "best guesses" about the intended meaning, in
- a fashion similar to what browsers typically do.
- """
- self.strict = strict
- self.reset()
-
- def reset(self):
- """Reset this instance. Loses all unprocessed data."""
- self.rawdata = ''
- self.lasttag = '???'
- self.interesting = interesting_normal
- self.cdata_elem = None
- _markupbase.ParserBase.reset(self)
-
- def feed(self, data):
- r"""Feed data to the parser.
-
- Call this as often as you want, with as little or as much text
- as you want (may include '\n').
- """
- self.rawdata = self.rawdata + data
- self.goahead(0)
-
- def close(self):
- """Handle any buffered data."""
- self.goahead(1)
-
- def error(self, message):
- raise HTMLParseError(message, self.getpos())
-
- __starttag_text = None
-
- def get_starttag_text(self):
- """Return full source of start tag: '<...>'."""
- return self.__starttag_text
-
- def set_cdata_mode(self, elem):
- self.cdata_elem = elem.lower()
- self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
-
- def clear_cdata_mode(self):
- self.interesting = interesting_normal
- self.cdata_elem = None
-
- # Internal -- handle data as far as reasonable. May leave state
- # and data to be processed by a subsequent call. If 'end' is
- # true, force handling all data as if followed by EOF marker.
- def goahead(self, end):
- rawdata = self.rawdata
- i = 0
- n = len(rawdata)
- while i < n:
- match = self.interesting.search(rawdata, i) # < or &
- if match:
- j = match.start()
- else:
- if self.cdata_elem:
- break
- j = n
- if i < j: self.handle_data(rawdata[i:j])
- i = self.updatepos(i, j)
- if i == n: break
- startswith = rawdata.startswith
- if startswith('<', i):
- if starttagopen.match(rawdata, i): # < + letter
- k = self.parse_starttag(i)
- elif startswith("</", i):
- k = self.parse_endtag(i)
- elif startswith("<!--", i):
- k = self.parse_comment(i)
- elif startswith("<?", i):
- k = self.parse_pi(i)
- elif startswith("<!", i):
- if self.strict:
- k = self.parse_declaration(i)
- else:
- k = self.parse_html_declaration(i)
- elif (i + 1) < n:
- self.handle_data("<")
- k = i + 1
- else:
- break
- if k < 0:
- if not end:
- break
- if self.strict:
- self.error("EOF in middle of construct")
- k = rawdata.find('>', i + 1)
- if k < 0:
- k = rawdata.find('<', i + 1)
- if k < 0:
- k = i + 1
- else:
- k += 1
- self.handle_data(rawdata[i:k])
- i = self.updatepos(i, k)
- elif startswith("&#", i):
- match = charref.match(rawdata, i)
- if match:
- name = match.group()[2:-1]
- self.handle_charref(name)
- k = match.end()
- if not startswith(';', k-1):
- k = k - 1
- i = self.updatepos(i, k)
- continue
- else:
- if ";" in rawdata[i:]: #bail by consuming &#
- self.handle_data(rawdata[0:2])
- i = self.updatepos(i, 2)
- break
- elif startswith('&', i):
- match = entityref.match(rawdata, i)
- if match:
- name = match.group(1)
- self.handle_entityref(name)
- k = match.end()
- if not startswith(';', k-1):
- k = k - 1
- i = self.updatepos(i, k)
- continue
- match = incomplete.match(rawdata, i)
- if match:
- # match.group() will contain at least 2 chars
- if end and match.group() == rawdata[i:]:
- if self.strict:
- self.error("EOF in middle of entity or char ref")
- else:
- if k <= i:
- k = n
- i = self.updatepos(i, i + 1)
- # incomplete
- break
- elif (i + 1) < n:
- # not the end of the buffer, and can't be confused
- # with some other construct
- self.handle_data("&")
- i = self.updatepos(i, i + 1)
- else:
- break
- else:
- assert 0, "interesting.search() lied"
- # end while
- if end and i < n and not self.cdata_elem:
- self.handle_data(rawdata[i:n])
- i = self.updatepos(i, n)
- self.rawdata = rawdata[i:]
-
- # Internal -- parse html declarations, return length or -1 if not terminated
- # See w3.org/TR/html5/tokenization.html#markup-declaration-open-state
- # See also parse_declaration in _markupbase
- def parse_html_declaration(self, i):
- rawdata = self.rawdata
- if rawdata[i:i+2] != '<!':
- self.error('unexpected call to parse_html_declaration()')
- if rawdata[i:i+4] == '<!--':
- # this case is actually already handled in goahead()
- return self.parse_comment(i)
- elif rawdata[i:i+3] == '<![':
- return self.parse_marked_section(i)
- elif rawdata[i:i+9].lower() == '<!doctype':
- # find the closing >
- gtpos = rawdata.find('>', i+9)
- if gtpos == -1:
- return -1
- self.handle_decl(rawdata[i+2:gtpos])
- return gtpos+1
- else:
- return self.parse_bogus_comment(i)
-
- # Internal -- parse bogus comment, return length or -1 if not terminated
- # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
- def parse_bogus_comment(self, i, report=1):
- rawdata = self.rawdata
- if rawdata[i:i+2] not in ('<!', '</'):
- self.error('unexpected call to parse_comment()')
- pos = rawdata.find('>', i+2)
- if pos == -1:
- return -1
- if report:
- self.handle_comment(rawdata[i+2:pos])
- return pos + 1
-
- # Internal -- parse processing instr, return end or -1 if not terminated
- def parse_pi(self, i):
- rawdata = self.rawdata
- assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'
- match = piclose.search(rawdata, i+2) # >
- if not match:
- return -1
- j = match.start()
- self.handle_pi(rawdata[i+2: j])
- j = match.end()
- return j
-
- # Internal -- handle starttag, return end or -1 if not terminated
- def parse_starttag(self, i):
- self.__starttag_text = None
- endpos = self.check_for_whole_start_tag(i)
- if endpos < 0:
- return endpos
- rawdata = self.rawdata
- self.__starttag_text = rawdata[i:endpos]
-
- # Now parse the data between i+1 and j into a tag and attrs
- attrs = []
- match = tagfind.match(rawdata, i+1)
- assert match, 'unexpected call to parse_starttag()'
- k = match.end()
- self.lasttag = tag = match.group(1).lower()
- while k < endpos:
- if self.strict:
- m = attrfind.match(rawdata, k)
- else:
- m = attrfind_tolerant.match(rawdata, k)
- if not m:
- break
- attrname, rest, attrvalue = m.group(1, 2, 3)
- if not rest:
- attrvalue = None
- elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
- attrvalue[:1] == '"' == attrvalue[-1:]:
- attrvalue = attrvalue[1:-1]
- if attrvalue:
- attrvalue = self.unescape(attrvalue)
- attrs.append((attrname.lower(), attrvalue))
- k = m.end()
-
- end = rawdata[k:endpos].strip()
- if end not in (">", "/>"):
- lineno, offset = self.getpos()
- if "\n" in self.__starttag_text:
- lineno = lineno + self.__starttag_text.count("\n")
- offset = len(self.__starttag_text) \
- - self.__starttag_text.rfind("\n")
- else:
- offset = offset + len(self.__starttag_text)
- if self.strict:
- self.error("junk characters in start tag: %r"
- % (rawdata[k:endpos][:20],))
- self.handle_data(rawdata[i:endpos])
- return endpos
- if end.endswith('/>'):
- # XHTML-style empty tag: <span attr="value" />
- self.handle_startendtag(tag, attrs)
- else:
- self.handle_starttag(tag, attrs)
- if tag in self.CDATA_CONTENT_ELEMENTS:
- self.set_cdata_mode(tag)
- return endpos
-
- # Internal -- check to see if we have a complete starttag; return end
- # or -1 if incomplete.
- def check_for_whole_start_tag(self, i):
- rawdata = self.rawdata
- if self.strict:
- m = locatestarttagend.match(rawdata, i)
- else:
- m = locatestarttagend_tolerant.match(rawdata, i)
- if m:
- j = m.end()
- next = rawdata[j:j+1]
- if next == ">":
- return j + 1
- if next == "/":
- if rawdata.startswith("/>", j):
- return j + 2
- if rawdata.startswith("/", j):
- # buffer boundary
- return -1
- # else bogus input
- if self.strict:
- self.updatepos(i, j + 1)
- self.error("malformed empty start tag")
- if j > i:
- return j
- else:
- return i + 1
- if next == "":
- # end of input
- return -1
- if next in ("abcdefghijklmnopqrstuvwxyz=/"
- "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
- # end of input in or before attribute value, or we have the
- # '/' from a '/>' ending
- return -1
- if self.strict:
- self.updatepos(i, j)
- self.error("malformed start tag")
- if j > i:
- return j
- else:
- return i + 1
- raise AssertionError("we should not get here!")
-
- # Internal -- parse endtag, return end or -1 if incomplete
- def parse_endtag(self, i):
- rawdata = self.rawdata
- assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
- match = endendtag.search(rawdata, i+1) # >
- if not match:
- return -1
- gtpos = match.end()
- match = endtagfind.match(rawdata, i) # </ + tag + >
- if not match:
- if self.cdata_elem is not None:
- self.handle_data(rawdata[i:gtpos])
- return gtpos
- if self.strict:
- self.error("bad end tag: %r" % (rawdata[i:gtpos],))
- # find the name: w3.org/TR/html5/tokenization.html#tag-name-state
- namematch = tagfind_tolerant.match(rawdata, i+2)
- if not namematch:
- # w3.org/TR/html5/tokenization.html#end-tag-open-state
- if rawdata[i:i+3] == '</>':
- return i+3
- else:
- return self.parse_bogus_comment(i)
- tagname = namematch.group().lower()
- # consume and ignore other stuff between the name and the >
- # Note: this is not 100% correct, since we might have things like
- # </tag attr=">">, but looking for > after tha name should cover
- # most of the cases and is much simpler
- gtpos = rawdata.find('>', namematch.end())
- self.handle_endtag(tagname)
- return gtpos+1
-
- elem = match.group(1).lower() # script or style
- if self.cdata_elem is not None:
- if elem != self.cdata_elem:
- self.handle_data(rawdata[i:gtpos])
- return gtpos
-
- self.handle_endtag(elem.lower())
- self.clear_cdata_mode()
- return gtpos
-
- # Overridable -- finish processing of start+end tag: <tag.../>
- def handle_startendtag(self, tag, attrs):
- self.handle_starttag(tag, attrs)
- self.handle_endtag(tag)
-
- # Overridable -- handle start tag
- def handle_starttag(self, tag, attrs):
- pass
-
- # Overridable -- handle end tag
- def handle_endtag(self, tag):
- pass
-
- # Overridable -- handle character reference
- def handle_charref(self, name):
- pass
-
- # Overridable -- handle entity reference
- def handle_entityref(self, name):
- pass
-
- # Overridable -- handle data
- def handle_data(self, data):
- pass
-
- # Overridable -- handle comment
- def handle_comment(self, data):
- pass
-
- # Overridable -- handle declaration
- def handle_decl(self, decl):
- pass
-
- # Overridable -- handle processing instruction
- def handle_pi(self, data):
- pass
-
- def unknown_decl(self, data):
- if self.strict:
- self.error("unknown declaration: %r" % (data,))
-
- # Internal -- helper to remove special character quoting
- entitydefs = None
- def unescape(self, s):
- if '&' not in s:
- return s
- # -------------------------------------------------------- change start
- if PY3:
- def replaceEntities(s):
- s = s.groups()[0]
- try:
- if s[0] == "#":
- s = s[1:]
- if s[0] in ['x','X']:
- c = int(s[1:], 16)
- else:
- c = int(s)
- return chr(c)
- except ValueError:
- return '&#'+ s +';'
- else:
- # Cannot use name2codepoint directly, because HTMLParser
- # supports apos, which is not part of HTML 4
- import html.entities
- if HTMLParser.entitydefs is None:
- entitydefs = HTMLParser.entitydefs = {'apos':"'"}
- for k, v in html.entities.name2codepoint.items():
- entitydefs[k] = chr(v)
- try:
- return self.entitydefs[s]
- except KeyError:
- return '&'+s+';'
-
- return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));",
- replaceEntities, s, flags=re.ASCII)
- else:
- def replaceEntities(s):
- s = s.groups()[0]
- try:
- if s[0] == "#":
- s = s[1:]
- if s[0] in ['x','X']:
- c = int(s[1:], 16)
- else:
- c = int(s)
- return unichr(c)
- except ValueError:
- return '&#'+s+';'
- else:
- # Cannot use name2codepoint directly, because HTMLParser supports apos,
- # which is not part of HTML 4
- import htmlentitydefs
- if HTMLParser.entitydefs is None:
- entitydefs = HTMLParser.entitydefs = {'apos':"'"}
- for k, v in htmlentitydefs.name2codepoint.iteritems():
- entitydefs[k] = unichr(v)
- try:
- return self.entitydefs[s]
- except KeyError:
- return '&'+s+';'
-
- return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));", replaceEntities, s)
- # -------------------------------------------------------- change end
diff --git a/creole/shared/base_emitter.py b/creole/shared/base_emitter.py
index de6fd2f..a29c224 100644
--- a/creole/shared/base_emitter.py
+++ b/creole/shared/base_emitter.py
@@ -1,20 +1,16 @@
-#!/usr/bin/env python
-# coding: utf-8
"""
Base document tree emitter
~~~~~~~~~~~~~~~~~~~~~~~~~~
- :copyleft: 2008-2011 by python-creole team, see AUTHORS for more details.
+ :copyleft: 2008-2020 by python-creole team, see AUTHORS for more details.
:license: GNU GPL v3 or above, see LICENSE for more details.
"""
-
from creole.parser.html_parser_config import BLOCK_TAGS
from creole.html_tools.deentity import Deentity
-from creole.py3compat import TEXT_TYPE
from creole.shared.markup_table import MarkupTable
from creole.shared.unknown_tags import transparent_unknown_nodes
@@ -196,7 +192,7 @@ class BaseEmitter(object):
result = []
for child in node.children:
content = self.emit_node(child)
- assert isinstance(content, TEXT_TYPE)
+ assert isinstance(content, str)
result.append(content)
return result
@@ -220,11 +216,11 @@ class BaseEmitter(object):
if emit_method:
content = emit_method(node)
- if not isinstance(content, TEXT_TYPE):
+ if not isinstance(content, str):
unicode_error(method_name, emit_method, node, content)
else:
content = self._unknown_emit(self, node)
- if not isinstance(content, TEXT_TYPE):
+ if not isinstance(content, str):
unicode_error(method_name, self._unknown_emit, node, content)
self.last = node
diff --git a/creole/shared/document_tree.py b/creole/shared/document_tree.py
index 4971953..e5b7bcf 100644
--- a/creole/shared/document_tree.py
+++ b/creole/shared/document_tree.py
@@ -6,16 +6,13 @@
~~~~~~~~~~~~~
- :copyleft: 2008-2011 by python-creole team, see AUTHORS for more details.
+ :copyleft: 2008-2020 by python-creole team, see AUTHORS for more details.
:license: GNU GPL v3 or above, see LICENSE for more details.
"""
-
-
import warnings
import inspect
-from creole.py3compat import TEXT_TYPE
from creole.shared.utils import dict2string
@@ -35,7 +32,7 @@ class DocNode:
self.attrs = dict(attrs)
if content:
- assert isinstance(content, TEXT_TYPE), "Given content %r is not unicode, it's type: %s" % (
+ assert isinstance(content, str), "Given content %r is not unicode, it's type: %s" % (
content, type(content)
)
diff --git a/creole/shared/html_parser.py b/creole/shared/html_parser.py
deleted file mode 100644
index 0bdb7c4..0000000
--- a/creole/shared/html_parser.py
+++ /dev/null
@@ -1,32 +0,0 @@
-# coding: utf-8
-
-"""
- HTMLParser for Python 2.x and 3.x
- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
- The HTMLParser has problems with the correct handling of <script>...</script>
- and <style>...</style> areas.
-
- It was fixed with v2.7.3 and 3.2.3, see:
- http://www.python.org/download/releases/2.7.3/
- http://www.python.org/download/releases/3.2.3/
- see also:
- http://bugs.python.org/issue670664#msg146770
-
- :copyleft: 2011-2012 by python-creole team, see AUTHORS for more details.
- :license: GNU GPL v3 or above, see LICENSE for more details.
-"""
-
-
-try:
- import HTMLParser as OriginHTMLParser
-except ImportError:
- from html import parser as OriginHTMLParser # python 3
-
-
-if hasattr(OriginHTMLParser, "cdata_elem"):
- # Current python version is patched -> use the original
- HTMLParser = OriginHTMLParser
-else:
- # Current python version is not patched -> use own patched version
- from creole.shared.HTMLParsercompat import HTMLParser
diff --git a/creole/shared/utils.py b/creole/shared/utils.py
index f1b981d..cb55299 100644
--- a/creole/shared/utils.py
+++ b/creole/shared/utils.py
@@ -5,7 +5,7 @@
python creole utilities
~~~~~~~~~~~~~~~~~~~~~~~
- :copyleft: 2011-2014 by python-creole team, see AUTHORS for more details.
+ :copyleft: 2011-2020 by python-creole team, see AUTHORS for more details.
:license: GNU GPL v3 or above, see LICENSE for more details.
"""
@@ -14,7 +14,7 @@
import shlex
import json
-from creole.py3compat import TEXT_TYPE, PY3
+
try:
from pygments import lexers
@@ -31,7 +31,7 @@ KEYWORD_MAP = {
"None": None,
}
-def string2dict(raw_content, encoding="utf-8"):
+def string2dict(raw_content):
"""
convert a string into a dictionary. e.g.:
@@ -43,10 +43,6 @@ def string2dict(raw_content, encoding="utf-8"):
See test_creole2html.TestString2Dict()
"""
- if not PY3 and isinstance(raw_content, TEXT_TYPE):
- # shlex.split doesn't work with unicode?!?
- raw_content = raw_content.encode(encoding)
-
parts = shlex.split(raw_content)
result = {}
diff --git a/creole/tests/test_creole2html.py b/creole/tests/test_creole2html.py
index b75435d..b6f9ed0 100644
--- a/creole/tests/test_creole2html.py
+++ b/creole/tests/test_creole2html.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python
-# coding: utf-8
"""
creole2html unittest
@@ -12,7 +10,7 @@
Test the creole markup.
- :copyleft: 2008-2014 by python-creole team, see AUTHORS for more details.
+ :copyleft: 2008-2020 by python-creole team, see AUTHORS for more details.
:license: GNU GPL v3 or above, see LICENSE for more details.
"""
@@ -20,12 +18,9 @@
import sys
import unittest
-import warnings
-try:
- from StringIO import StringIO
-except ImportError:
- from io import StringIO # python 3
+
+from io import StringIO
try:
from pygments import highlight
@@ -35,7 +30,6 @@ except ImportError:
from creole.tests.utils.base_unittest import BaseCreoleTest
from creole.tests import test_macros
-from creole.py3compat import PY3
from creole import creole2html
from creole.shared import example_macros
diff --git a/creole/tests/test_setup_utils.py b/creole/tests/test_setup_utils.py
index 462351e..30a532e 100644
--- a/creole/tests/test_setup_utils.py
+++ b/creole/tests/test_setup_utils.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python
-# coding: utf-8
"""
unittest for setup_utils
@@ -7,12 +5,11 @@
https://code.google.com/p/python-creole/wiki/UseInSetup
- :copyleft: 2011-2014 by python-creole team, see AUTHORS for more details.
+ :copyleft: 2011-2020 by python-creole team, see AUTHORS for more details.
:license: GNU GPL v3 or above, see LICENSE for more details.
"""
-
import unittest
import os
import warnings
@@ -26,7 +23,6 @@ except ImportError:
import creole
from creole.setup_utils import get_long_description
from creole.tests.utils.base_unittest import BaseCreoleTest
-from creole.py3compat import BINARY_TYPE, PY3, TEXT_TYPE
import tempfile
@@ -124,16 +120,7 @@ class SetupUtilsTests(BaseCreoleTest):
def test_readme_encoding(self):
long_description = get_long_description(TEST_README_DIR, filename=TEST_README_FILENAME, raise_errors=True)
-
- if PY3:
- self.assertTrue(isinstance(long_description, TEXT_TYPE))
- else:
- self.assertTrue(isinstance(long_description, BINARY_TYPE))
+ self.assertTrue(isinstance(long_description, str))
txt = "German Umlaute: ä ö ü ß Ä Ö Ü"
- if not PY3:
- txt = txt.encode("utf-8")
self.assertIn(txt, long_description)
-
-if __name__ == '__main__':
- unittest.main()
diff --git a/creole/tests/utils/base_unittest.py b/creole/tests/utils/base_unittest.py
index e375168..26b67fa 100644
--- a/creole/tests/utils/base_unittest.py
+++ b/creole/tests/utils/base_unittest.py
@@ -1,5 +1,3 @@
-# coding: utf-8
-
"""
unitest base class
@@ -7,7 +5,7 @@
Basic unittest class for all python-creole tests.
- :copyleft: 2008-2014 by python-creole team, see AUTHORS for more details.
+ :copyleft: 2008-2020 by python-creole team, see AUTHORS for more details.
:license: GNU GPL v3 or above, see LICENSE for more details.
"""
@@ -17,7 +15,6 @@ import re
import warnings
from creole.tests.utils.utils import MarkupTest
-from creole.py3compat import TEXT_TYPE
try:
@@ -109,10 +106,10 @@ class BaseCreoleTest(MarkupTest):
# prepare whitespace on test strings
markup_string = self._prepare_text(raw_creole)
- assert isinstance(markup_string, TEXT_TYPE)
+ assert isinstance(markup_string, str)
html_string = self._prepare_text(raw_html)
- assert isinstance(html_string, TEXT_TYPE)
+ assert isinstance(html_string, str)
if strip_lines:
html_string = strip_html_lines(html_string, strip_lines)
self._debug_text("assert_creole2html() html_string reference", html_string)
@@ -162,7 +159,7 @@ class BaseCreoleTest(MarkupTest):
"""
self.assertEqual(parser_kwargs, {}, "parser_kwargs is deprecated!")
self.assertEqual(emitter_kwargs, {}, "parser_kwargs is deprecated!")
-# assert isinstance(raw_html, TEXT_TYPE)
+# assert isinstance(raw_html, str)
# creole_string = unicode(creole_string, encoding="utf8")
# raw_html = unicode(raw_html, "utf8")
@@ -170,12 +167,12 @@ class BaseCreoleTest(MarkupTest):
# prepare whitespace on test strings
creole = self._prepare_text(raw_creole)
- assert isinstance(creole, TEXT_TYPE)
+ assert isinstance(creole, str)
if debug:
self._debug_text("assert_creole2html() markup", creole)
html = self._prepare_text(raw_html)
- assert isinstance(html, TEXT_TYPE)
+ assert isinstance(html, str)
self.assert_html2creole2(creole, html, debug, unknown_emit, strict)
@@ -201,8 +198,8 @@ class BaseCreoleTest(MarkupTest):
self.assertEqual(html_parser_kwargs, {}, "html_parser_kwargs is deprecated!")
self.assertEqual(creole_emitter_kwargs, {}, "creole_emitter_kwargs is deprecated!")
- assert isinstance(creole_string, TEXT_TYPE)
- assert isinstance(html_string, TEXT_TYPE)
+ assert isinstance(creole_string, str)
+ assert isinstance(html_string, str)
self.assertNotEqual(creole_string, html_string)
self.assert_creole2html(
@@ -248,8 +245,8 @@ class BaseCreoleTest(MarkupTest):
* html2textile
* textile2html
"""
-# assert isinstance(textile_string, TEXT_TYPE)
-# assert isinstance(html_string, TEXT_TYPE)
+# assert isinstance(textile_string, str)
+# assert isinstance(html_string, str)
self.assertNotEqual(textile_string, html_string)
# compare html -> textile
@@ -327,8 +324,8 @@ class BaseCreoleTest(MarkupTest):
def cross_compare_rest(self, rest_string, html_string, \
strip_lines=False, debug=False, parser_kwargs={}, emitter_kwargs={}):
-# assert isinstance(textile_string, TEXT_TYPE)
-# assert isinstance(html_string, TEXT_TYPE)
+# assert isinstance(textile_string, str)
+# assert isinstance(html_string, str)
self.assertNotEqual(rest_string, html_string)
rest_string, html_string = self.assert_html2rest(