summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDmitry Shachnev <mitya57@gmail.com>2021-03-24 20:59:28 +0300
committerWaylan Limberg <waylan.limberg@icloud.com>2021-03-24 14:35:59 -0400
commita11431539d08e14b0bd821ceb101fa59d6a74c8a (patch)
tree35b9fcd6f20e1b45ae50c84f182714f66cb8c1b7
parent14c2fa92302084e4850e5323ed56721e43fbbdb9 (diff)
downloadpython-markdown-a11431539d08e14b0bd821ceb101fa59d6a74c8a.tar.gz
toc: Do not remove diacritical marks when slugify_unicode is used
Update the existing test and add a new one to make sure that the behavior of default slugify function has not changed. Fixes #1118.
-rw-r--r--docs/change_log/index.md4
-rw-r--r--markdown/extensions/toc.py11
-rw-r--r--tests/test_syntax/extensions/test_toc.py18
3 files changed, 25 insertions, 8 deletions
diff --git a/docs/change_log/index.md b/docs/change_log/index.md
index d7487a3..aed19e3 100644
--- a/docs/change_log/index.md
+++ b/docs/change_log/index.md
@@ -3,6 +3,10 @@ title: Change Log
Python-Markdown Change Log
=========================
+Under development: version 3.3.5 (a bug-fix release).
+
+* Make the `slugify_unicode` function not remove diacritical marks (#1118).
+
Feb 24, 2021: version 3.3.4 (a bug-fix release).
* Properly parse unclosed tags in code spans (#1066).
diff --git a/markdown/extensions/toc.py b/markdown/extensions/toc.py
index d64ec16..965ba4a 100644
--- a/markdown/extensions/toc.py
+++ b/markdown/extensions/toc.py
@@ -23,16 +23,19 @@ import unicodedata
import xml.etree.ElementTree as etree
-def slugify(value, separator, encoding='ascii'):
+def slugify(value, separator, unicode=False):
""" Slugify a string, to make it URL friendly. """
- value = unicodedata.normalize('NFKD', value).encode(encoding, 'ignore')
- value = re.sub(r'[^\w\s-]', '', value.decode(encoding)).strip().lower()
+ if not unicode:
+ # Replace Extended Latin characters with ASCII, i.e. žlutý → zluty
+ value = unicodedata.normalize('NFKD', value)
+ value = value.encode('ascii', 'ignore').decode('ascii')
+ value = re.sub(r'[^\w\s-]', '', value).strip().lower()
return re.sub(r'[{}\s]+'.format(separator), separator, value)
def slugify_unicode(value, separator):
""" Slugify a string, to make it URL friendly while preserving Unicode characters. """
- return slugify(value, separator, 'utf-8')
+ return slugify(value, separator, unicode=True)
IDCOUNT_RE = re.compile(r'^(.*)_([0-9]+)$')
diff --git a/tests/test_syntax/extensions/test_toc.py b/tests/test_syntax/extensions/test_toc.py
index 04893e3..83c990f 100644
--- a/tests/test_syntax/extensions/test_toc.py
+++ b/tests/test_syntax/extensions/test_toc.py
@@ -534,9 +534,9 @@ class TestTOC(TestCase):
from markdown.extensions.toc import slugify_unicode
self.assertMarkdownRenders(
'# Unicode ヘッダー',
- '<h1 id="unicode-ヘッター">' # noqa
+ '<h1 id="unicode-ヘッダー">' # noqa
'Unicode ヘッダー' # noqa
- '<a class="headerlink" href="#unicode-ヘッター" title="Permanent link">&para;</a>' # noqa
+ '<a class="headerlink" href="#unicode-ヘッダー" title="Permanent link">&para;</a>' # noqa
'</h1>', # noqa
extensions=[TocExtension(permalink=True, slugify=slugify_unicode)]
)
@@ -545,9 +545,19 @@ class TestTOC(TestCase):
from markdown.extensions.toc import slugify_unicode
self.assertMarkdownRenders(
'# Unicode ヘッダー',
- '<h1 id="unicode-ヘッター">' # noqa
+ '<h1 id="unicode-ヘッダー">' # noqa
'Unicode ヘッダー' # noqa
- '<a class="headerlink" href="#unicode-ヘッター" title="パーマリンク">&para;</a>' # noqa
+ '<a class="headerlink" href="#unicode-ヘッダー" title="パーマリンク">&para;</a>' # noqa
'</h1>', # noqa
extensions=[TocExtension(permalink=True, permalink_title="パーマリンク", slugify=slugify_unicode)]
)
+
+ def testPermalinkWithExtendedLatinInID(self):
+ self.assertMarkdownRenders(
+ '# Théâtre',
+ '<h1 id="theatre">' # noqa
+ 'Théâtre' # noqa
+ '<a class="headerlink" href="#theatre" title="Permanent link">&para;</a>' # noqa
+ '</h1>', # noqa
+ extensions=[TocExtension(permalink=True)]
+ )