diff options
author | Dmitry Shachnev <mitya57@gmail.com> | 2021-03-24 20:59:28 +0300 |
---|---|---|
committer | Waylan Limberg <waylan.limberg@icloud.com> | 2021-03-24 14:35:59 -0400 |
commit | a11431539d08e14b0bd821ceb101fa59d6a74c8a (patch) | |
tree | 35b9fcd6f20e1b45ae50c84f182714f66cb8c1b7 | |
parent | 14c2fa92302084e4850e5323ed56721e43fbbdb9 (diff) | |
download | python-markdown-a11431539d08e14b0bd821ceb101fa59d6a74c8a.tar.gz |
toc: Do not remove diacritical marks when slugify_unicode is used
Update the existing test and add a new one to make sure that the
behavior of default slugify function has not changed.
Fixes #1118.
-rw-r--r-- | docs/change_log/index.md | 4 | ||||
-rw-r--r-- | markdown/extensions/toc.py | 11 | ||||
-rw-r--r-- | tests/test_syntax/extensions/test_toc.py | 18 |
3 files changed, 25 insertions, 8 deletions
diff --git a/docs/change_log/index.md b/docs/change_log/index.md index d7487a3..aed19e3 100644 --- a/docs/change_log/index.md +++ b/docs/change_log/index.md @@ -3,6 +3,10 @@ title: Change Log Python-Markdown Change Log ========================= +Under development: version 3.3.5 (a bug-fix release). + +* Make the `slugify_unicode` function not remove diacritical marks (#1118). + Feb 24, 2021: version 3.3.4 (a bug-fix release). * Properly parse unclosed tags in code spans (#1066). diff --git a/markdown/extensions/toc.py b/markdown/extensions/toc.py index d64ec16..965ba4a 100644 --- a/markdown/extensions/toc.py +++ b/markdown/extensions/toc.py @@ -23,16 +23,19 @@ import unicodedata import xml.etree.ElementTree as etree -def slugify(value, separator, encoding='ascii'): +def slugify(value, separator, unicode=False): """ Slugify a string, to make it URL friendly. """ - value = unicodedata.normalize('NFKD', value).encode(encoding, 'ignore') - value = re.sub(r'[^\w\s-]', '', value.decode(encoding)).strip().lower() + if not unicode: + # Replace Extended Latin characters with ASCII, i.e. žlutý → zluty + value = unicodedata.normalize('NFKD', value) + value = value.encode('ascii', 'ignore').decode('ascii') + value = re.sub(r'[^\w\s-]', '', value).strip().lower() return re.sub(r'[{}\s]+'.format(separator), separator, value) def slugify_unicode(value, separator): """ Slugify a string, to make it URL friendly while preserving Unicode characters. """ - return slugify(value, separator, 'utf-8') + return slugify(value, separator, unicode=True) IDCOUNT_RE = re.compile(r'^(.*)_([0-9]+)$') diff --git a/tests/test_syntax/extensions/test_toc.py b/tests/test_syntax/extensions/test_toc.py index 04893e3..83c990f 100644 --- a/tests/test_syntax/extensions/test_toc.py +++ b/tests/test_syntax/extensions/test_toc.py @@ -534,9 +534,9 @@ class TestTOC(TestCase): from markdown.extensions.toc import slugify_unicode self.assertMarkdownRenders( '# Unicode ヘッダー', - '<h1 id="unicode-ヘッター">' # noqa + '<h1 id="unicode-ヘッダー">' # noqa 'Unicode ヘッダー' # noqa - '<a class="headerlink" href="#unicode-ヘッター" title="Permanent link">¶</a>' # noqa + '<a class="headerlink" href="#unicode-ヘッダー" title="Permanent link">¶</a>' # noqa '</h1>', # noqa extensions=[TocExtension(permalink=True, slugify=slugify_unicode)] ) @@ -545,9 +545,19 @@ class TestTOC(TestCase): from markdown.extensions.toc import slugify_unicode self.assertMarkdownRenders( '# Unicode ヘッダー', - '<h1 id="unicode-ヘッター">' # noqa + '<h1 id="unicode-ヘッダー">' # noqa 'Unicode ヘッダー' # noqa - '<a class="headerlink" href="#unicode-ヘッター" title="パーマリンク">¶</a>' # noqa + '<a class="headerlink" href="#unicode-ヘッダー" title="パーマリンク">¶</a>' # noqa '</h1>', # noqa extensions=[TocExtension(permalink=True, permalink_title="パーマリンク", slugify=slugify_unicode)] ) + + def testPermalinkWithExtendedLatinInID(self): + self.assertMarkdownRenders( + '# Théâtre', + '<h1 id="theatre">' # noqa + 'Théâtre' # noqa + '<a class="headerlink" href="#theatre" title="Permanent link">¶</a>' # noqa + '</h1>', # noqa + extensions=[TocExtension(permalink=True)] + ) |