toc: Do not remove diacritical marks when slugify_unicode is used

Update the existing test and add a new one to make sure that the behavior of default slugify function has not changed. Fixes #1118.
author: Dmitry Shachnev <mitya57@gmail.com> 2021-03-24 20:59:28 +0300
committer: Waylan Limberg <waylan.limberg@icloud.com> 2021-03-24 14:35:59 -0400
commit: a11431539d08e14b0bd821ceb101fa59d6a74c8a (patch)
tree: 35b9fcd6f20e1b45ae50c84f182714f66cb8c1b7
parent: 14c2fa92302084e4850e5323ed56721e43fbbdb9 (diff)
download: python-markdown-a11431539d08e14b0bd821ceb101fa59d6a74c8a.tar.gz
3 files changed, 25 insertions, 8 deletions
diff --git a/docs/change_log/index.md b/docs/change_log/index.md
index d7487a3..aed19e3 100644
--- a/docs/change_log/index.md
+++ b/docs/change_log/index.md
@@ -3,6 +3,10 @@ title: Change Log
 Python-Markdown Change Log
 =========================
 
+Under development: version 3.3.5 (a bug-fix release).
+
+* Make the `slugify_unicode` function not remove diacritical marks (#1118).
+
 Feb 24, 2021: version 3.3.4 (a bug-fix release).
 
 * Properly parse unclosed tags in code spans (#1066).
diff --git a/markdown/extensions/toc.py b/markdown/extensions/toc.py
index d64ec16..965ba4a 100644
--- a/markdown/extensions/toc.py
+++ b/markdown/extensions/toc.py
@@ -23,16 +23,19 @@ import unicodedata
 import xml.etree.ElementTree as etree
 
 
-def slugify(value, separator, encoding='ascii'):
+def slugify(value, separator, unicode=False):
     """ Slugify a string, to make it URL friendly. """
-    value = unicodedata.normalize('NFKD', value).encode(encoding, 'ignore')
-    value = re.sub(r'[^\w\s-]', '', value.decode(encoding)).strip().lower()
+    if not unicode:
+        # Replace Extended Latin characters with ASCII, i.e. žlutý → zluty
+        value = unicodedata.normalize('NFKD', value)
+        value = value.encode('ascii', 'ignore').decode('ascii')
+    value = re.sub(r'[^\w\s-]', '', value).strip().lower()
     return re.sub(r'[{}\s]+'.format(separator), separator, value)
 
 
 def slugify_unicode(value, separator):
     """ Slugify a string, to make it URL friendly while preserving Unicode characters. """
-    return slugify(value, separator, 'utf-8')
+    return slugify(value, separator, unicode=True)
 
 
 IDCOUNT_RE = re.compile(r'^(.*)_([0-9]+)$')
diff --git a/tests/test_syntax/extensions/test_toc.py b/tests/test_syntax/extensions/test_toc.py
index 04893e3..83c990f 100644
--- a/tests/test_syntax/extensions/test_toc.py
+++ b/tests/test_syntax/extensions/test_toc.py
@@ -534,9 +534,9 @@ class TestTOC(TestCase):
         from markdown.extensions.toc import slugify_unicode
         self.assertMarkdownRenders(
             '# Unicode ヘッダー',
-            '<h1 id="unicode-ヘッター">'                                                            # noqa
+            '<h1 id="unicode-ヘッダー">'                                                            # noqa
                 'Unicode ヘッダー'                                                                  # noqa
-                '<a class="headerlink" href="#unicode-ヘッター" title="Permanent link">&para;</a>'  # noqa
+                '<a class="headerlink" href="#unicode-ヘッダー" title="Permanent link">&para;</a>'  # noqa
             '</h1>',                                                                               # noqa
             extensions=[TocExtension(permalink=True, slugify=slugify_unicode)]
         )
@@ -545,9 +545,19 @@ class TestTOC(TestCase):
         from markdown.extensions.toc import slugify_unicode
         self.assertMarkdownRenders(
             '# Unicode ヘッダー',
-            '<h1 id="unicode-ヘッター">'                                                        # noqa
+            '<h1 id="unicode-ヘッダー">'                                                        # noqa
                 'Unicode ヘッダー'                                                              # noqa
-                '<a class="headerlink" href="#unicode-ヘッター" title="パーマリンク">&para;</a>'  # noqa
+                '<a class="headerlink" href="#unicode-ヘッダー" title="パーマリンク">&para;</a>'  # noqa
             '</h1>',                                                                           # noqa
             extensions=[TocExtension(permalink=True, permalink_title="パーマリンク", slugify=slugify_unicode)]
         )
+
+    def testPermalinkWithExtendedLatinInID(self):
+        self.assertMarkdownRenders(
+            '# Théâtre',
+            '<h1 id="theatre">'                                                            # noqa
+                'Théâtre'                                                                  # noqa
+                '<a class="headerlink" href="#theatre" title="Permanent link">&para;</a>'  # noqa
+            '</h1>',                                                                       # noqa
+            extensions=[TocExtension(permalink=True)]
+        )
author	Dmitry Shachnev <mitya57@gmail.com>	2021-03-24 20:59:28 +0300
committer	Waylan Limberg <waylan.limberg@icloud.com>	2021-03-24 14:35:59 -0400
commit	a11431539d08e14b0bd821ceb101fa59d6a74c8a (patch)
tree	35b9fcd6f20e1b45ae50c84f182714f66cb8c1b7
parent	14c2fa92302084e4850e5323ed56721e43fbbdb9 (diff)
download	python-markdown-a11431539d08e14b0bd821ceb101fa59d6a74c8a.tar.gz