Merge pull request #970 from jeanas/autojunk

Fix two issues with fuzzy matching
author: Aarni Koskela <akx@iki.fi> 2023-02-20 15:49:35 +0200
committer: GitHub <noreply@github.com> 2023-02-20 15:49:35 +0200
commit: c76f1d4bc574c4885b7a1afa90da3ca5026153ec (patch)
tree: e1e47f90009b8e6c964bff95e3fc89bf74a60613
parent: 08af5e2bab184c1b5d357ebde8c0efdbe6288e2c (diff)
parent: c8b7ac5b7ae1428aa6315abb6b90af0c03ab4e30 (diff)
download: babel-c76f1d4bc574c4885b7a1afa90da3ca5026153ec.tar.gz
2 files changed, 60 insertions, 12 deletions
diff --git a/babel/messages/catalog.py b/babel/messages/catalog.py
index dead4aa..a500e77 100644
--- a/babel/messages/catalog.py
+++ b/babel/messages/catalog.py
@@ -14,8 +14,9 @@ import re
 from collections import OrderedDict
 from collections.abc import Iterable, Iterator
 from copy import copy
-from difflib import get_close_matches
+from difflib import SequenceMatcher
 from email import message_from_string
+from heapq import nlargest
 from typing import TYPE_CHECKING
 
 from babel import __version__ as VERSION
@@ -31,6 +32,31 @@ if TYPE_CHECKING:
 
 __all__ = ['Message', 'Catalog', 'TranslationError']
 
+def get_close_matches(word, possibilities, n=3, cutoff=0.6):
+    """A modified version of ``difflib.get_close_matches``.
+
+    It just passes ``autojunk=False`` to the ``SequenceMatcher``, to work
+    around https://github.com/python/cpython/issues/90825.
+    """
+    if not n >  0:
+        raise ValueError("n must be > 0: %r" % (n,))
+    if not 0.0 <= cutoff <= 1.0:
+        raise ValueError("cutoff must be in [0.0, 1.0]: %r" % (cutoff,))
+    result = []
+    s = SequenceMatcher(autojunk=False) # only line changed from difflib.py
+    s.set_seq2(word)
+    for x in possibilities:
+        s.set_seq1(x)
+        if s.real_quick_ratio() >= cutoff and \
+           s.quick_ratio() >= cutoff and \
+           s.ratio() >= cutoff:
+            result.append((s.ratio(), x))
+
+    # Move the best scorers to head of list
+    result = nlargest(n, result)
+    # Strip scores for the best n matches
+    return [x for score, x in result]
+
 
 PYTHON_FORMAT = re.compile(r'''
     \%
@@ -803,10 +829,13 @@ class Catalog:
         # Prepare for fuzzy matching
         fuzzy_candidates = []
         if not no_fuzzy_matching:
-            fuzzy_candidates = {
-                self._key_for(msgid): messages[msgid].context
-                for msgid in messages if msgid and messages[msgid].string
-            }
+            fuzzy_candidates = {}
+            for msgid in messages:
+                if msgid and messages[msgid].string:
+                    key = self._key_for(msgid)
+                    ctxt = messages[msgid].context
+                    modified_key = key.lower().strip()
+                    fuzzy_candidates[modified_key] = (key, ctxt)
         fuzzy_matches = set()
 
         def _merge(message: Message, oldkey: tuple[str, str] | str, newkey: tuple[str, str] | str) -> None:
@@ -861,8 +890,8 @@ class Catalog:
                         matches = get_close_matches(matchkey.lower().strip(),
                                                     fuzzy_candidates.keys(), 1)
                         if matches:
-                            newkey = matches[0]
-                            newctxt = fuzzy_candidates[newkey]
+                            modified_key = matches[0]
+                            newkey, newctxt = fuzzy_candidates[modified_key]
                             if newctxt is not None:
                                 newkey = newkey, newctxt
                             _merge(message, newkey, key)
diff --git a/tests/messages/test_catalog.py b/tests/messages/test_catalog.py
index 273c83f..b9d72bc 100644
--- a/tests/messages/test_catalog.py
+++ b/tests/messages/test_catalog.py
@@ -121,16 +121,16 @@ class CatalogTestCase(unittest.TestCase):
 
     def test_update_fuzzy_matching_with_case_change(self):
         cat = catalog.Catalog()
-        cat.add('foo', 'Voh')
+        cat.add('FOO', 'Voh')
         cat.add('bar', 'Bahr')
         tmpl = catalog.Catalog()
-        tmpl.add('Foo')
+        tmpl.add('foo')
         cat.update(tmpl)
         assert len(cat.obsolete) == 1
-        assert 'foo' not in cat
+        assert 'FOO' not in cat
 
-        assert cat['Foo'].string == 'Voh'
-        assert cat['Foo'].fuzzy is True
+        assert cat['foo'].string == 'Voh'
+        assert cat['foo'].fuzzy is True
 
     def test_update_fuzzy_matching_with_char_change(self):
         cat = catalog.Catalog()
@@ -209,6 +209,25 @@ class CatalogTestCase(unittest.TestCase):
         assert cat['fooo'].string == 'Vohe'
         assert cat['fooo'].fuzzy is True
 
+    def test_update_fuzzy_matching_long_string(self):
+        lipsum = "\
+Lorem Ipsum is simply dummy text of the printing and typesetting \
+industry. Lorem Ipsum has been the industry's standard dummy text ever \
+since the 1500s, when an unknown printer took a galley of type and \
+scrambled it to make a type specimen book. It has survived not only \
+five centuries, but also the leap into electronic typesetting, \
+remaining essentially unchanged. It was popularised in the 1960s with \
+the release of Letraset sheets containing Lorem Ipsum passages, and \
+more recently with desktop publishing software like Aldus PageMaker \
+including versions of Lorem Ipsum."
+        cat = catalog.Catalog()
+        cat.add("ZZZZZZ " + lipsum, "foo")
+        tmpl = catalog.Catalog()
+        tmpl.add(lipsum + " ZZZZZZ")
+        cat.update(tmpl)
+        assert cat[lipsum + " ZZZZZZ"].fuzzy is True
+        assert len(cat.obsolete) == 0
+
     def test_update_without_fuzzy_matching(self):
         cat = catalog.Catalog()
         cat.add('fo', 'Voh')
author	Aarni Koskela <akx@iki.fi>	2023-02-20 15:49:35 +0200
committer	GitHub <noreply@github.com>	2023-02-20 15:49:35 +0200
commit	c76f1d4bc574c4885b7a1afa90da3ca5026153ec (patch)
tree	e1e47f90009b8e6c964bff95e3fc89bf74a60613
parent	08af5e2bab184c1b5d357ebde8c0efdbe6288e2c (diff)
parent	c8b7ac5b7ae1428aa6315abb6b90af0c03ab4e30 (diff)
download	babel-c76f1d4bc574c4885b7a1afa90da3ca5026153ec.tar.gz