diff options
author | Aarni Koskela <akx@iki.fi> | 2023-02-20 15:49:35 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-02-20 15:49:35 +0200 |
commit | c76f1d4bc574c4885b7a1afa90da3ca5026153ec (patch) | |
tree | e1e47f90009b8e6c964bff95e3fc89bf74a60613 | |
parent | 08af5e2bab184c1b5d357ebde8c0efdbe6288e2c (diff) | |
parent | c8b7ac5b7ae1428aa6315abb6b90af0c03ab4e30 (diff) | |
download | babel-c76f1d4bc574c4885b7a1afa90da3ca5026153ec.tar.gz |
Merge pull request #970 from jeanas/autojunk
Fix two issues with fuzzy matching
-rw-r--r-- | babel/messages/catalog.py | 43 | ||||
-rw-r--r-- | tests/messages/test_catalog.py | 29 |
2 files changed, 60 insertions, 12 deletions
diff --git a/babel/messages/catalog.py b/babel/messages/catalog.py index dead4aa..a500e77 100644 --- a/babel/messages/catalog.py +++ b/babel/messages/catalog.py @@ -14,8 +14,9 @@ import re from collections import OrderedDict from collections.abc import Iterable, Iterator from copy import copy -from difflib import get_close_matches +from difflib import SequenceMatcher from email import message_from_string +from heapq import nlargest from typing import TYPE_CHECKING from babel import __version__ as VERSION @@ -31,6 +32,31 @@ if TYPE_CHECKING: __all__ = ['Message', 'Catalog', 'TranslationError'] +def get_close_matches(word, possibilities, n=3, cutoff=0.6): + """A modified version of ``difflib.get_close_matches``. + + It just passes ``autojunk=False`` to the ``SequenceMatcher``, to work + around https://github.com/python/cpython/issues/90825. + """ + if not n > 0: + raise ValueError("n must be > 0: %r" % (n,)) + if not 0.0 <= cutoff <= 1.0: + raise ValueError("cutoff must be in [0.0, 1.0]: %r" % (cutoff,)) + result = [] + s = SequenceMatcher(autojunk=False) # only line changed from difflib.py + s.set_seq2(word) + for x in possibilities: + s.set_seq1(x) + if s.real_quick_ratio() >= cutoff and \ + s.quick_ratio() >= cutoff and \ + s.ratio() >= cutoff: + result.append((s.ratio(), x)) + + # Move the best scorers to head of list + result = nlargest(n, result) + # Strip scores for the best n matches + return [x for score, x in result] + PYTHON_FORMAT = re.compile(r''' \% @@ -803,10 +829,13 @@ class Catalog: # Prepare for fuzzy matching fuzzy_candidates = [] if not no_fuzzy_matching: - fuzzy_candidates = { - self._key_for(msgid): messages[msgid].context - for msgid in messages if msgid and messages[msgid].string - } + fuzzy_candidates = {} + for msgid in messages: + if msgid and messages[msgid].string: + key = self._key_for(msgid) + ctxt = messages[msgid].context + modified_key = key.lower().strip() + fuzzy_candidates[modified_key] = (key, ctxt) fuzzy_matches = set() def _merge(message: Message, oldkey: tuple[str, str] | str, newkey: tuple[str, str] | str) -> None: @@ -861,8 +890,8 @@ class Catalog: matches = get_close_matches(matchkey.lower().strip(), fuzzy_candidates.keys(), 1) if matches: - newkey = matches[0] - newctxt = fuzzy_candidates[newkey] + modified_key = matches[0] + newkey, newctxt = fuzzy_candidates[modified_key] if newctxt is not None: newkey = newkey, newctxt _merge(message, newkey, key) diff --git a/tests/messages/test_catalog.py b/tests/messages/test_catalog.py index 273c83f..b9d72bc 100644 --- a/tests/messages/test_catalog.py +++ b/tests/messages/test_catalog.py @@ -121,16 +121,16 @@ class CatalogTestCase(unittest.TestCase): def test_update_fuzzy_matching_with_case_change(self): cat = catalog.Catalog() - cat.add('foo', 'Voh') + cat.add('FOO', 'Voh') cat.add('bar', 'Bahr') tmpl = catalog.Catalog() - tmpl.add('Foo') + tmpl.add('foo') cat.update(tmpl) assert len(cat.obsolete) == 1 - assert 'foo' not in cat + assert 'FOO' not in cat - assert cat['Foo'].string == 'Voh' - assert cat['Foo'].fuzzy is True + assert cat['foo'].string == 'Voh' + assert cat['foo'].fuzzy is True def test_update_fuzzy_matching_with_char_change(self): cat = catalog.Catalog() @@ -209,6 +209,25 @@ class CatalogTestCase(unittest.TestCase): assert cat['fooo'].string == 'Vohe' assert cat['fooo'].fuzzy is True + def test_update_fuzzy_matching_long_string(self): + lipsum = "\ +Lorem Ipsum is simply dummy text of the printing and typesetting \ +industry. Lorem Ipsum has been the industry's standard dummy text ever \ +since the 1500s, when an unknown printer took a galley of type and \ +scrambled it to make a type specimen book. It has survived not only \ +five centuries, but also the leap into electronic typesetting, \ +remaining essentially unchanged. It was popularised in the 1960s with \ +the release of Letraset sheets containing Lorem Ipsum passages, and \ +more recently with desktop publishing software like Aldus PageMaker \ +including versions of Lorem Ipsum." + cat = catalog.Catalog() + cat.add("ZZZZZZ " + lipsum, "foo") + tmpl = catalog.Catalog() + tmpl.add(lipsum + " ZZZZZZ") + cat.update(tmpl) + assert cat[lipsum + " ZZZZZZ"].fuzzy is True + assert len(cat.obsolete) == 0 + def test_update_without_fuzzy_matching(self): cat = catalog.Catalog() cat.add('fo', 'Voh') |