diff options
author | Jean Abou Samra <jean@abou-samra.fr> | 2023-02-12 02:49:28 +0100 |
---|---|---|
committer | Jean Abou Samra <jean@abou-samra.fr> | 2023-02-12 02:54:38 +0100 |
commit | c8b7ac5b7ae1428aa6315abb6b90af0c03ab4e30 (patch) | |
tree | e1e47f90009b8e6c964bff95e3fc89bf74a60613 /babel | |
parent | 63bb71a5acd0d49a2ceee15098485bc34b0e8864 (diff) | |
download | babel-c8b7ac5b7ae1428aa6315abb6b90af0c03ab4e30.tar.gz |
Turn off difflib "autojunk" heuristic in fuzzy matching
difflib has a heuristic that used to make fuzzy matching unreliable for
>200char strings. See
https://github.com/python/cpython/issues/90825
Fixes #969
Diffstat (limited to 'babel')
-rw-r--r-- | babel/messages/catalog.py | 28 |
1 files changed, 27 insertions, 1 deletions
diff --git a/babel/messages/catalog.py b/babel/messages/catalog.py index 1902643..a500e77 100644 --- a/babel/messages/catalog.py +++ b/babel/messages/catalog.py @@ -14,8 +14,9 @@ import re from collections import OrderedDict from collections.abc import Iterable, Iterator from copy import copy -from difflib import get_close_matches +from difflib import SequenceMatcher from email import message_from_string +from heapq import nlargest from typing import TYPE_CHECKING from babel import __version__ as VERSION @@ -31,6 +32,31 @@ if TYPE_CHECKING: __all__ = ['Message', 'Catalog', 'TranslationError'] +def get_close_matches(word, possibilities, n=3, cutoff=0.6): + """A modified version of ``difflib.get_close_matches``. + + It just passes ``autojunk=False`` to the ``SequenceMatcher``, to work + around https://github.com/python/cpython/issues/90825. + """ + if not n > 0: + raise ValueError("n must be > 0: %r" % (n,)) + if not 0.0 <= cutoff <= 1.0: + raise ValueError("cutoff must be in [0.0, 1.0]: %r" % (cutoff,)) + result = [] + s = SequenceMatcher(autojunk=False) # only line changed from difflib.py + s.set_seq2(word) + for x in possibilities: + s.set_seq1(x) + if s.real_quick_ratio() >= cutoff and \ + s.quick_ratio() >= cutoff and \ + s.ratio() >= cutoff: + result.append((s.ratio(), x)) + + # Move the best scorers to head of list + result = nlargest(n, result) + # Strip scores for the best n matches + return [x for score, x in result] + PYTHON_FORMAT = re.compile(r''' \% |