Turn off difflib "autojunk" heuristic in fuzzy matching

difflib has a heuristic that used to make fuzzy matching unreliable for >200char strings. See https://github.com/python/cpython/issues/90825 Fixes #969
author: Jean Abou Samra <jean@abou-samra.fr> 2023-02-12 02:49:28 +0100
committer: Jean Abou Samra <jean@abou-samra.fr> 2023-02-12 02:54:38 +0100
commit: c8b7ac5b7ae1428aa6315abb6b90af0c03ab4e30 (patch)
tree: e1e47f90009b8e6c964bff95e3fc89bf74a60613 /babel
parent: 63bb71a5acd0d49a2ceee15098485bc34b0e8864 (diff)
download: babel-c8b7ac5b7ae1428aa6315abb6b90af0c03ab4e30.tar.gz
1 files changed, 27 insertions, 1 deletions
diff --git a/babel/messages/catalog.py b/babel/messages/catalog.py
index 1902643..a500e77 100644
--- a/babel/messages/catalog.py
+++ b/babel/messages/catalog.py
@@ -14,8 +14,9 @@ import re
 from collections import OrderedDict
 from collections.abc import Iterable, Iterator
 from copy import copy
-from difflib import get_close_matches
+from difflib import SequenceMatcher
 from email import message_from_string
+from heapq import nlargest
 from typing import TYPE_CHECKING
 
 from babel import __version__ as VERSION
@@ -31,6 +32,31 @@ if TYPE_CHECKING:
 
 __all__ = ['Message', 'Catalog', 'TranslationError']
 
+def get_close_matches(word, possibilities, n=3, cutoff=0.6):
+    """A modified version of ``difflib.get_close_matches``.
+
+    It just passes ``autojunk=False`` to the ``SequenceMatcher``, to work
+    around https://github.com/python/cpython/issues/90825.
+    """
+    if not n >  0:
+        raise ValueError("n must be > 0: %r" % (n,))
+    if not 0.0 <= cutoff <= 1.0:
+        raise ValueError("cutoff must be in [0.0, 1.0]: %r" % (cutoff,))
+    result = []
+    s = SequenceMatcher(autojunk=False) # only line changed from difflib.py
+    s.set_seq2(word)
+    for x in possibilities:
+        s.set_seq1(x)
+        if s.real_quick_ratio() >= cutoff and \
+           s.quick_ratio() >= cutoff and \
+           s.ratio() >= cutoff:
+            result.append((s.ratio(), x))
+
+    # Move the best scorers to head of list
+    result = nlargest(n, result)
+    # Strip scores for the best n matches
+    return [x for score, x in result]
+
 
 PYTHON_FORMAT = re.compile(r'''
     \%
author	Jean Abou Samra <jean@abou-samra.fr>	2023-02-12 02:49:28 +0100
committer	Jean Abou Samra <jean@abou-samra.fr>	2023-02-12 02:54:38 +0100
commit	c8b7ac5b7ae1428aa6315abb6b90af0c03ab4e30 (patch)
tree	e1e47f90009b8e6c964bff95e3fc89bf74a60613 /babel
parent	63bb71a5acd0d49a2ceee15098485bc34b0e8864 (diff)
download	babel-c8b7ac5b7ae1428aa6315abb6b90af0c03ab4e30.tar.gz