Default to NumPy for simrank_similarity (#4841)

* Default to NumPy for simrank_similarity * fix typo: targete -> target * arg G left out of call to simrank_similarity * Fix * More * Add pre/post processing of nodes in simrank_similarity Added test for noninteger nodes Added and updted examples in docstrings cleaned up some tests * parametrize tests so new _python version tested * Move similarity_numpy code to private function so deprecating requires very little code change. * implement changes from the discussion Move warning so doesn't show unless _numpy() version called Add test of difference between python and numpy version Add exception when methods don't converge Add tests of convergence exception Remove _is_close function * increase maxiterations and error handling including for isolated nodes * black Co-authored-by: Dan Schult <dschult@colgate.edu>
author: Jarrod Millman <jarrod.millman@gmail.com> 2021-05-30 18:08:42 -0700
committer: GitHub <noreply@github.com> 2021-05-30 18:08:42 -0700
commit: 7afb9006f3650e0746b41d44863b42fe28a224ab (patch)
tree: 71a6315e4d0fefbeb8b348eb0a521eb35c18d726
parent: d989f687e2f83e48fb4906836c3eb5c3e00e7d78 (diff)
download: networkx-7afb9006f3650e0746b41d44863b42fe28a224ab.tar.gz
4 files changed, 236 insertions, 80 deletions
diff --git a/doc/developer/deprecations.rst b/doc/developer/deprecations.rst
index 56d8978d..1749ab41 100644
--- a/doc/developer/deprecations.rst
+++ b/doc/developer/deprecations.rst
@@ -87,3 +87,4 @@ Version 3.0
 * In ``algorithms/community/quality.py`` remove ``coverage`` and ``performance``.
 * Remove ``testing``.
 * In ``linalg/graphmatrix.py`` remove ``adj_matrix``.
+* In ``algorithms/similarity.py`` replace ``simrank_similarity`` with ``simrank_similarity_numpy``.
diff --git a/doc/release/release_dev.rst b/doc/release/release_dev.rst
index 14a1ef4d..6ebb3e1c 100644
--- a/doc/release/release_dev.rst
+++ b/doc/release/release_dev.rst
@@ -78,6 +78,8 @@ Improvements
   Adds ``forest_str`` for string representation of trees.
 - [`#4319 <https://github.com/networkx/networkx/pull/4319>`_]
   pagerank uses scipy by default now.
+- [`#4841 <https://github.com/networkx/networkx/pull/4841>`_]
+  simrank_similarity uses numpy by default now.
 - [`#4317 <https://github.com/networkx/networkx/pull/4317>`_]
   New ``source`` argument to ``has_eulerian_path`` to look for path starting at
   source.
@@ -239,6 +241,8 @@ Deprecations
   Deprecate ``assert_nodes_equal``, ``assert_edges_equal``, and ``assert_graphs_equal``.
 - [`#4850 <https://github.com/networkx/networkx/pull/4850>`_]
   Deprecate ``adj_matrix``.
+- [`#4841 <https://github.com/networkx/networkx/pull/4841>`_]
+  Deprecate ``simrank_similarity_numpy``.
 
 Contributors
 ------------
diff --git a/networkx/algorithms/similarity.py b/networkx/algorithms/similarity.py
index 6369012d..a4c2e961 100644
--- a/networkx/algorithms/similarity.py
+++ b/networkx/algorithms/similarity.py
@@ -1196,47 +1196,12 @@ def optimize_edit_paths(
         yield list(vertex_path), list(edge_path), cost
 
 
-def _is_close(d1, d2, atolerance=0, rtolerance=0):
-    """Determines whether two adjacency matrices are within
-    a provided tolerance.
-
-    Parameters
-    ----------
-    d1 : dict
-        Adjacency dictionary
-
-    d2 : dict
-        Adjacency dictionary
-
-    atolerance : float
-        Some scalar tolerance value to determine closeness
-
-    rtolerance : float
-        A scalar tolerance value that will be some proportion
-        of ``d2``'s value
-
-    Returns
-    -------
-    closeness : bool
-        If all of the nodes within ``d1`` and ``d2`` are within
-        a predefined tolerance, they are considered "close" and
-        this method will return True. Otherwise, this method will
-        return False.
-
-    """
-    # Pre-condition: d1 and d2 have the same keys at each level if they
-    # are dictionaries.
-    if not isinstance(d1, dict) and not isinstance(d2, dict):
-        return abs(d1 - d2) <= atolerance + rtolerance * abs(d2)
-    return all(all(_is_close(d1[u][v], d2[u][v]) for v in d1[u]) for u in d1)
-
-
 def simrank_similarity(
     G,
     source=None,
     target=None,
     importance_factor=0.9,
-    max_iterations=100,
+    max_iterations=1000,
     tolerance=1e-4,
 ):
     """Returns the SimRank similarity of nodes in the graph ``G``.
@@ -1304,16 +1269,27 @@ def simrank_similarity(
 
     Examples
     --------
-    If the nodes of the graph are numbered from zero to *n - 1*, where *n*
-    is the number of nodes in the graph, you can create a SimRank matrix
-    from the return value of this function where the node numbers are
-    the row and column indices of the matrix:
+    >>> G = nx.cycle_graph(2)
+    >>> nx.simrank_similarity(G)
+    {0: {0: 1.0, 1: 0.0}, 1: {0: 0.0, 1: 1.0}}
+    >>> nx.simrank_similarity(G, source=0)
+    {0: 1.0, 1: 0.0}
+    >>> nx.simrank_similarity(G, source=0, target=0)
+    1.0
+
+    The result of this function can be converted to a numpy array
+    representing the SimRank matrix by using the node order of the
+    graph to determine which row and column represent each node.
+    Other ordering of nodes is also possible.
 
     >>> import numpy as np
-    >>> G = nx.cycle_graph(4)
     >>> sim = nx.simrank_similarity(G)
-    >>> lol = [[sim[u][v] for v in sorted(sim[u])] for u in sorted(sim)]
-    >>> sim_array = np.array(lol)
+    >>> np.array([[sim[u][v] for v in G] for u in G])
+    array([[1., 0.],
+           [0., 1.]])
+    >>> sim_1d = nx.simrank_similarity(G, source=0)
+    >>> np.array([sim[0][v] for v in G])
+    array([1., 0.])
 
     References
     ----------
@@ -1324,8 +1300,46 @@ def simrank_similarity(
            International Conference on Knowledge Discovery and Data Mining,
            pp. 538--543. ACM Press, 2002.
     """
-    prevsim = None
+    import numpy as np
+
+    nodelist = list(G)
+    s_indx = None if source is None else nodelist.index(source)
+    t_indx = None if target is None else nodelist.index(target)
+
+    x = _simrank_similarity_numpy(
+        G, s_indx, t_indx, importance_factor, max_iterations, tolerance
+    )
+
+    if isinstance(x, np.ndarray):
+        if x.ndim == 1:
+            return {node: val for node, val in zip(G, x)}
+        else:  # x.ndim == 2:
+            return {u: dict(zip(G, row)) for u, row in zip(G, x)}
+    return x
+
+
+def _simrank_similarity_python(
+    G,
+    source=None,
+    target=None,
+    importance_factor=0.9,
+    max_iterations=1000,
+    tolerance=1e-4,
+):
+    """Returns the SimRank similarity of nodes in the graph ``G``.
+
+    This pure Python version is provided for pedagogical purposes.
 
+    Examples
+    --------
+    >>> G = nx.cycle_graph(2)
+    >>> nx.similarity._simrank_similarity_python(G)
+    {0: {0: 1, 1: 0.0}, 1: {0: 0.0, 1: 1}}
+    >>> nx.similarity._simrank_similarity_python(G, source=0)
+    {0: 1, 1: 0.0}
+    >>> nx.similarity._simrank_similarity_python(G, source=0, target=0)
+    1
+    """
     # build up our similarity adjacency dictionary output
     newsim = {u: {v: 1 if u == v else 0 for v in G} for u in G}
 
@@ -1334,17 +1348,28 @@ def simrank_similarity(
     def avg_sim(s):
         return sum(newsim[w][x] for (w, x) in s) / len(s) if s else 0.0
 
+    Gadj = G.pred if G.is_directed() else G.adj
+
     def sim(u, v):
-        Gadj = G.pred if G.is_directed() else G.adj
         return importance_factor * avg_sim(list(product(Gadj[u], Gadj[v])))
 
-    for _ in range(max_iterations):
-        if prevsim and _is_close(prevsim, newsim, tolerance):
+    for its in range(max_iterations):
+        oldsim = newsim
+        newsim = {u: {v: sim(u, v) if u is not v else 1 for v in G} for u in G}
+        is_close = all(
+            all(
+                abs(newsim[u][v] - old) <= tolerance * (1 + abs(old))
+                for v, old in nbrs.items()
+            )
+            for u, nbrs in oldsim.items()
+        )
+        if is_close:
             break
-        prevsim = newsim
-        newsim = {
-            u: {v: sim(u, v) if u is not v else 1 for v in newsim[u]} for u in newsim
-        }
+
+    if its + 1 == max_iterations:
+        raise nx.ExceededMaxIterations(
+            f"simrank did not converge after {max_iterations} iterations."
+        )
 
     if source is not None and target is not None:
         return newsim[source][target]
@@ -1353,12 +1378,12 @@ def simrank_similarity(
     return newsim
 
 
-def simrank_similarity_numpy(
+def _simrank_similarity_numpy(
     G,
     source=None,
     target=None,
     importance_factor=0.9,
-    max_iterations=100,
+    max_iterations=1000,
     tolerance=1e-4,
 ):
     """Calculate SimRank of nodes in ``G`` using matrices with ``numpy``.
@@ -1409,8 +1434,14 @@ def simrank_similarity_numpy(
 
     Examples
     --------
-    >>> G = nx.cycle_graph(4)
-    >>> sim = nx.simrank_similarity_numpy(G)
+    >>> G = nx.cycle_graph(2)
+    >>> nx.similarity._simrank_similarity_numpy(G)
+    array([[1., 0.],
+           [0., 1.]])
+    >>> nx.similarity._simrank_similarity_numpy(G, source=0)
+    array([1., 0.])
+    >>> nx.similarity._simrank_similarity_numpy(G, source=0, target=0)
+    1.0
 
     References
     ----------
@@ -1431,17 +1462,24 @@ def simrank_similarity_numpy(
     adjacency_matrix = nx.to_numpy_array(G)
 
     # column-normalize the ``adjacency_matrix``
-    adjacency_matrix /= adjacency_matrix.sum(axis=0)
+    s = np.array(adjacency_matrix.sum(axis=0))
+    s[s == 0] = 1
+    adjacency_matrix /= s  # adjacency_matrix.sum(axis=0)
 
-    newsim = np.eye(adjacency_matrix.shape[0], dtype=np.float64)
-    for _ in range(max_iterations):
-        prevsim = np.copy(newsim)
+    newsim = np.eye(len(G), dtype=np.float64)
+    for its in range(max_iterations):
+        prevsim = newsim.copy()
         newsim = importance_factor * ((adjacency_matrix.T @ prevsim) @ adjacency_matrix)
         np.fill_diagonal(newsim, 1.0)
 
         if np.allclose(prevsim, newsim, atol=tolerance):
             break
 
+    if its + 1 == max_iterations:
+        raise nx.ExceededMaxIterations(
+            f"simrank did not converge after {max_iterations} iterations."
+        )
+
     if source is not None and target is not None:
         return newsim[source, target]
     if source is not None:
@@ -1449,6 +1487,39 @@ def simrank_similarity_numpy(
     return newsim
 
 
+def simrank_similarity_numpy(
+    G,
+    source=None,
+    target=None,
+    importance_factor=0.9,
+    max_iterations=100,
+    tolerance=1e-4,
+):
+    """Calculate SimRank of nodes in ``G`` using matrices with ``numpy``.
+
+    .. deprecated:: 2.6
+        simrank_similarity_numpy is deprecated and will be removed in networkx 3.0.
+        Use simrank_similarity
+
+    """
+    warnings.warn(
+        (
+            "networkx.simrank_similarity_numpy is deprecated and will be removed"
+            "in NetworkX 3.0, use networkx.simrank_similarity instead."
+        ),
+        DeprecationWarning,
+        stacklevel=2,
+    )
+    return _simrank_similarity_numpy(
+        G,
+        source,
+        target,
+        importance_factor,
+        max_iterations,
+        tolerance,
+    )
+
+
 # TODO replace w/ math.comb(n, k) for Python 3.8+
 def _n_choose_k(n, k):
     """Pure Python implementation of the binomial coefficient
diff --git a/networkx/algorithms/tests/test_similarity.py b/networkx/algorithms/tests/test_similarity.py
index 88022f5f..08992882 100644
--- a/networkx/algorithms/tests/test_similarity.py
+++ b/networkx/algorithms/tests/test_similarity.py
@@ -523,7 +523,14 @@ class TestSimilarity:
         G2.add_edge("B", "D", label="bad")
         assert graph_edit_distance(G1, G2, node_match=nmatch, edge_match=ematch) == 1
 
-    def test_simrank_no_source_no_target(self):
+    # note: nx.simrank_similarity_numpy not included because returns np.array
+    simrank_algs = [
+        nx.simrank_similarity,
+        nx.similarity._simrank_similarity_python,
+    ]
+
+    @pytest.mark.parametrize("simrank_similarity", simrank_algs)
+    def test_simrank_no_source_no_target(self, simrank_similarity):
         G = nx.cycle_graph(5)
         expected = {
             0: {
@@ -562,8 +569,9 @@ class TestSimilarity:
                 4: 1,
             },
         }
-        actual = nx.simrank_similarity(G)
-        assert expected == actual
+        actual = simrank_similarity(G)
+        for k, v in expected.items():
+            assert v == pytest.approx(actual[k], abs=1e-2)
 
         # For a DiGraph test, use the first graph from the paper cited in
         # the docs: https://dl.acm.org/doi/pdf/10.1145/775047.775126
@@ -595,10 +603,12 @@ class TestSimilarity:
             },
         }
         # Use the importance_factor from the paper to get the same numbers.
-        actual = nx.algorithms.similarity.simrank_similarity(G, importance_factor=0.8)
-        assert expected == actual
+        actual = simrank_similarity(G, importance_factor=0.8)
+        for k, v in expected.items():
+            assert v == pytest.approx(actual[k], abs=1e-2)
 
-    def test_simrank_source_no_target(self):
+    @pytest.mark.parametrize("simrank_similarity", simrank_algs)
+    def test_simrank_source_no_target(self, simrank_similarity):
         G = nx.cycle_graph(5)
         expected = {
             0: 1,
@@ -607,8 +617,8 @@ class TestSimilarity:
             3: 0.5707317069281646,
             4: 0.3951219505902449,
         }
-        actual = nx.simrank_similarity(G, source=0)
-        assert expected == actual
+        actual = simrank_similarity(G, source=0)
+        assert expected == pytest.approx(actual, abs=1e-2)
 
         # For a DiGraph test, use the first graph from the paper cited in
         # the docs: https://dl.acm.org/doi/pdf/10.1145/775047.775126
@@ -622,15 +632,52 @@ class TestSimilarity:
 
         expected = {0: 1, 1: 0.0, 2: 0.1323363991265798, 3: 0.0, 4: 0.03387811817640443}
         # Use the importance_factor from the paper to get the same numbers.
-        actual = nx.algorithms.similarity.simrank_similarity(
-            G, importance_factor=0.8, source=0
-        )
-        assert expected == actual
+        actual = simrank_similarity(G, importance_factor=0.8, source=0)
+        assert expected == pytest.approx(actual, abs=1e-2)
+
+    @pytest.mark.parametrize("simrank_similarity", simrank_algs)
+    def test_simrank_noninteger_nodes(self, simrank_similarity):
+        G = nx.cycle_graph(5)
+        G = nx.relabel_nodes(G, dict(enumerate("abcde")))
+        expected = {
+            "a": 1,
+            "b": 0.3951219505902448,
+            "c": 0.5707317069281646,
+            "d": 0.5707317069281646,
+            "e": 0.3951219505902449,
+        }
+        actual = simrank_similarity(G, source="a")
+        assert expected == pytest.approx(actual, abs=1e-2)
+
+        # For a DiGraph test, use the first graph from the paper cited in
+        # the docs: https://dl.acm.org/doi/pdf/10.1145/775047.775126
+        G = nx.DiGraph()
+        G.add_node(0, label="Univ")
+        G.add_node(1, label="ProfA")
+        G.add_node(2, label="ProfB")
+        G.add_node(3, label="StudentA")
+        G.add_node(4, label="StudentB")
+        G.add_edges_from([(0, 1), (0, 2), (1, 3), (2, 4), (4, 2), (3, 0)])
+        node_labels = dict(enumerate(nx.get_node_attributes(G, "label").values()))
+        G = nx.relabel_nodes(G, node_labels)
+
+        expected = {
+            "Univ": 1,
+            "ProfA": 0.0,
+            "ProfB": 0.1323363991265798,
+            "StudentA": 0.0,
+            "StudentB": 0.03387811817640443,
+        }
+        # Use the importance_factor from the paper to get the same numbers.
+        actual = simrank_similarity(G, importance_factor=0.8, source="Univ")
+        assert expected == pytest.approx(actual, abs=1e-2)
 
-    def test_simrank_source_and_target(self):
+    @pytest.mark.parametrize("simrank_similarity", simrank_algs)
+    def test_simrank_source_and_target(self, simrank_similarity):
         G = nx.cycle_graph(5)
         expected = 1
-        actual = nx.simrank_similarity(G, source=0, target=0)
+        actual = simrank_similarity(G, source=0, target=0)
+        assert expected == pytest.approx(actual, abs=1e-2)
 
         # For a DiGraph test, use the first graph from the paper cited in
         # the docs: https://dl.acm.org/doi/pdf/10.1145/775047.775126
@@ -645,10 +692,43 @@ class TestSimilarity:
         expected = 0.1323363991265798
         # Use the importance_factor from the paper to get the same numbers.
         # Use the pair (0,2) because (0,0) and (0,1) have trivial results.
-        actual = nx.algorithms.similarity.simrank_similarity(
-            G, importance_factor=0.8, source=0, target=2
-        )
-        assert expected == actual
+        actual = simrank_similarity(G, importance_factor=0.8, source=0, target=2)
+        assert expected == pytest.approx(actual, abs=1e-5)
+
+    @pytest.mark.parametrize("alg", simrank_algs)
+    def test_simrank_max_iterations(self, alg):
+        G = nx.cycle_graph(5)
+        pytest.raises(nx.ExceededMaxIterations, alg, G, max_iterations=10)
+
+    def test_simrank_between_versions(self):
+        G = nx.cycle_graph(5)
+        # _python tolerance 1e-4
+        expected_python_tol4 = {
+            0: 1,
+            1: 0.394512499239852,
+            2: 0.5703550452791322,
+            3: 0.5703550452791323,
+            4: 0.394512499239852,
+        }
+        # _numpy tolerance 1e-4
+        expected_numpy_tol4 = {
+            0: 1.0,
+            1: 0.3947180735764555,
+            2: 0.570482097206368,
+            3: 0.570482097206368,
+            4: 0.3947180735764555,
+        }
+        actual = nx.simrank_similarity(G, source=0)
+        assert expected_numpy_tol4 == pytest.approx(actual, abs=1e-7)
+        # versions differ at 1e-4 level but equal at 1e-3
+        assert expected_python_tol4 != pytest.approx(actual, abs=1e-4)
+        assert expected_python_tol4 == pytest.approx(actual, abs=1e-3)
+
+        actual = nx.similarity._simrank_similarity_python(G, source=0)
+        assert expected_python_tol4 == pytest.approx(actual, abs=1e-7)
+        # versions differ at 1e-4 level but equal at 1e-3
+        assert expected_numpy_tol4 != pytest.approx(actual, abs=1e-4)
+        assert expected_numpy_tol4 == pytest.approx(actual, abs=1e-3)
 
     def test_simrank_numpy_no_source_no_target(self):
         G = nx.cycle_graph(5)
@@ -691,7 +771,7 @@ class TestSimilarity:
                 ],
             ]
         )
-        actual = nx.simrank_similarity_numpy(G)
+        actual = nx.similarity._simrank_similarity_numpy(G)
         np.testing.assert_allclose(expected, actual, atol=1e-7)
 
     def test_simrank_numpy_source_no_target(self):
@@ -705,13 +785,13 @@ class TestSimilarity:
                 0.3947180735764555,
             ]
         )
-        actual = nx.simrank_similarity_numpy(G, source=0)
+        actual = nx.similarity._simrank_similarity_numpy(G, source=0)
         np.testing.assert_allclose(expected, actual, atol=1e-7)
 
     def test_simrank_numpy_source_and_target(self):
         G = nx.cycle_graph(5)
         expected = 1.0
-        actual = nx.simrank_similarity_numpy(G, source=0, target=0)
+        actual = nx.similarity._simrank_similarity_numpy(G, source=0, target=0)
         np.testing.assert_allclose(expected, actual, atol=1e-7)
 
     def test_n_choose_k_small_k(self):
author	Jarrod Millman <jarrod.millman@gmail.com>	2021-05-30 18:08:42 -0700
committer	GitHub <noreply@github.com>	2021-05-30 18:08:42 -0700
commit	7afb9006f3650e0746b41d44863b42fe28a224ab (patch)
tree	71a6315e4d0fefbeb8b348eb0a521eb35c18d726
parent	d989f687e2f83e48fb4906836c3eb5c3e00e7d78 (diff)
download	networkx-7afb9006f3650e0746b41d44863b42fe28a224ab.tar.gz