From 7afb9006f3650e0746b41d44863b42fe28a224ab Mon Sep 17 00:00:00 2001 From: Jarrod Millman Date: Sun, 30 May 2021 18:08:42 -0700 Subject: Default to NumPy for simrank_similarity (#4841) * Default to NumPy for simrank_similarity * fix typo: targete -> target * arg G left out of call to simrank_similarity * Fix * More * Add pre/post processing of nodes in simrank_similarity Added test for noninteger nodes Added and updted examples in docstrings cleaned up some tests * parametrize tests so new _python version tested * Move similarity_numpy code to private function so deprecating requires very little code change. * implement changes from the discussion Move warning so doesn't show unless _numpy() version called Add test of difference between python and numpy version Add exception when methods don't converge Add tests of convergence exception Remove _is_close function * increase maxiterations and error handling including for isolated nodes * black Co-authored-by: Dan Schult --- doc/developer/deprecations.rst | 1 + doc/release/release_dev.rst | 4 + networkx/algorithms/similarity.py | 189 ++++++++++++++++++--------- networkx/algorithms/tests/test_similarity.py | 122 ++++++++++++++--- 4 files changed, 236 insertions(+), 80 deletions(-) diff --git a/doc/developer/deprecations.rst b/doc/developer/deprecations.rst index 56d8978d..1749ab41 100644 --- a/doc/developer/deprecations.rst +++ b/doc/developer/deprecations.rst @@ -87,3 +87,4 @@ Version 3.0 * In ``algorithms/community/quality.py`` remove ``coverage`` and ``performance``. * Remove ``testing``. * In ``linalg/graphmatrix.py`` remove ``adj_matrix``. +* In ``algorithms/similarity.py`` replace ``simrank_similarity`` with ``simrank_similarity_numpy``. diff --git a/doc/release/release_dev.rst b/doc/release/release_dev.rst index 14a1ef4d..6ebb3e1c 100644 --- a/doc/release/release_dev.rst +++ b/doc/release/release_dev.rst @@ -78,6 +78,8 @@ Improvements Adds ``forest_str`` for string representation of trees. - [`#4319 `_] pagerank uses scipy by default now. +- [`#4841 `_] + simrank_similarity uses numpy by default now. - [`#4317 `_] New ``source`` argument to ``has_eulerian_path`` to look for path starting at source. @@ -239,6 +241,8 @@ Deprecations Deprecate ``assert_nodes_equal``, ``assert_edges_equal``, and ``assert_graphs_equal``. - [`#4850 `_] Deprecate ``adj_matrix``. +- [`#4841 `_] + Deprecate ``simrank_similarity_numpy``. Contributors ------------ diff --git a/networkx/algorithms/similarity.py b/networkx/algorithms/similarity.py index 6369012d..a4c2e961 100644 --- a/networkx/algorithms/similarity.py +++ b/networkx/algorithms/similarity.py @@ -1196,47 +1196,12 @@ def optimize_edit_paths( yield list(vertex_path), list(edge_path), cost -def _is_close(d1, d2, atolerance=0, rtolerance=0): - """Determines whether two adjacency matrices are within - a provided tolerance. - - Parameters - ---------- - d1 : dict - Adjacency dictionary - - d2 : dict - Adjacency dictionary - - atolerance : float - Some scalar tolerance value to determine closeness - - rtolerance : float - A scalar tolerance value that will be some proportion - of ``d2``'s value - - Returns - ------- - closeness : bool - If all of the nodes within ``d1`` and ``d2`` are within - a predefined tolerance, they are considered "close" and - this method will return True. Otherwise, this method will - return False. - - """ - # Pre-condition: d1 and d2 have the same keys at each level if they - # are dictionaries. - if not isinstance(d1, dict) and not isinstance(d2, dict): - return abs(d1 - d2) <= atolerance + rtolerance * abs(d2) - return all(all(_is_close(d1[u][v], d2[u][v]) for v in d1[u]) for u in d1) - - def simrank_similarity( G, source=None, target=None, importance_factor=0.9, - max_iterations=100, + max_iterations=1000, tolerance=1e-4, ): """Returns the SimRank similarity of nodes in the graph ``G``. @@ -1304,16 +1269,27 @@ def simrank_similarity( Examples -------- - If the nodes of the graph are numbered from zero to *n - 1*, where *n* - is the number of nodes in the graph, you can create a SimRank matrix - from the return value of this function where the node numbers are - the row and column indices of the matrix: + >>> G = nx.cycle_graph(2) + >>> nx.simrank_similarity(G) + {0: {0: 1.0, 1: 0.0}, 1: {0: 0.0, 1: 1.0}} + >>> nx.simrank_similarity(G, source=0) + {0: 1.0, 1: 0.0} + >>> nx.simrank_similarity(G, source=0, target=0) + 1.0 + + The result of this function can be converted to a numpy array + representing the SimRank matrix by using the node order of the + graph to determine which row and column represent each node. + Other ordering of nodes is also possible. >>> import numpy as np - >>> G = nx.cycle_graph(4) >>> sim = nx.simrank_similarity(G) - >>> lol = [[sim[u][v] for v in sorted(sim[u])] for u in sorted(sim)] - >>> sim_array = np.array(lol) + >>> np.array([[sim[u][v] for v in G] for u in G]) + array([[1., 0.], + [0., 1.]]) + >>> sim_1d = nx.simrank_similarity(G, source=0) + >>> np.array([sim[0][v] for v in G]) + array([1., 0.]) References ---------- @@ -1324,8 +1300,46 @@ def simrank_similarity( International Conference on Knowledge Discovery and Data Mining, pp. 538--543. ACM Press, 2002. """ - prevsim = None + import numpy as np + + nodelist = list(G) + s_indx = None if source is None else nodelist.index(source) + t_indx = None if target is None else nodelist.index(target) + + x = _simrank_similarity_numpy( + G, s_indx, t_indx, importance_factor, max_iterations, tolerance + ) + + if isinstance(x, np.ndarray): + if x.ndim == 1: + return {node: val for node, val in zip(G, x)} + else: # x.ndim == 2: + return {u: dict(zip(G, row)) for u, row in zip(G, x)} + return x + + +def _simrank_similarity_python( + G, + source=None, + target=None, + importance_factor=0.9, + max_iterations=1000, + tolerance=1e-4, +): + """Returns the SimRank similarity of nodes in the graph ``G``. + + This pure Python version is provided for pedagogical purposes. + Examples + -------- + >>> G = nx.cycle_graph(2) + >>> nx.similarity._simrank_similarity_python(G) + {0: {0: 1, 1: 0.0}, 1: {0: 0.0, 1: 1}} + >>> nx.similarity._simrank_similarity_python(G, source=0) + {0: 1, 1: 0.0} + >>> nx.similarity._simrank_similarity_python(G, source=0, target=0) + 1 + """ # build up our similarity adjacency dictionary output newsim = {u: {v: 1 if u == v else 0 for v in G} for u in G} @@ -1334,17 +1348,28 @@ def simrank_similarity( def avg_sim(s): return sum(newsim[w][x] for (w, x) in s) / len(s) if s else 0.0 + Gadj = G.pred if G.is_directed() else G.adj + def sim(u, v): - Gadj = G.pred if G.is_directed() else G.adj return importance_factor * avg_sim(list(product(Gadj[u], Gadj[v]))) - for _ in range(max_iterations): - if prevsim and _is_close(prevsim, newsim, tolerance): + for its in range(max_iterations): + oldsim = newsim + newsim = {u: {v: sim(u, v) if u is not v else 1 for v in G} for u in G} + is_close = all( + all( + abs(newsim[u][v] - old) <= tolerance * (1 + abs(old)) + for v, old in nbrs.items() + ) + for u, nbrs in oldsim.items() + ) + if is_close: break - prevsim = newsim - newsim = { - u: {v: sim(u, v) if u is not v else 1 for v in newsim[u]} for u in newsim - } + + if its + 1 == max_iterations: + raise nx.ExceededMaxIterations( + f"simrank did not converge after {max_iterations} iterations." + ) if source is not None and target is not None: return newsim[source][target] @@ -1353,12 +1378,12 @@ def simrank_similarity( return newsim -def simrank_similarity_numpy( +def _simrank_similarity_numpy( G, source=None, target=None, importance_factor=0.9, - max_iterations=100, + max_iterations=1000, tolerance=1e-4, ): """Calculate SimRank of nodes in ``G`` using matrices with ``numpy``. @@ -1409,8 +1434,14 @@ def simrank_similarity_numpy( Examples -------- - >>> G = nx.cycle_graph(4) - >>> sim = nx.simrank_similarity_numpy(G) + >>> G = nx.cycle_graph(2) + >>> nx.similarity._simrank_similarity_numpy(G) + array([[1., 0.], + [0., 1.]]) + >>> nx.similarity._simrank_similarity_numpy(G, source=0) + array([1., 0.]) + >>> nx.similarity._simrank_similarity_numpy(G, source=0, target=0) + 1.0 References ---------- @@ -1431,17 +1462,24 @@ def simrank_similarity_numpy( adjacency_matrix = nx.to_numpy_array(G) # column-normalize the ``adjacency_matrix`` - adjacency_matrix /= adjacency_matrix.sum(axis=0) + s = np.array(adjacency_matrix.sum(axis=0)) + s[s == 0] = 1 + adjacency_matrix /= s # adjacency_matrix.sum(axis=0) - newsim = np.eye(adjacency_matrix.shape[0], dtype=np.float64) - for _ in range(max_iterations): - prevsim = np.copy(newsim) + newsim = np.eye(len(G), dtype=np.float64) + for its in range(max_iterations): + prevsim = newsim.copy() newsim = importance_factor * ((adjacency_matrix.T @ prevsim) @ adjacency_matrix) np.fill_diagonal(newsim, 1.0) if np.allclose(prevsim, newsim, atol=tolerance): break + if its + 1 == max_iterations: + raise nx.ExceededMaxIterations( + f"simrank did not converge after {max_iterations} iterations." + ) + if source is not None and target is not None: return newsim[source, target] if source is not None: @@ -1449,6 +1487,39 @@ def simrank_similarity_numpy( return newsim +def simrank_similarity_numpy( + G, + source=None, + target=None, + importance_factor=0.9, + max_iterations=100, + tolerance=1e-4, +): + """Calculate SimRank of nodes in ``G`` using matrices with ``numpy``. + + .. deprecated:: 2.6 + simrank_similarity_numpy is deprecated and will be removed in networkx 3.0. + Use simrank_similarity + + """ + warnings.warn( + ( + "networkx.simrank_similarity_numpy is deprecated and will be removed" + "in NetworkX 3.0, use networkx.simrank_similarity instead." + ), + DeprecationWarning, + stacklevel=2, + ) + return _simrank_similarity_numpy( + G, + source, + target, + importance_factor, + max_iterations, + tolerance, + ) + + # TODO replace w/ math.comb(n, k) for Python 3.8+ def _n_choose_k(n, k): """Pure Python implementation of the binomial coefficient diff --git a/networkx/algorithms/tests/test_similarity.py b/networkx/algorithms/tests/test_similarity.py index 88022f5f..08992882 100644 --- a/networkx/algorithms/tests/test_similarity.py +++ b/networkx/algorithms/tests/test_similarity.py @@ -523,7 +523,14 @@ class TestSimilarity: G2.add_edge("B", "D", label="bad") assert graph_edit_distance(G1, G2, node_match=nmatch, edge_match=ematch) == 1 - def test_simrank_no_source_no_target(self): + # note: nx.simrank_similarity_numpy not included because returns np.array + simrank_algs = [ + nx.simrank_similarity, + nx.similarity._simrank_similarity_python, + ] + + @pytest.mark.parametrize("simrank_similarity", simrank_algs) + def test_simrank_no_source_no_target(self, simrank_similarity): G = nx.cycle_graph(5) expected = { 0: { @@ -562,8 +569,9 @@ class TestSimilarity: 4: 1, }, } - actual = nx.simrank_similarity(G) - assert expected == actual + actual = simrank_similarity(G) + for k, v in expected.items(): + assert v == pytest.approx(actual[k], abs=1e-2) # For a DiGraph test, use the first graph from the paper cited in # the docs: https://dl.acm.org/doi/pdf/10.1145/775047.775126 @@ -595,10 +603,12 @@ class TestSimilarity: }, } # Use the importance_factor from the paper to get the same numbers. - actual = nx.algorithms.similarity.simrank_similarity(G, importance_factor=0.8) - assert expected == actual + actual = simrank_similarity(G, importance_factor=0.8) + for k, v in expected.items(): + assert v == pytest.approx(actual[k], abs=1e-2) - def test_simrank_source_no_target(self): + @pytest.mark.parametrize("simrank_similarity", simrank_algs) + def test_simrank_source_no_target(self, simrank_similarity): G = nx.cycle_graph(5) expected = { 0: 1, @@ -607,8 +617,8 @@ class TestSimilarity: 3: 0.5707317069281646, 4: 0.3951219505902449, } - actual = nx.simrank_similarity(G, source=0) - assert expected == actual + actual = simrank_similarity(G, source=0) + assert expected == pytest.approx(actual, abs=1e-2) # For a DiGraph test, use the first graph from the paper cited in # the docs: https://dl.acm.org/doi/pdf/10.1145/775047.775126 @@ -622,15 +632,52 @@ class TestSimilarity: expected = {0: 1, 1: 0.0, 2: 0.1323363991265798, 3: 0.0, 4: 0.03387811817640443} # Use the importance_factor from the paper to get the same numbers. - actual = nx.algorithms.similarity.simrank_similarity( - G, importance_factor=0.8, source=0 - ) - assert expected == actual + actual = simrank_similarity(G, importance_factor=0.8, source=0) + assert expected == pytest.approx(actual, abs=1e-2) + + @pytest.mark.parametrize("simrank_similarity", simrank_algs) + def test_simrank_noninteger_nodes(self, simrank_similarity): + G = nx.cycle_graph(5) + G = nx.relabel_nodes(G, dict(enumerate("abcde"))) + expected = { + "a": 1, + "b": 0.3951219505902448, + "c": 0.5707317069281646, + "d": 0.5707317069281646, + "e": 0.3951219505902449, + } + actual = simrank_similarity(G, source="a") + assert expected == pytest.approx(actual, abs=1e-2) + + # For a DiGraph test, use the first graph from the paper cited in + # the docs: https://dl.acm.org/doi/pdf/10.1145/775047.775126 + G = nx.DiGraph() + G.add_node(0, label="Univ") + G.add_node(1, label="ProfA") + G.add_node(2, label="ProfB") + G.add_node(3, label="StudentA") + G.add_node(4, label="StudentB") + G.add_edges_from([(0, 1), (0, 2), (1, 3), (2, 4), (4, 2), (3, 0)]) + node_labels = dict(enumerate(nx.get_node_attributes(G, "label").values())) + G = nx.relabel_nodes(G, node_labels) + + expected = { + "Univ": 1, + "ProfA": 0.0, + "ProfB": 0.1323363991265798, + "StudentA": 0.0, + "StudentB": 0.03387811817640443, + } + # Use the importance_factor from the paper to get the same numbers. + actual = simrank_similarity(G, importance_factor=0.8, source="Univ") + assert expected == pytest.approx(actual, abs=1e-2) - def test_simrank_source_and_target(self): + @pytest.mark.parametrize("simrank_similarity", simrank_algs) + def test_simrank_source_and_target(self, simrank_similarity): G = nx.cycle_graph(5) expected = 1 - actual = nx.simrank_similarity(G, source=0, target=0) + actual = simrank_similarity(G, source=0, target=0) + assert expected == pytest.approx(actual, abs=1e-2) # For a DiGraph test, use the first graph from the paper cited in # the docs: https://dl.acm.org/doi/pdf/10.1145/775047.775126 @@ -645,10 +692,43 @@ class TestSimilarity: expected = 0.1323363991265798 # Use the importance_factor from the paper to get the same numbers. # Use the pair (0,2) because (0,0) and (0,1) have trivial results. - actual = nx.algorithms.similarity.simrank_similarity( - G, importance_factor=0.8, source=0, target=2 - ) - assert expected == actual + actual = simrank_similarity(G, importance_factor=0.8, source=0, target=2) + assert expected == pytest.approx(actual, abs=1e-5) + + @pytest.mark.parametrize("alg", simrank_algs) + def test_simrank_max_iterations(self, alg): + G = nx.cycle_graph(5) + pytest.raises(nx.ExceededMaxIterations, alg, G, max_iterations=10) + + def test_simrank_between_versions(self): + G = nx.cycle_graph(5) + # _python tolerance 1e-4 + expected_python_tol4 = { + 0: 1, + 1: 0.394512499239852, + 2: 0.5703550452791322, + 3: 0.5703550452791323, + 4: 0.394512499239852, + } + # _numpy tolerance 1e-4 + expected_numpy_tol4 = { + 0: 1.0, + 1: 0.3947180735764555, + 2: 0.570482097206368, + 3: 0.570482097206368, + 4: 0.3947180735764555, + } + actual = nx.simrank_similarity(G, source=0) + assert expected_numpy_tol4 == pytest.approx(actual, abs=1e-7) + # versions differ at 1e-4 level but equal at 1e-3 + assert expected_python_tol4 != pytest.approx(actual, abs=1e-4) + assert expected_python_tol4 == pytest.approx(actual, abs=1e-3) + + actual = nx.similarity._simrank_similarity_python(G, source=0) + assert expected_python_tol4 == pytest.approx(actual, abs=1e-7) + # versions differ at 1e-4 level but equal at 1e-3 + assert expected_numpy_tol4 != pytest.approx(actual, abs=1e-4) + assert expected_numpy_tol4 == pytest.approx(actual, abs=1e-3) def test_simrank_numpy_no_source_no_target(self): G = nx.cycle_graph(5) @@ -691,7 +771,7 @@ class TestSimilarity: ], ] ) - actual = nx.simrank_similarity_numpy(G) + actual = nx.similarity._simrank_similarity_numpy(G) np.testing.assert_allclose(expected, actual, atol=1e-7) def test_simrank_numpy_source_no_target(self): @@ -705,13 +785,13 @@ class TestSimilarity: 0.3947180735764555, ] ) - actual = nx.simrank_similarity_numpy(G, source=0) + actual = nx.similarity._simrank_similarity_numpy(G, source=0) np.testing.assert_allclose(expected, actual, atol=1e-7) def test_simrank_numpy_source_and_target(self): G = nx.cycle_graph(5) expected = 1.0 - actual = nx.simrank_similarity_numpy(G, source=0, target=0) + actual = nx.similarity._simrank_similarity_numpy(G, source=0, target=0) np.testing.assert_allclose(expected, actual, atol=1e-7) def test_n_choose_k_small_k(self): -- cgit v1.2.1