summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJarrod Millman <jarrod.millman@gmail.com>2021-05-30 18:08:42 -0700
committerGitHub <noreply@github.com>2021-05-30 18:08:42 -0700
commit7afb9006f3650e0746b41d44863b42fe28a224ab (patch)
tree71a6315e4d0fefbeb8b348eb0a521eb35c18d726
parentd989f687e2f83e48fb4906836c3eb5c3e00e7d78 (diff)
downloadnetworkx-7afb9006f3650e0746b41d44863b42fe28a224ab.tar.gz
Default to NumPy for simrank_similarity (#4841)
* Default to NumPy for simrank_similarity * fix typo: targete -> target * arg G left out of call to simrank_similarity * Fix * More * Add pre/post processing of nodes in simrank_similarity Added test for noninteger nodes Added and updted examples in docstrings cleaned up some tests * parametrize tests so new _python version tested * Move similarity_numpy code to private function so deprecating requires very little code change. * implement changes from the discussion Move warning so doesn't show unless _numpy() version called Add test of difference between python and numpy version Add exception when methods don't converge Add tests of convergence exception Remove _is_close function * increase maxiterations and error handling including for isolated nodes * black Co-authored-by: Dan Schult <dschult@colgate.edu>
-rw-r--r--doc/developer/deprecations.rst1
-rw-r--r--doc/release/release_dev.rst4
-rw-r--r--networkx/algorithms/similarity.py189
-rw-r--r--networkx/algorithms/tests/test_similarity.py122
4 files changed, 236 insertions, 80 deletions
diff --git a/doc/developer/deprecations.rst b/doc/developer/deprecations.rst
index 56d8978d..1749ab41 100644
--- a/doc/developer/deprecations.rst
+++ b/doc/developer/deprecations.rst
@@ -87,3 +87,4 @@ Version 3.0
* In ``algorithms/community/quality.py`` remove ``coverage`` and ``performance``.
* Remove ``testing``.
* In ``linalg/graphmatrix.py`` remove ``adj_matrix``.
+* In ``algorithms/similarity.py`` replace ``simrank_similarity`` with ``simrank_similarity_numpy``.
diff --git a/doc/release/release_dev.rst b/doc/release/release_dev.rst
index 14a1ef4d..6ebb3e1c 100644
--- a/doc/release/release_dev.rst
+++ b/doc/release/release_dev.rst
@@ -78,6 +78,8 @@ Improvements
Adds ``forest_str`` for string representation of trees.
- [`#4319 <https://github.com/networkx/networkx/pull/4319>`_]
pagerank uses scipy by default now.
+- [`#4841 <https://github.com/networkx/networkx/pull/4841>`_]
+ simrank_similarity uses numpy by default now.
- [`#4317 <https://github.com/networkx/networkx/pull/4317>`_]
New ``source`` argument to ``has_eulerian_path`` to look for path starting at
source.
@@ -239,6 +241,8 @@ Deprecations
Deprecate ``assert_nodes_equal``, ``assert_edges_equal``, and ``assert_graphs_equal``.
- [`#4850 <https://github.com/networkx/networkx/pull/4850>`_]
Deprecate ``adj_matrix``.
+- [`#4841 <https://github.com/networkx/networkx/pull/4841>`_]
+ Deprecate ``simrank_similarity_numpy``.
Contributors
------------
diff --git a/networkx/algorithms/similarity.py b/networkx/algorithms/similarity.py
index 6369012d..a4c2e961 100644
--- a/networkx/algorithms/similarity.py
+++ b/networkx/algorithms/similarity.py
@@ -1196,47 +1196,12 @@ def optimize_edit_paths(
yield list(vertex_path), list(edge_path), cost
-def _is_close(d1, d2, atolerance=0, rtolerance=0):
- """Determines whether two adjacency matrices are within
- a provided tolerance.
-
- Parameters
- ----------
- d1 : dict
- Adjacency dictionary
-
- d2 : dict
- Adjacency dictionary
-
- atolerance : float
- Some scalar tolerance value to determine closeness
-
- rtolerance : float
- A scalar tolerance value that will be some proportion
- of ``d2``'s value
-
- Returns
- -------
- closeness : bool
- If all of the nodes within ``d1`` and ``d2`` are within
- a predefined tolerance, they are considered "close" and
- this method will return True. Otherwise, this method will
- return False.
-
- """
- # Pre-condition: d1 and d2 have the same keys at each level if they
- # are dictionaries.
- if not isinstance(d1, dict) and not isinstance(d2, dict):
- return abs(d1 - d2) <= atolerance + rtolerance * abs(d2)
- return all(all(_is_close(d1[u][v], d2[u][v]) for v in d1[u]) for u in d1)
-
-
def simrank_similarity(
G,
source=None,
target=None,
importance_factor=0.9,
- max_iterations=100,
+ max_iterations=1000,
tolerance=1e-4,
):
"""Returns the SimRank similarity of nodes in the graph ``G``.
@@ -1304,16 +1269,27 @@ def simrank_similarity(
Examples
--------
- If the nodes of the graph are numbered from zero to *n - 1*, where *n*
- is the number of nodes in the graph, you can create a SimRank matrix
- from the return value of this function where the node numbers are
- the row and column indices of the matrix:
+ >>> G = nx.cycle_graph(2)
+ >>> nx.simrank_similarity(G)
+ {0: {0: 1.0, 1: 0.0}, 1: {0: 0.0, 1: 1.0}}
+ >>> nx.simrank_similarity(G, source=0)
+ {0: 1.0, 1: 0.0}
+ >>> nx.simrank_similarity(G, source=0, target=0)
+ 1.0
+
+ The result of this function can be converted to a numpy array
+ representing the SimRank matrix by using the node order of the
+ graph to determine which row and column represent each node.
+ Other ordering of nodes is also possible.
>>> import numpy as np
- >>> G = nx.cycle_graph(4)
>>> sim = nx.simrank_similarity(G)
- >>> lol = [[sim[u][v] for v in sorted(sim[u])] for u in sorted(sim)]
- >>> sim_array = np.array(lol)
+ >>> np.array([[sim[u][v] for v in G] for u in G])
+ array([[1., 0.],
+ [0., 1.]])
+ >>> sim_1d = nx.simrank_similarity(G, source=0)
+ >>> np.array([sim[0][v] for v in G])
+ array([1., 0.])
References
----------
@@ -1324,8 +1300,46 @@ def simrank_similarity(
International Conference on Knowledge Discovery and Data Mining,
pp. 538--543. ACM Press, 2002.
"""
- prevsim = None
+ import numpy as np
+
+ nodelist = list(G)
+ s_indx = None if source is None else nodelist.index(source)
+ t_indx = None if target is None else nodelist.index(target)
+
+ x = _simrank_similarity_numpy(
+ G, s_indx, t_indx, importance_factor, max_iterations, tolerance
+ )
+
+ if isinstance(x, np.ndarray):
+ if x.ndim == 1:
+ return {node: val for node, val in zip(G, x)}
+ else: # x.ndim == 2:
+ return {u: dict(zip(G, row)) for u, row in zip(G, x)}
+ return x
+
+
+def _simrank_similarity_python(
+ G,
+ source=None,
+ target=None,
+ importance_factor=0.9,
+ max_iterations=1000,
+ tolerance=1e-4,
+):
+ """Returns the SimRank similarity of nodes in the graph ``G``.
+
+ This pure Python version is provided for pedagogical purposes.
+ Examples
+ --------
+ >>> G = nx.cycle_graph(2)
+ >>> nx.similarity._simrank_similarity_python(G)
+ {0: {0: 1, 1: 0.0}, 1: {0: 0.0, 1: 1}}
+ >>> nx.similarity._simrank_similarity_python(G, source=0)
+ {0: 1, 1: 0.0}
+ >>> nx.similarity._simrank_similarity_python(G, source=0, target=0)
+ 1
+ """
# build up our similarity adjacency dictionary output
newsim = {u: {v: 1 if u == v else 0 for v in G} for u in G}
@@ -1334,17 +1348,28 @@ def simrank_similarity(
def avg_sim(s):
return sum(newsim[w][x] for (w, x) in s) / len(s) if s else 0.0
+ Gadj = G.pred if G.is_directed() else G.adj
+
def sim(u, v):
- Gadj = G.pred if G.is_directed() else G.adj
return importance_factor * avg_sim(list(product(Gadj[u], Gadj[v])))
- for _ in range(max_iterations):
- if prevsim and _is_close(prevsim, newsim, tolerance):
+ for its in range(max_iterations):
+ oldsim = newsim
+ newsim = {u: {v: sim(u, v) if u is not v else 1 for v in G} for u in G}
+ is_close = all(
+ all(
+ abs(newsim[u][v] - old) <= tolerance * (1 + abs(old))
+ for v, old in nbrs.items()
+ )
+ for u, nbrs in oldsim.items()
+ )
+ if is_close:
break
- prevsim = newsim
- newsim = {
- u: {v: sim(u, v) if u is not v else 1 for v in newsim[u]} for u in newsim
- }
+
+ if its + 1 == max_iterations:
+ raise nx.ExceededMaxIterations(
+ f"simrank did not converge after {max_iterations} iterations."
+ )
if source is not None and target is not None:
return newsim[source][target]
@@ -1353,12 +1378,12 @@ def simrank_similarity(
return newsim
-def simrank_similarity_numpy(
+def _simrank_similarity_numpy(
G,
source=None,
target=None,
importance_factor=0.9,
- max_iterations=100,
+ max_iterations=1000,
tolerance=1e-4,
):
"""Calculate SimRank of nodes in ``G`` using matrices with ``numpy``.
@@ -1409,8 +1434,14 @@ def simrank_similarity_numpy(
Examples
--------
- >>> G = nx.cycle_graph(4)
- >>> sim = nx.simrank_similarity_numpy(G)
+ >>> G = nx.cycle_graph(2)
+ >>> nx.similarity._simrank_similarity_numpy(G)
+ array([[1., 0.],
+ [0., 1.]])
+ >>> nx.similarity._simrank_similarity_numpy(G, source=0)
+ array([1., 0.])
+ >>> nx.similarity._simrank_similarity_numpy(G, source=0, target=0)
+ 1.0
References
----------
@@ -1431,17 +1462,24 @@ def simrank_similarity_numpy(
adjacency_matrix = nx.to_numpy_array(G)
# column-normalize the ``adjacency_matrix``
- adjacency_matrix /= adjacency_matrix.sum(axis=0)
+ s = np.array(adjacency_matrix.sum(axis=0))
+ s[s == 0] = 1
+ adjacency_matrix /= s # adjacency_matrix.sum(axis=0)
- newsim = np.eye(adjacency_matrix.shape[0], dtype=np.float64)
- for _ in range(max_iterations):
- prevsim = np.copy(newsim)
+ newsim = np.eye(len(G), dtype=np.float64)
+ for its in range(max_iterations):
+ prevsim = newsim.copy()
newsim = importance_factor * ((adjacency_matrix.T @ prevsim) @ adjacency_matrix)
np.fill_diagonal(newsim, 1.0)
if np.allclose(prevsim, newsim, atol=tolerance):
break
+ if its + 1 == max_iterations:
+ raise nx.ExceededMaxIterations(
+ f"simrank did not converge after {max_iterations} iterations."
+ )
+
if source is not None and target is not None:
return newsim[source, target]
if source is not None:
@@ -1449,6 +1487,39 @@ def simrank_similarity_numpy(
return newsim
+def simrank_similarity_numpy(
+ G,
+ source=None,
+ target=None,
+ importance_factor=0.9,
+ max_iterations=100,
+ tolerance=1e-4,
+):
+ """Calculate SimRank of nodes in ``G`` using matrices with ``numpy``.
+
+ .. deprecated:: 2.6
+ simrank_similarity_numpy is deprecated and will be removed in networkx 3.0.
+ Use simrank_similarity
+
+ """
+ warnings.warn(
+ (
+ "networkx.simrank_similarity_numpy is deprecated and will be removed"
+ "in NetworkX 3.0, use networkx.simrank_similarity instead."
+ ),
+ DeprecationWarning,
+ stacklevel=2,
+ )
+ return _simrank_similarity_numpy(
+ G,
+ source,
+ target,
+ importance_factor,
+ max_iterations,
+ tolerance,
+ )
+
+
# TODO replace w/ math.comb(n, k) for Python 3.8+
def _n_choose_k(n, k):
"""Pure Python implementation of the binomial coefficient
diff --git a/networkx/algorithms/tests/test_similarity.py b/networkx/algorithms/tests/test_similarity.py
index 88022f5f..08992882 100644
--- a/networkx/algorithms/tests/test_similarity.py
+++ b/networkx/algorithms/tests/test_similarity.py
@@ -523,7 +523,14 @@ class TestSimilarity:
G2.add_edge("B", "D", label="bad")
assert graph_edit_distance(G1, G2, node_match=nmatch, edge_match=ematch) == 1
- def test_simrank_no_source_no_target(self):
+ # note: nx.simrank_similarity_numpy not included because returns np.array
+ simrank_algs = [
+ nx.simrank_similarity,
+ nx.similarity._simrank_similarity_python,
+ ]
+
+ @pytest.mark.parametrize("simrank_similarity", simrank_algs)
+ def test_simrank_no_source_no_target(self, simrank_similarity):
G = nx.cycle_graph(5)
expected = {
0: {
@@ -562,8 +569,9 @@ class TestSimilarity:
4: 1,
},
}
- actual = nx.simrank_similarity(G)
- assert expected == actual
+ actual = simrank_similarity(G)
+ for k, v in expected.items():
+ assert v == pytest.approx(actual[k], abs=1e-2)
# For a DiGraph test, use the first graph from the paper cited in
# the docs: https://dl.acm.org/doi/pdf/10.1145/775047.775126
@@ -595,10 +603,12 @@ class TestSimilarity:
},
}
# Use the importance_factor from the paper to get the same numbers.
- actual = nx.algorithms.similarity.simrank_similarity(G, importance_factor=0.8)
- assert expected == actual
+ actual = simrank_similarity(G, importance_factor=0.8)
+ for k, v in expected.items():
+ assert v == pytest.approx(actual[k], abs=1e-2)
- def test_simrank_source_no_target(self):
+ @pytest.mark.parametrize("simrank_similarity", simrank_algs)
+ def test_simrank_source_no_target(self, simrank_similarity):
G = nx.cycle_graph(5)
expected = {
0: 1,
@@ -607,8 +617,8 @@ class TestSimilarity:
3: 0.5707317069281646,
4: 0.3951219505902449,
}
- actual = nx.simrank_similarity(G, source=0)
- assert expected == actual
+ actual = simrank_similarity(G, source=0)
+ assert expected == pytest.approx(actual, abs=1e-2)
# For a DiGraph test, use the first graph from the paper cited in
# the docs: https://dl.acm.org/doi/pdf/10.1145/775047.775126
@@ -622,15 +632,52 @@ class TestSimilarity:
expected = {0: 1, 1: 0.0, 2: 0.1323363991265798, 3: 0.0, 4: 0.03387811817640443}
# Use the importance_factor from the paper to get the same numbers.
- actual = nx.algorithms.similarity.simrank_similarity(
- G, importance_factor=0.8, source=0
- )
- assert expected == actual
+ actual = simrank_similarity(G, importance_factor=0.8, source=0)
+ assert expected == pytest.approx(actual, abs=1e-2)
+
+ @pytest.mark.parametrize("simrank_similarity", simrank_algs)
+ def test_simrank_noninteger_nodes(self, simrank_similarity):
+ G = nx.cycle_graph(5)
+ G = nx.relabel_nodes(G, dict(enumerate("abcde")))
+ expected = {
+ "a": 1,
+ "b": 0.3951219505902448,
+ "c": 0.5707317069281646,
+ "d": 0.5707317069281646,
+ "e": 0.3951219505902449,
+ }
+ actual = simrank_similarity(G, source="a")
+ assert expected == pytest.approx(actual, abs=1e-2)
+
+ # For a DiGraph test, use the first graph from the paper cited in
+ # the docs: https://dl.acm.org/doi/pdf/10.1145/775047.775126
+ G = nx.DiGraph()
+ G.add_node(0, label="Univ")
+ G.add_node(1, label="ProfA")
+ G.add_node(2, label="ProfB")
+ G.add_node(3, label="StudentA")
+ G.add_node(4, label="StudentB")
+ G.add_edges_from([(0, 1), (0, 2), (1, 3), (2, 4), (4, 2), (3, 0)])
+ node_labels = dict(enumerate(nx.get_node_attributes(G, "label").values()))
+ G = nx.relabel_nodes(G, node_labels)
+
+ expected = {
+ "Univ": 1,
+ "ProfA": 0.0,
+ "ProfB": 0.1323363991265798,
+ "StudentA": 0.0,
+ "StudentB": 0.03387811817640443,
+ }
+ # Use the importance_factor from the paper to get the same numbers.
+ actual = simrank_similarity(G, importance_factor=0.8, source="Univ")
+ assert expected == pytest.approx(actual, abs=1e-2)
- def test_simrank_source_and_target(self):
+ @pytest.mark.parametrize("simrank_similarity", simrank_algs)
+ def test_simrank_source_and_target(self, simrank_similarity):
G = nx.cycle_graph(5)
expected = 1
- actual = nx.simrank_similarity(G, source=0, target=0)
+ actual = simrank_similarity(G, source=0, target=0)
+ assert expected == pytest.approx(actual, abs=1e-2)
# For a DiGraph test, use the first graph from the paper cited in
# the docs: https://dl.acm.org/doi/pdf/10.1145/775047.775126
@@ -645,10 +692,43 @@ class TestSimilarity:
expected = 0.1323363991265798
# Use the importance_factor from the paper to get the same numbers.
# Use the pair (0,2) because (0,0) and (0,1) have trivial results.
- actual = nx.algorithms.similarity.simrank_similarity(
- G, importance_factor=0.8, source=0, target=2
- )
- assert expected == actual
+ actual = simrank_similarity(G, importance_factor=0.8, source=0, target=2)
+ assert expected == pytest.approx(actual, abs=1e-5)
+
+ @pytest.mark.parametrize("alg", simrank_algs)
+ def test_simrank_max_iterations(self, alg):
+ G = nx.cycle_graph(5)
+ pytest.raises(nx.ExceededMaxIterations, alg, G, max_iterations=10)
+
+ def test_simrank_between_versions(self):
+ G = nx.cycle_graph(5)
+ # _python tolerance 1e-4
+ expected_python_tol4 = {
+ 0: 1,
+ 1: 0.394512499239852,
+ 2: 0.5703550452791322,
+ 3: 0.5703550452791323,
+ 4: 0.394512499239852,
+ }
+ # _numpy tolerance 1e-4
+ expected_numpy_tol4 = {
+ 0: 1.0,
+ 1: 0.3947180735764555,
+ 2: 0.570482097206368,
+ 3: 0.570482097206368,
+ 4: 0.3947180735764555,
+ }
+ actual = nx.simrank_similarity(G, source=0)
+ assert expected_numpy_tol4 == pytest.approx(actual, abs=1e-7)
+ # versions differ at 1e-4 level but equal at 1e-3
+ assert expected_python_tol4 != pytest.approx(actual, abs=1e-4)
+ assert expected_python_tol4 == pytest.approx(actual, abs=1e-3)
+
+ actual = nx.similarity._simrank_similarity_python(G, source=0)
+ assert expected_python_tol4 == pytest.approx(actual, abs=1e-7)
+ # versions differ at 1e-4 level but equal at 1e-3
+ assert expected_numpy_tol4 != pytest.approx(actual, abs=1e-4)
+ assert expected_numpy_tol4 == pytest.approx(actual, abs=1e-3)
def test_simrank_numpy_no_source_no_target(self):
G = nx.cycle_graph(5)
@@ -691,7 +771,7 @@ class TestSimilarity:
],
]
)
- actual = nx.simrank_similarity_numpy(G)
+ actual = nx.similarity._simrank_similarity_numpy(G)
np.testing.assert_allclose(expected, actual, atol=1e-7)
def test_simrank_numpy_source_no_target(self):
@@ -705,13 +785,13 @@ class TestSimilarity:
0.3947180735764555,
]
)
- actual = nx.simrank_similarity_numpy(G, source=0)
+ actual = nx.similarity._simrank_similarity_numpy(G, source=0)
np.testing.assert_allclose(expected, actual, atol=1e-7)
def test_simrank_numpy_source_and_target(self):
G = nx.cycle_graph(5)
expected = 1.0
- actual = nx.simrank_similarity_numpy(G, source=0, target=0)
+ actual = nx.similarity._simrank_similarity_numpy(G, source=0, target=0)
np.testing.assert_allclose(expected, actual, atol=1e-7)
def test_n_choose_k_small_k(self):