summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJörn Hees <joernhees@users.noreply.github.com>2017-04-09 20:35:34 +0200
committerGitHub <noreply@github.com>2017-04-09 20:35:34 +0200
commitee2c08b84b2e4c6e710a6274a8bf60d31f2e9d99 (patch)
treea5dc67142c93d4deb99284bb58805295dde544c1
parentaf230076e7796c368ec4c912404fe3baf44761e5 (diff)
parentd80b313f97d4ce9c6441184704b56e143ff08118 (diff)
downloadrdflib-ee2c08b84b2e4c6e710a6274a8bf60d31f2e9d99.tar.gz
Merge pull request #726 from joernhees/fix_canonicalization
fixes #725, huge thanks to @jimmccusker
-rw-r--r--rdflib/compare.py28
-rw-r--r--test/test_canonicalization.py201
2 files changed, 211 insertions, 18 deletions
diff --git a/rdflib/compare.py b/rdflib/compare.py
index 5e3f5994..97de047b 100644
--- a/rdflib/compare.py
+++ b/rdflib/compare.py
@@ -194,6 +194,10 @@ class Color:
self.hashfunc = hashfunc
self._hash_color = None
+ def __str__(self):
+ nodes, color = self.key()
+ return "Color %s (%s nodes)" % (color, nodes)
+
def key(self):
return (len(self.nodes), self.hash_color())
@@ -277,7 +281,7 @@ class _TripleCanonicalizer(object):
others = set()
self._neighbors = defaultdict(set)
for s, p, o in self.graph:
- nodes = set([s, o])
+ nodes = set([s, p, o])
b = set([x for x in nodes if isinstance(x, BNode)])
if len(b) > 0:
others |= nodes - b
@@ -286,6 +290,9 @@ class _TripleCanonicalizer(object):
self._neighbors[s].add(o)
if isinstance(o, BNode):
self._neighbors[o].add(s)
+ if isinstance(p, BNode):
+ self._neighbors[p].add(s)
+ self._neighbors[p].add(p)
if len(bnodes) > 0:
return [
Color(list(bnodes), self.hashfunc, hash_cache=self._hash_cache)
@@ -317,7 +324,7 @@ class _TripleCanonicalizer(object):
while len(sequence) > 0 and not self._discrete(coloring):
W = sequence.pop()
for c in coloring[:]:
- if len(c.nodes) > 1:
+ if len(c.nodes) > 1 or isinstance(c.nodes[0], BNode):
colors = sorted(c.distinguish(W, self.graph),
key=lambda x: x.key(),
reverse=True)
@@ -328,8 +335,17 @@ class _TripleCanonicalizer(object):
sequence = sequence[:si] + colors + sequence[si+1:]
except ValueError:
sequence = colors[1:] + sequence
-
- return coloring
+ combined_colors = []
+ combined_color_map = dict()
+ for color in coloring:
+ color_hash = color.hash_color()
+ # This is a hash collision, and be combined into a single color for individuation.
+ if color_hash in combined_color_map:
+ combined_color_map[color_hash].nodes.extend(color.nodes)
+ else:
+ combined_colors.append(color)
+ combined_color_map[color_hash] = color
+ return combined_colors
@_runtime("to_hash_runtime")
def to_hash(self, stats=None):
@@ -515,14 +531,14 @@ def isomorphic(graph1, graph2):
-def to_canonical_graph(g1):
+def to_canonical_graph(g1, stats=None):
"""Creates a canonical, read-only graph.
Creates a canonical, read-only graph where all bnode id:s are based on
deterministical SHA-256 checksums, correlated with the graph contents.
"""
graph = Graph()
- graph += _TripleCanonicalizer(g1).canonical_triples()
+ graph += _TripleCanonicalizer(g1).canonical_triples(stats=stats)
return ReadOnlyGraphAggregate([graph])
diff --git a/test/test_canonicalization.py b/test/test_canonicalization.py
index b64059e7..9b094ce6 100644
--- a/test/test_canonicalization.py
+++ b/test/test_canonicalization.py
@@ -1,6 +1,10 @@
+from collections import Counter
from rdflib import Graph, RDF, BNode, URIRef, Namespace, ConjunctiveGraph, Literal
from rdflib.compare import to_isomorphic, to_canonical_graph
+
+import rdflib
from rdflib.plugins.memory import IOMemory
+
from six import text_type
from io import StringIO
@@ -220,28 +224,28 @@ def test_issue494_collapsing_bnodes():
RDF['Statement']),
]
- print('graph length: %d, nodes: %d' % (len(g), len(g.all_nodes())))
- print('triple_bnode degrees:')
- for triple_bnode in g.subjects(RDF['type'], RDF['Statement']):
- print(len(list(g.triples([triple_bnode, None, None]))))
- print('all node degrees:')
+ # print('graph length: %d, nodes: %d' % (len(g), len(g.all_nodes())))
+ # print('triple_bnode degrees:')
+ # for triple_bnode in g.subjects(RDF['type'], RDF['Statement']):
+ # print(len(list(g.triples([triple_bnode, None, None]))))
+ # print('all node degrees:')
g_node_degs = sorted([
len(list(g.triples([node, None, None])))
for node in g.all_nodes()
], reverse=True)
- print(g_node_degs)
+ # print(g_node_degs)
cg = to_canonical_graph(g)
- print('graph length: %d, nodes: %d' % (len(cg), len(cg.all_nodes())))
- print('triple_bnode degrees:')
- for triple_bnode in cg.subjects(RDF['type'], RDF['Statement']):
- print(len(list(cg.triples([triple_bnode, None, None]))))
- print('all node degrees:')
+ # print('graph length: %d, nodes: %d' % (len(cg), len(cg.all_nodes())))
+ # print('triple_bnode degrees:')
+ # for triple_bnode in cg.subjects(RDF['type'], RDF['Statement']):
+ # print(len(list(cg.triples([triple_bnode, None, None]))))
+ # print('all node degrees:')
cg_node_degs = sorted([
len(list(cg.triples([node, None, None])))
for node in cg.all_nodes()
], reverse=True)
- print(cg_node_degs)
+ # print(cg_node_degs)
assert len(g) == len(cg), \
'canonicalization changed number of triples in graph'
@@ -253,6 +257,24 @@ def test_issue494_collapsing_bnodes():
assert g_node_degs == cg_node_degs, \
'canonicalization changed node degrees'
+ # counter for subject, predicate and object nodes
+ g_pos_counts = Counter(), Counter(), Counter()
+ for t in g:
+ for i, node in enumerate(t):
+ g_pos_counts[i][t] += 1
+ g_count_signature = [sorted(c.values()) for c in g_pos_counts]
+
+ cg = to_canonical_graph(g)
+ cg_pos_counts = Counter(), Counter(), Counter()
+ for t in cg:
+ for i, node in enumerate(t):
+ cg_pos_counts[i][t] += 1
+ cg_count_signature = [sorted(c.values()) for c in cg_pos_counts]
+
+ assert g_count_signature == cg_count_signature, \
+ 'canonicalization changed node position counts'
+
+
def test_issue682_signing_named_graphs():
ns = Namespace("http://love.com#")
@@ -282,3 +304,158 @@ def test_issue682_signing_named_graphs():
assert len(ig) == len(g)
assert len(igmary) < len(ig)
assert ig.graph_digest() != igmary.graph_digest()
+
+
+def test_issue725_collapsing_bnodes_2():
+ g = Graph()
+ g += [
+ (BNode('N0a76d42406b84fe4b8029d0a7fa04244'),
+ URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#object'),
+ BNode('v2')),
+ (BNode('N0a76d42406b84fe4b8029d0a7fa04244'),
+ URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#predicate'),
+ BNode('v0')),
+ (BNode('N0a76d42406b84fe4b8029d0a7fa04244'),
+ URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#subject'),
+ URIRef(u'urn:gp_learner:fixed_var:target')),
+ (BNode('N0a76d42406b84fe4b8029d0a7fa04244'),
+ URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#type'),
+ URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#Statement')),
+ (BNode('N2f62af5936b94a8eb4b1e4bfa8e11d95'),
+ URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#object'),
+ BNode('v1')),
+ (BNode('N2f62af5936b94a8eb4b1e4bfa8e11d95'),
+ URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#predicate'),
+ BNode('v0')),
+ (BNode('N2f62af5936b94a8eb4b1e4bfa8e11d95'),
+ URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#subject'),
+ URIRef(u'urn:gp_learner:fixed_var:target')),
+ (BNode('N2f62af5936b94a8eb4b1e4bfa8e11d95'),
+ URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#type'),
+ URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#Statement')),
+ (BNode('N5ae541f93e1d4e5880450b1bdceb6404'),
+ URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#object'),
+ BNode('v5')),
+ (BNode('N5ae541f93e1d4e5880450b1bdceb6404'),
+ URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#predicate'),
+ BNode('v4')),
+ (BNode('N5ae541f93e1d4e5880450b1bdceb6404'),
+ URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#subject'),
+ URIRef(u'urn:gp_learner:fixed_var:target')),
+ (BNode('N5ae541f93e1d4e5880450b1bdceb6404'),
+ URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#type'),
+ URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#Statement')),
+ (BNode('N86ac7ca781f546ae939b8963895f672e'),
+ URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#object'),
+ URIRef(u'urn:gp_learner:fixed_var:source')),
+ (BNode('N86ac7ca781f546ae939b8963895f672e'),
+ URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#predicate'),
+ BNode('v0')),
+ (BNode('N86ac7ca781f546ae939b8963895f672e'),
+ URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#subject'),
+ URIRef(u'urn:gp_learner:fixed_var:target')),
+ (BNode('N86ac7ca781f546ae939b8963895f672e'),
+ URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#type'),
+ URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#Statement')),
+ (BNode('Nac82b883ca3849b5ab6820b7ac15e490'),
+ URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#object'),
+ BNode('v1')),
+ (BNode('Nac82b883ca3849b5ab6820b7ac15e490'),
+ URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#predicate'),
+ BNode('v3')),
+ (BNode('Nac82b883ca3849b5ab6820b7ac15e490'),
+ URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#subject'),
+ URIRef(u'urn:gp_learner:fixed_var:target')),
+ (BNode('Nac82b883ca3849b5ab6820b7ac15e490'),
+ URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#type'),
+ URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#Statement'))
+ ]
+
+ turtle = '''
+ @prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
+ @prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
+ @prefix xml: <http://www.w3.org/XML/1998/namespace> .
+ @prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
+
+ [] a rdf:Statement ;
+ rdf:object [ ] ;
+ rdf:predicate _:v0 ;
+ rdf:subject <urn:gp_learner:fixed_var:target> .
+
+ [] a rdf:Statement ;
+ rdf:object _:v1 ;
+ rdf:predicate _:v0 ;
+ rdf:subject <urn:gp_learner:fixed_var:target> .
+
+ [] a rdf:Statement ;
+ rdf:object [ ] ;
+ rdf:predicate [ ] ;
+ rdf:subject <urn:gp_learner:fixed_var:target> .
+
+ [] a rdf:Statement ;
+ rdf:object <urn:gp_learner:fixed_var:source> ;
+ rdf:predicate _:v0 ;
+ rdf:subject <urn:gp_learner:fixed_var:target> .
+
+ [] a rdf:Statement ;
+ rdf:object _:v1 ;
+ rdf:predicate [ ] ;
+ rdf:subject <urn:gp_learner:fixed_var:target> .'''
+
+ # g = Graph()
+ # g.parse(data=turtle, format='turtle')
+
+ stats = {}
+ cg = rdflib.compare.to_canonical_graph(g, stats=stats)
+
+ # print ('graph g length: %d, nodes: %d' % (len(g), len(g.all_nodes())))
+ # print ('triple_bnode degrees:')
+ # for triple_bnode in g.subjects(rdflib.RDF['type'], rdflib.RDF['Statement']):
+ # print (len(list(g.triples([triple_bnode, None, None]))))
+ # print ('all node out-degrees:')
+ # print (sorted(
+ # [len(list(g.triples([node, None, None]))) for node in g.all_nodes()]))
+ # print ('all node in-degrees:')
+ # print (sorted(
+ # [len(list(g.triples([None, None, node]))) for node in g.all_nodes()]))
+ # print(g.serialize(format='n3'))
+ #
+ # print ('graph cg length: %d, nodes: %d' % (len(cg), len(cg.all_nodes())))
+ # print ('triple_bnode degrees:')
+ # for triple_bnode in cg.subjects(rdflib.RDF['type'],
+ # rdflib.RDF['Statement']):
+ # print (len(list(cg.triples([triple_bnode, None, None]))))
+ # print ('all node out-degrees:')
+ # print (sorted(
+ # [len(list(cg.triples([node, None, None]))) for node in cg.all_nodes()]))
+ # print ('all node in-degrees:')
+ # print (sorted(
+ # [len(list(cg.triples([None, None, node]))) for node in cg.all_nodes()]))
+ # print(cg.serialize(format='n3'))
+
+ assert (len(g.all_nodes()) == len(cg.all_nodes()))
+
+ cg = to_canonical_graph(g)
+ assert len(g) == len(cg), \
+ 'canonicalization changed number of triples in graph'
+ assert len(g.all_nodes()) == len(cg.all_nodes()), \
+ 'canonicalization changed number of nodes in graph'
+ assert len(list(g.subjects(RDF['type'], RDF['Statement']))) == \
+ len(list(cg.subjects(RDF['type'], RDF['Statement']))), \
+ 'canonicalization changed number of statements'
+
+ # counter for subject, predicate and object nodes
+ g_pos_counts = Counter(), Counter(), Counter()
+ for t in g:
+ for i, node in enumerate(t):
+ g_pos_counts[i][t] += 1
+ g_count_signature = [sorted(c.values()) for c in g_pos_counts]
+
+ cg_pos_counts = Counter(), Counter(), Counter()
+ for t in cg:
+ for i, node in enumerate(t):
+ cg_pos_counts[i][t] += 1
+ cg_count_signature = [sorted(c.values()) for c in cg_pos_counts]
+
+ assert g_count_signature == cg_count_signature, \
+ 'canonicalization changed node position counts'