diff options
author | Jörn Hees <joernhees@users.noreply.github.com> | 2017-04-09 20:35:34 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2017-04-09 20:35:34 +0200 |
commit | ee2c08b84b2e4c6e710a6274a8bf60d31f2e9d99 (patch) | |
tree | a5dc67142c93d4deb99284bb58805295dde544c1 | |
parent | af230076e7796c368ec4c912404fe3baf44761e5 (diff) | |
parent | d80b313f97d4ce9c6441184704b56e143ff08118 (diff) | |
download | rdflib-ee2c08b84b2e4c6e710a6274a8bf60d31f2e9d99.tar.gz |
Merge pull request #726 from joernhees/fix_canonicalization
fixes #725, huge thanks to @jimmccusker
-rw-r--r-- | rdflib/compare.py | 28 | ||||
-rw-r--r-- | test/test_canonicalization.py | 201 |
2 files changed, 211 insertions, 18 deletions
diff --git a/rdflib/compare.py b/rdflib/compare.py index 5e3f5994..97de047b 100644 --- a/rdflib/compare.py +++ b/rdflib/compare.py @@ -194,6 +194,10 @@ class Color: self.hashfunc = hashfunc self._hash_color = None + def __str__(self): + nodes, color = self.key() + return "Color %s (%s nodes)" % (color, nodes) + def key(self): return (len(self.nodes), self.hash_color()) @@ -277,7 +281,7 @@ class _TripleCanonicalizer(object): others = set() self._neighbors = defaultdict(set) for s, p, o in self.graph: - nodes = set([s, o]) + nodes = set([s, p, o]) b = set([x for x in nodes if isinstance(x, BNode)]) if len(b) > 0: others |= nodes - b @@ -286,6 +290,9 @@ class _TripleCanonicalizer(object): self._neighbors[s].add(o) if isinstance(o, BNode): self._neighbors[o].add(s) + if isinstance(p, BNode): + self._neighbors[p].add(s) + self._neighbors[p].add(p) if len(bnodes) > 0: return [ Color(list(bnodes), self.hashfunc, hash_cache=self._hash_cache) @@ -317,7 +324,7 @@ class _TripleCanonicalizer(object): while len(sequence) > 0 and not self._discrete(coloring): W = sequence.pop() for c in coloring[:]: - if len(c.nodes) > 1: + if len(c.nodes) > 1 or isinstance(c.nodes[0], BNode): colors = sorted(c.distinguish(W, self.graph), key=lambda x: x.key(), reverse=True) @@ -328,8 +335,17 @@ class _TripleCanonicalizer(object): sequence = sequence[:si] + colors + sequence[si+1:] except ValueError: sequence = colors[1:] + sequence - - return coloring + combined_colors = [] + combined_color_map = dict() + for color in coloring: + color_hash = color.hash_color() + # This is a hash collision, and be combined into a single color for individuation. + if color_hash in combined_color_map: + combined_color_map[color_hash].nodes.extend(color.nodes) + else: + combined_colors.append(color) + combined_color_map[color_hash] = color + return combined_colors @_runtime("to_hash_runtime") def to_hash(self, stats=None): @@ -515,14 +531,14 @@ def isomorphic(graph1, graph2): -def to_canonical_graph(g1): +def to_canonical_graph(g1, stats=None): """Creates a canonical, read-only graph. Creates a canonical, read-only graph where all bnode id:s are based on deterministical SHA-256 checksums, correlated with the graph contents. """ graph = Graph() - graph += _TripleCanonicalizer(g1).canonical_triples() + graph += _TripleCanonicalizer(g1).canonical_triples(stats=stats) return ReadOnlyGraphAggregate([graph]) diff --git a/test/test_canonicalization.py b/test/test_canonicalization.py index b64059e7..9b094ce6 100644 --- a/test/test_canonicalization.py +++ b/test/test_canonicalization.py @@ -1,6 +1,10 @@ +from collections import Counter from rdflib import Graph, RDF, BNode, URIRef, Namespace, ConjunctiveGraph, Literal from rdflib.compare import to_isomorphic, to_canonical_graph + +import rdflib from rdflib.plugins.memory import IOMemory + from six import text_type from io import StringIO @@ -220,28 +224,28 @@ def test_issue494_collapsing_bnodes(): RDF['Statement']), ] - print('graph length: %d, nodes: %d' % (len(g), len(g.all_nodes()))) - print('triple_bnode degrees:') - for triple_bnode in g.subjects(RDF['type'], RDF['Statement']): - print(len(list(g.triples([triple_bnode, None, None])))) - print('all node degrees:') + # print('graph length: %d, nodes: %d' % (len(g), len(g.all_nodes()))) + # print('triple_bnode degrees:') + # for triple_bnode in g.subjects(RDF['type'], RDF['Statement']): + # print(len(list(g.triples([triple_bnode, None, None])))) + # print('all node degrees:') g_node_degs = sorted([ len(list(g.triples([node, None, None]))) for node in g.all_nodes() ], reverse=True) - print(g_node_degs) + # print(g_node_degs) cg = to_canonical_graph(g) - print('graph length: %d, nodes: %d' % (len(cg), len(cg.all_nodes()))) - print('triple_bnode degrees:') - for triple_bnode in cg.subjects(RDF['type'], RDF['Statement']): - print(len(list(cg.triples([triple_bnode, None, None])))) - print('all node degrees:') + # print('graph length: %d, nodes: %d' % (len(cg), len(cg.all_nodes()))) + # print('triple_bnode degrees:') + # for triple_bnode in cg.subjects(RDF['type'], RDF['Statement']): + # print(len(list(cg.triples([triple_bnode, None, None])))) + # print('all node degrees:') cg_node_degs = sorted([ len(list(cg.triples([node, None, None]))) for node in cg.all_nodes() ], reverse=True) - print(cg_node_degs) + # print(cg_node_degs) assert len(g) == len(cg), \ 'canonicalization changed number of triples in graph' @@ -253,6 +257,24 @@ def test_issue494_collapsing_bnodes(): assert g_node_degs == cg_node_degs, \ 'canonicalization changed node degrees' + # counter for subject, predicate and object nodes + g_pos_counts = Counter(), Counter(), Counter() + for t in g: + for i, node in enumerate(t): + g_pos_counts[i][t] += 1 + g_count_signature = [sorted(c.values()) for c in g_pos_counts] + + cg = to_canonical_graph(g) + cg_pos_counts = Counter(), Counter(), Counter() + for t in cg: + for i, node in enumerate(t): + cg_pos_counts[i][t] += 1 + cg_count_signature = [sorted(c.values()) for c in cg_pos_counts] + + assert g_count_signature == cg_count_signature, \ + 'canonicalization changed node position counts' + + def test_issue682_signing_named_graphs(): ns = Namespace("http://love.com#") @@ -282,3 +304,158 @@ def test_issue682_signing_named_graphs(): assert len(ig) == len(g) assert len(igmary) < len(ig) assert ig.graph_digest() != igmary.graph_digest() + + +def test_issue725_collapsing_bnodes_2(): + g = Graph() + g += [ + (BNode('N0a76d42406b84fe4b8029d0a7fa04244'), + URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#object'), + BNode('v2')), + (BNode('N0a76d42406b84fe4b8029d0a7fa04244'), + URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#predicate'), + BNode('v0')), + (BNode('N0a76d42406b84fe4b8029d0a7fa04244'), + URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#subject'), + URIRef(u'urn:gp_learner:fixed_var:target')), + (BNode('N0a76d42406b84fe4b8029d0a7fa04244'), + URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#type'), + URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#Statement')), + (BNode('N2f62af5936b94a8eb4b1e4bfa8e11d95'), + URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#object'), + BNode('v1')), + (BNode('N2f62af5936b94a8eb4b1e4bfa8e11d95'), + URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#predicate'), + BNode('v0')), + (BNode('N2f62af5936b94a8eb4b1e4bfa8e11d95'), + URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#subject'), + URIRef(u'urn:gp_learner:fixed_var:target')), + (BNode('N2f62af5936b94a8eb4b1e4bfa8e11d95'), + URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#type'), + URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#Statement')), + (BNode('N5ae541f93e1d4e5880450b1bdceb6404'), + URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#object'), + BNode('v5')), + (BNode('N5ae541f93e1d4e5880450b1bdceb6404'), + URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#predicate'), + BNode('v4')), + (BNode('N5ae541f93e1d4e5880450b1bdceb6404'), + URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#subject'), + URIRef(u'urn:gp_learner:fixed_var:target')), + (BNode('N5ae541f93e1d4e5880450b1bdceb6404'), + URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#type'), + URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#Statement')), + (BNode('N86ac7ca781f546ae939b8963895f672e'), + URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#object'), + URIRef(u'urn:gp_learner:fixed_var:source')), + (BNode('N86ac7ca781f546ae939b8963895f672e'), + URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#predicate'), + BNode('v0')), + (BNode('N86ac7ca781f546ae939b8963895f672e'), + URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#subject'), + URIRef(u'urn:gp_learner:fixed_var:target')), + (BNode('N86ac7ca781f546ae939b8963895f672e'), + URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#type'), + URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#Statement')), + (BNode('Nac82b883ca3849b5ab6820b7ac15e490'), + URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#object'), + BNode('v1')), + (BNode('Nac82b883ca3849b5ab6820b7ac15e490'), + URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#predicate'), + BNode('v3')), + (BNode('Nac82b883ca3849b5ab6820b7ac15e490'), + URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#subject'), + URIRef(u'urn:gp_learner:fixed_var:target')), + (BNode('Nac82b883ca3849b5ab6820b7ac15e490'), + URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#type'), + URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#Statement')) + ] + + turtle = ''' + @prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> . + @prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> . + @prefix xml: <http://www.w3.org/XML/1998/namespace> . + @prefix xsd: <http://www.w3.org/2001/XMLSchema#> . + + [] a rdf:Statement ; + rdf:object [ ] ; + rdf:predicate _:v0 ; + rdf:subject <urn:gp_learner:fixed_var:target> . + + [] a rdf:Statement ; + rdf:object _:v1 ; + rdf:predicate _:v0 ; + rdf:subject <urn:gp_learner:fixed_var:target> . + + [] a rdf:Statement ; + rdf:object [ ] ; + rdf:predicate [ ] ; + rdf:subject <urn:gp_learner:fixed_var:target> . + + [] a rdf:Statement ; + rdf:object <urn:gp_learner:fixed_var:source> ; + rdf:predicate _:v0 ; + rdf:subject <urn:gp_learner:fixed_var:target> . + + [] a rdf:Statement ; + rdf:object _:v1 ; + rdf:predicate [ ] ; + rdf:subject <urn:gp_learner:fixed_var:target> .''' + + # g = Graph() + # g.parse(data=turtle, format='turtle') + + stats = {} + cg = rdflib.compare.to_canonical_graph(g, stats=stats) + + # print ('graph g length: %d, nodes: %d' % (len(g), len(g.all_nodes()))) + # print ('triple_bnode degrees:') + # for triple_bnode in g.subjects(rdflib.RDF['type'], rdflib.RDF['Statement']): + # print (len(list(g.triples([triple_bnode, None, None])))) + # print ('all node out-degrees:') + # print (sorted( + # [len(list(g.triples([node, None, None]))) for node in g.all_nodes()])) + # print ('all node in-degrees:') + # print (sorted( + # [len(list(g.triples([None, None, node]))) for node in g.all_nodes()])) + # print(g.serialize(format='n3')) + # + # print ('graph cg length: %d, nodes: %d' % (len(cg), len(cg.all_nodes()))) + # print ('triple_bnode degrees:') + # for triple_bnode in cg.subjects(rdflib.RDF['type'], + # rdflib.RDF['Statement']): + # print (len(list(cg.triples([triple_bnode, None, None])))) + # print ('all node out-degrees:') + # print (sorted( + # [len(list(cg.triples([node, None, None]))) for node in cg.all_nodes()])) + # print ('all node in-degrees:') + # print (sorted( + # [len(list(cg.triples([None, None, node]))) for node in cg.all_nodes()])) + # print(cg.serialize(format='n3')) + + assert (len(g.all_nodes()) == len(cg.all_nodes())) + + cg = to_canonical_graph(g) + assert len(g) == len(cg), \ + 'canonicalization changed number of triples in graph' + assert len(g.all_nodes()) == len(cg.all_nodes()), \ + 'canonicalization changed number of nodes in graph' + assert len(list(g.subjects(RDF['type'], RDF['Statement']))) == \ + len(list(cg.subjects(RDF['type'], RDF['Statement']))), \ + 'canonicalization changed number of statements' + + # counter for subject, predicate and object nodes + g_pos_counts = Counter(), Counter(), Counter() + for t in g: + for i, node in enumerate(t): + g_pos_counts[i][t] += 1 + g_count_signature = [sorted(c.values()) for c in g_pos_counts] + + cg_pos_counts = Counter(), Counter(), Counter() + for t in cg: + for i, node in enumerate(t): + cg_pos_counts[i][t] += 1 + cg_count_signature = [sorted(c.values()) for c in cg_pos_counts] + + assert g_count_signature == cg_count_signature, \ + 'canonicalization changed node position counts' |