diff options
author | Jim McCusker <mccusker@gmail.com> | 2017-04-09 13:20:20 -0400 |
---|---|---|
committer | Jim McCusker <mccusker@gmail.com> | 2017-04-09 13:20:20 -0400 |
commit | 7283c6bf896df7bc644badd5ad1a6217ab145f7a (patch) | |
tree | 9b8d33d3d440a9332464556b1798c04a783c500d | |
parent | 74357f4b91fc36504a09e3a37bc6d277dd70f966 (diff) | |
download | rdflib-7283c6bf896df7bc644badd5ad1a6217ab145f7a.tar.gz |
Fixed #725 by fully refining all bnode colors and combining color groups if there is a hash collision.
-rw-r--r-- | rdflib/compare.py | 30 | ||||
-rw-r--r-- | test/test_canonicalization.py | 125 |
2 files changed, 149 insertions, 6 deletions
diff --git a/rdflib/compare.py b/rdflib/compare.py index fb9b1902..d5c12c3b 100644 --- a/rdflib/compare.py +++ b/rdflib/compare.py @@ -191,6 +191,10 @@ class Color: self.hashfunc = hashfunc self._hash_color = None + def __str__(self): + nodes, color = self.key() + return "Color %s (%s nodes)" % (color, nodes) + def key(self): return (len(self.nodes), self.hash_color()) @@ -274,7 +278,7 @@ class _TripleCanonicalizer(object): others = set() self._neighbors = defaultdict(set) for s, p, o in self.graph: - nodes = set([s, o]) + nodes = set([s, p, o]) b = set([x for x in nodes if isinstance(x, BNode)]) if len(b) > 0: others |= nodes - b @@ -283,6 +287,9 @@ class _TripleCanonicalizer(object): self._neighbors[s].add(o) if isinstance(o, BNode): self._neighbors[o].add(s) + if isinstance(p, BNode): + self._neighbors[p].add(s) + self._neighbors[p].add(p) if len(bnodes) > 0: return [ Color(list(bnodes), self.hashfunc, hash_cache=self._hash_cache) @@ -314,7 +321,7 @@ class _TripleCanonicalizer(object): while len(sequence) > 0 and not self._discrete(coloring): W = sequence.pop() for c in coloring[:]: - if len(c.nodes) > 1: + if len(c.nodes) > 1 or isinstance(c.nodes[0], BNode): colors = sorted(c.distinguish(W, self.graph), key=lambda x: x.key(), reverse=True) @@ -325,8 +332,17 @@ class _TripleCanonicalizer(object): sequence = sequence[:si] + colors + sequence[si+1:] except ValueError: sequence = colors[1:] + sequence - - return coloring + combined_colors = [] + combined_color_map = dict() + for color in coloring: + color_hash = color.hash_color() + # This is a hash collision, and be combined into a single color for individuation. + if color_hash in combined_color_map: + combined_color_map[color_hash].nodes.extend(color.nodes) + else: + combined_colors.append(color) + combined_color_map[color_hash] = color + return combined_colors @_runtime("to_hash_runtime") def to_hash(self, stats=None): @@ -452,6 +468,8 @@ class _TripleCanonicalizer(object): stats['color_count'] = len(coloring) bnode_labels = dict([(c.nodes[0], c.hash_color()) for c in coloring]) + print ("BNode Labels:") + print ("\n".join(['\t'.join(item) for item in bnode_labels.items()])) if stats is not None: stats["canonicalize_triples_runtime"] = _total_seconds(datetime.now() - start_coloring) for triple in self.graph: @@ -509,14 +527,14 @@ def isomorphic(graph1, graph2): -def to_canonical_graph(g1): +def to_canonical_graph(g1, stats=None): """Creates a canonical, read-only graph. Creates a canonical, read-only graph where all bnode id:s are based on deterministical SHA-256 checksums, correlated with the graph contents. """ graph = Graph() - graph += _TripleCanonicalizer(g1).canonical_triples() + graph += _TripleCanonicalizer(g1).canonical_triples(stats=stats) return ReadOnlyGraphAggregate([graph]) diff --git a/test/test_canonicalization.py b/test/test_canonicalization.py index d1e38d1a..97f49036 100644 --- a/test/test_canonicalization.py +++ b/test/test_canonicalization.py @@ -1,5 +1,6 @@ from rdflib import Graph, RDF, BNode, URIRef from rdflib.compare import to_isomorphic, to_canonical_graph +import rdflib from six import text_type from io import StringIO @@ -153,6 +154,130 @@ def negative_graph_match_test(): for inputs in testInputs: yield fn, inputs[0], inputs[1], inputs[2] +def test_issue725_collapsing_bnodes(): + g = rdflib.Graph() + g += [ + (rdflib.term.BNode('N0a76d42406b84fe4b8029d0a7fa04244'), + rdflib.term.URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#object'), + rdflib.term.BNode('v2')), + (rdflib.term.BNode('N0a76d42406b84fe4b8029d0a7fa04244'), + rdflib.term.URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#predicate'), + rdflib.term.BNode('v0')), + (rdflib.term.BNode('N0a76d42406b84fe4b8029d0a7fa04244'), + rdflib.term.URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#subject'), + rdflib.term.URIRef(u'urn:gp_learner:fixed_var:target')), + (rdflib.term.BNode('N0a76d42406b84fe4b8029d0a7fa04244'), + rdflib.term.URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#type'), + rdflib.term.URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#Statement')), + (rdflib.term.BNode('N2f62af5936b94a8eb4b1e4bfa8e11d95'), + rdflib.term.URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#object'), + rdflib.term.BNode('v1')), + (rdflib.term.BNode('N2f62af5936b94a8eb4b1e4bfa8e11d95'), + rdflib.term.URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#predicate'), + rdflib.term.BNode('v0')), + (rdflib.term.BNode('N2f62af5936b94a8eb4b1e4bfa8e11d95'), + rdflib.term.URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#subject'), + rdflib.term.URIRef(u'urn:gp_learner:fixed_var:target')), + (rdflib.term.BNode('N2f62af5936b94a8eb4b1e4bfa8e11d95'), + rdflib.term.URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#type'), + rdflib.term.URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#Statement')), + (rdflib.term.BNode('N5ae541f93e1d4e5880450b1bdceb6404'), + rdflib.term.URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#object'), + rdflib.term.BNode('v5')), + (rdflib.term.BNode('N5ae541f93e1d4e5880450b1bdceb6404'), + rdflib.term.URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#predicate'), + rdflib.term.BNode('v4')), + (rdflib.term.BNode('N5ae541f93e1d4e5880450b1bdceb6404'), + rdflib.term.URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#subject'), + rdflib.term.URIRef(u'urn:gp_learner:fixed_var:target')), + (rdflib.term.BNode('N5ae541f93e1d4e5880450b1bdceb6404'), + rdflib.term.URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#type'), + rdflib.term.URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#Statement')), + (rdflib.term.BNode('N86ac7ca781f546ae939b8963895f672e'), + rdflib.term.URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#object'), + rdflib.term.URIRef(u'urn:gp_learner:fixed_var:source')), + (rdflib.term.BNode('N86ac7ca781f546ae939b8963895f672e'), + rdflib.term.URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#predicate'), + rdflib.term.BNode('v0')), + (rdflib.term.BNode('N86ac7ca781f546ae939b8963895f672e'), + rdflib.term.URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#subject'), + rdflib.term.URIRef(u'urn:gp_learner:fixed_var:target')), + (rdflib.term.BNode('N86ac7ca781f546ae939b8963895f672e'), + rdflib.term.URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#type'), + rdflib.term.URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#Statement')), + (rdflib.term.BNode('Nac82b883ca3849b5ab6820b7ac15e490'), + rdflib.term.URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#object'), + rdflib.term.BNode('v1')), + (rdflib.term.BNode('Nac82b883ca3849b5ab6820b7ac15e490'), + rdflib.term.URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#predicate'), + rdflib.term.BNode('v3')), + (rdflib.term.BNode('Nac82b883ca3849b5ab6820b7ac15e490'), + rdflib.term.URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#subject'), + rdflib.term.URIRef(u'urn:gp_learner:fixed_var:target')), + (rdflib.term.BNode('Nac82b883ca3849b5ab6820b7ac15e490'), + rdflib.term.URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#type'), + rdflib.term.URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#Statement')) + ] + + turtle = ''' +@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> . +@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> . +@prefix xml: <http://www.w3.org/XML/1998/namespace> . +@prefix xsd: <http://www.w3.org/2001/XMLSchema#> . + +[] a rdf:Statement ; + rdf:object [ ] ; + rdf:predicate _:v0 ; + rdf:subject <urn:gp_learner:fixed_var:target> . + +[] a rdf:Statement ; + rdf:object _:v1 ; + rdf:predicate _:v0 ; + rdf:subject <urn:gp_learner:fixed_var:target> . + +[] a rdf:Statement ; + rdf:object [ ] ; + rdf:predicate [ ] ; + rdf:subject <urn:gp_learner:fixed_var:target> . + +[] a rdf:Statement ; + rdf:object <urn:gp_learner:fixed_var:source> ; + rdf:predicate _:v0 ; + rdf:subject <urn:gp_learner:fixed_var:target> . + +[] a rdf:Statement ; + rdf:object _:v1 ; + rdf:predicate [ ] ; + rdf:subject <urn:gp_learner:fixed_var:target> .''' + + #g = Graph() + #g.parse(data=turtle, format='turtle') + + stats = {} + cg = rdflib.compare.to_canonical_graph(g, stats=stats) + + print ('graph g length: %d, nodes: %d' % (len(g), len(g.all_nodes()))) + print ('triple_bnode degrees:') + for triple_bnode in g.subjects(rdflib.RDF['type'], rdflib.RDF['Statement']): + print (len(list(g.triples([triple_bnode, None, None])))) + print ('all node out-degrees:') + print (sorted([len(list(g.triples([node, None, None]))) for node in g.all_nodes()])) + print ('all node in-degrees:') + print (sorted([len(list(g.triples([None, None, node]))) for node in g.all_nodes()])) + print(g.serialize(format='n3')) + + print ('graph cg length: %d, nodes: %d' % (len(cg), len(cg.all_nodes()))) + print ('triple_bnode degrees:') + for triple_bnode in cg.subjects(rdflib.RDF['type'], rdflib.RDF['Statement']): + print (len(list(cg.triples([triple_bnode, None, None])))) + print ('all node out-degrees:') + print (sorted([len(list(cg.triples([node, None, None]))) for node in cg.all_nodes()])) + print ('all node in-degrees:') + print (sorted([len(list(cg.triples([None, None, node]))) for node in cg.all_nodes()])) + print(cg.serialize(format='n3')) + + assert(len(g.all_nodes()) == len(cg.all_nodes())) + def test_issue494_collapsing_bnodes(): """Test for https://github.com/RDFLib/rdflib/issues/494 collapsing BNodes""" g = Graph() |