summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJim McCusker <mccusker@gmail.com>2017-04-09 13:20:20 -0400
committerJim McCusker <mccusker@gmail.com>2017-04-09 13:20:20 -0400
commit7283c6bf896df7bc644badd5ad1a6217ab145f7a (patch)
tree9b8d33d3d440a9332464556b1798c04a783c500d
parent74357f4b91fc36504a09e3a37bc6d277dd70f966 (diff)
downloadrdflib-7283c6bf896df7bc644badd5ad1a6217ab145f7a.tar.gz
Fixed #725 by fully refining all bnode colors and combining color groups if there is a hash collision.
-rw-r--r--rdflib/compare.py30
-rw-r--r--test/test_canonicalization.py125
2 files changed, 149 insertions, 6 deletions
diff --git a/rdflib/compare.py b/rdflib/compare.py
index fb9b1902..d5c12c3b 100644
--- a/rdflib/compare.py
+++ b/rdflib/compare.py
@@ -191,6 +191,10 @@ class Color:
self.hashfunc = hashfunc
self._hash_color = None
+ def __str__(self):
+ nodes, color = self.key()
+ return "Color %s (%s nodes)" % (color, nodes)
+
def key(self):
return (len(self.nodes), self.hash_color())
@@ -274,7 +278,7 @@ class _TripleCanonicalizer(object):
others = set()
self._neighbors = defaultdict(set)
for s, p, o in self.graph:
- nodes = set([s, o])
+ nodes = set([s, p, o])
b = set([x for x in nodes if isinstance(x, BNode)])
if len(b) > 0:
others |= nodes - b
@@ -283,6 +287,9 @@ class _TripleCanonicalizer(object):
self._neighbors[s].add(o)
if isinstance(o, BNode):
self._neighbors[o].add(s)
+ if isinstance(p, BNode):
+ self._neighbors[p].add(s)
+ self._neighbors[p].add(p)
if len(bnodes) > 0:
return [
Color(list(bnodes), self.hashfunc, hash_cache=self._hash_cache)
@@ -314,7 +321,7 @@ class _TripleCanonicalizer(object):
while len(sequence) > 0 and not self._discrete(coloring):
W = sequence.pop()
for c in coloring[:]:
- if len(c.nodes) > 1:
+ if len(c.nodes) > 1 or isinstance(c.nodes[0], BNode):
colors = sorted(c.distinguish(W, self.graph),
key=lambda x: x.key(),
reverse=True)
@@ -325,8 +332,17 @@ class _TripleCanonicalizer(object):
sequence = sequence[:si] + colors + sequence[si+1:]
except ValueError:
sequence = colors[1:] + sequence
-
- return coloring
+ combined_colors = []
+ combined_color_map = dict()
+ for color in coloring:
+ color_hash = color.hash_color()
+ # This is a hash collision, and be combined into a single color for individuation.
+ if color_hash in combined_color_map:
+ combined_color_map[color_hash].nodes.extend(color.nodes)
+ else:
+ combined_colors.append(color)
+ combined_color_map[color_hash] = color
+ return combined_colors
@_runtime("to_hash_runtime")
def to_hash(self, stats=None):
@@ -452,6 +468,8 @@ class _TripleCanonicalizer(object):
stats['color_count'] = len(coloring)
bnode_labels = dict([(c.nodes[0], c.hash_color()) for c in coloring])
+ print ("BNode Labels:")
+ print ("\n".join(['\t'.join(item) for item in bnode_labels.items()]))
if stats is not None:
stats["canonicalize_triples_runtime"] = _total_seconds(datetime.now() - start_coloring)
for triple in self.graph:
@@ -509,14 +527,14 @@ def isomorphic(graph1, graph2):
-def to_canonical_graph(g1):
+def to_canonical_graph(g1, stats=None):
"""Creates a canonical, read-only graph.
Creates a canonical, read-only graph where all bnode id:s are based on
deterministical SHA-256 checksums, correlated with the graph contents.
"""
graph = Graph()
- graph += _TripleCanonicalizer(g1).canonical_triples()
+ graph += _TripleCanonicalizer(g1).canonical_triples(stats=stats)
return ReadOnlyGraphAggregate([graph])
diff --git a/test/test_canonicalization.py b/test/test_canonicalization.py
index d1e38d1a..97f49036 100644
--- a/test/test_canonicalization.py
+++ b/test/test_canonicalization.py
@@ -1,5 +1,6 @@
from rdflib import Graph, RDF, BNode, URIRef
from rdflib.compare import to_isomorphic, to_canonical_graph
+import rdflib
from six import text_type
from io import StringIO
@@ -153,6 +154,130 @@ def negative_graph_match_test():
for inputs in testInputs:
yield fn, inputs[0], inputs[1], inputs[2]
+def test_issue725_collapsing_bnodes():
+ g = rdflib.Graph()
+ g += [
+ (rdflib.term.BNode('N0a76d42406b84fe4b8029d0a7fa04244'),
+ rdflib.term.URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#object'),
+ rdflib.term.BNode('v2')),
+ (rdflib.term.BNode('N0a76d42406b84fe4b8029d0a7fa04244'),
+ rdflib.term.URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#predicate'),
+ rdflib.term.BNode('v0')),
+ (rdflib.term.BNode('N0a76d42406b84fe4b8029d0a7fa04244'),
+ rdflib.term.URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#subject'),
+ rdflib.term.URIRef(u'urn:gp_learner:fixed_var:target')),
+ (rdflib.term.BNode('N0a76d42406b84fe4b8029d0a7fa04244'),
+ rdflib.term.URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#type'),
+ rdflib.term.URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#Statement')),
+ (rdflib.term.BNode('N2f62af5936b94a8eb4b1e4bfa8e11d95'),
+ rdflib.term.URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#object'),
+ rdflib.term.BNode('v1')),
+ (rdflib.term.BNode('N2f62af5936b94a8eb4b1e4bfa8e11d95'),
+ rdflib.term.URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#predicate'),
+ rdflib.term.BNode('v0')),
+ (rdflib.term.BNode('N2f62af5936b94a8eb4b1e4bfa8e11d95'),
+ rdflib.term.URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#subject'),
+ rdflib.term.URIRef(u'urn:gp_learner:fixed_var:target')),
+ (rdflib.term.BNode('N2f62af5936b94a8eb4b1e4bfa8e11d95'),
+ rdflib.term.URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#type'),
+ rdflib.term.URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#Statement')),
+ (rdflib.term.BNode('N5ae541f93e1d4e5880450b1bdceb6404'),
+ rdflib.term.URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#object'),
+ rdflib.term.BNode('v5')),
+ (rdflib.term.BNode('N5ae541f93e1d4e5880450b1bdceb6404'),
+ rdflib.term.URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#predicate'),
+ rdflib.term.BNode('v4')),
+ (rdflib.term.BNode('N5ae541f93e1d4e5880450b1bdceb6404'),
+ rdflib.term.URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#subject'),
+ rdflib.term.URIRef(u'urn:gp_learner:fixed_var:target')),
+ (rdflib.term.BNode('N5ae541f93e1d4e5880450b1bdceb6404'),
+ rdflib.term.URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#type'),
+ rdflib.term.URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#Statement')),
+ (rdflib.term.BNode('N86ac7ca781f546ae939b8963895f672e'),
+ rdflib.term.URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#object'),
+ rdflib.term.URIRef(u'urn:gp_learner:fixed_var:source')),
+ (rdflib.term.BNode('N86ac7ca781f546ae939b8963895f672e'),
+ rdflib.term.URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#predicate'),
+ rdflib.term.BNode('v0')),
+ (rdflib.term.BNode('N86ac7ca781f546ae939b8963895f672e'),
+ rdflib.term.URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#subject'),
+ rdflib.term.URIRef(u'urn:gp_learner:fixed_var:target')),
+ (rdflib.term.BNode('N86ac7ca781f546ae939b8963895f672e'),
+ rdflib.term.URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#type'),
+ rdflib.term.URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#Statement')),
+ (rdflib.term.BNode('Nac82b883ca3849b5ab6820b7ac15e490'),
+ rdflib.term.URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#object'),
+ rdflib.term.BNode('v1')),
+ (rdflib.term.BNode('Nac82b883ca3849b5ab6820b7ac15e490'),
+ rdflib.term.URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#predicate'),
+ rdflib.term.BNode('v3')),
+ (rdflib.term.BNode('Nac82b883ca3849b5ab6820b7ac15e490'),
+ rdflib.term.URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#subject'),
+ rdflib.term.URIRef(u'urn:gp_learner:fixed_var:target')),
+ (rdflib.term.BNode('Nac82b883ca3849b5ab6820b7ac15e490'),
+ rdflib.term.URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#type'),
+ rdflib.term.URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#Statement'))
+ ]
+
+ turtle = '''
+@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
+@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
+@prefix xml: <http://www.w3.org/XML/1998/namespace> .
+@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
+
+[] a rdf:Statement ;
+ rdf:object [ ] ;
+ rdf:predicate _:v0 ;
+ rdf:subject <urn:gp_learner:fixed_var:target> .
+
+[] a rdf:Statement ;
+ rdf:object _:v1 ;
+ rdf:predicate _:v0 ;
+ rdf:subject <urn:gp_learner:fixed_var:target> .
+
+[] a rdf:Statement ;
+ rdf:object [ ] ;
+ rdf:predicate [ ] ;
+ rdf:subject <urn:gp_learner:fixed_var:target> .
+
+[] a rdf:Statement ;
+ rdf:object <urn:gp_learner:fixed_var:source> ;
+ rdf:predicate _:v0 ;
+ rdf:subject <urn:gp_learner:fixed_var:target> .
+
+[] a rdf:Statement ;
+ rdf:object _:v1 ;
+ rdf:predicate [ ] ;
+ rdf:subject <urn:gp_learner:fixed_var:target> .'''
+
+ #g = Graph()
+ #g.parse(data=turtle, format='turtle')
+
+ stats = {}
+ cg = rdflib.compare.to_canonical_graph(g, stats=stats)
+
+ print ('graph g length: %d, nodes: %d' % (len(g), len(g.all_nodes())))
+ print ('triple_bnode degrees:')
+ for triple_bnode in g.subjects(rdflib.RDF['type'], rdflib.RDF['Statement']):
+ print (len(list(g.triples([triple_bnode, None, None]))))
+ print ('all node out-degrees:')
+ print (sorted([len(list(g.triples([node, None, None]))) for node in g.all_nodes()]))
+ print ('all node in-degrees:')
+ print (sorted([len(list(g.triples([None, None, node]))) for node in g.all_nodes()]))
+ print(g.serialize(format='n3'))
+
+ print ('graph cg length: %d, nodes: %d' % (len(cg), len(cg.all_nodes())))
+ print ('triple_bnode degrees:')
+ for triple_bnode in cg.subjects(rdflib.RDF['type'], rdflib.RDF['Statement']):
+ print (len(list(cg.triples([triple_bnode, None, None]))))
+ print ('all node out-degrees:')
+ print (sorted([len(list(cg.triples([node, None, None]))) for node in cg.all_nodes()]))
+ print ('all node in-degrees:')
+ print (sorted([len(list(cg.triples([None, None, node]))) for node in cg.all_nodes()]))
+ print(cg.serialize(format='n3'))
+
+ assert(len(g.all_nodes()) == len(cg.all_nodes()))
+
def test_issue494_collapsing_bnodes():
"""Test for https://github.com/RDFLib/rdflib/issues/494 collapsing BNodes"""
g = Graph()