summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJoern Hees <dev@joernhees.de>2017-04-09 20:19:39 +0200
committerJoern Hees <dev@joernhees.de>2017-04-09 20:19:39 +0200
commit47b04545676fc953e96f189d3d312f42e3d64cc3 (patch)
tree9b6affba4170bda64a02938936b50e686814a9a2
parent6ab6174c7ec81bdd1bee18f2526c34fbb345e81a (diff)
parent6b5bd6367baf75d8af8022290b869e02dfb75236 (diff)
downloadrdflib-47b04545676fc953e96f189d3d312f42e3d64cc3.tar.gz
Merge remote-tracking branch 'upstream/pull/730' into fix_canonicalization
* upstream/pull/730: Removed extraneous debug statements. Fixed #725 by fully refining all bnode colors and combining color groups if there is a hash collision.
-rw-r--r--rdflib/compare.py28
-rw-r--r--test/test_canonicalization.py127
2 files changed, 149 insertions, 6 deletions
diff --git a/rdflib/compare.py b/rdflib/compare.py
index 5e3f5994..97de047b 100644
--- a/rdflib/compare.py
+++ b/rdflib/compare.py
@@ -194,6 +194,10 @@ class Color:
self.hashfunc = hashfunc
self._hash_color = None
+ def __str__(self):
+ nodes, color = self.key()
+ return "Color %s (%s nodes)" % (color, nodes)
+
def key(self):
return (len(self.nodes), self.hash_color())
@@ -277,7 +281,7 @@ class _TripleCanonicalizer(object):
others = set()
self._neighbors = defaultdict(set)
for s, p, o in self.graph:
- nodes = set([s, o])
+ nodes = set([s, p, o])
b = set([x for x in nodes if isinstance(x, BNode)])
if len(b) > 0:
others |= nodes - b
@@ -286,6 +290,9 @@ class _TripleCanonicalizer(object):
self._neighbors[s].add(o)
if isinstance(o, BNode):
self._neighbors[o].add(s)
+ if isinstance(p, BNode):
+ self._neighbors[p].add(s)
+ self._neighbors[p].add(p)
if len(bnodes) > 0:
return [
Color(list(bnodes), self.hashfunc, hash_cache=self._hash_cache)
@@ -317,7 +324,7 @@ class _TripleCanonicalizer(object):
while len(sequence) > 0 and not self._discrete(coloring):
W = sequence.pop()
for c in coloring[:]:
- if len(c.nodes) > 1:
+ if len(c.nodes) > 1 or isinstance(c.nodes[0], BNode):
colors = sorted(c.distinguish(W, self.graph),
key=lambda x: x.key(),
reverse=True)
@@ -328,8 +335,17 @@ class _TripleCanonicalizer(object):
sequence = sequence[:si] + colors + sequence[si+1:]
except ValueError:
sequence = colors[1:] + sequence
-
- return coloring
+ combined_colors = []
+ combined_color_map = dict()
+ for color in coloring:
+ color_hash = color.hash_color()
+ # This is a hash collision, and be combined into a single color for individuation.
+ if color_hash in combined_color_map:
+ combined_color_map[color_hash].nodes.extend(color.nodes)
+ else:
+ combined_colors.append(color)
+ combined_color_map[color_hash] = color
+ return combined_colors
@_runtime("to_hash_runtime")
def to_hash(self, stats=None):
@@ -515,14 +531,14 @@ def isomorphic(graph1, graph2):
-def to_canonical_graph(g1):
+def to_canonical_graph(g1, stats=None):
"""Creates a canonical, read-only graph.
Creates a canonical, read-only graph where all bnode id:s are based on
deterministical SHA-256 checksums, correlated with the graph contents.
"""
graph = Graph()
- graph += _TripleCanonicalizer(g1).canonical_triples()
+ graph += _TripleCanonicalizer(g1).canonical_triples(stats=stats)
return ReadOnlyGraphAggregate([graph])
diff --git a/test/test_canonicalization.py b/test/test_canonicalization.py
index 2cdeda2c..bb856218 100644
--- a/test/test_canonicalization.py
+++ b/test/test_canonicalization.py
@@ -1,7 +1,10 @@
from collections import Counter
from rdflib import Graph, RDF, BNode, URIRef, Namespace, ConjunctiveGraph, Literal
from rdflib.compare import to_isomorphic, to_canonical_graph
+
+import rdflib
from rdflib.plugins.memory import IOMemory
+
from six import text_type
from io import StringIO
@@ -155,6 +158,130 @@ def negative_graph_match_test():
for inputs in testInputs:
yield fn, inputs[0], inputs[1], inputs[2]
+def test_issue725_collapsing_bnodes():
+ g = rdflib.Graph()
+ g += [
+ (rdflib.term.BNode('N0a76d42406b84fe4b8029d0a7fa04244'),
+ rdflib.term.URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#object'),
+ rdflib.term.BNode('v2')),
+ (rdflib.term.BNode('N0a76d42406b84fe4b8029d0a7fa04244'),
+ rdflib.term.URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#predicate'),
+ rdflib.term.BNode('v0')),
+ (rdflib.term.BNode('N0a76d42406b84fe4b8029d0a7fa04244'),
+ rdflib.term.URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#subject'),
+ rdflib.term.URIRef(u'urn:gp_learner:fixed_var:target')),
+ (rdflib.term.BNode('N0a76d42406b84fe4b8029d0a7fa04244'),
+ rdflib.term.URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#type'),
+ rdflib.term.URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#Statement')),
+ (rdflib.term.BNode('N2f62af5936b94a8eb4b1e4bfa8e11d95'),
+ rdflib.term.URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#object'),
+ rdflib.term.BNode('v1')),
+ (rdflib.term.BNode('N2f62af5936b94a8eb4b1e4bfa8e11d95'),
+ rdflib.term.URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#predicate'),
+ rdflib.term.BNode('v0')),
+ (rdflib.term.BNode('N2f62af5936b94a8eb4b1e4bfa8e11d95'),
+ rdflib.term.URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#subject'),
+ rdflib.term.URIRef(u'urn:gp_learner:fixed_var:target')),
+ (rdflib.term.BNode('N2f62af5936b94a8eb4b1e4bfa8e11d95'),
+ rdflib.term.URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#type'),
+ rdflib.term.URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#Statement')),
+ (rdflib.term.BNode('N5ae541f93e1d4e5880450b1bdceb6404'),
+ rdflib.term.URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#object'),
+ rdflib.term.BNode('v5')),
+ (rdflib.term.BNode('N5ae541f93e1d4e5880450b1bdceb6404'),
+ rdflib.term.URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#predicate'),
+ rdflib.term.BNode('v4')),
+ (rdflib.term.BNode('N5ae541f93e1d4e5880450b1bdceb6404'),
+ rdflib.term.URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#subject'),
+ rdflib.term.URIRef(u'urn:gp_learner:fixed_var:target')),
+ (rdflib.term.BNode('N5ae541f93e1d4e5880450b1bdceb6404'),
+ rdflib.term.URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#type'),
+ rdflib.term.URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#Statement')),
+ (rdflib.term.BNode('N86ac7ca781f546ae939b8963895f672e'),
+ rdflib.term.URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#object'),
+ rdflib.term.URIRef(u'urn:gp_learner:fixed_var:source')),
+ (rdflib.term.BNode('N86ac7ca781f546ae939b8963895f672e'),
+ rdflib.term.URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#predicate'),
+ rdflib.term.BNode('v0')),
+ (rdflib.term.BNode('N86ac7ca781f546ae939b8963895f672e'),
+ rdflib.term.URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#subject'),
+ rdflib.term.URIRef(u'urn:gp_learner:fixed_var:target')),
+ (rdflib.term.BNode('N86ac7ca781f546ae939b8963895f672e'),
+ rdflib.term.URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#type'),
+ rdflib.term.URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#Statement')),
+ (rdflib.term.BNode('Nac82b883ca3849b5ab6820b7ac15e490'),
+ rdflib.term.URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#object'),
+ rdflib.term.BNode('v1')),
+ (rdflib.term.BNode('Nac82b883ca3849b5ab6820b7ac15e490'),
+ rdflib.term.URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#predicate'),
+ rdflib.term.BNode('v3')),
+ (rdflib.term.BNode('Nac82b883ca3849b5ab6820b7ac15e490'),
+ rdflib.term.URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#subject'),
+ rdflib.term.URIRef(u'urn:gp_learner:fixed_var:target')),
+ (rdflib.term.BNode('Nac82b883ca3849b5ab6820b7ac15e490'),
+ rdflib.term.URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#type'),
+ rdflib.term.URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#Statement'))
+ ]
+
+ turtle = '''
+@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
+@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
+@prefix xml: <http://www.w3.org/XML/1998/namespace> .
+@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
+
+[] a rdf:Statement ;
+ rdf:object [ ] ;
+ rdf:predicate _:v0 ;
+ rdf:subject <urn:gp_learner:fixed_var:target> .
+
+[] a rdf:Statement ;
+ rdf:object _:v1 ;
+ rdf:predicate _:v0 ;
+ rdf:subject <urn:gp_learner:fixed_var:target> .
+
+[] a rdf:Statement ;
+ rdf:object [ ] ;
+ rdf:predicate [ ] ;
+ rdf:subject <urn:gp_learner:fixed_var:target> .
+
+[] a rdf:Statement ;
+ rdf:object <urn:gp_learner:fixed_var:source> ;
+ rdf:predicate _:v0 ;
+ rdf:subject <urn:gp_learner:fixed_var:target> .
+
+[] a rdf:Statement ;
+ rdf:object _:v1 ;
+ rdf:predicate [ ] ;
+ rdf:subject <urn:gp_learner:fixed_var:target> .'''
+
+ #g = Graph()
+ #g.parse(data=turtle, format='turtle')
+
+ stats = {}
+ cg = rdflib.compare.to_canonical_graph(g, stats=stats)
+
+ print ('graph g length: %d, nodes: %d' % (len(g), len(g.all_nodes())))
+ print ('triple_bnode degrees:')
+ for triple_bnode in g.subjects(rdflib.RDF['type'], rdflib.RDF['Statement']):
+ print (len(list(g.triples([triple_bnode, None, None]))))
+ print ('all node out-degrees:')
+ print (sorted([len(list(g.triples([node, None, None]))) for node in g.all_nodes()]))
+ print ('all node in-degrees:')
+ print (sorted([len(list(g.triples([None, None, node]))) for node in g.all_nodes()]))
+ print(g.serialize(format='n3'))
+
+ print ('graph cg length: %d, nodes: %d' % (len(cg), len(cg.all_nodes())))
+ print ('triple_bnode degrees:')
+ for triple_bnode in cg.subjects(rdflib.RDF['type'], rdflib.RDF['Statement']):
+ print (len(list(cg.triples([triple_bnode, None, None]))))
+ print ('all node out-degrees:')
+ print (sorted([len(list(cg.triples([node, None, None]))) for node in cg.all_nodes()]))
+ print ('all node in-degrees:')
+ print (sorted([len(list(cg.triples([None, None, node]))) for node in cg.all_nodes()]))
+ print(cg.serialize(format='n3'))
+
+ assert(len(g.all_nodes()) == len(cg.all_nodes()))
+
def test_issue494_collapsing_bnodes():
"""Test for https://github.com/RDFLib/rdflib/issues/494 collapsing BNodes"""
g = Graph()