summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJim McCusker <jmccusker@5amsolutions.com>2014-12-08 23:34:45 -0500
committerJim McCusker <jmccusker@5amsolutions.com>2014-12-08 23:34:45 -0500
commita4f2999df25f77f1d89846fff011a50bd5ff8a09 (patch)
tree70af5106489d54572b91a7bc58a4089550da380f
parent4be615ca52cd2487aab3c538ee706835083ba7ac (diff)
parent8e78708e0dd3a36d8e54796f78fd27957255b47a (diff)
downloadrdflib-a4f2999df25f77f1d89846fff011a50bd5ff8a09.tar.gz
Merge remote-tracking branch 'joernhees/canonicalization' into canonicalization
Conflicts: rdflib/compare.py
-rw-r--r--examples/graph_digest_benchmark.py50
-rw-r--r--rdflib/compare.py169
2 files changed, 124 insertions, 95 deletions
diff --git a/examples/graph_digest_benchmark.py b/examples/graph_digest_benchmark.py
index c7c374f8..d987cf18 100644
--- a/examples/graph_digest_benchmark.py
+++ b/examples/graph_digest_benchmark.py
@@ -1,7 +1,7 @@
#!/usr/bin/env python
'''
-This benchmark will produce graph digests for all of the
+This benchmark will produce graph digests for all of the
downloadable ontologies available in Bioportal.
'''
@@ -25,36 +25,38 @@ select distinct ?ontology ?title ?download where {
metadata:links ?links.
?links metadata:Ontology ?download.
filter(regex(?download, "/download"))
-}
+}
'''
stat_cols = [
'id',
'ontology',
'download_url',
- 'tree_depth',
- 'color_count',
- 'individuations',
+ 'tree_depth',
+ 'color_count',
+ 'individuations',
'prunings',
- 'initial_color_count',
- 'adjacent_nodes',
- 'initial_coloring_runtime',
- 'triple_count',
+ 'initial_color_count',
+ 'adjacent_nodes',
+ 'initial_coloring_runtime',
+ 'triple_count',
'graph_digest',
'to_hash_runtime',
'canonicalize_triples_runtime',
'error',
- ]
+ ]
+
def files_benchmark(ontologies, output_file, threads):
w = open(output_file, 'w')
- writer = csv.DictWriter(w,stat_cols)
+ writer = csv.DictWriter(w, stat_cols)
writer.writeheader()
tasks = Queue()
finished_tasks = Queue()
dl_lock = Semaphore(4)
task_count = len(ontologies)
- def worker(q,finished_tasks, dl_lock):
+
+ def worker(q, finished_tasks, dl_lock):
try:
while True:
stats = q.get()
@@ -73,13 +75,13 @@ def files_benchmark(ontologies, output_file, threads):
pass
for i in range(int(threads)):
print "Starting worker", i
- t = Process(target=worker, args=[tasks,finished_tasks, dl_lock])
+ t = Process(target=worker, args=[tasks, finished_tasks, dl_lock])
t.daemon = True
t.start()
for download in ontologies:
stats = defaultdict(str)
stats.update({
- "id":download.split("/")[-1].split(".")[0],
+ "id": download.split("/")[-1].split(".")[0],
"ontology": download.split("/")[-1].split(".")[0],
"download_url": download
})
@@ -88,27 +90,29 @@ def files_benchmark(ontologies, output_file, threads):
written_tasks = 0
while written_tasks < task_count:
stats = finished_tasks.get()
- #print "Writing", stats['ontology']
+ # print "Writing", stats['ontology']
writer.writerow(stats)
w.flush()
written_tasks += 1
+
def bioportal_benchmark(apikey, output_file, threads):
metadata = Namespace("http://data.bioontology.org/metadata/")
- url = 'http://data.bioontology.org/ontologies?apikey=%s'%apikey
+ url = 'http://data.bioontology.org/ontologies?apikey=%s' % apikey
ontology_graph = Graph()
print url
ontology_list_json = urlopen(url).read()
ontology_graph.parse(StringIO(unicode(ontology_list_json)), format="json-ld")
ontologies = ontology_graph.query(bioportal_query)
w = open(output_file, 'w')
- writer = csv.DictWriter(w,stat_cols)
+ writer = csv.DictWriter(w, stat_cols)
writer.writeheader()
tasks = Queue()
finished_tasks = Queue()
dl_lock = Semaphore(4)
task_count = len(ontologies)
- def worker(q,finished_tasks, dl_lock):
+
+ def worker(q, finished_tasks, dl_lock):
try:
while True:
stats = q.get()
@@ -116,7 +120,7 @@ def bioportal_benchmark(apikey, output_file, threads):
try:
try:
dl_lock.acquire()
- og.load(stats['download_url']+"?apikey=%s"%apikey)
+ og.load(stats['download_url'] + "?apikey=%s" % apikey)
finally:
dl_lock.release()
print stats['ontology'], stats['id']
@@ -131,13 +135,13 @@ def bioportal_benchmark(apikey, output_file, threads):
pass
for i in range(int(threads)):
print "Starting worker", i
- t = Process(target=worker, args=[tasks,finished_tasks, dl_lock])
+ t = Process(target=worker, args=[tasks, finished_tasks, dl_lock])
t.daemon = True
t.start()
for ontology, title, download in ontologies:
stats = defaultdict(str)
stats.update({
- "id":ontology,
+ "id": ontology,
"ontology": title,
"download_url": download
})
@@ -146,13 +150,13 @@ def bioportal_benchmark(apikey, output_file, threads):
written_tasks = 0
while written_tasks < task_count:
stats = finished_tasks.get()
- #print "Writing", stats['ontology']
+ # print "Writing", stats['ontology']
writer.writerow(stats)
w.flush()
written_tasks += 1
if __name__ == '__main__':
if len(sys.argv) > 4:
- files_benchmark(sys.argv[1:-2],sys.argv[-2],sys.argv[-1])
+ files_benchmark(sys.argv[1:-2], sys.argv[-2], sys.argv[-1])
else:
bioportal_benchmark(sys.argv[1], sys.argv[2], sys.argv[3])
diff --git a/rdflib/compare.py b/rdflib/compare.py
index 488acaf2..e23b3de3 100644
--- a/rdflib/compare.py
+++ b/rdflib/compare.py
@@ -5,8 +5,8 @@ A collection of utilities for canonicalizing and inspecting graphs.
Among other things, they solve of the problem of deterministic bnode
comparisons.
-Warning: the time to canonicalize bnodes may increase exponentially on
-degnerate larger graphs. Use with care!
+Warning: the time to canonicalize bnodes may increase exponentially on
+degenerate larger graphs. Use with care!
Example of comparing two graphs::
@@ -96,37 +96,42 @@ except ImportError:
from datetime import datetime
from collections import defaultdict
+
def _total_seconds(td):
result = td.days * 24 * 60 * 60
result += td.seconds
result += td.microseconds / 1000000.0
return result
+
class _runtime(object):
def __init__(self, label):
self.label = label
- def __call__(self,f):
- if self.label == None:
- self.label = f.__name__+"_runtime"
- def wrapped_f(*args,**kwargs):
+ def __call__(self, f):
+ if self.label is None:
+ self.label = f.__name__ + "_runtime"
+
+ def wrapped_f(*args, **kwargs):
start = datetime.now()
result = f(*args, **kwargs)
- if 'stats' in kwargs and kwargs['stats'] != None:
+ if 'stats' in kwargs and kwargs['stats'] is not None:
stats = kwargs['stats']
stats[self.label] = _total_seconds(datetime.now() - start)
return result
return wrapped_f
+
class _call_count(object):
def __init__(self, label):
self.label = label
- def __call__(self,f):
- if self.label == None:
- self.label = f.__name__+"_call_count"
- def wrapped_f(*args,**kwargs):
- if 'stats' in kwargs and kwargs['stats'] != None:
+ def __call__(self, f):
+ if self.label is None:
+ self.label = f.__name__ + "_runtime"
+
+ def wrapped_f(*args, **kwargs):
+ if 'stats' in kwargs and kwargs['stats'] is not None:
stats = kwargs['stats']
if self.label not in stats:
stats[self.label] = 0
@@ -134,10 +139,12 @@ class _call_count(object):
return f(*args, **kwargs)
return wrapped_f
+
class IsomorphicGraph(ConjunctiveGraph):
- """
+ """An implementation of the RGDA1 graph digest algorithm.
+
An implementation of RGDA1 (publication forthcoming),
- a combination of Sayers & Karp's graph digest algorithm using
+ a combination of Sayers & Karp's graph digest algorithm using
sum and SHA-256 <http://www.hpl.hp.com/techreports/2003/HPL-2003-235R1.pdf>
and traces <http://pallini.di.uniroma1.it>, an average case
polynomial time algorithm for graph canonicalization.
@@ -162,7 +169,7 @@ class IsomorphicGraph(ConjunctiveGraph):
"""Synonym for IsomorphicGraph.internal_hash."""
return self.internal_hash(stats=stats)
- def internal_hash(self,stats=None):
+ def internal_hash(self, stats=None):
"""
This is defined instead of __hash__ to avoid a circular recursion
scenario with the Memory store for rdflib which requires a hash lookup
@@ -170,6 +177,7 @@ class IsomorphicGraph(ConjunctiveGraph):
"""
return _TripleCanonicalizer(self).to_hash(stats=stats)
+
class Color:
def __init__(self, nodes, hashfunc, color=(), hash_cache={}):
self.color = color
@@ -179,21 +187,23 @@ class Color:
self._hash_cache = hash_cache
def key(self):
- return (len(self.nodes),self.hash_color())
+ return (len(self.nodes), self.hash_color())
def hash_color(self, color=None):
- if color == None:
+ if color is None:
color = self.color
if color in self._hash_cache:
return self._hash_cache[color]
+
def stringify(x):
- if isinstance(x,Node):
+ if isinstance(x, Node):
return x.n3()
- else: return unicode(x)
+ else:
+ return unicode(x)
if isinstance(color, Node):
return stringify(color)
value = sum(map(self.hashfunc, ' '.join([stringify(x) for x in color])))
- val = u"%x"% value
+ val = u"%x" % value
self._hash_cache[color] = val
return val
@@ -202,16 +212,19 @@ class Color:
for n in self.nodes:
new_color = list(self.color)
for node in W.nodes:
- new_color += [(1,p,W.hash_color()) for s,p,o in graph.triples((n,None,node))]
- new_color += [(W.hash_color(),p,3) for s,p,o in graph.triples((node,None,n))]
+ new_color += [
+ (1, p, W.hash_color())
+ for s, p, o in graph.triples((n, None, node))]
+ new_color += [
+ (W.hash_color(), p, 3)
+ for s, p, o in graph.triples((node, None, n))]
new_color = tuple(new_color)
new_hash_color = self.hash_color(new_color)
- #print new_hash_color, n
- #print self.color, n
- #print '\t' + "\n\t".join([str(n) for n in new_color])
if new_hash_color not in colors:
- c = Color([],self.hashfunc, new_color, hash_cache=self._hash_cache)
+ c = Color(
+ [], self.hashfunc, new_color,
+ hash_cache=self._hash_cache)
colors[new_hash_color] = c
colors[new_hash_color].nodes.append(n)
return colors.values()
@@ -220,16 +233,20 @@ class Color:
return len(self.nodes) == 1
def copy(self):
- return Color(self.nodes[:], self.hashfunc, self.color,hash_cache=self._hash_cache)
+ return Color(
+ self.nodes[:], self.hashfunc, self.color,
+ hash_cache=self._hash_cache)
+
class _TripleCanonicalizer(object):
def __init__(self, graph, hashfunc=sha256):
self.graph = graph
+
def _hashfunc(s):
h = hashfunc()
h.update(unicode(s).encode("utf8"))
- return int(h.hexdigest(),16)
+ return int(h.hexdigest(), 16)
self._hash_cache = {}
self.hashfunc = _hashfunc
@@ -239,26 +256,33 @@ class _TripleCanonicalizer(object):
def _initial_color(self):
- '''Finds an initial color for the graph by finding all blank nodes and
+ """Finds an initial color for the graph.
+
+ Finds an initial color fo the graph by finding all blank nodes and
non-blank nodes that are adjacent. Nodes that are not adjacent to blank
- nodes are not included, as they are a) already colored (by URI or literal)
- and b) do not factor into the color of any blank node.'''
+ nodes are not included, as they are a) already colored (by URI or literal)
+ and b) do not factor into the color of any blank node.
+ """
bnodes = set()
others = set()
self._neighbors = defaultdict(set)
- for s,p,o in self.graph:
- nodes = set([s,o])
- b = set([x for x in nodes if isinstance(x,BNode)])
+ for s, p, o in self.graph:
+ nodes = set([s, o])
+ b = set([x for x in nodes if isinstance(x, BNode)])
if len(b) > 0:
others |= nodes - b
bnodes |= b
- if isinstance(s,BNode):
+ if isinstance(s, BNode):
self._neighbors[s].add(o)
- if isinstance(o,BNode):
+ if isinstance(o, BNode):
self._neighbors[o].add(s)
if len(bnodes) > 0:
- return [Color(list(bnodes),self.hashfunc,hash_cache=self._hash_cache)]+[
- Color([x],self.hashfunc, x,hash_cache=self._hash_cache) for x in others]
+ return [
+ Color(list(bnodes), self.hashfunc, hash_cache=self._hash_cache)
+ ] + [
+ Color([x], self.hashfunc, x, hash_cache=self._hash_cache)
+ for x in others
+ ]
else:
return []
@@ -267,7 +291,7 @@ class _TripleCanonicalizer(object):
new_color.append((len(color.nodes)))
color.nodes.remove(individual)
- c = Color([individual],self.hashfunc, tuple(new_color),
+ c = Color([individual], s elf.hashfunc, tuple(new_color),
hash_cache=self._hash_cache)
return c
@@ -301,8 +325,8 @@ class _TripleCanonicalizer(object):
result = 0
for triple in self.canonical_triples(stats=stats):
result += self.hashfunc(' '.join([x.n3() for x in triple]))
- if stats != None:
- stats['graph_digest'] = "%x"% result
+ if stats is not None:
+ stats['graph_digest'] = "%x" % result
return result
def _experimental_path(self, coloring):
@@ -312,10 +336,10 @@ class _TripleCanonicalizer(object):
node = color.nodes[0]
new_color = self._individuate(color, node)
coloring.append(new_color)
- coloring = self._refine(coloring,[new_color])
+ coloring = self._refine(coloring, [new_color])
return coloring
- def _create_generator(self, colorings, groupings = None):
+ def _create_generator(self, colorings, groupings=None):
if not groupings:
groupings = defaultdict(set)
for group in zip(*colorings):
@@ -328,7 +352,7 @@ class _TripleCanonicalizer(object):
@_call_count("individuations")
def _traces(self, coloring, stats=None, depth=[0]):
- if stats != None and 'prunings' not in stats:
+ if stats is not None and 'prunings' not in stats:
stats['prunings'] = 0
depth[0] += 1
candidates = self._get_candidates(coloring)
@@ -355,28 +379,29 @@ class _TripleCanonicalizer(object):
color_copy = c_copy
new_color = self._individuate(color_copy, candidate)
coloring_copy.append(new_color)
- refined_coloring = self._refine(coloring_copy,[new_color])
+ refined_coloring = self._refine(coloring_copy, [new_color])
color_score = tuple([c.key() for c in refined_coloring])
experimental = self._experimental_path(coloring_copy)
experimental_score = set([c.key() for c in experimental])
if last_coloring:
- generator = self._create_generator([last_coloring,experimental],
- generator)
+ generator = self._create_generator(
+ [last_coloring, experimental],
+ generator)
last_coloring = experimental
- if best_score == None or best_score < color_score:
+ if best_score is None or best_score < color_score:
best = [refined_coloring]
best_score = color_score
best_experimental = experimental
best_experimental_score = experimental_score
elif best_score > color_score:
# prune this branch.
- if stats != None:
+ if stats is not None:
stats['prunings'] += 1
elif experimental_score != best_experimental_score:
best.append(refined_coloring)
else:
# prune this branch.
- if stats != None:
+ if stats is not None:
stats['prunings'] += 1
discrete = [x for x in best if self._discrete(x)]
if len(discrete) == 0:
@@ -387,7 +412,7 @@ class _TripleCanonicalizer(object):
d = [depth[0]]
new_color = self._traces(coloring, stats=stats, depth=d)
color_score = tuple([c.key() for c in refined_coloring])
- if best_score == None or color_score > best_score:
+ if best_score is None or color_score > best_score:
discrete = [new_color]
best_score = color_score
best_depth = d[0]
@@ -395,32 +420,32 @@ class _TripleCanonicalizer(object):
return discrete[0]
def canonical_triples(self, stats=None):
- if stats != None:
+ if stats is not None:
start_canonicalization = datetime.now()
- if stats != None:
+ if statsis not None:
start_coloring = datetime.now()
coloring = self._initial_color()
- if stats != None:
+ if stats is not None:
stats['triple_count'] = len(self.graph)
- stats['adjacent_nodes'] = max(0, len(coloring) - 1)
- coloring = self._refine(coloring,coloring[:])
- if stats != None:
- stats['initial_coloring__runtime'] = _total_seconds(datetime.now() - start_coloring)
+ stats['adjacent_nodes'] = max(0, len(coloring) - 1)
+ coloring = self._refine(coloring, coloring[:])
+ if stats is not None:
+ stats['initial_coloring_runtime'] = _total_seconds(datetime.now() - start_coloring)
stats['initial_color_count'] = len(coloring)
if not self._discrete(coloring):
depth = [0]
coloring = self._traces(coloring, stats=stats, depth=depth)
- if stats != None:
+ if stats is not None:
stats['tree_depth'] = depth[0]
- elif stats != None:
+ elif stats is not None:
stats['individuations'] = 0
stats['tree_depth'] = 0
- if stats != None:
+ if stats is not None:
stats['color_count'] = len(coloring)
bnode_labels = dict([(c.nodes[0], c.hash_color()) for c in coloring])
- if stats != None:
- stats["canonicalize_triples__runtime"] = _total_seconds(datetime.now() - start_coloring)
+ if stats is not None:
+ stats["canonicalize_triples_runtime"] = _total_seconds(datetime.now() - start_coloring)
for triple in self.graph:
result = tuple(self._canonicalize_bnodes(triple, bnode_labels))
yield result
@@ -440,9 +465,9 @@ def to_isomorphic(graph):
def isomorphic(graph1, graph2):
- """
- Compare graph for equality. Uses an algorithm to compute unique hashes
- which takes bnodes into account.
+ """Compare graph for equality.
+
+ Uses an algorithm to compute unique hashes which takes bnodes into account.
Examples::
@@ -473,11 +498,12 @@ def isomorphic(graph1, graph2):
gd1 = _TripleCanonicalizer(graph1).to_hash()
gd2 = _TripleCanonicalizer(graph2).to_hash()
return gd1 == gd2
-
+
def to_canonical_graph(g1):
- """
+ """Creates a canonical, read-only graph.
+
Creates a canonical, read-only graph where all bnode id:s are based on
deterministical SHA-256 checksums, correlated with the graph contents.
"""
@@ -487,9 +513,7 @@ def to_canonical_graph(g1):
def graph_diff(g1, g2):
- """
- Returns three sets of triples: "in both", "in first" and "in second".
- """
+ """Returns three sets of triples: "in both", "in first" and "in second"."""
# bnodes have deterministic values in canonical graphs:
cg1 = to_canonical_graph(g1)
cg2 = to_canonical_graph(g2)
@@ -504,7 +528,8 @@ _MOCK_BNODE = BNode()
def similar(g1, g2):
- """
+ """Checks if the two graphs are "similar".
+
Checks if the two graphs are "similar", by comparing sorted triples where
all bnodes have been replaced by a singular mock bnode (the
``_MOCK_BNODE``).