diff options
author | Jim McCusker <jmccusker@5amsolutions.com> | 2014-12-08 23:34:45 -0500 |
---|---|---|
committer | Jim McCusker <jmccusker@5amsolutions.com> | 2014-12-08 23:34:45 -0500 |
commit | a4f2999df25f77f1d89846fff011a50bd5ff8a09 (patch) | |
tree | 70af5106489d54572b91a7bc58a4089550da380f | |
parent | 4be615ca52cd2487aab3c538ee706835083ba7ac (diff) | |
parent | 8e78708e0dd3a36d8e54796f78fd27957255b47a (diff) | |
download | rdflib-a4f2999df25f77f1d89846fff011a50bd5ff8a09.tar.gz |
Merge remote-tracking branch 'joernhees/canonicalization' into canonicalization
Conflicts:
rdflib/compare.py
-rw-r--r-- | examples/graph_digest_benchmark.py | 50 | ||||
-rw-r--r-- | rdflib/compare.py | 169 |
2 files changed, 124 insertions, 95 deletions
diff --git a/examples/graph_digest_benchmark.py b/examples/graph_digest_benchmark.py index c7c374f8..d987cf18 100644 --- a/examples/graph_digest_benchmark.py +++ b/examples/graph_digest_benchmark.py @@ -1,7 +1,7 @@ #!/usr/bin/env python ''' -This benchmark will produce graph digests for all of the +This benchmark will produce graph digests for all of the downloadable ontologies available in Bioportal. ''' @@ -25,36 +25,38 @@ select distinct ?ontology ?title ?download where { metadata:links ?links. ?links metadata:Ontology ?download. filter(regex(?download, "/download")) -} +} ''' stat_cols = [ 'id', 'ontology', 'download_url', - 'tree_depth', - 'color_count', - 'individuations', + 'tree_depth', + 'color_count', + 'individuations', 'prunings', - 'initial_color_count', - 'adjacent_nodes', - 'initial_coloring_runtime', - 'triple_count', + 'initial_color_count', + 'adjacent_nodes', + 'initial_coloring_runtime', + 'triple_count', 'graph_digest', 'to_hash_runtime', 'canonicalize_triples_runtime', 'error', - ] + ] + def files_benchmark(ontologies, output_file, threads): w = open(output_file, 'w') - writer = csv.DictWriter(w,stat_cols) + writer = csv.DictWriter(w, stat_cols) writer.writeheader() tasks = Queue() finished_tasks = Queue() dl_lock = Semaphore(4) task_count = len(ontologies) - def worker(q,finished_tasks, dl_lock): + + def worker(q, finished_tasks, dl_lock): try: while True: stats = q.get() @@ -73,13 +75,13 @@ def files_benchmark(ontologies, output_file, threads): pass for i in range(int(threads)): print "Starting worker", i - t = Process(target=worker, args=[tasks,finished_tasks, dl_lock]) + t = Process(target=worker, args=[tasks, finished_tasks, dl_lock]) t.daemon = True t.start() for download in ontologies: stats = defaultdict(str) stats.update({ - "id":download.split("/")[-1].split(".")[0], + "id": download.split("/")[-1].split(".")[0], "ontology": download.split("/")[-1].split(".")[0], "download_url": download }) @@ -88,27 +90,29 @@ def files_benchmark(ontologies, output_file, threads): written_tasks = 0 while written_tasks < task_count: stats = finished_tasks.get() - #print "Writing", stats['ontology'] + # print "Writing", stats['ontology'] writer.writerow(stats) w.flush() written_tasks += 1 + def bioportal_benchmark(apikey, output_file, threads): metadata = Namespace("http://data.bioontology.org/metadata/") - url = 'http://data.bioontology.org/ontologies?apikey=%s'%apikey + url = 'http://data.bioontology.org/ontologies?apikey=%s' % apikey ontology_graph = Graph() print url ontology_list_json = urlopen(url).read() ontology_graph.parse(StringIO(unicode(ontology_list_json)), format="json-ld") ontologies = ontology_graph.query(bioportal_query) w = open(output_file, 'w') - writer = csv.DictWriter(w,stat_cols) + writer = csv.DictWriter(w, stat_cols) writer.writeheader() tasks = Queue() finished_tasks = Queue() dl_lock = Semaphore(4) task_count = len(ontologies) - def worker(q,finished_tasks, dl_lock): + + def worker(q, finished_tasks, dl_lock): try: while True: stats = q.get() @@ -116,7 +120,7 @@ def bioportal_benchmark(apikey, output_file, threads): try: try: dl_lock.acquire() - og.load(stats['download_url']+"?apikey=%s"%apikey) + og.load(stats['download_url'] + "?apikey=%s" % apikey) finally: dl_lock.release() print stats['ontology'], stats['id'] @@ -131,13 +135,13 @@ def bioportal_benchmark(apikey, output_file, threads): pass for i in range(int(threads)): print "Starting worker", i - t = Process(target=worker, args=[tasks,finished_tasks, dl_lock]) + t = Process(target=worker, args=[tasks, finished_tasks, dl_lock]) t.daemon = True t.start() for ontology, title, download in ontologies: stats = defaultdict(str) stats.update({ - "id":ontology, + "id": ontology, "ontology": title, "download_url": download }) @@ -146,13 +150,13 @@ def bioportal_benchmark(apikey, output_file, threads): written_tasks = 0 while written_tasks < task_count: stats = finished_tasks.get() - #print "Writing", stats['ontology'] + # print "Writing", stats['ontology'] writer.writerow(stats) w.flush() written_tasks += 1 if __name__ == '__main__': if len(sys.argv) > 4: - files_benchmark(sys.argv[1:-2],sys.argv[-2],sys.argv[-1]) + files_benchmark(sys.argv[1:-2], sys.argv[-2], sys.argv[-1]) else: bioportal_benchmark(sys.argv[1], sys.argv[2], sys.argv[3]) diff --git a/rdflib/compare.py b/rdflib/compare.py index 488acaf2..e23b3de3 100644 --- a/rdflib/compare.py +++ b/rdflib/compare.py @@ -5,8 +5,8 @@ A collection of utilities for canonicalizing and inspecting graphs. Among other things, they solve of the problem of deterministic bnode comparisons. -Warning: the time to canonicalize bnodes may increase exponentially on -degnerate larger graphs. Use with care! +Warning: the time to canonicalize bnodes may increase exponentially on +degenerate larger graphs. Use with care! Example of comparing two graphs:: @@ -96,37 +96,42 @@ except ImportError: from datetime import datetime from collections import defaultdict + def _total_seconds(td): result = td.days * 24 * 60 * 60 result += td.seconds result += td.microseconds / 1000000.0 return result + class _runtime(object): def __init__(self, label): self.label = label - def __call__(self,f): - if self.label == None: - self.label = f.__name__+"_runtime" - def wrapped_f(*args,**kwargs): + def __call__(self, f): + if self.label is None: + self.label = f.__name__ + "_runtime" + + def wrapped_f(*args, **kwargs): start = datetime.now() result = f(*args, **kwargs) - if 'stats' in kwargs and kwargs['stats'] != None: + if 'stats' in kwargs and kwargs['stats'] is not None: stats = kwargs['stats'] stats[self.label] = _total_seconds(datetime.now() - start) return result return wrapped_f + class _call_count(object): def __init__(self, label): self.label = label - def __call__(self,f): - if self.label == None: - self.label = f.__name__+"_call_count" - def wrapped_f(*args,**kwargs): - if 'stats' in kwargs and kwargs['stats'] != None: + def __call__(self, f): + if self.label is None: + self.label = f.__name__ + "_runtime" + + def wrapped_f(*args, **kwargs): + if 'stats' in kwargs and kwargs['stats'] is not None: stats = kwargs['stats'] if self.label not in stats: stats[self.label] = 0 @@ -134,10 +139,12 @@ class _call_count(object): return f(*args, **kwargs) return wrapped_f + class IsomorphicGraph(ConjunctiveGraph): - """ + """An implementation of the RGDA1 graph digest algorithm. + An implementation of RGDA1 (publication forthcoming), - a combination of Sayers & Karp's graph digest algorithm using + a combination of Sayers & Karp's graph digest algorithm using sum and SHA-256 <http://www.hpl.hp.com/techreports/2003/HPL-2003-235R1.pdf> and traces <http://pallini.di.uniroma1.it>, an average case polynomial time algorithm for graph canonicalization. @@ -162,7 +169,7 @@ class IsomorphicGraph(ConjunctiveGraph): """Synonym for IsomorphicGraph.internal_hash.""" return self.internal_hash(stats=stats) - def internal_hash(self,stats=None): + def internal_hash(self, stats=None): """ This is defined instead of __hash__ to avoid a circular recursion scenario with the Memory store for rdflib which requires a hash lookup @@ -170,6 +177,7 @@ class IsomorphicGraph(ConjunctiveGraph): """ return _TripleCanonicalizer(self).to_hash(stats=stats) + class Color: def __init__(self, nodes, hashfunc, color=(), hash_cache={}): self.color = color @@ -179,21 +187,23 @@ class Color: self._hash_cache = hash_cache def key(self): - return (len(self.nodes),self.hash_color()) + return (len(self.nodes), self.hash_color()) def hash_color(self, color=None): - if color == None: + if color is None: color = self.color if color in self._hash_cache: return self._hash_cache[color] + def stringify(x): - if isinstance(x,Node): + if isinstance(x, Node): return x.n3() - else: return unicode(x) + else: + return unicode(x) if isinstance(color, Node): return stringify(color) value = sum(map(self.hashfunc, ' '.join([stringify(x) for x in color]))) - val = u"%x"% value + val = u"%x" % value self._hash_cache[color] = val return val @@ -202,16 +212,19 @@ class Color: for n in self.nodes: new_color = list(self.color) for node in W.nodes: - new_color += [(1,p,W.hash_color()) for s,p,o in graph.triples((n,None,node))] - new_color += [(W.hash_color(),p,3) for s,p,o in graph.triples((node,None,n))] + new_color += [ + (1, p, W.hash_color()) + for s, p, o in graph.triples((n, None, node))] + new_color += [ + (W.hash_color(), p, 3) + for s, p, o in graph.triples((node, None, n))] new_color = tuple(new_color) new_hash_color = self.hash_color(new_color) - #print new_hash_color, n - #print self.color, n - #print '\t' + "\n\t".join([str(n) for n in new_color]) if new_hash_color not in colors: - c = Color([],self.hashfunc, new_color, hash_cache=self._hash_cache) + c = Color( + [], self.hashfunc, new_color, + hash_cache=self._hash_cache) colors[new_hash_color] = c colors[new_hash_color].nodes.append(n) return colors.values() @@ -220,16 +233,20 @@ class Color: return len(self.nodes) == 1 def copy(self): - return Color(self.nodes[:], self.hashfunc, self.color,hash_cache=self._hash_cache) + return Color( + self.nodes[:], self.hashfunc, self.color, + hash_cache=self._hash_cache) + class _TripleCanonicalizer(object): def __init__(self, graph, hashfunc=sha256): self.graph = graph + def _hashfunc(s): h = hashfunc() h.update(unicode(s).encode("utf8")) - return int(h.hexdigest(),16) + return int(h.hexdigest(), 16) self._hash_cache = {} self.hashfunc = _hashfunc @@ -239,26 +256,33 @@ class _TripleCanonicalizer(object): def _initial_color(self): - '''Finds an initial color for the graph by finding all blank nodes and + """Finds an initial color for the graph. + + Finds an initial color fo the graph by finding all blank nodes and non-blank nodes that are adjacent. Nodes that are not adjacent to blank - nodes are not included, as they are a) already colored (by URI or literal) - and b) do not factor into the color of any blank node.''' + nodes are not included, as they are a) already colored (by URI or literal) + and b) do not factor into the color of any blank node. + """ bnodes = set() others = set() self._neighbors = defaultdict(set) - for s,p,o in self.graph: - nodes = set([s,o]) - b = set([x for x in nodes if isinstance(x,BNode)]) + for s, p, o in self.graph: + nodes = set([s, o]) + b = set([x for x in nodes if isinstance(x, BNode)]) if len(b) > 0: others |= nodes - b bnodes |= b - if isinstance(s,BNode): + if isinstance(s, BNode): self._neighbors[s].add(o) - if isinstance(o,BNode): + if isinstance(o, BNode): self._neighbors[o].add(s) if len(bnodes) > 0: - return [Color(list(bnodes),self.hashfunc,hash_cache=self._hash_cache)]+[ - Color([x],self.hashfunc, x,hash_cache=self._hash_cache) for x in others] + return [ + Color(list(bnodes), self.hashfunc, hash_cache=self._hash_cache) + ] + [ + Color([x], self.hashfunc, x, hash_cache=self._hash_cache) + for x in others + ] else: return [] @@ -267,7 +291,7 @@ class _TripleCanonicalizer(object): new_color.append((len(color.nodes))) color.nodes.remove(individual) - c = Color([individual],self.hashfunc, tuple(new_color), + c = Color([individual], s elf.hashfunc, tuple(new_color), hash_cache=self._hash_cache) return c @@ -301,8 +325,8 @@ class _TripleCanonicalizer(object): result = 0 for triple in self.canonical_triples(stats=stats): result += self.hashfunc(' '.join([x.n3() for x in triple])) - if stats != None: - stats['graph_digest'] = "%x"% result + if stats is not None: + stats['graph_digest'] = "%x" % result return result def _experimental_path(self, coloring): @@ -312,10 +336,10 @@ class _TripleCanonicalizer(object): node = color.nodes[0] new_color = self._individuate(color, node) coloring.append(new_color) - coloring = self._refine(coloring,[new_color]) + coloring = self._refine(coloring, [new_color]) return coloring - def _create_generator(self, colorings, groupings = None): + def _create_generator(self, colorings, groupings=None): if not groupings: groupings = defaultdict(set) for group in zip(*colorings): @@ -328,7 +352,7 @@ class _TripleCanonicalizer(object): @_call_count("individuations") def _traces(self, coloring, stats=None, depth=[0]): - if stats != None and 'prunings' not in stats: + if stats is not None and 'prunings' not in stats: stats['prunings'] = 0 depth[0] += 1 candidates = self._get_candidates(coloring) @@ -355,28 +379,29 @@ class _TripleCanonicalizer(object): color_copy = c_copy new_color = self._individuate(color_copy, candidate) coloring_copy.append(new_color) - refined_coloring = self._refine(coloring_copy,[new_color]) + refined_coloring = self._refine(coloring_copy, [new_color]) color_score = tuple([c.key() for c in refined_coloring]) experimental = self._experimental_path(coloring_copy) experimental_score = set([c.key() for c in experimental]) if last_coloring: - generator = self._create_generator([last_coloring,experimental], - generator) + generator = self._create_generator( + [last_coloring, experimental], + generator) last_coloring = experimental - if best_score == None or best_score < color_score: + if best_score is None or best_score < color_score: best = [refined_coloring] best_score = color_score best_experimental = experimental best_experimental_score = experimental_score elif best_score > color_score: # prune this branch. - if stats != None: + if stats is not None: stats['prunings'] += 1 elif experimental_score != best_experimental_score: best.append(refined_coloring) else: # prune this branch. - if stats != None: + if stats is not None: stats['prunings'] += 1 discrete = [x for x in best if self._discrete(x)] if len(discrete) == 0: @@ -387,7 +412,7 @@ class _TripleCanonicalizer(object): d = [depth[0]] new_color = self._traces(coloring, stats=stats, depth=d) color_score = tuple([c.key() for c in refined_coloring]) - if best_score == None or color_score > best_score: + if best_score is None or color_score > best_score: discrete = [new_color] best_score = color_score best_depth = d[0] @@ -395,32 +420,32 @@ class _TripleCanonicalizer(object): return discrete[0] def canonical_triples(self, stats=None): - if stats != None: + if stats is not None: start_canonicalization = datetime.now() - if stats != None: + if statsis not None: start_coloring = datetime.now() coloring = self._initial_color() - if stats != None: + if stats is not None: stats['triple_count'] = len(self.graph) - stats['adjacent_nodes'] = max(0, len(coloring) - 1) - coloring = self._refine(coloring,coloring[:]) - if stats != None: - stats['initial_coloring__runtime'] = _total_seconds(datetime.now() - start_coloring) + stats['adjacent_nodes'] = max(0, len(coloring) - 1) + coloring = self._refine(coloring, coloring[:]) + if stats is not None: + stats['initial_coloring_runtime'] = _total_seconds(datetime.now() - start_coloring) stats['initial_color_count'] = len(coloring) if not self._discrete(coloring): depth = [0] coloring = self._traces(coloring, stats=stats, depth=depth) - if stats != None: + if stats is not None: stats['tree_depth'] = depth[0] - elif stats != None: + elif stats is not None: stats['individuations'] = 0 stats['tree_depth'] = 0 - if stats != None: + if stats is not None: stats['color_count'] = len(coloring) bnode_labels = dict([(c.nodes[0], c.hash_color()) for c in coloring]) - if stats != None: - stats["canonicalize_triples__runtime"] = _total_seconds(datetime.now() - start_coloring) + if stats is not None: + stats["canonicalize_triples_runtime"] = _total_seconds(datetime.now() - start_coloring) for triple in self.graph: result = tuple(self._canonicalize_bnodes(triple, bnode_labels)) yield result @@ -440,9 +465,9 @@ def to_isomorphic(graph): def isomorphic(graph1, graph2): - """ - Compare graph for equality. Uses an algorithm to compute unique hashes - which takes bnodes into account. + """Compare graph for equality. + + Uses an algorithm to compute unique hashes which takes bnodes into account. Examples:: @@ -473,11 +498,12 @@ def isomorphic(graph1, graph2): gd1 = _TripleCanonicalizer(graph1).to_hash() gd2 = _TripleCanonicalizer(graph2).to_hash() return gd1 == gd2 - + def to_canonical_graph(g1): - """ + """Creates a canonical, read-only graph. + Creates a canonical, read-only graph where all bnode id:s are based on deterministical SHA-256 checksums, correlated with the graph contents. """ @@ -487,9 +513,7 @@ def to_canonical_graph(g1): def graph_diff(g1, g2): - """ - Returns three sets of triples: "in both", "in first" and "in second". - """ + """Returns three sets of triples: "in both", "in first" and "in second".""" # bnodes have deterministic values in canonical graphs: cg1 = to_canonical_graph(g1) cg2 = to_canonical_graph(g2) @@ -504,7 +528,8 @@ _MOCK_BNODE = BNode() def similar(g1, g2): - """ + """Checks if the two graphs are "similar". + Checks if the two graphs are "similar", by comparing sorted triples where all bnodes have been replaced by a singular mock bnode (the ``_MOCK_BNODE``). |