Merge remote-tracking branch 'joernhees/canonicalization' into canonicalization

Conflicts: rdflib/compare.py
author: Jim McCusker <jmccusker@5amsolutions.com> 2014-12-08 23:34:45 -0500
committer: Jim McCusker <jmccusker@5amsolutions.com> 2014-12-08 23:34:45 -0500
commit: a4f2999df25f77f1d89846fff011a50bd5ff8a09 (patch)
tree: 70af5106489d54572b91a7bc58a4089550da380f
parent: 4be615ca52cd2487aab3c538ee706835083ba7ac (diff)
parent: 8e78708e0dd3a36d8e54796f78fd27957255b47a (diff)
download: rdflib-a4f2999df25f77f1d89846fff011a50bd5ff8a09.tar.gz
2 files changed, 124 insertions, 95 deletions
diff --git a/examples/graph_digest_benchmark.py b/examples/graph_digest_benchmark.py
index c7c374f8..d987cf18 100644
--- a/examples/graph_digest_benchmark.py
+++ b/examples/graph_digest_benchmark.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python
 
 '''
-This benchmark will produce graph digests for all of the 
+This benchmark will produce graph digests for all of the
 downloadable ontologies available in Bioportal.
 '''
 
@@ -25,36 +25,38 @@ select distinct ?ontology ?title ?download where {
               metadata:links ?links.
     ?links metadata:Ontology ?download.
     filter(regex(?download, "/download"))
-} 
+}
 '''
 
 stat_cols = [
     'id',
     'ontology',
     'download_url',
-    'tree_depth', 
-    'color_count', 
-    'individuations', 
+    'tree_depth',
+    'color_count',
+    'individuations',
     'prunings',
-    'initial_color_count', 
-    'adjacent_nodes', 
-    'initial_coloring_runtime', 
-    'triple_count', 
+    'initial_color_count',
+    'adjacent_nodes',
+    'initial_coloring_runtime',
+    'triple_count',
     'graph_digest',
     'to_hash_runtime',
     'canonicalize_triples_runtime',
     'error',
-    ]     
+    ]
+
 
 def files_benchmark(ontologies, output_file, threads):
     w = open(output_file, 'w')
-    writer = csv.DictWriter(w,stat_cols)
+    writer = csv.DictWriter(w, stat_cols)
     writer.writeheader()
     tasks = Queue()
     finished_tasks = Queue()
     dl_lock = Semaphore(4)
     task_count = len(ontologies)
-    def worker(q,finished_tasks, dl_lock):
+
+    def worker(q, finished_tasks, dl_lock):
         try:
             while True:
                 stats = q.get()
@@ -73,13 +75,13 @@ def files_benchmark(ontologies, output_file, threads):
             pass
     for i in range(int(threads)):
         print "Starting worker", i
-        t = Process(target=worker, args=[tasks,finished_tasks, dl_lock])
+        t = Process(target=worker, args=[tasks, finished_tasks, dl_lock])
         t.daemon = True
         t.start()
     for download in ontologies:
         stats = defaultdict(str)
         stats.update({
-            "id":download.split("/")[-1].split(".")[0],
+            "id": download.split("/")[-1].split(".")[0],
             "ontology": download.split("/")[-1].split(".")[0],
             "download_url": download
         })
@@ -88,27 +90,29 @@ def files_benchmark(ontologies, output_file, threads):
     written_tasks = 0
     while written_tasks < task_count:
         stats = finished_tasks.get()
-        #print "Writing", stats['ontology']
+        # print "Writing", stats['ontology']
         writer.writerow(stats)
         w.flush()
         written_tasks += 1
 
+
 def bioportal_benchmark(apikey, output_file, threads):
     metadata = Namespace("http://data.bioontology.org/metadata/")
-    url = 'http://data.bioontology.org/ontologies?apikey=%s'%apikey
+    url = 'http://data.bioontology.org/ontologies?apikey=%s' % apikey
     ontology_graph = Graph()
     print url
     ontology_list_json = urlopen(url).read()
     ontology_graph.parse(StringIO(unicode(ontology_list_json)), format="json-ld")
     ontologies = ontology_graph.query(bioportal_query)
     w = open(output_file, 'w')
-    writer = csv.DictWriter(w,stat_cols)
+    writer = csv.DictWriter(w, stat_cols)
     writer.writeheader()
     tasks = Queue()
     finished_tasks = Queue()
     dl_lock = Semaphore(4)
     task_count = len(ontologies)
-    def worker(q,finished_tasks, dl_lock):
+
+    def worker(q, finished_tasks, dl_lock):
         try:
             while True:
                 stats = q.get()
@@ -116,7 +120,7 @@ def bioportal_benchmark(apikey, output_file, threads):
                 try:
                     try:
                         dl_lock.acquire()
-                        og.load(stats['download_url']+"?apikey=%s"%apikey)
+                        og.load(stats['download_url'] + "?apikey=%s" % apikey)
                     finally:
                         dl_lock.release()
                     print stats['ontology'], stats['id']
@@ -131,13 +135,13 @@ def bioportal_benchmark(apikey, output_file, threads):
             pass
     for i in range(int(threads)):
         print "Starting worker", i
-        t = Process(target=worker, args=[tasks,finished_tasks, dl_lock])
+        t = Process(target=worker, args=[tasks, finished_tasks, dl_lock])
         t.daemon = True
         t.start()
     for ontology, title, download in ontologies:
         stats = defaultdict(str)
         stats.update({
-            "id":ontology,
+            "id": ontology,
             "ontology": title,
             "download_url": download
         })
@@ -146,13 +150,13 @@ def bioportal_benchmark(apikey, output_file, threads):
     written_tasks = 0
     while written_tasks < task_count:
         stats = finished_tasks.get()
-        #print "Writing", stats['ontology']
+        # print "Writing", stats['ontology']
         writer.writerow(stats)
         w.flush()
         written_tasks += 1
 
 if __name__ == '__main__':
     if len(sys.argv) > 4:
-        files_benchmark(sys.argv[1:-2],sys.argv[-2],sys.argv[-1])
+        files_benchmark(sys.argv[1:-2], sys.argv[-2], sys.argv[-1])
     else:
         bioportal_benchmark(sys.argv[1], sys.argv[2], sys.argv[3])
diff --git a/rdflib/compare.py b/rdflib/compare.py
index 488acaf2..e23b3de3 100644
--- a/rdflib/compare.py
+++ b/rdflib/compare.py
@@ -5,8 +5,8 @@ A collection of utilities for canonicalizing and inspecting graphs.
 Among other things, they solve of the problem of deterministic bnode
 comparisons.
 
-Warning: the time to canonicalize bnodes may increase exponentially on 
-degnerate larger graphs. Use with care!
+Warning: the time to canonicalize bnodes may increase exponentially on
+degenerate larger graphs. Use with care!
 
 Example of comparing two graphs::
 
@@ -96,37 +96,42 @@ except ImportError:
 from datetime import datetime
 from collections import defaultdict
 
+
 def _total_seconds(td):
     result = td.days * 24 * 60 * 60
     result += td.seconds
     result += td.microseconds / 1000000.0
     return result
 
+
 class _runtime(object):
     def __init__(self, label):
         self.label = label
 
-    def __call__(self,f):
-        if self.label == None:
-            self.label = f.__name__+"_runtime"
-        def wrapped_f(*args,**kwargs):
+    def __call__(self, f):
+        if self.label is None:
+            self.label = f.__name__ + "_runtime"
+
+        def wrapped_f(*args, **kwargs):
             start = datetime.now()
             result = f(*args, **kwargs)
-            if 'stats' in kwargs and kwargs['stats'] != None:
+            if 'stats' in kwargs and kwargs['stats'] is not None:
                 stats = kwargs['stats']
                 stats[self.label] = _total_seconds(datetime.now() - start)
             return result
         return wrapped_f
 
+
 class _call_count(object):
     def __init__(self, label):
         self.label = label
 
-    def __call__(self,f):
-        if self.label == None:
-            self.label = f.__name__+"_call_count"
-        def wrapped_f(*args,**kwargs):
-            if 'stats' in kwargs and kwargs['stats'] != None:
+    def __call__(self, f):
+        if self.label is None:
+            self.label = f.__name__ + "_runtime"
+
+        def wrapped_f(*args, **kwargs):
+            if 'stats' in kwargs and kwargs['stats'] is not None:
                 stats = kwargs['stats']
                 if self.label not in stats:
                     stats[self.label] = 0
@@ -134,10 +139,12 @@ class _call_count(object):
             return f(*args, **kwargs)
         return wrapped_f
 
+
 class IsomorphicGraph(ConjunctiveGraph):
-    """
+    """An implementation of the RGDA1 graph digest algorithm.
+
     An implementation of RGDA1 (publication forthcoming),
-    a combination of Sayers & Karp's graph digest algorithm using 
+    a combination of Sayers & Karp's graph digest algorithm using
     sum and SHA-256 <http://www.hpl.hp.com/techreports/2003/HPL-2003-235R1.pdf>
     and traces <http://pallini.di.uniroma1.it>, an average case
     polynomial time algorithm for graph canonicalization.
@@ -162,7 +169,7 @@ class IsomorphicGraph(ConjunctiveGraph):
         """Synonym for IsomorphicGraph.internal_hash."""
         return self.internal_hash(stats=stats)
 
-    def internal_hash(self,stats=None):
+    def internal_hash(self, stats=None):
         """
         This is defined instead of __hash__ to avoid a circular recursion
         scenario with the Memory store for rdflib which requires a hash lookup
@@ -170,6 +177,7 @@ class IsomorphicGraph(ConjunctiveGraph):
         """
         return _TripleCanonicalizer(self).to_hash(stats=stats)
 
+
 class Color:
     def __init__(self, nodes, hashfunc, color=(), hash_cache={}):
         self.color = color
@@ -179,21 +187,23 @@ class Color:
         self._hash_cache = hash_cache
 
     def key(self):
-        return (len(self.nodes),self.hash_color())
+        return (len(self.nodes), self.hash_color())
 
     def hash_color(self, color=None):
-        if color == None:
+        if color is None:
             color = self.color
         if color in self._hash_cache:
             return self._hash_cache[color]
+
         def stringify(x):
-            if isinstance(x,Node):
+            if isinstance(x, Node):
                 return x.n3()
-            else: return unicode(x)
+            else:
+                return unicode(x)
         if isinstance(color, Node):
             return stringify(color)
         value = sum(map(self.hashfunc, ' '.join([stringify(x) for x in color])))
-        val = u"%x"% value
+        val = u"%x" % value
         self._hash_cache[color] = val
         return val
 
@@ -202,16 +212,19 @@ class Color:
         for n in self.nodes:
             new_color = list(self.color)
             for node in W.nodes:
-                new_color += [(1,p,W.hash_color()) for s,p,o in graph.triples((n,None,node))]
-                new_color += [(W.hash_color(),p,3) for s,p,o in graph.triples((node,None,n))]
+                new_color += [
+                    (1, p, W.hash_color())
+                    for s, p, o in graph.triples((n, None, node))]
+                new_color += [
+                    (W.hash_color(), p, 3)
+                    for s, p, o in graph.triples((node, None, n))]
             new_color = tuple(new_color)
             new_hash_color = self.hash_color(new_color)
-            #print new_hash_color, n
-            #print self.color, n
-            #print '\t' + "\n\t".join([str(n) for n in new_color])
 
             if new_hash_color not in colors:
-                c = Color([],self.hashfunc, new_color, hash_cache=self._hash_cache)
+                c = Color(
+                    [], self.hashfunc, new_color,
+                    hash_cache=self._hash_cache)
                 colors[new_hash_color] = c
             colors[new_hash_color].nodes.append(n)
         return colors.values()
@@ -220,16 +233,20 @@ class Color:
         return len(self.nodes) == 1
 
     def copy(self):
-        return Color(self.nodes[:], self.hashfunc, self.color,hash_cache=self._hash_cache)
+        return Color(
+            self.nodes[:], self.hashfunc, self.color,
+            hash_cache=self._hash_cache)
+
 
 class _TripleCanonicalizer(object):
 
     def __init__(self, graph, hashfunc=sha256):
         self.graph = graph
+
         def _hashfunc(s):
             h = hashfunc()
             h.update(unicode(s).encode("utf8"))
-            return int(h.hexdigest(),16)
+            return int(h.hexdigest(), 16)
         self._hash_cache = {}
         self.hashfunc = _hashfunc
 
@@ -239,26 +256,33 @@ class _TripleCanonicalizer(object):
 
 
     def _initial_color(self):
-        '''Finds an initial color for the graph by finding all blank nodes and
+        """Finds an initial color for the graph.
+
+        Finds an initial color fo the graph by finding all blank nodes and
         non-blank nodes that are adjacent. Nodes that are not adjacent to blank
-        nodes are not included, as they are a) already colored (by URI or literal) 
-        and b) do not factor into the color of any blank node.'''
+        nodes are not included, as they are a) already colored (by URI or literal)
+        and b) do not factor into the color of any blank node.
+        """
         bnodes = set()
         others = set()
         self._neighbors = defaultdict(set)
-        for s,p,o in self.graph:
-            nodes = set([s,o])
-            b = set([x for x in nodes if isinstance(x,BNode)])
+        for s, p, o in self.graph:
+            nodes = set([s, o])
+            b = set([x for x in nodes if isinstance(x, BNode)])
             if len(b) > 0:
                 others |= nodes - b
                 bnodes |= b
-                if isinstance(s,BNode):
+                if isinstance(s, BNode):
                     self._neighbors[s].add(o)
-                if isinstance(o,BNode):
+                if isinstance(o, BNode):
                     self._neighbors[o].add(s)
         if len(bnodes) > 0:
-            return [Color(list(bnodes),self.hashfunc,hash_cache=self._hash_cache)]+[
-                    Color([x],self.hashfunc, x,hash_cache=self._hash_cache) for x in others]
+            return [
+                    Color(list(bnodes), self.hashfunc, hash_cache=self._hash_cache)
+                ] + [
+                    Color([x], self.hashfunc, x, hash_cache=self._hash_cache)
+                    for x in others
+                ]
         else:
             return []
 
@@ -267,7 +291,7 @@ class _TripleCanonicalizer(object):
         new_color.append((len(color.nodes)))
 
         color.nodes.remove(individual)
-        c = Color([individual],self.hashfunc, tuple(new_color),
+        c = Color([individual], s elf.hashfunc, tuple(new_color),
                   hash_cache=self._hash_cache)
         return c
 
@@ -301,8 +325,8 @@ class _TripleCanonicalizer(object):
         result = 0
         for triple in self.canonical_triples(stats=stats):
             result += self.hashfunc(' '.join([x.n3() for x in triple]))
-        if stats != None:
-            stats['graph_digest'] = "%x"% result
+        if stats is not None:
+            stats['graph_digest'] = "%x" % result
         return result
 
     def _experimental_path(self, coloring):
@@ -312,10 +336,10 @@ class _TripleCanonicalizer(object):
             node = color.nodes[0]
             new_color = self._individuate(color, node)
             coloring.append(new_color)
-            coloring = self._refine(coloring,[new_color])
+            coloring = self._refine(coloring, [new_color])
         return coloring
 
-    def _create_generator(self, colorings, groupings = None):
+    def _create_generator(self, colorings, groupings=None):
         if not groupings:
             groupings = defaultdict(set)
         for group in zip(*colorings):
@@ -328,7 +352,7 @@ class _TripleCanonicalizer(object):
 
     @_call_count("individuations")
     def _traces(self, coloring, stats=None, depth=[0]):
-        if stats != None and 'prunings' not in stats:
+        if stats is not None and 'prunings' not in stats:
             stats['prunings'] = 0
         depth[0] += 1
         candidates = self._get_candidates(coloring)
@@ -355,28 +379,29 @@ class _TripleCanonicalizer(object):
                     color_copy = c_copy
             new_color = self._individuate(color_copy, candidate)
             coloring_copy.append(new_color)
-            refined_coloring = self._refine(coloring_copy,[new_color])
+            refined_coloring = self._refine(coloring_copy, [new_color])
             color_score = tuple([c.key() for c in refined_coloring])
             experimental = self._experimental_path(coloring_copy)
             experimental_score = set([c.key() for c in experimental])
             if last_coloring:
-                generator = self._create_generator([last_coloring,experimental],
-                                                   generator)
+                generator = self._create_generator(
+                    [last_coloring, experimental],
+                    generator)
             last_coloring = experimental
-            if best_score == None or best_score < color_score:
+            if best_score is None or best_score < color_score:
                 best = [refined_coloring]
                 best_score = color_score
                 best_experimental = experimental
                 best_experimental_score = experimental_score
             elif best_score > color_score:
                 # prune this branch.
-                if stats != None:
+                if stats is not None:
                     stats['prunings'] += 1
             elif experimental_score != best_experimental_score:
                 best.append(refined_coloring)
             else:
                 # prune this branch.
-                if stats != None:
+                if stats is not None:
                     stats['prunings'] += 1
         discrete = [x for x in best if self._discrete(x)]
         if len(discrete) == 0:
@@ -387,7 +412,7 @@ class _TripleCanonicalizer(object):
                 d = [depth[0]]
                 new_color = self._traces(coloring, stats=stats, depth=d)
                 color_score = tuple([c.key() for c in refined_coloring])
-                if best_score == None or color_score > best_score:
+                if best_score is None or color_score > best_score:
                     discrete = [new_color]
                     best_score = color_score
                     best_depth = d[0]
@@ -395,32 +420,32 @@ class _TripleCanonicalizer(object):
         return discrete[0]
 
     def canonical_triples(self, stats=None):
-        if stats != None:
+        if stats is not None:
             start_canonicalization = datetime.now()
-        if stats != None:
+        if statsis not None:
             start_coloring = datetime.now()
         coloring = self._initial_color()
-        if stats != None:
+        if stats is not None:
             stats['triple_count'] = len(self.graph)
-            stats['adjacent_nodes']  = max(0, len(coloring) - 1)
-        coloring = self._refine(coloring,coloring[:])
-        if stats != None:
-            stats['initial_coloring__runtime'] = _total_seconds(datetime.now() - start_coloring)
+            stats['adjacent_nodes'] = max(0, len(coloring) - 1)
+        coloring = self._refine(coloring, coloring[:])
+        if stats is not None:
+            stats['initial_coloring_runtime'] = _total_seconds(datetime.now() - start_coloring)
             stats['initial_color_count'] = len(coloring)
         if not self._discrete(coloring):
             depth = [0]
             coloring = self._traces(coloring, stats=stats, depth=depth)
-            if stats != None:
+            if stats is not None:
                 stats['tree_depth'] = depth[0]
-        elif stats != None:
+        elif stats is not None:
             stats['individuations'] = 0
             stats['tree_depth'] = 0
-        if stats != None:
+        if stats is not None:
             stats['color_count'] = len(coloring)
 
         bnode_labels = dict([(c.nodes[0], c.hash_color()) for c in coloring])
-        if stats != None:
-            stats["canonicalize_triples__runtime"] = _total_seconds(datetime.now() - start_coloring)
+        if stats is not None:
+            stats["canonicalize_triples_runtime"] = _total_seconds(datetime.now() - start_coloring)
         for triple in self.graph:
             result = tuple(self._canonicalize_bnodes(triple, bnode_labels))
             yield result
@@ -440,9 +465,9 @@ def to_isomorphic(graph):
 
 
 def isomorphic(graph1, graph2):
-    """
-    Compare graph for equality. Uses an algorithm to compute unique hashes
-    which takes bnodes into account.
+    """Compare graph for equality.
+
+    Uses an algorithm to compute unique hashes which takes bnodes into account.
 
     Examples::
 
@@ -473,11 +498,12 @@ def isomorphic(graph1, graph2):
     gd1 = _TripleCanonicalizer(graph1).to_hash()
     gd2 = _TripleCanonicalizer(graph2).to_hash()
     return gd1 == gd2
-        
+
 
 
 def to_canonical_graph(g1):
-    """
+    """Creates a canonical, read-only graph.
+
     Creates a canonical, read-only graph where all bnode id:s are based on
     deterministical SHA-256 checksums, correlated with the graph contents.
     """
@@ -487,9 +513,7 @@ def to_canonical_graph(g1):
 
 
 def graph_diff(g1, g2):
-    """
-    Returns three sets of triples: "in both", "in first" and "in second".
-    """
+    """Returns three sets of triples: "in both", "in first" and "in second"."""
     # bnodes have deterministic values in canonical graphs:
     cg1 = to_canonical_graph(g1)
     cg2 = to_canonical_graph(g2)
@@ -504,7 +528,8 @@ _MOCK_BNODE = BNode()
 
 
 def similar(g1, g2):
-    """
+    """Checks if the two graphs are "similar".
+
     Checks if the two graphs are "similar", by comparing sorted triples where
     all bnodes have been replaced by a singular mock bnode (the
     ``_MOCK_BNODE``).
author	Jim McCusker <jmccusker@5amsolutions.com>	2014-12-08 23:34:45 -0500
committer	Jim McCusker <jmccusker@5amsolutions.com>	2014-12-08 23:34:45 -0500
commit	a4f2999df25f77f1d89846fff011a50bd5ff8a09 (patch)
tree	70af5106489d54572b91a7bc58a4089550da380f
parent	4be615ca52cd2487aab3c538ee706835083ba7ac (diff)
parent	8e78708e0dd3a36d8e54796f78fd27957255b47a (diff)
download	rdflib-a4f2999df25f77f1d89846fff011a50bd5ff8a09.tar.gz