# topological.py # Copyright (C) 2005,2006 Michael Bayer mike_mp@zzzcomputing.com # # This module is part of SQLAlchemy and is released under # the MIT License: http://www.opensource.org/licenses/mit-license.php """topological sorting algorithms. the key to the unit of work is to assemble a list of dependencies amongst all the different mappers that have been defined for classes. Related tables with foreign key constraints have a definite insert order, deletion order, objects need dependent properties from parent objects set up before saved, etc. These are all encoded as dependencies, in the form "mapper X is dependent on mapper Y", meaning mapper Y's objects must be saved before those of mapper X, and mapper X's objects must be deleted before those of mapper Y. The topological sort is an algorithm that receives this list of dependencies as a "partial ordering", that is a list of pairs which might say, "X is dependent on Y", "Q is dependent on Z", but does not necessarily tell you anything about Q being dependent on X. Therefore, its not a straight sort where every element can be compared to another...only some of the elements have any sorting preference, and then only towards just some of the other elements. For a particular partial ordering, there can be many possible sorts that satisfy the conditions. An intrinsic "gotcha" to this algorithm is that since there are many possible outcomes to sorting a partial ordering, the algorithm can return any number of different results for the same input; just running it on a different machine architecture, or just random differences in the ordering of dictionaries, can change the result that is returned. While this result is guaranteed to be true to the incoming partial ordering, if the partial ordering itself does not properly represent the dependencies, code that works fine will suddenly break, then work again, then break, etc. Most of the bugs I've chased down while developing the "unit of work" have been of this nature - very tricky to reproduce and track down, particularly before I realized this characteristic of the algorithm. """ import string, StringIO from sets import * import sqlalchemy.util as util from sqlalchemy.exceptions import * class QueueDependencySorter(object): """this is a topological sort from wikipedia. its very stable. it creates a straight-line list of elements, then a second pass groups non-dependent actions together to build more of a tree structure with siblings.""" class Node: """represents a node in a tree. stores an 'item' which represents the dependent thing we are talking about. if node 'a' is an ancestor node of node 'b', it means 'a's item is *not* dependent on that of 'b'.""" def __init__(self, item): self.item = item self.edges = {} self.dependencies = {} self.children = [] self.cycles = None def __str__(self): return self.safestr() def safestr(self, indent=0): return (' ' * indent) + "%s (idself=%s)" % (str(self.item), repr(id(self))) + repr(self.cycles) + "\n" + string.join([n.safestr(indent + 1) for n in self.children], '') def describe(self): return "%s (idself=%s)" % (str(self.item), repr(id(self))) def __repr__(self): return self.describe() def __init__(self, tuples, allitems): self.tuples = tuples self.allitems = allitems def _dump_edges(self, edges): s = StringIO.StringIO() for key, value in edges.iteritems(): for c in value.keys(): s.write("%s->%s\n" % (repr(key), repr(c))) return s.getvalue() def sort(self, allow_self_cycles=True, allow_all_cycles=False): (tuples, allitems) = (self.tuples, self.allitems) #print "\n---------------------------------\n" #print repr([t for t in tuples]) #print repr([a for a in allitems]) #print "\n---------------------------------\n" nodes = {} edges = {} for item in allitems + [t[0] for t in tuples] + [t[1] for t in tuples]: if not nodes.has_key(item): node = QueueDependencySorter.Node(item) nodes[item] = node edges[node] = {} for t in tuples: if t[0] is t[1]: if allow_self_cycles: n = nodes[t[0]] n.cycles = Set([n]) continue else: raise CommitError("Self-referential dependency detected " + repr(t)) childnode = nodes[t[1]] parentnode = nodes[t[0]] self._add_edge(edges, (parentnode, childnode)) queue = [] for n in nodes.values(): if len(n.edges) == 0: queue.append(n) cycles = {} output = [] while len(edges) > 0: #print self._dump_edges(edges) if len(queue) == 0: # edges remain but no edgeless nodes to remove; this indicates # a cycle if allow_all_cycles: cycle = self._find_cycle(edges) lead = cycle[0][0] lead.cycles = Set() for edge in cycle: n = self._remove_edge(edges, edge) lead.cycles.add(edge[0]) lead.cycles.add(edge[1]) if n is not None: queue.append(n) if n is not lead: n._cyclical = True # loop through cycle # remove edges from the edge dictionary # install the cycled nodes in the "cycle" list of one of the nodes continue else: # long cycles not allowed raise CommitError("Circular dependency detected " + repr(edges) + repr(queue)) node = queue.pop() if not hasattr(node, '_cyclical'): output.append(node) nodeedges = edges.pop(node, None) if nodeedges is None: continue for childnode in nodeedges.keys(): del childnode.edges[node] if len(childnode.edges) == 0: queue.append(childnode) #print repr(output) head = None node = None for o in output: if head is None: head = o node = o else: for x in node.children: if x.dependencies.has_key(o): node = x node.children.append(o) return head def _add_edge(self, edges, edge): (parentnode, childnode) = edge edges[parentnode][childnode] = True parentnode.dependencies[childnode] = True childnode.edges[parentnode] = True def _remove_edge(self, edges, edge): (parentnode, childnode) = edge del edges[parentnode][childnode] del childnode.edges[parentnode] del parentnode.dependencies[childnode] if len(childnode.edges) == 0: return childnode def _find_cycle(self, edges): """given a structure of edges, locates a cycle in the strucure and returns as a list of tuples representing edges involved in the cycle.""" seen = Set() cycled_edges = [] def traverse(d, parent=None): for key in d.keys(): if not edges.has_key(key): continue if key in seen: if parent is not None: cycled_edges.append((parent, key)) return key seen.add(key) x = traverse(edges[key], parent=key) if x is None: seen.remove(key) else: if parent is not None: cycled_edges.append((parent, key)) return x else: return None s = traverse(edges) if s is None: return None else: return cycled_edges class TreeDependencySorter(object): """ this is my first topological sorting algorithm. its crazy, but matched my thinking at the time. it also creates the kind of structure I want. but, I am not 100% sure it works in all cases since I always did really poorly in linear algebra. anyway, I got the other one above to produce a tree structure too so we should be OK. """ class Node: """represents a node in a tree. stores an 'item' which represents the dependent thing we are talking about. if node 'a' is an ancestor node of node 'b', it means 'a's item is *not* dependent on that of 'b'.""" def __init__(self, item): #print "new node on " + str(item) self.item = item self.children = HashSet() self.parent = None def append(self, node): """appends the given node as a child on this node. removes the node from its preexisting parent.""" if node.parent is not None: del node.parent.children[node] self.children.append(node) node.parent = self def is_descendant_of(self, node): """returns true if this node is a descendant of the given node""" n = self while n is not None: if n is node: return True else: n = n.parent return False def get_root(self): """returns the highest ancestor node of this node, i.e. which has no parent""" n = self while n.parent is not None: n = n.parent return n def get_sibling_ancestor(self, node): """returns the node which is: - an ancestor of this node - is a sibling of the given node - not an ancestor of the given node - else returns this node's root node.""" n = self while n.parent is not None and n.parent is not node.parent and not node.is_descendant_of(n.parent): n = n.parent return n def __str__(self): return self.safestr({}) def safestr(self, hash, indent = 0): if hash.has_key(self): return (' ' * indent) + "RECURSIVE:%s(%s, %s)" % (str(self.item), repr(id(self)), self.parent and repr(id(self.parent)) or 'None') hash[self] = True return (' ' * indent) + "%s (idself=%s, idparent=%s)" % (str(self.item), repr(id(self)), self.parent and repr(id(self.parent)) or "None") + "\n" + string.join([n.safestr(hash, indent + 1) for n in self.children], '') def describe(self): return "%s (idself=%s)" % (str(self.item), repr(id(self))) def __init__(self, tuples, allitems): self.tuples = tuples self.allitems = allitems def sort(self): (tuples, allitems) = (self.tuples, self.allitems) nodes = {} # make nodes for all the items and store in the hash for item in allitems + [t[0] for t in tuples] + [t[1] for t in tuples]: if not nodes.has_key(item): nodes[item] = TreeDependencySorter.Node(item) # loop through tuples for tup in tuples: (parent, child) = (tup[0], tup[1]) # get parent node parentnode = nodes[parent] # if parent is child, mark "circular" attribute on the node if parent is child: parentnode.circular = True # and just continue continue # get child node childnode = nodes[child] if parentnode.parent is childnode: # check for "a switch" t = parentnode.item parentnode.item = childnode.item childnode.item = t nodes[parentnode.item] = parentnode nodes[childnode.item] = childnode elif parentnode.is_descendant_of(childnode): # check for a line thats backwards with nodes in between, this is a # circular dependency (although confirmation on this would be helpful) raise CommitError("Circular dependency detected") elif not childnode.is_descendant_of(parentnode): # if relationship doesnt exist, connect nodes together root = childnode.get_sibling_ancestor(parentnode) parentnode.append(root) # now we have a collection of subtrees which represent dependencies. # go through the collection root nodes wire them together into one tree head = None for node in nodes.values(): if node.parent is None: if head is not None: head.append(node) else: head = node #print str(head) return head