integrated xml/html literal code into normal literal class

author: Gunnar Aastrand Grimnes <gromgull@gmail.com> 2013-03-28 11:03:31 +0100
committer: Gunnar Aastrand Grimnes <gromgull@gmail.com> 2013-03-28 11:03:31 +0100
commit: 78d504c44fc3a44d1e753d95737bb72a8684db4a (patch)
tree: d2948c99dd95d07e99aec23f1da9f28cca08a93a /rdflib/term.py
parent: 01a8ddaac3cab4fc99e6dd0c9504b58697b0c916 (diff)
download: rdflib-78d504c44fc3a44d1e753d95737bb72a8684db4a.tar.gz
1 files changed, 124 insertions, 172 deletions
diff --git a/rdflib/term.py b/rdflib/term.py
index 0c0307a7..cdc4cc6b 100644
--- a/rdflib/term.py
+++ b/rdflib/term.py
@@ -29,9 +29,7 @@ __all__ = [
 
     'URIRef',
     'BNode',
-    'HTMLLiteral',
     'Literal',
-    'XMLLiteral',
 
     'Variable',
     'Statement',
@@ -43,6 +41,7 @@ import warnings
 _LOGGER = logging.getLogger(__name__)
 
 import base64
+import xml.dom.minidom 
 
 from urlparse import urlparse, urljoin, urldefrag
 from datetime import date, time, datetime
@@ -657,11 +656,15 @@ class Literal(Identifier):
         """
 
         py = self.toPython()
-        if isinstance(py, Literal):
-            s = super(Literal, self).__add__(val)
-            return Literal(s, self.language, self.datatype)
-        else:
-            return Literal(py + val)
+        if not isinstance(py, Literal):
+            try: 
+                return Literal(py + val)
+            except TypeError:
+                pass # fall-through
+
+        s = unicode.__add__(self, val)
+        return Literal(s, self.language, self.datatype)
+                
 
     def __nonzero__(self):
         """
@@ -1016,9 +1019,8 @@ class Literal(Identifier):
                 else:
                     if unicode.__eq__(self, other):
                         return True
-                    raise TypeError(
-                        'I cannot know that these two lexical forms do not map to the same value: %s and %s' % (self, other))
-
+                    raise TypeError('I cannot know that these two lexical forms do not map to the same value: %s and %s' % (self, other))
+                
             if (self.language or "").lower() != (other.language or "").lower():
                 return False
 
@@ -1036,9 +1038,15 @@ class Literal(Identifier):
                 else:
                     return False
 
-            # matching non-string DTs
+            # matching non-string DTs now - do we compare values or
+            # lexical form first?  comparing two ints is far quicker -
+            # maybe there are counter examples
 
             if self.value != None and other.value != None:
+
+                if self.datatype in (_RDF_XMLLITERAL, _RDF_HTMLLITERAL):
+                    return _isEqualXMLNode(self.value, other.value)
+                
                 return self.value == other.value
             else:
 
@@ -1278,10 +1286,49 @@ class Literal(Identifier):
         return d.hexdigest()
 
 
+def _parseXML(xmlstring):
+    retval = xml.dom.minidom.parseString(
+        "<rdflibtoplevelelement>%s</rdflibtoplevelelement>" % xmlstring)
+    retval.normalize()
+    return retval
+
+def _parseHTML(htmltext):
+    try:
+        import html5lib
+        parser = html5lib.HTMLParser(
+            tree=html5lib.treebuilders.getTreeBuilder("dom"))
+        retval = parser.parseFragment(htmltext)
+        retval.normalize()
+        return retval
+    except ImportError:
+        raise ImportError(
+            "HTML5 parser not available. Try installing" +
+            " html5lib <http://code.google.com/p/html5lib>")
+
+
+def _writeXML(xmlnode): 
+    if isinstance(xmlnode, xml.dom.minidom.DocumentFragment):
+        d=xml.dom.minidom.Document()
+        d.childNodes+=xmlnode.childNodes
+        xmlnode=d
+    s=xmlnode.toxml('utf-8')
+    # for clean round-tripping, remove headers -- I have great and
+    # specific worries that this will blow up later, but this margin
+    # is too narrow to contain them
+    if s.startswith(u'<?xml version="1.0" encoding="utf-8"?>'): 
+        s=s[38:]
+    if s.startswith('<rdflibtoplevelelement>'):
+        s=s[23:-24]
+    if s=='<rdflibtoplevelelement/>': s=''
+    return s
+
 # Cannot import Namespace/XSD because of circular dependencies
 _XSD_PFX = 'http://www.w3.org/2001/XMLSchema#'
 _RDF_PFX = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'
 
+_RDF_XMLLITERAL = URIRef(_RDF_PFX+'XMLLiteral')
+_RDF_HTMLLITERAL = URIRef(_RDF_PFX+'HTML')
+
 _XSD_STRING = URIRef(_XSD_PFX + 'string')
 
 _XSD_FLOAT = URIRef(_XSD_PFX + 'float')
@@ -1365,6 +1412,12 @@ _PythonToXSD = [
     (datetime, (lambda i:i.isoformat(), _XSD_DATETIME)),
     (date, (lambda i:i.isoformat(), _XSD_DATE)),
     (time, (lambda i:i.isoformat(), _XSD_TIME)),
+    (xml.dom.minidom.Document, (_writeXML, _RDF_XMLLITERAL)),
+    # this is a bit dirty - by accident the html5lib parser produces
+    # DocumentFragments, and the xml parser Documents, letting this
+    # decide what datatype to use makes roundtripping easier, but it a
+    # bit random
+    (xml.dom.minidom.DocumentFragment, (_writeXML, _RDF_HTMLLITERAL)) 
 ]
 
 XSDToPython = {
@@ -1395,6 +1448,8 @@ XSDToPython = {
     URIRef(
         _XSD_PFX + 'base64Binary'): lambda s: base64.b64decode(py3compat.b(s)),
     URIRef(_XSD_PFX + 'anyURI'): None,
+    _RDF_XMLLITERAL: _parseXML, 
+    _RDF_HTMLLITERAL: _parseHTML
 }
 
 _toPythonMapping = {}
@@ -1483,183 +1538,80 @@ class Statement(Node, tuple):
 _ORDERING = dict(map(reversed, enumerate([BNode, Variable, URIRef, Literal])))
 
 
-class XMLOrHTMLLiteral(Literal):
-    def __add__(self, val):
-        raise TypeError("Not a number; %s" % self)
-
-    def __neg__(self, val):
-        raise TypeError("Not a number; %s" % self)
-
-    def __abs__(self, val):
-        raise TypeError("Not a number; %s" % self)
-
-    def __invert__(self, val):
-        raise TypeError("Not a number; %s" % self)
-
-    def __lt__(self, val):
-        raise TypeError("Not a number; %s" % self)
-
-    def __le__(self, val):
-        raise TypeError("Not a number; %s" % self)
-
-    def __gt__(self, val):
-        raise TypeError("Not a number; %s" % self)
-
-    def __ge__(self, val):
-        raise TypeError("Not a number; %s" % self)
-
-    def toPython(self):
-        """
-        Returns an appropriate python datatype derived from this RDF Literal
-        """
-        return self
-
-    @staticmethod
-    def _isEqualNode(node, other):
-        from xml.dom.minidom import Node
-
-        def recurse():
-            # Recursion through the children
-            # In Python2, the semantics of 'map' is such that the check on
-            # length would be unnecessary. In Python 3,
-            # the semantics of map has changed (why, oh why???) and the check
-            # for the length becomes necessary...
-            if len(node.childNodes) != len(other.childNodes):
-                return False
-            for (nc, oc) in map(
-                    lambda x, y: (x, y), node.childNodes, other.childNodes):
-                if not XMLOrHTMLLiteral._isEqualNode(nc, oc):
-                    return False
-            # if we got here then everything is fine:
-            return True
 
-        if node is None or other is None:
-            return False
+def _isEqualXMLNode(node, other):
+    from xml.dom.minidom import Node
 
-        if node.nodeType != other.nodeType:
+    def recurse():
+        # Recursion through the children
+        # In Python2, the semantics of 'map' is such that the check on
+        # length would be unnecessary. In Python 3,
+        # the semantics of map has changed (why, oh why???) and the check
+        # for the length becomes necessary...
+        if len(node.childNodes) != len(other.childNodes):
             return False
-
-        if node.nodeType in [Node.DOCUMENT_NODE, Node.DOCUMENT_FRAGMENT_NODE]:
-            return recurse()
-
-        elif node.nodeType == Node.ELEMENT_NODE:
-            # Get the basics right
-            if not (node.tagName == other.tagName
-                    and node.namespaceURI == other.namespaceURI):
-                return False
-
-            # Handle the (namespaced) attributes; the namespace setting key
-            # should be ignored, though
-            # Note that the minidom orders the keys already, so we do not have
-            # to worry about that, which is a bonus...
-            n_keys = [
-                k for k in node.attributes.keysNS()
-                if k[0] != 'http://www.w3.org/2000/xmlns/']
-            o_keys = [
-                k for k in other.attributes.keysNS()
-                if k[0] != 'http://www.w3.org/2000/xmlns/']
-            if len(n_keys) != len(o_keys):
+        for (nc, oc) in map(
+                lambda x, y: (x, y), node.childNodes, other.childNodes):
+            if not _isEqualXMLNode(nc, oc):
                 return False
-            for k in n_keys:
-                if not (k in o_keys
-                        and node.getAttributeNS(k[0], k[1]) ==
-                        other.getAttributeNS(k[0], k[1])):
-                    return False
-
-            # if we got here, the attributes are all right, we can go down
-            # the tree recursively
-            return recurse()
-
-        elif node.nodeType in [
-                Node.TEXT_NODE, Node.COMMENT_NODE, Node.CDATA_SECTION_NODE,
-                Node.NOTATION_NODE]:
-            return node.data == other.data
+        # if we got here then everything is fine:
+        return True
 
-        elif node.nodeType == Node.PROCESSING_INSTRUCTION_NODE:
-            return node.data == other.data and node.target == other.target
+    if node is None or other is None:
+        return False
 
-        elif node.nodeType == Node.ENTITY_NODE:
-            return node.nodeValue == other.nodeValue
+    if node.nodeType != other.nodeType:
+        return False
 
-        elif node.nodeType == Node.DOCUMENT_TYPE_NODE:
-            return node.publicId == other.publicId \
-                and node.systemId == other.system.Id
+    if node.nodeType in [Node.DOCUMENT_NODE, Node.DOCUMENT_FRAGMENT_NODE]:
+        return recurse()
 
-        else:
-            # should not happen, in fact
+    elif node.nodeType == Node.ELEMENT_NODE:
+        # Get the basics right
+        if not (node.tagName == other.tagName
+                and node.namespaceURI == other.namespaceURI):
             return False
 
-    def __hash__(self):
-        """
-        Cargo-culted from rdflib.term.Literal
-
-        "Called for the key object for dictionary operations,
-        and by the built-in function hash(). Should return
-        a 32-bit integer usable as a hash value for
-        dictionary operations. The only required property
-        is that objects which compare equal have the same
-        hash value; it is advised to somehow mix together
-        (e.g., using exclusive or) the hash values for the
-        components of the object that also play a part in
-        comparison of objects." -- 3.4.1 Basic customization (Python)
-
-        "Two literals are equal if and only if all of the following hold:
-        * The strings of the two lexical forms compare equal, character by
-        character.
-        * Either both or neither have language tags.
-        * The language tags, if any, compare equal.
-        * Either both or neither have datatype URIs.
-        * The two datatype URIs, if any, compare equal, character by
-        character."
-        -- 6.5.1 Literal Equality (RDF: Concepts and Abstract Syntax)
-
-        """
-
-        return Identifier.__hash__(self)
-
-
-class XMLLiteral(XMLOrHTMLLiteral):
-    def _toCompareValue(self):
-        from xml.dom.minidom import parseString
-        retval = parseString(
-            "<rdflibtoplevelelement>%s</rdflibtoplevelelement>" % self)
-        retval.normalize()
-        return retval
-
-    def __eq__(self, other):
-        if other is not None and isinstance(other, XMLLiteral):
-            return XMLOrHTMLLiteral._isEqualNode(
-                self._cmp_value, other._cmp_value)
-        else:
+        # Handle the (namespaced) attributes; the namespace setting key
+        # should be ignored, though
+        # Note that the minidom orders the keys already, so we do not have
+        # to worry about that, which is a bonus...
+        n_keys = [
+            k for k in node.attributes.keysNS()
+            if k[0] != 'http://www.w3.org/2000/xmlns/']
+        o_keys = [
+            k for k in other.attributes.keysNS()
+            if k[0] != 'http://www.w3.org/2000/xmlns/']
+        if len(n_keys) != len(o_keys):
             return False
+        for k in n_keys:
+            if not (k in o_keys
+                    and node.getAttributeNS(k[0], k[1]) ==
+                    other.getAttributeNS(k[0], k[1])):
+                return False
 
-    def __hash__(self):
-        return XMLOrHTMLLiteral.__hash__(self)
+        # if we got here, the attributes are all right, we can go down
+        # the tree recursively
+        return recurse()
 
+    elif node.nodeType in [
+            Node.TEXT_NODE, Node.COMMENT_NODE, Node.CDATA_SECTION_NODE,
+            Node.NOTATION_NODE]:
+        return node.data == other.data
 
-class HTMLLiteral(XMLOrHTMLLiteral):
-    def _toCompareValue(self):
-        try:
-            import html5lib
-            parser = html5lib.HTMLParser(
-                tree=html5lib.treebuilders.getTreeBuilder("dom"))
-            retval = parser.parseFragment("%s" % self)
-            retval.normalize()
-            return retval
-        except ImportError:
-            raise ImportError(
-                "HTML5 parser not available. Try installing" +
-                " html5lib <http://code.google.com/p/html5lib>")
+    elif node.nodeType == Node.PROCESSING_INSTRUCTION_NODE:
+        return node.data == other.data and node.target == other.target
 
-    def __eq__(self, other):
-        if other is not None and isinstance(other, HTMLLiteral):
-            return XMLOrHTMLLiteral._isEqualNode(
-                self._cmp_value, other._cmp_value)
-        else:
-            return False
+    elif node.nodeType == Node.ENTITY_NODE:
+        return node.nodeValue == other.nodeValue
 
-    def __hash__(self):
-        return XMLOrHTMLLiteral.__hash__(self)
+    elif node.nodeType == Node.DOCUMENT_TYPE_NODE:
+        return node.publicId == other.publicId \
+            and node.systemId == other.system.Id
+
+    else:
+        # should not happen, in fact
+        raise Exception('I dont know how to compare XML Node type: %s'%node.nodeType)
 
 if __name__ == '__main__':
     import doctest
author	Gunnar Aastrand Grimnes <gromgull@gmail.com>	2013-03-28 11:03:31 +0100
committer	Gunnar Aastrand Grimnes <gromgull@gmail.com>	2013-03-28 11:03:31 +0100
commit	78d504c44fc3a44d1e753d95737bb72a8684db4a (patch)
tree	d2948c99dd95d07e99aec23f1da9f28cca08a93a /rdflib/term.py
parent	01a8ddaac3cab4fc99e6dd0c9504b58697b0c916 (diff)
download	rdflib-78d504c44fc3a44d1e753d95737bb72a8684db4a.tar.gz