diff options
author | Gunnar Aastrand Grimnes <gromgull@gmail.com> | 2013-03-28 11:03:31 +0100 |
---|---|---|
committer | Gunnar Aastrand Grimnes <gromgull@gmail.com> | 2013-03-28 11:03:31 +0100 |
commit | 78d504c44fc3a44d1e753d95737bb72a8684db4a (patch) | |
tree | d2948c99dd95d07e99aec23f1da9f28cca08a93a /rdflib/term.py | |
parent | 01a8ddaac3cab4fc99e6dd0c9504b58697b0c916 (diff) | |
download | rdflib-78d504c44fc3a44d1e753d95737bb72a8684db4a.tar.gz |
integrated xml/html literal code into normal literal class
Diffstat (limited to 'rdflib/term.py')
-rw-r--r-- | rdflib/term.py | 296 |
1 files changed, 124 insertions, 172 deletions
diff --git a/rdflib/term.py b/rdflib/term.py index 0c0307a7..cdc4cc6b 100644 --- a/rdflib/term.py +++ b/rdflib/term.py @@ -29,9 +29,7 @@ __all__ = [ 'URIRef', 'BNode', - 'HTMLLiteral', 'Literal', - 'XMLLiteral', 'Variable', 'Statement', @@ -43,6 +41,7 @@ import warnings _LOGGER = logging.getLogger(__name__) import base64 +import xml.dom.minidom from urlparse import urlparse, urljoin, urldefrag from datetime import date, time, datetime @@ -657,11 +656,15 @@ class Literal(Identifier): """ py = self.toPython() - if isinstance(py, Literal): - s = super(Literal, self).__add__(val) - return Literal(s, self.language, self.datatype) - else: - return Literal(py + val) + if not isinstance(py, Literal): + try: + return Literal(py + val) + except TypeError: + pass # fall-through + + s = unicode.__add__(self, val) + return Literal(s, self.language, self.datatype) + def __nonzero__(self): """ @@ -1016,9 +1019,8 @@ class Literal(Identifier): else: if unicode.__eq__(self, other): return True - raise TypeError( - 'I cannot know that these two lexical forms do not map to the same value: %s and %s' % (self, other)) - + raise TypeError('I cannot know that these two lexical forms do not map to the same value: %s and %s' % (self, other)) + if (self.language or "").lower() != (other.language or "").lower(): return False @@ -1036,9 +1038,15 @@ class Literal(Identifier): else: return False - # matching non-string DTs + # matching non-string DTs now - do we compare values or + # lexical form first? comparing two ints is far quicker - + # maybe there are counter examples if self.value != None and other.value != None: + + if self.datatype in (_RDF_XMLLITERAL, _RDF_HTMLLITERAL): + return _isEqualXMLNode(self.value, other.value) + return self.value == other.value else: @@ -1278,10 +1286,49 @@ class Literal(Identifier): return d.hexdigest() +def _parseXML(xmlstring): + retval = xml.dom.minidom.parseString( + "<rdflibtoplevelelement>%s</rdflibtoplevelelement>" % xmlstring) + retval.normalize() + return retval + +def _parseHTML(htmltext): + try: + import html5lib + parser = html5lib.HTMLParser( + tree=html5lib.treebuilders.getTreeBuilder("dom")) + retval = parser.parseFragment(htmltext) + retval.normalize() + return retval + except ImportError: + raise ImportError( + "HTML5 parser not available. Try installing" + + " html5lib <http://code.google.com/p/html5lib>") + + +def _writeXML(xmlnode): + if isinstance(xmlnode, xml.dom.minidom.DocumentFragment): + d=xml.dom.minidom.Document() + d.childNodes+=xmlnode.childNodes + xmlnode=d + s=xmlnode.toxml('utf-8') + # for clean round-tripping, remove headers -- I have great and + # specific worries that this will blow up later, but this margin + # is too narrow to contain them + if s.startswith(u'<?xml version="1.0" encoding="utf-8"?>'): + s=s[38:] + if s.startswith('<rdflibtoplevelelement>'): + s=s[23:-24] + if s=='<rdflibtoplevelelement/>': s='' + return s + # Cannot import Namespace/XSD because of circular dependencies _XSD_PFX = 'http://www.w3.org/2001/XMLSchema#' _RDF_PFX = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#' +_RDF_XMLLITERAL = URIRef(_RDF_PFX+'XMLLiteral') +_RDF_HTMLLITERAL = URIRef(_RDF_PFX+'HTML') + _XSD_STRING = URIRef(_XSD_PFX + 'string') _XSD_FLOAT = URIRef(_XSD_PFX + 'float') @@ -1365,6 +1412,12 @@ _PythonToXSD = [ (datetime, (lambda i:i.isoformat(), _XSD_DATETIME)), (date, (lambda i:i.isoformat(), _XSD_DATE)), (time, (lambda i:i.isoformat(), _XSD_TIME)), + (xml.dom.minidom.Document, (_writeXML, _RDF_XMLLITERAL)), + # this is a bit dirty - by accident the html5lib parser produces + # DocumentFragments, and the xml parser Documents, letting this + # decide what datatype to use makes roundtripping easier, but it a + # bit random + (xml.dom.minidom.DocumentFragment, (_writeXML, _RDF_HTMLLITERAL)) ] XSDToPython = { @@ -1395,6 +1448,8 @@ XSDToPython = { URIRef( _XSD_PFX + 'base64Binary'): lambda s: base64.b64decode(py3compat.b(s)), URIRef(_XSD_PFX + 'anyURI'): None, + _RDF_XMLLITERAL: _parseXML, + _RDF_HTMLLITERAL: _parseHTML } _toPythonMapping = {} @@ -1483,183 +1538,80 @@ class Statement(Node, tuple): _ORDERING = dict(map(reversed, enumerate([BNode, Variable, URIRef, Literal]))) -class XMLOrHTMLLiteral(Literal): - def __add__(self, val): - raise TypeError("Not a number; %s" % self) - - def __neg__(self, val): - raise TypeError("Not a number; %s" % self) - - def __abs__(self, val): - raise TypeError("Not a number; %s" % self) - - def __invert__(self, val): - raise TypeError("Not a number; %s" % self) - - def __lt__(self, val): - raise TypeError("Not a number; %s" % self) - - def __le__(self, val): - raise TypeError("Not a number; %s" % self) - - def __gt__(self, val): - raise TypeError("Not a number; %s" % self) - - def __ge__(self, val): - raise TypeError("Not a number; %s" % self) - - def toPython(self): - """ - Returns an appropriate python datatype derived from this RDF Literal - """ - return self - - @staticmethod - def _isEqualNode(node, other): - from xml.dom.minidom import Node - - def recurse(): - # Recursion through the children - # In Python2, the semantics of 'map' is such that the check on - # length would be unnecessary. In Python 3, - # the semantics of map has changed (why, oh why???) and the check - # for the length becomes necessary... - if len(node.childNodes) != len(other.childNodes): - return False - for (nc, oc) in map( - lambda x, y: (x, y), node.childNodes, other.childNodes): - if not XMLOrHTMLLiteral._isEqualNode(nc, oc): - return False - # if we got here then everything is fine: - return True - if node is None or other is None: - return False +def _isEqualXMLNode(node, other): + from xml.dom.minidom import Node - if node.nodeType != other.nodeType: + def recurse(): + # Recursion through the children + # In Python2, the semantics of 'map' is such that the check on + # length would be unnecessary. In Python 3, + # the semantics of map has changed (why, oh why???) and the check + # for the length becomes necessary... + if len(node.childNodes) != len(other.childNodes): return False - - if node.nodeType in [Node.DOCUMENT_NODE, Node.DOCUMENT_FRAGMENT_NODE]: - return recurse() - - elif node.nodeType == Node.ELEMENT_NODE: - # Get the basics right - if not (node.tagName == other.tagName - and node.namespaceURI == other.namespaceURI): - return False - - # Handle the (namespaced) attributes; the namespace setting key - # should be ignored, though - # Note that the minidom orders the keys already, so we do not have - # to worry about that, which is a bonus... - n_keys = [ - k for k in node.attributes.keysNS() - if k[0] != 'http://www.w3.org/2000/xmlns/'] - o_keys = [ - k for k in other.attributes.keysNS() - if k[0] != 'http://www.w3.org/2000/xmlns/'] - if len(n_keys) != len(o_keys): + for (nc, oc) in map( + lambda x, y: (x, y), node.childNodes, other.childNodes): + if not _isEqualXMLNode(nc, oc): return False - for k in n_keys: - if not (k in o_keys - and node.getAttributeNS(k[0], k[1]) == - other.getAttributeNS(k[0], k[1])): - return False - - # if we got here, the attributes are all right, we can go down - # the tree recursively - return recurse() - - elif node.nodeType in [ - Node.TEXT_NODE, Node.COMMENT_NODE, Node.CDATA_SECTION_NODE, - Node.NOTATION_NODE]: - return node.data == other.data + # if we got here then everything is fine: + return True - elif node.nodeType == Node.PROCESSING_INSTRUCTION_NODE: - return node.data == other.data and node.target == other.target + if node is None or other is None: + return False - elif node.nodeType == Node.ENTITY_NODE: - return node.nodeValue == other.nodeValue + if node.nodeType != other.nodeType: + return False - elif node.nodeType == Node.DOCUMENT_TYPE_NODE: - return node.publicId == other.publicId \ - and node.systemId == other.system.Id + if node.nodeType in [Node.DOCUMENT_NODE, Node.DOCUMENT_FRAGMENT_NODE]: + return recurse() - else: - # should not happen, in fact + elif node.nodeType == Node.ELEMENT_NODE: + # Get the basics right + if not (node.tagName == other.tagName + and node.namespaceURI == other.namespaceURI): return False - def __hash__(self): - """ - Cargo-culted from rdflib.term.Literal - - "Called for the key object for dictionary operations, - and by the built-in function hash(). Should return - a 32-bit integer usable as a hash value for - dictionary operations. The only required property - is that objects which compare equal have the same - hash value; it is advised to somehow mix together - (e.g., using exclusive or) the hash values for the - components of the object that also play a part in - comparison of objects." -- 3.4.1 Basic customization (Python) - - "Two literals are equal if and only if all of the following hold: - * The strings of the two lexical forms compare equal, character by - character. - * Either both or neither have language tags. - * The language tags, if any, compare equal. - * Either both or neither have datatype URIs. - * The two datatype URIs, if any, compare equal, character by - character." - -- 6.5.1 Literal Equality (RDF: Concepts and Abstract Syntax) - - """ - - return Identifier.__hash__(self) - - -class XMLLiteral(XMLOrHTMLLiteral): - def _toCompareValue(self): - from xml.dom.minidom import parseString - retval = parseString( - "<rdflibtoplevelelement>%s</rdflibtoplevelelement>" % self) - retval.normalize() - return retval - - def __eq__(self, other): - if other is not None and isinstance(other, XMLLiteral): - return XMLOrHTMLLiteral._isEqualNode( - self._cmp_value, other._cmp_value) - else: + # Handle the (namespaced) attributes; the namespace setting key + # should be ignored, though + # Note that the minidom orders the keys already, so we do not have + # to worry about that, which is a bonus... + n_keys = [ + k for k in node.attributes.keysNS() + if k[0] != 'http://www.w3.org/2000/xmlns/'] + o_keys = [ + k for k in other.attributes.keysNS() + if k[0] != 'http://www.w3.org/2000/xmlns/'] + if len(n_keys) != len(o_keys): return False + for k in n_keys: + if not (k in o_keys + and node.getAttributeNS(k[0], k[1]) == + other.getAttributeNS(k[0], k[1])): + return False - def __hash__(self): - return XMLOrHTMLLiteral.__hash__(self) + # if we got here, the attributes are all right, we can go down + # the tree recursively + return recurse() + elif node.nodeType in [ + Node.TEXT_NODE, Node.COMMENT_NODE, Node.CDATA_SECTION_NODE, + Node.NOTATION_NODE]: + return node.data == other.data -class HTMLLiteral(XMLOrHTMLLiteral): - def _toCompareValue(self): - try: - import html5lib - parser = html5lib.HTMLParser( - tree=html5lib.treebuilders.getTreeBuilder("dom")) - retval = parser.parseFragment("%s" % self) - retval.normalize() - return retval - except ImportError: - raise ImportError( - "HTML5 parser not available. Try installing" + - " html5lib <http://code.google.com/p/html5lib>") + elif node.nodeType == Node.PROCESSING_INSTRUCTION_NODE: + return node.data == other.data and node.target == other.target - def __eq__(self, other): - if other is not None and isinstance(other, HTMLLiteral): - return XMLOrHTMLLiteral._isEqualNode( - self._cmp_value, other._cmp_value) - else: - return False + elif node.nodeType == Node.ENTITY_NODE: + return node.nodeValue == other.nodeValue - def __hash__(self): - return XMLOrHTMLLiteral.__hash__(self) + elif node.nodeType == Node.DOCUMENT_TYPE_NODE: + return node.publicId == other.publicId \ + and node.systemId == other.system.Id + + else: + # should not happen, in fact + raise Exception('I dont know how to compare XML Node type: %s'%node.nodeType) if __name__ == '__main__': import doctest |