summaryrefslogtreecommitdiff
path: root/rdflib/term.py
diff options
context:
space:
mode:
authorGunnar Aastrand Grimnes <gromgull@gmail.com>2013-03-28 11:03:31 +0100
committerGunnar Aastrand Grimnes <gromgull@gmail.com>2013-03-28 11:03:31 +0100
commit78d504c44fc3a44d1e753d95737bb72a8684db4a (patch)
treed2948c99dd95d07e99aec23f1da9f28cca08a93a /rdflib/term.py
parent01a8ddaac3cab4fc99e6dd0c9504b58697b0c916 (diff)
downloadrdflib-78d504c44fc3a44d1e753d95737bb72a8684db4a.tar.gz
integrated xml/html literal code into normal literal class
Diffstat (limited to 'rdflib/term.py')
-rw-r--r--rdflib/term.py296
1 files changed, 124 insertions, 172 deletions
diff --git a/rdflib/term.py b/rdflib/term.py
index 0c0307a7..cdc4cc6b 100644
--- a/rdflib/term.py
+++ b/rdflib/term.py
@@ -29,9 +29,7 @@ __all__ = [
'URIRef',
'BNode',
- 'HTMLLiteral',
'Literal',
- 'XMLLiteral',
'Variable',
'Statement',
@@ -43,6 +41,7 @@ import warnings
_LOGGER = logging.getLogger(__name__)
import base64
+import xml.dom.minidom
from urlparse import urlparse, urljoin, urldefrag
from datetime import date, time, datetime
@@ -657,11 +656,15 @@ class Literal(Identifier):
"""
py = self.toPython()
- if isinstance(py, Literal):
- s = super(Literal, self).__add__(val)
- return Literal(s, self.language, self.datatype)
- else:
- return Literal(py + val)
+ if not isinstance(py, Literal):
+ try:
+ return Literal(py + val)
+ except TypeError:
+ pass # fall-through
+
+ s = unicode.__add__(self, val)
+ return Literal(s, self.language, self.datatype)
+
def __nonzero__(self):
"""
@@ -1016,9 +1019,8 @@ class Literal(Identifier):
else:
if unicode.__eq__(self, other):
return True
- raise TypeError(
- 'I cannot know that these two lexical forms do not map to the same value: %s and %s' % (self, other))
-
+ raise TypeError('I cannot know that these two lexical forms do not map to the same value: %s and %s' % (self, other))
+
if (self.language or "").lower() != (other.language or "").lower():
return False
@@ -1036,9 +1038,15 @@ class Literal(Identifier):
else:
return False
- # matching non-string DTs
+ # matching non-string DTs now - do we compare values or
+ # lexical form first? comparing two ints is far quicker -
+ # maybe there are counter examples
if self.value != None and other.value != None:
+
+ if self.datatype in (_RDF_XMLLITERAL, _RDF_HTMLLITERAL):
+ return _isEqualXMLNode(self.value, other.value)
+
return self.value == other.value
else:
@@ -1278,10 +1286,49 @@ class Literal(Identifier):
return d.hexdigest()
+def _parseXML(xmlstring):
+ retval = xml.dom.minidom.parseString(
+ "<rdflibtoplevelelement>%s</rdflibtoplevelelement>" % xmlstring)
+ retval.normalize()
+ return retval
+
+def _parseHTML(htmltext):
+ try:
+ import html5lib
+ parser = html5lib.HTMLParser(
+ tree=html5lib.treebuilders.getTreeBuilder("dom"))
+ retval = parser.parseFragment(htmltext)
+ retval.normalize()
+ return retval
+ except ImportError:
+ raise ImportError(
+ "HTML5 parser not available. Try installing" +
+ " html5lib <http://code.google.com/p/html5lib>")
+
+
+def _writeXML(xmlnode):
+ if isinstance(xmlnode, xml.dom.minidom.DocumentFragment):
+ d=xml.dom.minidom.Document()
+ d.childNodes+=xmlnode.childNodes
+ xmlnode=d
+ s=xmlnode.toxml('utf-8')
+ # for clean round-tripping, remove headers -- I have great and
+ # specific worries that this will blow up later, but this margin
+ # is too narrow to contain them
+ if s.startswith(u'<?xml version="1.0" encoding="utf-8"?>'):
+ s=s[38:]
+ if s.startswith('<rdflibtoplevelelement>'):
+ s=s[23:-24]
+ if s=='<rdflibtoplevelelement/>': s=''
+ return s
+
# Cannot import Namespace/XSD because of circular dependencies
_XSD_PFX = 'http://www.w3.org/2001/XMLSchema#'
_RDF_PFX = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'
+_RDF_XMLLITERAL = URIRef(_RDF_PFX+'XMLLiteral')
+_RDF_HTMLLITERAL = URIRef(_RDF_PFX+'HTML')
+
_XSD_STRING = URIRef(_XSD_PFX + 'string')
_XSD_FLOAT = URIRef(_XSD_PFX + 'float')
@@ -1365,6 +1412,12 @@ _PythonToXSD = [
(datetime, (lambda i:i.isoformat(), _XSD_DATETIME)),
(date, (lambda i:i.isoformat(), _XSD_DATE)),
(time, (lambda i:i.isoformat(), _XSD_TIME)),
+ (xml.dom.minidom.Document, (_writeXML, _RDF_XMLLITERAL)),
+ # this is a bit dirty - by accident the html5lib parser produces
+ # DocumentFragments, and the xml parser Documents, letting this
+ # decide what datatype to use makes roundtripping easier, but it a
+ # bit random
+ (xml.dom.minidom.DocumentFragment, (_writeXML, _RDF_HTMLLITERAL))
]
XSDToPython = {
@@ -1395,6 +1448,8 @@ XSDToPython = {
URIRef(
_XSD_PFX + 'base64Binary'): lambda s: base64.b64decode(py3compat.b(s)),
URIRef(_XSD_PFX + 'anyURI'): None,
+ _RDF_XMLLITERAL: _parseXML,
+ _RDF_HTMLLITERAL: _parseHTML
}
_toPythonMapping = {}
@@ -1483,183 +1538,80 @@ class Statement(Node, tuple):
_ORDERING = dict(map(reversed, enumerate([BNode, Variable, URIRef, Literal])))
-class XMLOrHTMLLiteral(Literal):
- def __add__(self, val):
- raise TypeError("Not a number; %s" % self)
-
- def __neg__(self, val):
- raise TypeError("Not a number; %s" % self)
-
- def __abs__(self, val):
- raise TypeError("Not a number; %s" % self)
-
- def __invert__(self, val):
- raise TypeError("Not a number; %s" % self)
-
- def __lt__(self, val):
- raise TypeError("Not a number; %s" % self)
-
- def __le__(self, val):
- raise TypeError("Not a number; %s" % self)
-
- def __gt__(self, val):
- raise TypeError("Not a number; %s" % self)
-
- def __ge__(self, val):
- raise TypeError("Not a number; %s" % self)
-
- def toPython(self):
- """
- Returns an appropriate python datatype derived from this RDF Literal
- """
- return self
-
- @staticmethod
- def _isEqualNode(node, other):
- from xml.dom.minidom import Node
-
- def recurse():
- # Recursion through the children
- # In Python2, the semantics of 'map' is such that the check on
- # length would be unnecessary. In Python 3,
- # the semantics of map has changed (why, oh why???) and the check
- # for the length becomes necessary...
- if len(node.childNodes) != len(other.childNodes):
- return False
- for (nc, oc) in map(
- lambda x, y: (x, y), node.childNodes, other.childNodes):
- if not XMLOrHTMLLiteral._isEqualNode(nc, oc):
- return False
- # if we got here then everything is fine:
- return True
- if node is None or other is None:
- return False
+def _isEqualXMLNode(node, other):
+ from xml.dom.minidom import Node
- if node.nodeType != other.nodeType:
+ def recurse():
+ # Recursion through the children
+ # In Python2, the semantics of 'map' is such that the check on
+ # length would be unnecessary. In Python 3,
+ # the semantics of map has changed (why, oh why???) and the check
+ # for the length becomes necessary...
+ if len(node.childNodes) != len(other.childNodes):
return False
-
- if node.nodeType in [Node.DOCUMENT_NODE, Node.DOCUMENT_FRAGMENT_NODE]:
- return recurse()
-
- elif node.nodeType == Node.ELEMENT_NODE:
- # Get the basics right
- if not (node.tagName == other.tagName
- and node.namespaceURI == other.namespaceURI):
- return False
-
- # Handle the (namespaced) attributes; the namespace setting key
- # should be ignored, though
- # Note that the minidom orders the keys already, so we do not have
- # to worry about that, which is a bonus...
- n_keys = [
- k for k in node.attributes.keysNS()
- if k[0] != 'http://www.w3.org/2000/xmlns/']
- o_keys = [
- k for k in other.attributes.keysNS()
- if k[0] != 'http://www.w3.org/2000/xmlns/']
- if len(n_keys) != len(o_keys):
+ for (nc, oc) in map(
+ lambda x, y: (x, y), node.childNodes, other.childNodes):
+ if not _isEqualXMLNode(nc, oc):
return False
- for k in n_keys:
- if not (k in o_keys
- and node.getAttributeNS(k[0], k[1]) ==
- other.getAttributeNS(k[0], k[1])):
- return False
-
- # if we got here, the attributes are all right, we can go down
- # the tree recursively
- return recurse()
-
- elif node.nodeType in [
- Node.TEXT_NODE, Node.COMMENT_NODE, Node.CDATA_SECTION_NODE,
- Node.NOTATION_NODE]:
- return node.data == other.data
+ # if we got here then everything is fine:
+ return True
- elif node.nodeType == Node.PROCESSING_INSTRUCTION_NODE:
- return node.data == other.data and node.target == other.target
+ if node is None or other is None:
+ return False
- elif node.nodeType == Node.ENTITY_NODE:
- return node.nodeValue == other.nodeValue
+ if node.nodeType != other.nodeType:
+ return False
- elif node.nodeType == Node.DOCUMENT_TYPE_NODE:
- return node.publicId == other.publicId \
- and node.systemId == other.system.Id
+ if node.nodeType in [Node.DOCUMENT_NODE, Node.DOCUMENT_FRAGMENT_NODE]:
+ return recurse()
- else:
- # should not happen, in fact
+ elif node.nodeType == Node.ELEMENT_NODE:
+ # Get the basics right
+ if not (node.tagName == other.tagName
+ and node.namespaceURI == other.namespaceURI):
return False
- def __hash__(self):
- """
- Cargo-culted from rdflib.term.Literal
-
- "Called for the key object for dictionary operations,
- and by the built-in function hash(). Should return
- a 32-bit integer usable as a hash value for
- dictionary operations. The only required property
- is that objects which compare equal have the same
- hash value; it is advised to somehow mix together
- (e.g., using exclusive or) the hash values for the
- components of the object that also play a part in
- comparison of objects." -- 3.4.1 Basic customization (Python)
-
- "Two literals are equal if and only if all of the following hold:
- * The strings of the two lexical forms compare equal, character by
- character.
- * Either both or neither have language tags.
- * The language tags, if any, compare equal.
- * Either both or neither have datatype URIs.
- * The two datatype URIs, if any, compare equal, character by
- character."
- -- 6.5.1 Literal Equality (RDF: Concepts and Abstract Syntax)
-
- """
-
- return Identifier.__hash__(self)
-
-
-class XMLLiteral(XMLOrHTMLLiteral):
- def _toCompareValue(self):
- from xml.dom.minidom import parseString
- retval = parseString(
- "<rdflibtoplevelelement>%s</rdflibtoplevelelement>" % self)
- retval.normalize()
- return retval
-
- def __eq__(self, other):
- if other is not None and isinstance(other, XMLLiteral):
- return XMLOrHTMLLiteral._isEqualNode(
- self._cmp_value, other._cmp_value)
- else:
+ # Handle the (namespaced) attributes; the namespace setting key
+ # should be ignored, though
+ # Note that the minidom orders the keys already, so we do not have
+ # to worry about that, which is a bonus...
+ n_keys = [
+ k for k in node.attributes.keysNS()
+ if k[0] != 'http://www.w3.org/2000/xmlns/']
+ o_keys = [
+ k for k in other.attributes.keysNS()
+ if k[0] != 'http://www.w3.org/2000/xmlns/']
+ if len(n_keys) != len(o_keys):
return False
+ for k in n_keys:
+ if not (k in o_keys
+ and node.getAttributeNS(k[0], k[1]) ==
+ other.getAttributeNS(k[0], k[1])):
+ return False
- def __hash__(self):
- return XMLOrHTMLLiteral.__hash__(self)
+ # if we got here, the attributes are all right, we can go down
+ # the tree recursively
+ return recurse()
+ elif node.nodeType in [
+ Node.TEXT_NODE, Node.COMMENT_NODE, Node.CDATA_SECTION_NODE,
+ Node.NOTATION_NODE]:
+ return node.data == other.data
-class HTMLLiteral(XMLOrHTMLLiteral):
- def _toCompareValue(self):
- try:
- import html5lib
- parser = html5lib.HTMLParser(
- tree=html5lib.treebuilders.getTreeBuilder("dom"))
- retval = parser.parseFragment("%s" % self)
- retval.normalize()
- return retval
- except ImportError:
- raise ImportError(
- "HTML5 parser not available. Try installing" +
- " html5lib <http://code.google.com/p/html5lib>")
+ elif node.nodeType == Node.PROCESSING_INSTRUCTION_NODE:
+ return node.data == other.data and node.target == other.target
- def __eq__(self, other):
- if other is not None and isinstance(other, HTMLLiteral):
- return XMLOrHTMLLiteral._isEqualNode(
- self._cmp_value, other._cmp_value)
- else:
- return False
+ elif node.nodeType == Node.ENTITY_NODE:
+ return node.nodeValue == other.nodeValue
- def __hash__(self):
- return XMLOrHTMLLiteral.__hash__(self)
+ elif node.nodeType == Node.DOCUMENT_TYPE_NODE:
+ return node.publicId == other.publicId \
+ and node.systemId == other.system.Id
+
+ else:
+ # should not happen, in fact
+ raise Exception('I dont know how to compare XML Node type: %s'%node.nodeType)
if __name__ == '__main__':
import doctest