diff options
author | Ivan Herman <ivan@ivan-herman.net> | 2012-08-31 16:28:50 +0200 |
---|---|---|
committer | Ivan Herman <ivan@ivan-herman.net> | 2012-08-31 16:28:50 +0200 |
commit | 068f21b442452bdf1a245fd2bba24cf845419e9e (patch) | |
tree | 0f7a81fe0ec61d8a07b38280a4a07318cd6d617c | |
parent | 49bbca6409f35d7764f7d77d5b483e2e873d2084 (diff) | |
download | rdflib-068f21b442452bdf1a245fd2bba24cf845419e9e.tar.gz |
Added the updated RDFa 1.1 parser, the microdata parser, and a common, 'structured data' parser as a separate branch. The code is, hopefully, prepared for Python 3, too, though no proper testing can be done due to the missing Python 3 version of the html5 parser.
30 files changed, 8790 insertions, 6 deletions
diff --git a/rdflib/plugin.py b/rdflib/plugin.py index 7a012e9f..95fd357a 100644 --- a/rdflib/plugin.py +++ b/rdflib/plugin.py @@ -147,10 +147,6 @@ register("nquads", Serializer, register('application/rdf+xml', Parser, 'rdflib.plugins.parsers.rdfxml', 'RDFXMLParser') -register('text/html', Parser, - 'rdflib.plugins.parsers.rdfa', 'RDFaParser') -register('application/xhtml+xml', Parser, - 'rdflib.plugins.parsers.rdfa', 'RDFaParser') register('xml', Parser, 'rdflib.plugins.parsers.rdfxml', 'RDFXMLParser') register('n3', Parser, @@ -163,5 +159,17 @@ register('nquads', Parser, 'rdflib.plugins.parsers.nquads', 'NQuadsParser') register('trix', Parser, 'rdflib.plugins.parsers.trix', 'TriXParser') -register('rdfa', Parser, - 'rdflib.plugins.parsers.rdfa', 'RDFaParser') + +# Various combination of structured data in files: microdata, RDFa +# Big question whether the distribution would point at the RDFa 1.1 parser for 'rdfa' or stay, +# due to backward compatibility, by the old, 1.0 version... +register('rdfa', Parser, 'rdflib.plugins.parsers.rdfa', 'RDFaParser') +register('rdfa1.0', Parser, 'rdflib.plugins.parsers.rdfa', 'RDFaParser') +# Various combination with 1.1 +register('text/html', Parser, 'rdflib.plugins.parsers.structureddata', 'StructuredDataParser') +register('application/xhtml+xml', Parser, 'rdflib.plugins.parsers.structureddata', 'RDFaParser') +register('application/svg+xml', Parser, 'rdflib.plugins.parsers.structureddata', 'RDFaParser') +register('rdfa1.1', Parser, 'rdflib.plugins.parsers.structureddata', 'RDFaParser') +register('mdata', Parser, 'rdflib.plugins.parsers.structureddata', 'MicrodataParser') +register('microdata', Parser, 'rdflib.plugins.parsers.structureddata', 'MicrodataParser') +register('html', Parser, 'rdflib.plugins.parsers.structureddata', 'StructuredDataParser') diff --git a/rdflib/plugins/parsers/pyMicrodata/__init__.py b/rdflib/plugins/parsers/pyMicrodata/__init__.py new file mode 100644 index 00000000..38c14ce1 --- /dev/null +++ b/rdflib/plugins/parsers/pyMicrodata/__init__.py @@ -0,0 +1,423 @@ +# -*- coding: utf-8 -*- +""" + +This module implements the microdata->RDF algorithm, as documented by the U{W3C Semantic Web Interest Group +Note<http://www.w3.org/TR/2012/NOTE-microdata-rdf-20120308/>}. + +The module can be used via a stand-alone script (an example is part of the distribution) or bound to a CGI script as a Web Service. An example CGI script is also added to the distribution. Both the local script and the distribution may have to be adapted to local circumstances. + +(Simple) Usage +============== +From a Python file, expecting a Turtle output:: + from pyMicrodata import pyMicrodata + print pyMicrodata().rdf_from_source('filename') +Other output formats are also possible. E.g., to produce RDF/XML output, one could use:: + from pyMicrodata import pyMicrodata + print pyMicrodata().rdf_from_source('filename', outputFormat='pretty-xml') +It is also possible to embed an RDFa processing. Eg, using:: + from pyMicrodata import pyMicrodata + graph = pyMicrodata().graph_from_source('filename') +returns an RDFLib.Graph object instead of a serialization thereof. See the the description of the +L{pyMicrodata class<pyMicrodata.pyMicrodata>} for further possible entry points details. + +There is also, as part of this module, a L{separate entry for CGI calls<processURI>}. + +Return formats +-------------- + +By default, the output format for the graph is RDF/XML. At present, the following formats are also available (with the corresponding key to be used in the package entry points): + + - "xml": U{RDF/XML<http://www.w3.org/TR/rdf-syntax-grammar/>} + - "turtle": U{Turtle<http://www.w3.org/TR/turtle/>} (default) + - "nt": U{N-triple<http://www.w3.org/TR/rdf-testcases/#ntriples>} + - "json": U{JSON-LD<http://json-ld.org/spec/latest/json-ld-syntax/>} + +@summary: Microdata parser (distiller) +@requires: Python version 2.5 or up +@requires: U{RDFLib<http://rdflib.net>} +@requires: U{html5lib<http://code.google.com/p/html5lib/>} for the HTML5 parsing; note possible dependecies on Python's version on the project's web site +@organization: U{World Wide Web Consortium<http://www.w3.org>} +@author: U{Ivan Herman<http://www.w3.org/People/Ivan/>} +@license: This software is available for use under the +U{W3C® SOFTWARE NOTICE AND LICENSE<href="http://www.w3.org/Consortium/Legal/2002/copyright-software-20021231">} +""" + +""" +$Id: __init__.py,v 1.14 2012/08/22 12:08:52 ivan Exp $ $Date: 2012/08/22 12:08:52 $ + + +""" + +__version__ = "1.1" +__author__ = 'Ivan Herman' +__contact__ = 'Ivan Herman, ivan@w3.org' + +import sys +PY3 = (sys.version_info[0] >= 3) + +if PY3 : + from io import StringIO +else : + from StringIO import StringIO + +import datetime +import os + +import rdflib +from rdflib import URIRef +from rdflib import Literal +from rdflib import BNode +from rdflib import Namespace +if rdflib.__version__ >= "3.0.0" : + from rdflib import RDF as ns_rdf + from rdflib import RDFS as ns_rdfs +else : + from rdflib.RDFS import RDFSNS as ns_rdfs + from rdflib.RDF import RDFNS as ns_rdf + +if PY3 : + from urllib.parse import urlparse +else : + from urlparse import urlparse + +debug = False + +from pyMicrodata.utils import URIOpener +from pyMicrodata.microdata import MicrodataConversion + +ns_micro = Namespace("http://www.w3.org/2012/pyMicrodata/vocab#") +ns_dc = Namespace("http://purl.org/dc/terms/") +ns_xsd = Namespace('http://www.w3.org/2001/XMLSchema#') +ns_ht = Namespace("http://www.w3.org/2006/http#") + +class MicrodataError(Exception) : + """Superclass exceptions representing error conditions defined by the RDFa 1.1 specification. + It does not add any new functionality to the + Exception class.""" + def __init__(self, msg) : + self.msg = msg + Exception.__init__(self) + +class HTTPError(MicrodataError) : + """Raised when HTTP problems are detected. It does not add any new functionality to the + Exception class.""" + def __init__(self, http_msg, http_code) : + self.msg = http_msg + self.http_code = http_code + MicrodataError.__init__(self,http_msg) + + +# Default bindings. This is just for the beauty of things: bindings are added to the graph to make the output nicer. If this is not done, RDFlib defines prefixes like "_1:", "_2:" which is, though correct, ugly... + +_bindings = { + 'owl' : 'http://www.w3.org/2002/07/owl#', + 'gr' : 'http://purl.org/goodrelations/v1#', + 'cc' : 'http://creativecommons.org/ns#', + 'sioc' : 'http://rdfs.org/sioc/ns#', + 'skos' : 'http://www.w3.org/2004/02/skos/core#', + 'rdfs' : 'http://www.w3.org/2000/01/rdf-schema#', + 'foaf' : 'http://xmlns.com/foaf/0.1/', + 'void' : 'http://rdfs.org/ns/void#', + 'ical' : 'http://www.w3.org/2002/12/cal/icaltzd#', + 'vcard' : 'http://www.w3.org/2006/vcard/ns#', + 'og' : 'http://ogp.me/ns#', + 'rdf' : 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', + 'ma' : 'http://www.w3.org/ns/ma-ont#', +} + +######################################################################################################### +class pyMicrodata : + """Main processing class for the distiller + @ivar base: the base value for processing + @ivar http_status: HTTP Status, to be returned when the package is used via a CGI entry. Initially set to 200, may be modified by exception handlers + """ + + def __init__(self, base = "") : + """ + @keyword base: URI for the default "base" value (usually the URI of the file to be processed) + """ + self.http_status = 200 + self.base = base + + def _generate_error_graph(self, pgraph, full_msg, uri = None) : + """ + Generate an error message into the graph. This method is usually used reacting on exceptions. + + Later versions of pyMicrodata may have more detailed error conditions on which it wishes to react. At the moment, this + is fairly crude... + """ + if pgraph == None : + retval = Graph() + else : + retval = pgraph + + pgraph.bind( "dc","http://purl.org/dc/terms/" ) + pgraph.bind( "xsd",'http://www.w3.org/2001/XMLSchema#' ) + pgraph.bind( "ht",'http://www.w3.org/2006/http#' ) + pgraph.bind( "pyMicrodata",'http://www.w3.org/2012/pyMicrodata/vocab#' ) + + bnode = BNode() + retval.add((bnode, ns_rdf["type"], ns_micro["Error"])) + retval.add((bnode, ns_dc["description"], Literal(full_msg))) + retval.add((bnode, ns_dc["date"], Literal(datetime.datetime.utcnow().isoformat(),datatype=ns_xsd["dateTime"]))) + + if uri != None : + htbnode = BNode() + retval.add( (bnode, ns_micro["context"],htbnode) ) + retval.add( (htbnode, ns_rdf["type"], ns_ht["Request"]) ) + retval.add( (htbnode, ns_ht["requestURI"], Literal(uri)) ) + + if self.http_status != None and self.http_status != 200: + htbnode = BNode() + retval.add( (bnode, ns_micro["context"],htbnode) ) + retval.add( (htbnode, ns_rdf["type"], ns_ht["Response"]) ) + retval.add( (htbnode, ns_ht["responseCode"], URIRef("http://www.w3.org/2006/http#%s" % self.http_status)) ) + + return retval + + def _get_input(self, name) : + """ + Trying to guess whether "name" is a URI, a string; it then tries to open these as such accordingly, + returning a file-like object. If name is a plain string then it returns the input argument (that should + be, supposidly, a file-like object already) + @param name: identifier of the input source + @type name: string or a file-like object + @return: a file like object if opening "name" is possible and successful, "name" otherwise + """ + try : + # Python 2 branch + isstring = isinstance(name, basestring) + except : + # Python 3 branch + isstring = isinstance(name, str) + + if isstring : + # check if this is a URI, ie, if there is a valid 'scheme' part + # otherwise it is considered to be a simple file + if urlparse(name)[0] != "" : + url_request = URIOpener(name) + self.base = url_request.location + return url_request.data + else : + self.base = name + return file(name) + else : + return name + + #################################################################################################################### + # Externally used methods + # + def graph_from_DOM(self, dom, graph = None) : + """ + Extract the RDF Graph from a DOM tree. + @param dom: a DOM Node element, the top level entry node for the whole tree (to make it clear, a dom.documentElement is used to initiate processing) + @keyword graph: an RDF Graph (if None, than a new one is created) + @type graph: rdflib Graph instance. If None, a new one is created. + @return: an RDF Graph + @rtype: rdflib Graph instance + """ + if graph == None : + # Create the RDF Graph, that will contain the return triples... + graph = Graph() + + MicrodataConversion(dom.documentElement, graph, base=self.base).convert() + return graph + + def graph_from_source(self, name, graph = None, rdfOutput = False) : + """ + Extract an RDF graph from an microdata source. The source is parsed, the RDF extracted, and the RDF Graph is + returned. This is a front-end to the L{pyMicrodata.graph_from_DOM} method. + + @param name: a URI, a file name, or a file-like object + @return: an RDF Graph + @rtype: rdflib Graph instance + """ + # First, open the source... + try : + # First, open the source... Possible HTTP errors are returned as error triples + input = None + try : + input = self._get_input(name) + except HTTPError : + h = sys.exc_info()[1] + self.http_status = h.http_code + if not rdfOutput : raise h + return self._generate_error_graph(graph, "HTTP Error: %s (%s)" % (h.http_code,h.msg), uri=name) + except Exception : + # Something nasty happened:-( + e = sys.exc_info()[1] + self.http_status = 500 + if not rdfOutput : raise e + return self._generate_error_graph(graph, str(e), uri=name) + + dom = None + try : + import warnings + warnings.filterwarnings("ignore", category=DeprecationWarning) + import html5lib + parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("dom")) + dom = parser.parse(input) + return self.graph_from_DOM(dom, graph) + except ImportError : + msg = "HTML5 parser not available. Try installing html5lib <http://code.google.com/p/html5lib>" + raise ImportError(msg) + except Exception : + # Something nasty happened:-( + e = sys.exc_info()[1] + self.http_status = 400 + if not rdfOutput : raise e + return self._generate_error_graph(graph, str(e), uri=name) + + except Exception : + # Something nasty happened:-( + e = sys.exc_info()[1] + if isinstance(e, ImportError) : + self.http_status = None + else : + self.http_status = 500 + if not rdfOutput : raise e + return self._generate_error_graph(graph, str(e), uri=name) + + def rdf_from_sources(self, names, outputFormat = "pretty-xml", rdfOutput = False) : + """ + Extract and RDF graph from a list of RDFa sources and serialize them in one graph. The sources are parsed, the RDF + extracted, and serialization is done in the specified format. + @param names: list of sources, each can be a URI, a file name, or a file-like object + @keyword outputFormat: serialization format. Can be one of "turtle", "n3", "xml", "pretty-xml", "nt". "xml" and "pretty-xml", as well as "turtle" and "n3" are synonyms. + @return: a serialized RDF Graph + @rtype: string + """ + try : + from pyRdfaExtras import MyGraph + graph = MyGraph() + except : + graph = Graph() + + for prefix in _bindings : + graph.bind(prefix,Namespace(_bindings[prefix])) + + # the value of rdfOutput determines the reaction on exceptions... + for name in names : + self.graph_from_source(name, graph, rdfOutput) + return graph.serialize(format=outputFormat) + + def rdf_from_source(self, name, outputFormat = "pretty-xml", rdfOutput = False) : + """ + Extract and RDF graph from an RDFa source and serialize it in one graph. The source is parsed, the RDF + extracted, and serialization is done in the specified format. + @param name: a URI, a file name, or a file-like object + @keyword outputFormat: serialization format. Can be one of "turtle", "n3", "xml", "pretty-xml", "nt". "xml" and "pretty-xml", as well as "turtle" and "n3" are synonyms. + @return: a serialized RDF Graph + @rtype: string + """ + return self.rdf_from_sources([name], outputFormat, rdfOutput) + +################################################# CGI Entry point +def processURI(uri, outputFormat, form) : + """The standard processing of a microdata uri options in a form, ie, as an entry point from a CGI call. + + The call accepts extra form options (eg, HTTP GET options) as follows: + + @param uri: URI to access. Note that the "text:" and "uploaded:" values are treated separately; the former is for textual intput (in which case a StringIO is used to get the data) and the latter is for uploaded file, where the form gives access to the file directly. + @param outputFormat: serialization formats, as understood by RDFLib. Note that though "turtle" is + a possible parameter value, some versions of the RDFLib turtle generation does funny (though legal) things with + namespaces, defining unusual and unwanted prefixes... + @param form: extra call options (from the CGI call) to set up the local options (if any) + @type form: cgi FieldStorage instance + @return: serialized graph + @rtype: string + """ + if uri == "uploaded:" : + input = form["uploaded"].file + base = "" + elif uri == "text:" : + input = StringIO(form.getfirst("text")) + base = "" + else : + input = uri + base = uri + + processor = pyMicrodata(base = base) + + # Decide the output format; the issue is what should happen in case of a top level error like an inaccessibility of + # the html source: should a graph be returned or an HTML page with an error message? + + # decide whether HTML or RDF should be sent. + htmlOutput = False + #if 'HTTP_ACCEPT' in os.environ : + # acc = os.environ['HTTP_ACCEPT'] + # possibilities = ['text/html', + # 'application/rdf+xml', + # 'text/turtle; charset=utf-8', + # 'application/json', + # 'application/ld+json', + # 'text/rdf+n3'] + # + # # this nice module does content negotiation and returns the preferred format + # sg = httpheader.acceptable_content_type(acc, possibilities) + # htmlOutput = (sg != None and sg[0] == httpheader.content_type('text/html')) + # os.environ['rdfaerror'] = 'true' + + try : + graph = processor.rdf_from_source(input, outputFormat, rdfOutput = ("forceRDFOutput" in list(form.keys())) or not htmlOutput) + if outputFormat == "n3" : + retval = 'Content-Type: text/rdf+n3; charset=utf-8\n' + elif outputFormat == "nt" or outputFormat == "turtle" : + retval = 'Content-Type: text/turtle; charset=utf-8\n' + elif outputFormat == "json-ld" or outputFormat == "json" : + retval = 'Content-Type: application/json; charset=utf-8\n' + else : + retval = 'Content-Type: application/rdf+xml; charset=utf-8\n' + retval += '\n' + + retval += graph + return retval + except HTTPError : + import cgi + h = sys.exc_info()[1] + retval = 'Content-type: text/html; charset=utf-8\nStatus: %s \n\n' % h.http_code + retval += "<html>\n" + retval += "<head>\n" + retval += "<title>HTTP Error in Microdata processing</title>\n" + retval += "</head><body>\n" + retval += "<h1>HTTP Error in distilling Microdata</h1>\n" + retval += "<p>HTTP Error: %s (%s)</p>\n" % (h.http_code,h.msg) + retval += "<p>On URI: <code>'%s'</code></p>\n" % cgi.escape(uri) + retval +="</body>\n" + retval +="</html>\n" + return retval + except : + # This branch should occur only if an exception is really raised, ie, if it is not turned + # into a graph value. + (type,value,traceback) = sys.exc_info() + + import traceback, cgi + + retval = 'Content-type: text/html; charset=utf-8\nStatus: %s\n\n' % processor.http_status + retval += "<html>\n" + retval += "<head>\n" + retval += "<title>Exception in Microdata processing</title>\n" + retval += "</head><body>\n" + retval += "<h1>Exception in distilling Microdata</h1>\n" + retval += "<pre>\n" + strio = StringIO() + traceback.print_exc(file=strio) + retval += strio.getvalue() + retval +="</pre>\n" + retval +="<pre>%s</pre>\n" % value + retval +="<h1>Distiller request details</h1>\n" + retval +="<dl>\n" + if uri == "text:" and "text" in form and form["text"].value != None and len(form["text"].value.strip()) != 0 : + retval +="<dt>Text input:</dt><dd>%s</dd>\n" % cgi.escape(form["text"].value).replace('\n','<br/>') + elif uri == "uploaded:" : + retval +="<dt>Uploaded file</dt>\n" + else : + retval +="<dt>URI received:</dt><dd><code>'%s'</code></dd>\n" % cgi.escape(uri) + retval +="<dt>Output serialization format:</dt><dd> %s</dd>\n" % outputFormat + retval +="</dl>\n" + retval +="</body>\n" + retval +="</html>\n" + return retval + +################################################################################################### + diff --git a/rdflib/plugins/parsers/pyMicrodata/microdata.py b/rdflib/plugins/parsers/pyMicrodata/microdata.py new file mode 100644 index 00000000..59ad88cb --- /dev/null +++ b/rdflib/plugins/parsers/pyMicrodata/microdata.py @@ -0,0 +1,555 @@ +# -*- coding: utf-8 -*- +""" + +The core of the Microdata->RDF conversion, a more or less verbatim implementation of the +U{W3C IG Note<http://www.w3.org/TR/microdata-rdf/>}. Because the implementation was also used to check +the note itself, it tries to be fairly close to the text. + + +@organization: U{World Wide Web Consortium<http://www.w3.org>} +@author: U{Ivan Herman<a href="http://www.w3.org/People/Ivan/">} +@license: This software is available for use under the +U{W3C® SOFTWARE NOTICE AND LICENSE<href="http://www.w3.org/Consortium/Legal/2002/copyright-software-20021231">} +""" + +""" +$Id: microdata.py,v 1.3 2012/03/26 13:18:31 ivan Exp $ +$Date: 2012/03/26 13:18:31 $ + +Added a reaction on the RDFaStopParsing exception: if raised while setting up the local execution context, parsing +is stopped (on the whole subtree) +""" + +import sys +if sys.version_info[0] >= 3 : + from urllib.parse import urlsplit, urlunsplit +else : + from urlparse import urlsplit, urlunsplit + +import rdflib +from rdflib import URIRef +from rdflib import Literal +from rdflib import BNode +from rdflib import Namespace +if rdflib.__version__ >= "3.0.0" : + from rdflib import Graph + from rdflib import RDF as ns_rdf + from rdflib import RDFS as ns_rdfs +else : + from rdflib.Graph import Graph + from rdflib.RDFS import RDFSNS as ns_rdfs + from rdflib.RDF import RDFNS as ns_rdf + +from pyMicrodata.registry import registry, vocab_names +from pyMicrodata.utils import generate_RDF_collection, get_Literal, get_time_type +from pyMicrodata.utils import get_lang_from_hierarchy, is_absolute_URI, generate_URI, fragment_escape + +MD_VOCAB = "http://www.w3.org/ns/md#" + +from pyMicrodata import debug + +# Existing predicate schemes +class PropertySchemes : + vocabulary = "vocabulary" + contextual = "contextual" + +class ValueMethod : + unordered = "unordered" + list = "list" + +# ---------------------------------------------------------------------------- + +class Evaluation_Context : + """ + Evaluation context structure. See Section 4.1 of the U{W3C IG Note<http://www.w3.org/TR/microdata-rdf/>}for the details. + + @ivar current_type : an absolute URL for the current type, used when an item does not contain an item type + @ivar memory: mapping from items to RDF subjects + @type memory: dictionary + @ivar current_name: an absolute URL for the in-scope name, used for generating URIs for properties of items without an item type + @ivar current_vocabulary: an absolute URL for the current vocabulary, from the registry + """ + def __init__( self ) : + self.current_type = None + self.memory = {} + self.current_name = None + self.current_vocabulary = None + + def get_memory( self, item ) : + """ + Get the memory content (ie, RDF subject) for 'item', or None if not stored yet + @param item: an 'item', in microdata terminology + @type item: DOM Element Node + @return: None, or an RDF Subject (URIRef or BNode) + """ + if item in self.memory : + return self.memory[item] + else : + return None + + def set_memory( self, item, subject ) : + """ + Set the memory content, ie, the subject, for 'item'. + @param item: an 'item', in microdata terminology + @type item: DOM Element Node + @param subject: RDF Subject + @type subject: URIRef or Blank Node + """ + self.memory[item] = subject + + def new_copy(self, itype) : + """ + During the generation algorithm a new copy of the current context has to be done with a new current type. + + At the moment, the content of memory is copied, ie, a fresh dictionary is created and the content copied over. + Not clear whether that is necessary, though, maybe a simple reference is enough... + @param itype : an absolute URL for the current type + @return: a new evaluation context instance + """ + retval = Evaluation_Context() + for k in self.memory : + retval.memory[k] = self.memory[k] + + retval.current_type = itype + retval.current_name = self.current_name + retval.current_vocabulary = self.current_vocabulary + return retval + + def __str__(self) : + retval = "Evaluation context:\n" + retval += " current type: %s\n" % self.current_type + retval += " current name: %s\n" % self.current_name + retval += " current vocabulary: %s\n" % self.current_vocabulary + retval += " memory: %s\n" % self.memory + retval += "----\n" + return retval + +class Microdata : + """ + This class encapsulates methods that are defined by the U{microdata spec<http://dev.w3.org/html5/md/Overview.html>}, as opposed to + the RDF conversion note. + + @ivar document: top of the DOM tree, as returned by the HTML5 parser + @ivar base: the base URI of the Dom tree, either set from the outside or via a @base element + """ + def __init__( self, document, base = None) : + """ + @param document: top of the DOM tree, as returned by the HTML5 parser + @param base: the base URI of the Dom tree, either set from the outside or via a @base element + """ + self.document = document + + #----------------------------------------------------------------- + # set the document base, will be used to generate top level URIs + self.base = None + # handle the base element case for HTML + for set_base in document.getElementsByTagName("base") : + if set_base.hasAttribute("href") : + # Yep, there is a local setting for base + self.base = set_base.getAttribute("href") + return + # If got here, ie, if no local setting for base occurs, the input argument has it + self.base = base + + def get_top_level_items( self ) : + """ + A top level item is and element that has the @itemscope set, but no @itemtype. They have to + be collected in pre-order and depth-first fashion. + + @return: list of items (ie, DOM Nodes) + """ + def collect_items( node ) : + items = [] + for child in node.childNodes : + if child.nodeType == node.ELEMENT_NODE : + items += collect_items( child ) + + if node.hasAttribute("itemscope") and not node.hasAttribute("itemprop") : + # This is also a top level item + items.append(node) + + return items + + return collect_items( self.document ) + + def get_item_properties( self, item ) : + """ + Collect the item's properties, ie, all DOM descendent nodes with @itemprop until the subtree hits another @itemscope. @itemrefs are also added at this point. + + @param item: current item + @type item: DOM Node + @return: array of items, ie, DOM Nodes + + + """ + # go down the tree until another itemprop is hit, take care of the itemrefs, too; see the microdata doc + # probably the ugliest stuff + # returns a series of element nodes. + # Is it worth filtering the ones with itemprop at that level??? + results = [] + memory = [ item ] + pending = [ child for child in item.childNodes if child.nodeType == item.ELEMENT_NODE ] + + if item.hasAttribute("itemref") : + for id in item.getAttribute("itemref").strip().split() : + obj = self.getElementById(id) + if obj != None : pending.append(obj) + + while len(pending) > 0 : + current = pending.pop(0) + if current in memory : + # in general this raises an error; the same item cannot be there twice. In this case this is + # simply ignored + continue + else : + # this for the check above + memory.append(current) + + # @itemscope is the barrier... + if not current.hasAttribute("itemscope") : + pending = [ child for child in current.childNodes if child.nodeType == child.ELEMENT_NODE ] + pending + + if current.hasAttribute("itemprop") and current.getAttribute("itemprop").strip() != "" : + results.append(current) + + return results + + def getElementById(self, id) : + """This is a method defined for DOM 2 HTML, but the HTML5 parser does not seem to define it. Oh well... + @param id: value of an @id attribute to look for + @return: array of nodes whose @id attribute matches C{id} (formally, there should be only one...) + """ + def collect_ids( node ) : + ids = [] + for child in node.childNodes : + if child.nodeType == node.ELEMENT_NODE : + ids += collect_ids( child ) + + if node.hasAttribute("id") and node.getAttribute("id") == id : + # This is also a top level item + ids.append(node) + + return ids + + ids = collect_ids(self.document) + if len(ids) > 0 : + return ids[0] + else : + return None + + +class MicrodataConversion(Microdata) : + """ + Top level class encapsulating the conversion algorithms as described in the W3C note. + + @ivar graph: an RDF graph; an RDFLib Graph + @type graph: RDFLib Graph + @ivar document: top of the DOM tree, as returned by the HTML5 parser + @ivar ns_md: the Namespace for the microdata vocabulary + @ivar base: the base of the Dom tree, either set from the outside or via a @base element + """ + def __init__( self, document, graph, base = None ) : + """ + @param graph: an RDF graph; an RDFLib Graph + @type graph: RDFLib Graph + @param document: top of the DOM tree, as returned by the HTML5 parser + @param base: the base of the Dom tree, either set from the outside or via a @base element + """ + Microdata.__init__(self, document, base) + self.graph = graph + self.ns_md = Namespace( MD_VOCAB ) + self.graph.bind( "md",MD_VOCAB ) + + # Get the vocabularies defined in the registry bound to proper names, if any... + + def _use_rdfa_context () : + from pyRdfa.initialcontext import initial_context + retval = {} + vocabs = initial_context["http://www.w3.org/2011/rdfa-context/rdfa-1.1"].ns + for prefix in list(vocabs.keys()) : + uri = vocabs[prefix] + if uri not in vocab_names and uri not in registry : retval[uri] = prefix + return retval + + for vocab in registry : + if vocab in vocab_names : + self.graph.bind( vocab_names[vocab],vocab ) + else : + hvocab = vocab + '#' + if hvocab in vocab_names : + self.graph.bind( vocab_names[hvocab],hvocab ) + + # Add the prefixes defined in the RDFa initial context to improve the outlook of the output + from pyRdfa.initialcontext import initial_context + vocabs = initial_context["http://www.w3.org/2011/rdfa-context/rdfa-1.1"].ns + for prefix in list(vocabs.keys()) : + uri = vocabs[prefix] + if uri not in registry : + # if it is in the registry, then it may have needed some special microdata massage... + self.graph.bind( prefix,uri ) + + def convert( self ) : + """ + Top level entry to convert and generate all the triples. It finds the top level items, + and generates triples for each of them; additionally, it generates a top level entry point + to the items from base in the form of an RDF list. + """ + item_list = [] + for top_level_item in self.get_top_level_items() : + item_list.append( self.generate_triples(top_level_item, Evaluation_Context()) ) + list = generate_RDF_collection( self.graph, item_list ) + self.graph.add( (URIRef(self.base),self.ns_md["item"],list) ) + + def generate_triples( self, item, context ) : + """ + Generate the triples for a specific item. See the W3C Note for the details. + + @param item: the DOM Node for the specific item + @type item: DOM Node + @param context: an instance of an evaluation context + @type context: L{Evaluation_Context} + @return: a URIRef or a BNode for the (RDF) subject + """ + # Step 1,2: if the subject has to be set, store it in memory + subject = context.get_memory( item ) + if subject == None : + # nop, there is no subject set. If there is a valid @itemid, that carries it + if item.hasAttribute("itemid") and is_absolute_URI( item.getAttribute("itemid") ): + subject = URIRef( item.getAttribute("itemid").strip() ) + else : + subject = BNode() + context.set_memory( item, subject ) + + # Step 3: set the type triples if any + types = [] + if item.hasAttribute("itemtype") : + types = item.getAttribute("itemtype").strip().split() + for t in types : + if is_absolute_URI( t ) : + self.graph.add( (subject, ns_rdf["type"], URIRef(t)) ) + + # Step 4, 5 and 6 to set the typing variable + if len(types) == 0 : + itype = None + else : + if is_absolute_URI(types[0]) : + itype = types[0] + context.current_name = None + elif context.current_type != None : + itype = context.current_type + else : + itype = None + + # Step 7, 8: Check the registry for possible keys and set the vocab + vocab = None + if itype != None : + for key in list(registry.keys()) : + if itype.startswith(key) : + # There is a predefined vocabulary for this type... + vocab = key + break + # The registry has not set the vocabulary; has to be extracted from the type + if vocab == None : + parsed = urlsplit(itype) + if parsed.fragment != "" : + vocab = urlunsplit( (parsed.scheme,parsed.netloc,parsed.path,parsed.query,"") ) + '#' + elif parsed.path == "" and parsed.query == "" : + vocab = itype + if vocab[-1] != '/' : vocab += '/' + else : + vocab = itype.rsplit('/',1)[0] + '/' + + # Step 9: update vocab in the context + if vocab != None : + context.current_vocabulary = vocab + elif item.hasAttribute("itemtype") : + context.current_vocabulary = None + + # context.current_vocabulary = vocab + + # Step 10: set up a property list; this will be used to generate triples later. + # each entry in the dictionary is an array of RDF objects + property_list = {} + + # Step 11: Get the item properties and run a cycle on those + for prop in self.get_item_properties(item) : + for name in prop.getAttribute("itemprop").strip().split() : + # 11.1.1. set a new context + new_context = context.new_copy(itype) + # 11.1.2, generate the URI for the property name, that will be the predicate + # Also update the context + new_context.current_name = predicate = self.generate_predicate_URI( name,new_context ) + # 11.1.3, generate the property value. The extra flag signals that the value is a new item + # Note that 10.1.4 step is done in the method itself, ie, a recursion may occur there + # if a new item is hit (in which case the return value is a RDF resource chaining to a subject) + value = self.get_property_value( prop, new_context ) + # 11.1.5, store all the values + if predicate in property_list : + property_list[predicate].append(value) + else : + property_list[predicate] = [ value ] + + # step 12: generate the triples + for property in list(property_list.keys()) : + self.generate_property_values( subject, URIRef(property), property_list[property], context ) + + # Step 13: return the subject to the caller + return subject + + def generate_predicate_URI( self, name, context ) : + """ + Generate a full URI for a predicate, using the type, the vocabulary, etc. + + For details of this entry, see Section 4.4 + @param name: name of the property, ie, what appears in @itemprop + @param context: an instance of an evaluation context + @type context: L{Evaluation_Context} + """ + if debug: print( "name: %s, %s" % (name,context) ) + + # Step 1: absolute URI-s are fine, take them as they are + if is_absolute_URI(name) : return name + + # Step 2: if type is none, that this is just used as a fragment + # if not context.current_type : + if context.current_type == None and context.current_vocabulary == None : + return generate_URI( self.base, name ) + + #if context.current_type == None : + # return generate_URI( self.base, name ) + + # Step 3: set the scheme + try : + if context.current_vocabulary in registry and "propertyURI" in registry[context.current_vocabulary] : + scheme = registry[context.current_vocabulary]["propertyURI"] + else : + scheme = PropertySchemes.vocabulary + except : + # This is when the structure of the registry is broken + scheme = PropertySchemes.vocabulary + + name = fragment_escape( name ) + if scheme == PropertySchemes.contextual : + # Step 5.1 + s = context.current_name + # s = context.current_type + if s != None and s.startswith("http://www.w3.org/ns/md?type=") : + # Step 5.2 + return s + '.' + name + else : + # Step 5.3 + return "http://www.w3.org/ns/md?type=" + fragment_escape(context.current_type) + "&prop=" + name + else : + # Step 4 + if context.current_vocabulary[-1] == '#' or context.current_vocabulary[-1] == '/' : + return context.current_vocabulary + name + else : + return context.current_vocabulary + '#' + name + + def get_property_value(self, node, context) : + """ + Generate an RDF object, ie, the value of a property. Note that if this element contains + an @itemscope, then a recursive call to L{MicrodataConversion.generate_triples} is done and the + return value of that method (ie, the subject for the corresponding item) is return as an + object. + + Otherwise, either URIRefs are created for <a>, <img>, etc, elements, or a Literal; the latter + gets a time-related type for the <time> element. + + @param node: the DOM Node for which the property values should be generated + @type node: DOM Node + @param context: an instance of an evaluation context + @type context: L{Evaluation_Context} + @return: an RDF resource (URIRef, BNode, or Literal) + """ + URI_attrs = { + "audio" : "src", + "embed" : "src", + "iframe" : "src", + "img" : "src", + "source" : "src", + "track" : "src", + "video" : "src", + "data" : "src", + "a" : "href", + "area" : "href", + "link" : "href", + "object" : "data" + } + lang = get_lang_from_hierarchy( self.document, node ) + + if node.hasAttribute("itemscope") : + # THIS IS A RECURSION ENTRY POINT! + return self.generate_triples( node, context ) + + elif node.tagName in URI_attrs and node.hasAttribute(URI_attrs[node.tagName]) : + return URIRef( generate_URI( self.base, node.getAttribute(URI_attrs[node.tagName]).strip() ) ) + + elif node.tagName == "meta" and node.hasAttribute("content") : + if lang : + return Literal( node.getAttribute("content"), lang = lang ) + else : + return Literal( node.getAttribute("content") ) + + elif node.tagName == "time" and node.hasAttribute("datetime") : + litval = node.getAttribute("datetime") + dtype = get_time_type(litval) + if dtype : + return Literal( litval, datatype = dtype ) + else : + return Literal( litval ) + + else : + if lang : + return Literal( get_Literal(node), lang = lang ) + else : + return Literal( get_Literal(node) ) + + def generate_property_values( self, subject, predicate, objects, context) : + """ + Generate the property values for for a specific subject and predicate. The context should specify whether + the objects should be added in an RDF list or each triples individually. + + @param subject: RDF subject + @type subject: RDFLib Node (URIRef or blank node) + @param predicate: RDF predicate + @type predicate: RDFLib URIRef + @param objects: RDF objects + @type objects: list of RDFLib nodes (URIRefs, Blank Nodes, or literals) + @param context: evaluation context + @type context: L{Evaluation_Context} + """ + # generate triples with a list, or a bunch of triples, depending on the context + # The biggest complication is to find the method... + method = ValueMethod.unordered + + # This is necessary because predicate is a URIRef, and I am not sure the comparisons would work well + # to be tested, in fact... + pred_key = "%s" % predicate + for key in registry : + if predicate.startswith(key) : + registry_object = registry[key] + name = pred_key[len(key):] + try : + if "multipleValues" in registry_object : method = registry_object["multipleValues"] + # The generic definition can be overwritten for a specific property. The simplest is to rely on a 'try' + # with the right structure... + try : + method = registry_object["properties"][pred_key[len(key):]]["multipleValues"] + except : + pass + except : + pass + break + + if method == ValueMethod.unordered : + for object in objects : + self.graph.add( (subject, predicate, object) ) + else : + self.graph.add( (subject,predicate,generate_RDF_collection( self.graph, objects )) ) + + + + + + diff --git a/rdflib/plugins/parsers/pyMicrodata/registry.py b/rdflib/plugins/parsers/pyMicrodata/registry.py new file mode 100644 index 00000000..d279fef6 --- /dev/null +++ b/rdflib/plugins/parsers/pyMicrodata/registry.py @@ -0,0 +1,122 @@ +# -*- coding: utf-8 -*- +""" + +Hardcoded version of the current microdata->RDF registry. There is also a local registry to include some test cases. +Finally, there is a local dictionary for prefix mapping for the registry items; these are the preferred prefixes +for those vocabularies, and are used to make the output nicer. + +@organization: U{World Wide Web Consortium<http://www.w3.org>} +@author: U{Ivan Herman<a href="http://www.w3.org/People/Ivan/">} +@license: This software is available for use under the +U{W3C® SOFTWARE NOTICE AND LICENSE<href="http://www.w3.org/Consortium/Legal/2002/copyright-software-20021231">} +""" + +""" +$Id: registry.py,v 1.4 2012/03/26 13:18:31 ivan Exp $ +$Date: 2012/03/26 13:18:31 $ +""" + +import sys +(py_v_major, py_v_minor, py_v_micro, py_v_final, py_v_serial) = sys.version_info + +_registry = """ +{ + "http://schema.org/": { + "propertyURI": "vocabulary", + "multipleValues": "unordered", + "properties": { + "blogPosts": {"multipleValues": "list"}, + "breadcrumb": {"multipleValues": "list"}, + "byArtist": {"multipleValues": "list"}, + "creator": {"multipleValues": "list"}, + "episodes": {"multipleValues": "list"}, + "events": {"multipleValues": "list"}, + "founders": {"multipleValues": "list"}, + "itemListElement": {"multipleValues": "list"}, + "musicGroupMember": {"multipleValues": "list"}, + "performerIn": {"multipleValues": "list"}, + "performers": {"multipleValues": "list"}, + "producer": {"multipleValues": "list"}, + "recipeInstructions": {"multipleValues": "list"}, + "seasons": {"multipleValues": "list"}, + "subEvents": {"multipleValues": "list"}, + "tracks": {"multipleValues": "list"} + } + }, + "http://xmlns.com/foaf/0.1/": { + "propertyURI": "vocabulary", + "multipleValues": "unordered" + }, + "http://microformats.org/profile/hcard": { + "propertyURI": "vocabulary", + "multipleValues": "unordered" + }, + "http://microformats.org/profile/hcalendar#": { + "propertyURI": "vocabulary", + "multipleValues": "unordered", + "properties": { + "categories": {"multipleValues": "list"} + } + } +} +""" + +vocab_names = { + "http://schema.org/" : "schema", + "http://xmlns.com/foaf/0.1/" : "foaf", + "http://microformats.org/profile/hcard#" : "hcard", + "http://microformats.org/profile/hcalendar#" : "hcalendar" +} + +# This is the local version, added mainly for testing +_myRegistry = """ +{ + "http://n.whatwg.org/work": { + "propertyURI": "contextual", + "multipleValues": "list" + } +} +""" + +registry = [] +myRegistry = [] +if py_v_major >= 3 or (py_v_major == 2 and py_v_minor >= 6) : + import json + registry = json.loads(_registry) + myRegistry = json.loads(_myRegistry) +else : + import simplejson + registry = simplejson.loads(_registry) + myRegistry = simplejson.loads(_myRegistry) + +for (k,v) in list(myRegistry.items()) : registry[k] = v + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/rdflib/plugins/parsers/pyMicrodata/utils.py b/rdflib/plugins/parsers/pyMicrodata/utils.py new file mode 100644 index 00000000..4ec9c870 --- /dev/null +++ b/rdflib/plugins/parsers/pyMicrodata/utils.py @@ -0,0 +1,307 @@ +# -*- coding: utf-8 -*- +""" +Various utilities for pyMicrodata + +@organization: U{World Wide Web Consortium<http://www.w3.org>} +@author: U{Ivan Herman<a href="http://www.w3.org/People/Ivan/">} +@license: This software is available for use under the +U{W3C® SOFTWARE NOTICE AND LICENSE<href="http://www.w3.org/Consortium/Legal/2002/copyright-software-20021231">} +""" + +""" +$Id: utils.py,v 1.6 2012/08/22 12:08:52 ivan Exp $ +$Date: 2012/08/22 12:08:52 $ +""" +import os, os.path, sys +(py_v_major, py_v_minor, py_v_micro, py_v_final, py_v_serial) = sys.version_info + +if py_v_major >= 3 : + from urllib.request import Request, urlopen + from urllib.parse import urljoin, quote, urlparse + from http.server import BaseHTTPRequestHandler + from urllib.error import HTTPError as urllib_HTTPError +else : + from urllib2 import Request, urlopen + from urllib2 import HTTPError as urllib_HTTPError + from urlparse import urljoin, urlparse + from urllib import quote + from BaseHTTPServer import BaseHTTPRequestHandler + +import re +from datetime import datetime + +from rdflib import BNode +import rdflib +if rdflib.__version__ >= "3.0.0" : + from rdflib import RDF as ns_rdf +else : + from rdflib.RDF import RDFNS as ns_rdf + +################################################################################# +def is_absolute_URI( uri ) : + return urlparse(uri)[0] != "" + +################################################################################# + +def fragment_escape( name ) : + return quote(name, '/~:-.') + +################################################################################# + +def generate_URI(base, v) : + """ + Generate an (absolute) URI; if val is a fragment, then using it with base, + otherwise just return the value + @param base: Absolute URI for base + @param v: relative or absolute URI + """ + if is_absolute_URI( v ) : + return v + else : + # UGLY!!! There is a bug for a corner case in python version <= 2.5.X + if len(v) > 0 and v[0] == '?' and (py_v_major < 3 and py_v_minor <= 5) : + return base+val + #### + + # Trust the python library... + # Well, not quite:-) there is what is, in my view, a bug in the urljoin; in some cases it + # swallows the '#' or '?' character at the end. This is clearly a problem with + # Semantic Web URI-s + v = fragment_escape(v.strip()) + joined = urljoin(base, v) + try : + if v[-1] != joined[-1] and (v[-1] == "#" or v[-1] == "?") : + return joined + v[-1] + else : + return joined + except : + return joined + +################################################################################# +def generate_RDF_collection( graph, vals ) : + """ + Generate an RDF List from vals, returns the head of the list + @param graph: RDF graph + @type graph: RDFLib Graph + @param vals: array of RDF Resources + @return: head of the List (an RDF Resource) + """ + # generate an RDF List, returns the head + # list has all the elements in RDF format already + heads = [ BNode() for r in vals ] + [ ns_rdf["nil"] ] + for i in range(0, len(vals)) : + graph.add( (heads[i], ns_rdf["first"], vals[i]) ) + graph.add( (heads[i], ns_rdf["rest"], heads[i+1]) ) + return heads[0] + +################################################################################# +def get_Literal(Pnode): + """ + Get (recursively) the full text from a DOM Node. + + @param Pnode: DOM Node + @return: string + """ + rc = "" + for node in Pnode.childNodes: + if node.nodeType == node.TEXT_NODE: + rc = rc + node.data + elif node.nodeType == node.ELEMENT_NODE : + rc = rc + get_Literal(node) + + # This presupposes that all spaces and such should be stripped. I am not sure it is true in the spec, + # but this is what the examples show + return re.sub(r'(\r| |\n|\t)+'," ",rc).strip() + +################################################################################# +def get_lang(node) : + # we may have lang and xml:lang + retval = None + if node.hasAttribute("lang") : + retval = node.getAttribute("lang") + if retval and node.hasAttribute("xml:lang") : + xmllang = node.getAttribute("xml:lang").lower() + if not( xmllang != None and xmllang == retval.lower() ) : + # This is an error, in which case retval must be invalidated... + retval = None + return retval + +def get_lang_from_hierarchy(document, node) : + lang = get_lang(node) + if lang == None : + parent = node.parentNode + if parent != None and parent != document : + return get_lang_from_hierarchy(document, parent) + else : + return get_lang(document) + else : + return lang + +################################################################################# +datetime_type = "http://www.w3.org/2001/XMLSchema#dateTime" +time_type = "http://www.w3.org/2001/XMLSchema#time" +date_type = "http://www.w3.org/2001/XMLSchema#date" +date_gYear = "http://www.w3.org/2001/XMLSchema#gYear" +date_gYearMonth = "http://www.w3.org/2001/XMLSchema#gYearMonth" +date_gMonthDay = "http://www.w3.org/2001/XMLSchema#gMonthDay" +duration_type = "http://www.w3.org/2001/XMLSchema#duration" + +_formats = { + date_gMonthDay : [ "%m-%d" ], + date_gYearMonth : [ "%Y-%m"], + date_gYear : [ "%Y" ], + date_type : [ "%Y-%m-%d", "%Y-%m-%dZ" ], + time_type : [ "%H:%M", + "%H:%M:%S", + "%H:%M:%SZ", + "%H:%M:%S.%f" ], + datetime_type : [ "%Y-%m-%dT%H:%M", + "%Y-%m-%dT%H:%M:%S", + "%Y-%m-%dT%H:%M:%S.%f", + "%Y-%m-%dT%H:%MZ", + "%Y-%m-%dT%H:%M:%SZ", + "%Y-%m-%dT%H:%M:%S.%fZ" ], + duration_type : [ "P%dD", + "P%YY%mM%dD", + "P%YY%mM", + "P%YY%dD", + "P%YY", + "P%mM", + "P%mM%dD", + ], +} + +_dur_times = [ "%HH%MM%SS", "%HH", "%MM", "%SS", "%HH%MM", "%HH%SS", "%MM%SS" ] + +def get_time_type(string) : + """ + Check whether the string abides to one of the accepted time related datatypes, and returns that one if yes + @param string: the attribute value to be checked + @return : a datatype URI or None + """ + for key in _formats : + for format in _formats[key] : + try : + # try to check if the syntax is fine + d = datetime.strptime(string, format) + # bingo! + return key + except ValueError : + pass + + # Now come the special cases:-( + # Check first for the duration stuff, that is the nastiest. + if len(string) > 2 and string[0] == 'P' or (string [0] == '-' and string[1] == 'P') : + # this is meant to be a duration type + # first of all, get rid of the leading '-' and check again + if string[0] == '-' : + for format in _formats[duration_type] : + try : + # try to check if the syntax is fine + d = datetime.strptime(string, format) + # bingo! + return duration_type + except ValueError : + pass + # Let us see if the value contains a separate time portion, and cut that one + durs = string.split('T') + if len(durs) == 2 : + # yep, so we should check again + dur = durs[0] + tm = durs[1] + # Check the duration part + td = False + for format in _formats[duration_type] : + try : + # try to check if the syntax is fine + d = datetime.strptime(dur, format) + # bingo! + td = True + break + except ValueError : + pass + if td == True : + # Getting there... + for format in _dur_times : + try : + # try to check if the syntax is fine + d = datetime.strptime(tm, format) + # bingo! + return duration_type + except ValueError : + pass + # something went wrong... + return None + else : + # Well, no more tricks, this is a plain type + return None + + # If we got here, we should check the time zone + # there is a discrepancy betwen the python and the HTML5/XSD lexical string, + # which means that this has to handled separately for the date and the timezone portion + try : + # The time-zone-less portion of the string + str = string[0:-6] + # The time-zone portion + tz = string[-5:] + try : + t = datetime.strptime(tz,"%H:%M") + except ValueError : + # Bummer, this is not a correct time + return None + # The time-zone is fine, the datetime portion has to be checked + for format in _formats[datetime_type] : + try : + # try to check if it is fine + d = datetime.strptime(str, format) + # Bingo! + return datetime_type + except ValueError : + pass + except : + pass + return None + + +######################################################################################################### +# Handling URIs +class URIOpener : + """A wrapper around the urllib2 method to open a resource. Beyond accessing the data itself, the class + sets the content location. + The class also adds an accept header to the outgoing request, namely + text/html and application/xhtml+xml (unless set explicitly by the caller). + + @ivar data: the real data, ie, a file-like object + @ivar headers: the return headers as sent back by the server + @ivar location: the real location of the data (ie, after possible redirection and content negotiation) + """ + CONTENT_LOCATION = 'Content-Location' + def __init__(self, name) : + """ + @param name: URL to be opened + @keyword additional_headers: additional HTTP request headers to be added to the call + """ + try : + # Note the removal of the fragment ID. This is necessary, per the HTTP spec + req = Request(url=name.split('#')[0]) + + req.add_header('Accept', 'text/html, application/xhtml+xml') + + self.data = urlopen(req) + self.headers = self.data.info() + + if URIOpener.CONTENT_LOCATION in self.headers : + self.location = urlparse.urljoin(self.data.geturl(),self.headers[URIOpener.CONTENT_LOCATION]) + else : + self.location = name + + except urllib_HTTPError : + e = sys.exc_info()[1] + from pyMicrodata import HTTPError + msg = BaseHTTPRequestHandler.responses[e.code] + raise HTTPError('%s' % msg[1], e.code) + except Exception : + e = sys.exc_info()[1] + from pyMicrodata import MicrodataError + raise MicrodataError('%s' % e) + diff --git a/rdflib/plugins/parsers/pyRdfa/__init__.py b/rdflib/plugins/parsers/pyRdfa/__init__.py new file mode 100644 index 00000000..9835b7ac --- /dev/null +++ b/rdflib/plugins/parsers/pyRdfa/__init__.py @@ -0,0 +1,914 @@ +# -*- coding: utf-8 -*- +""" +RDFa 1.1 parser, also referred to as a “RDFa Distiller”. It is +deployed, via a CGI front-end, on the U{W3C RDFa 1.1 Distiller page<http://www.w3.org/2012/pyRdfa/>}. + +For details on RDFa, the reader should consult the U{RDFa Core 1.1<http://www.w3.org/TR/rdfa-core/>}, U{XHTML+RDFa1.1<http://www.w3.org/TR/2010/xhtml-rdfa>}, and the U{RDFa 1.1 Lite<http://www.w3.org/TR/rdfa-lite/>} documents. +The U{RDFa 1.1 Primer<http://www.w3.org/TR/owl2-primer/>} may also prove helpful. + +This package can also be downloaded U{from GitHub<https://github.com/RDFLib/pyrdfa3>}. The +distribution also includes the CGI front-end and a separate utility script to be run locally. + +Note that this package is an updated version of a U{previous RDFa distiller<http://www.w3.org/2007/08/pyRdfa>} that was developed +for RDFa 1.0. Although it reuses large portions of that code, it has been quite thoroughly rewritten, hence put in a completely +different project. (The version numbering has been continued, though, to avoid any kind of misunderstandings. This version has version numbers "3.0.0" or higher.) + +(Simple) Usage +============== +From a Python file, expecting a Turtle output:: + from pyRdfa import pyRdfa + print pyRdfa().rdf_from_source('filename') +Other output formats are also possible. E.g., to produce RDF/XML output, one could use:: + from pyRdfa import pyRdfa + print pyRdfa().rdf_from_source('filename', outputFormat='pretty-xml') +It is also possible to embed an RDFa processing. Eg, using:: + from pyRdfa import pyRdfa + graph = pyRdfa().graph_from_source('filename') +returns an RDFLib.Graph object instead of a serialization thereof. See the the description of the +L{pyRdfa class<pyRdfa.pyRdfa>} for further possible entry points details. + +There is also, as part of this module, a L{separate entry for CGI calls<processURI>}. + +Return (serialization) formats +------------------------------ + +The package relies on RDFLib. By default, it relies therefore on the serializers coming with the local RDFLib distribution. However, there has been some issues with serializers of older RDFLib releases; also, some output formats, like JSON-LD, are not (yet) part of the standard RDFLib distribution. A companion package, called pyRdfaExtras, is part of the download, and it includes some of those extra serializers. The extra format (not part of the RDFLib core) is U{JSON-LD<http://json-ld.org/spec/latest/json-ld-syntax/>}, whose 'key' is 'json', when used in the 'parse' method of an RDFLib graph. + +Options +======= + +The package also implements some optional features that are not part of the RDFa recommendations. At the moment these are: + + - possibility for plain literals to be normalized in terms of white spaces. Default: false. (The RDFa specification requires keeping the white spaces and leave applications to normalize them, if needed) + - inclusion of embedded RDF: Turtle content may be enclosed in a C{script} element and typed as C{text/turtle}, U{defined by the RDF Working Group<http://www.w3.org/TR/turtle/>}. Alternatively, some XML dialects (e.g., SVG) allows the usage of RDF/XML as part of their core content to define metadata in RDF. For both of these cases pyRdfa parses these serialized RDF content and adds the resulting triples to the output Graph. Default: true. + - extra, built-in transformers are executed on the DOM tree prior to RDFa processing (see below). These transformers can be provided by the end user. + +Options are collected in an instance of the L{Options} class and may be passed to the processing functions as an extra argument. E.g., to allow the inclusion of embedded content:: + from pyRdfa.options import Options + options = Options(embedded_rdf=True) + print pyRdfa(options=options).rdf_from_source('filename') + +See the description of the L{Options} class for the details. + + +Host Languages +============== + +RDFa 1.1. Core is defined for generic XML; there are specific documents to describe how the generic specification is applied to +XHTML and HTML5. + +pyRdfa makes an automatic switch among these based on the content type of the source as returned by an HTTP request. The following are the +possible host languages: + - if the content type is C{text/html}, the content is HTML5 + - if the content type is C{application/xhtml+xml} I{and} the right DTD is used, the content is XHTML1 + - if the content type is C{application/xhtml+xml} and no or an unknown DTD is used, the content is XHTML5 + - if the content type is C{application/svg+xml}, the content type is SVG + - if the content type is C{application/atom+xml}, the content type is SVG + - if the content type is C{application/xml} or C{application/xxx+xml} (but 'xxx' is not 'atom' or 'svg'), the content type is XML + +If local files are used, pyRdfa makes a guess on the content type based on the file name suffix: C{.html} is for HTML5, C{.xhtml} for XHTML1, C{.svg} for SVG, anything else is considered to be general XML. Finally, the content type may be set by the caller when initializing the L{pyRdfa class<pyRdfa.pyRdfa>}. + +Beyond the differences described in the RDFa specification, the main difference is the parser used to parse the source. In the case of HTML5, pyRdfa uses an U{HTML5 parser<http://code.google.com/p/html5lib/>}; for all other cases the simple XML parser, part of the core Python environment, is used. This may be significant in the case of erronuous sources: indeed, the HTML5 parser may do adjustments on +the DOM tree before handing it over to the distiller. Furthermore, SVG is also recognized as a type that allows embedded RDF in the form of RDF/XML. + +See the variables in the L{host} module if a new host language is added to the system. The current host language information is available for transformers via the option argument, too, and can be used to control the effect of the transformer. + +Vocabularies +============ + +RDFa 1.1 has the notion of vocabulary files (using the C{@vocab} attribute) that may be used to expand the generated RDF graph. Expansion is based on some very simply RDF Schema and OWL statements on sub-properties and sub-classes, and equivalences. + +pyRdfa implements this feature, although it does not do this by default. The extra C{vocab_expansion} parameter should be used for this extra step, for example:: + from pyRdfa.options import Options + options = Options(vocab_expansion=True) + print pyRdfa(options=options).rdf_from_source('filename') + +The triples in the vocabulary files themselves (i.e., the small ontology in RDF Schema and OWL) are removed from the result, leaving the inferred property and type relationships only (additionally to the “core” RDF content). + +Vocabulary caching +------------------ + +By default, pyRdfa uses a caching mechanism instead of fetching the vocabulary files each time their URI is met as a C{@vocab} attribute value. (This behavior can be switched off setting the C{vocab_cache} option to false.) + +Caching happens in a file system directory. The directory itself is determined by the platform the tool is used on, namely: + - On Windows, it is the C{pyRdfa-cache} subdirectory of the C{%APPDATA%} environment variable + - On MacOS, it is the C{~/Library/Application Support/pyRdfa-cache} + - Otherwise, it is the C{~/.pyRdfa-cache} + +This automatic choice can be overridden by the C{PyRdfaCacheDir} environment variable. + +Caching can be set to be read-only, i.e., the setup might generate the cache files off-line instead of letting the tool writing its own cache when operating, e.g., as a service on the Web. This can be achieved by making the cache directory read only. + +If the directories are neither readable nor writable, the vocabulary files are retrieved via HTTP every time they are hit. This may slow down processing, it is advised to avoid such a setup for the package. + +The cache includes a separate index file and a file for each vocabulary file. Cache control is based upon the C{EXPIRES} header of a vocabulary file’s HTTP return header: when first seen, this data is stored in the index file and controls whether the cache has to be renewed or not. If the HTTP return header does not have this entry, the date is artificially set ot the current date plus one day. + +(The cache files themselves are dumped and loaded using U{Python’s built in cPickle package<http://docs.python.org/release/2.7/library/pickle.html#module-cPickle>}. These are binary files. Care should be taken if they are managed by CVS: they must be declared as binary files when adding them to the repository.) + +RDFa 1.1 vs. RDFa 1.0 +===================== + +Unfortunately, RDFa 1.1 is I{not} fully backward compatible with RDFa 1.0, meaning that, in a few cases, the triples generated from an RDFa 1.1 source are not the same as for RDFa 1.0. (See the separate U{section in the RDFa 1.1 specification<http://www.w3.org/TR/rdfa-core/#major-differences-with-rdfa-syntax-1.0>} for some further details.) + +This distiller’s default behavior is RDFa 1.1. However, if the source includes, in the top element of the file (e.g., the C{html} element) a C{@version} attribute whose value contains the C{RDFa 1.0} string, then the distiller switches to a RDFa 1.0 mode. (Although the C{@version} attribute is not required in RDFa 1.0, it is fairly commonly used.) Similarly, if the RDFa 1.0 DTD is used in the XHTML source, it will be taken into account (a very frequent setup is that an XHTML file is defined with that DTD and is served as text/html; pyRdfa will consider that file as XHTML5, i.e., parse it with the HTML5 parser, but interpret the RDFa attributes under the RDFa 1.0 rules). + +Transformers +============ + +The package uses the concept of 'transformers': the parsed DOM tree is possibly +transformed I{before} performing the real RDFa processing. This transformer structure makes it possible to +add additional 'services' without distoring the core code of RDFa processing. + +A transformer is a function with three arguments: + + - C{node}: a DOM node for the top level element of the DOM tree + - C{options}: the current L{Options} instance + - C{state}: the current L{ExecutionContext} instance, corresponding to the top level DOM Tree element + +The function may perform any type of change on the DOM tree; the typical behaviour is to add or remove attributes on specific elements. Some transformations are included in the package and can be used as examples; see the L{transform} module of the distribution. These are: + + - The C{@name} attribute of the C{meta} element is copied into a C{@property} attribute of the same element + - Interpreting the 'openid' references in the header. See L{transform.OpenID} for further details. + - Implementing the Dublin Core dialect to include DC statements from the header. See L{transform.DublinCore} for further details. + +The user of the package may refer add these transformers to L{Options} instance. Here is a possible usage with the “openid” transformer added to the call:: + from pyRdfa.options import Options + from pyRdfa.transform.OpenID import OpenID_transform + options = Options(transformers=[OpenID_transform]) + print pyRdfa(options=options).rdf_from_source('filename') + + +@summary: RDFa parser (distiller) +@requires: Python version 2.5 or up; 2.7 is preferred +@requires: U{RDFLib<http://rdflib.net>}; version 3.X is preferred. +@requires: U{html5lib<http://code.google.com/p/html5lib/>} for the HTML5 parsing. +@requires: U{httpheader<http://deron.meranda.us/python/httpheader/>}; however, a small modification had to make on the original file, so for this reason and to make distribution easier this module (single file) is added to the package. +@organization: U{World Wide Web Consortium<http://www.w3.org>} +@author: U{Ivan Herman<a href="http://www.w3.org/People/Ivan/">} +@license: This software is available for use under the +U{W3C® SOFTWARE NOTICE AND LICENSE<href="http://www.w3.org/Consortium/Legal/2002/copyright-software-20021231">} + +@var builtInTransformers: List of built-in transformers that are to be run regardless, because they are part of the RDFa spec +@var CACHE_DIR_VAR: Environment variable used to define cache directories for RDFa vocabularies in case the default setting does not work or is not appropriate. +@var rdfa_current_version: Current "official" version of RDFa that this package implements by default. This can be changed at the invocation of the package +@var uri_schemes: List of registered (or widely used) URI schemes; used for warnings... +""" + +""" + $Id: __init__.py,v 1.82 2012/08/21 10:28:50 ivan Exp $ +""" + +__version__ = "3.4.3" +__author__ = 'Ivan Herman' +__contact__ = 'Ivan Herman, ivan@w3.org' +__license__ = 'W3C® SOFTWARE NOTICE AND LICENSE, http://www.w3.org/Consortium/Legal/2002/copyright-software-20021231' + +import sys +PY3 = (sys.version_info[0] >= 3) + +if PY3 : + from io import StringIO +else : + from StringIO import StringIO + +import os + +import rdflib +from rdflib import URIRef +from rdflib import Literal +from rdflib import BNode +from rdflib import Namespace +if rdflib.__version__ >= "3.0.0" : + from rdflib import RDF as ns_rdf + from rdflib import RDFS as ns_rdfs + from rdflib import Graph +else : + from rdflib.RDFS import RDFSNS as ns_rdfs + from rdflib.RDF import RDFNS as ns_rdf + from rdflib.Graph import Graph + +from pyRdfa.extras.httpheader import acceptable_content_type, content_type + +import xml.dom.minidom + +if PY3 : + from urllib.parse import urlparse +else : + from urlparse import urlparse + +# Namespace, in the RDFLib sense, for the rdfa vocabulary +ns_rdfa = Namespace("http://www.w3.org/ns/rdfa#") + +# Vocabulary terms for vocab reporting +RDFA_VOCAB = ns_rdfa["usesVocabulary"] + +# Namespace, in the RDFLib sense, for the XSD Datatypes +ns_xsd = Namespace('http://www.w3.org/2001/XMLSchema#') + +# Namespace, in the RDFLib sense, for the distiller vocabulary, used as part of the processor graph +ns_distill = Namespace("http://www.w3.org/2007/08/pyRdfa/vocab#") + +debug = False + +######################################################################################################### + +# Exception/error handling. Essentially, all the different exceptions are re-packaged into +# separate exception class, to allow for an easier management on the user level + +class RDFaError(Exception) : + """Superclass exceptions representing error conditions defined by the RDFa 1.1 specification. + It does not add any new functionality to the + Exception class.""" + def __init__(self, msg) : + self.msg = msg + Exception.__init__(self) + +class FailedSource(RDFaError) : + """Raised when the original source cannot be accessed. It does not add any new functionality to the + Exception class.""" + def __init__(self, msg, http_code = None) : + self.msg = msg + self.http_code = http_code + RDFaError.__init__(self, msg) + +class HTTPError(RDFaError) : + """Raised when HTTP problems are detected. It does not add any new functionality to the + Exception class.""" + def __init__(self, http_msg, http_code) : + self.msg = http_msg + self.http_code = http_code + RDFaError.__init__(self,http_msg) + +class ProcessingError(RDFaError) : + """Error found during processing. It does not add any new functionality to the + Exception class.""" + pass + +class pyRdfaError(Exception) : + """Superclass exceptions representing error conditions outside the RDFa 1.1 specification.""" + pass + +# Error and Warning RDFS classes +RDFA_Error = ns_rdfa["Error"] +RDFA_Warning = ns_rdfa["Warning"] +RDFA_Info = ns_rdfa["Information"] +NonConformantMarkup = ns_rdfa["DocumentError"] +UnresolvablePrefix = ns_rdfa["UnresolvedCURIEPrefix"] +UnresolvableReference = ns_rdfa["UnresolvedCURIEReference"] +UnresolvableTerm = ns_rdfa["UnresolvedTerm"] +VocabReferenceError = ns_rdfa["VocabReferenceError"] + +FileReferenceError = ns_distill["FileReferenceError"] +HTError = ns_distill["HTTPError"] +IncorrectPrefixDefinition = ns_distill["IncorrectPrefixDefinition"] +IncorrectBlankNodeUsage = ns_distill["IncorrectBlankNodeUsage"] +IncorrectLiteral = ns_distill["IncorrectLiteral"] + +# Error message texts +err_no_blank_node = "Blank node in %s position is not allowed; ignored" + +err_redefining_URI_as_prefix = "'%s' a registered or an otherwise used URI scheme, but is defined as a prefix here; is this a mistake? (see, eg, http://en.wikipedia.org/wiki/URI_scheme or http://www.iana.org/assignments/uri-schemes.html for further information for most of the URI schemes)" +err_xmlns_deprecated = "The usage of 'xmlns' for prefix definition is deprecated; please use the 'prefix' attribute instead (definition for '%s')" +err_bnode_local_prefix = "The '_' local CURIE prefix is reserved for blank nodes, and cannot be defined as a prefix" +err_col_local_prefix = "The character ':' is not valid in a CURIE Prefix, and cannot be used in a prefix definition (definition for '%s')" +err_missing_URI_prefix = "Missing URI in prefix declaration for '%s' (in '%s')" +err_invalid_prefix = "Invalid prefix declaration '%s' (in '%s')" +err_no_default_prefix = "Default prefix cannot be changed (in '%s')" +err_prefix_and_xmlns = "@prefix setting for '%s' overrides the 'xmlns:%s' setting; may be a source of problem if same file is run through RDFa 1.0" +err_non_ncname_prefix = "Non NCNAME '%s' in prefix definition (in '%s'); ignored" +err_absolute_reference = "CURIE Reference part contains an authority part: %s (in '%s'); ignored" +err_query_reference = "CURIE Reference query part contains an unauthorized character: %s (in '%s'); ignored" +err_fragment_reference = "CURIE Reference fragment part contains an unauthorized character: %s (in '%s'); ignored" +err_lang = "There is a problem with language setting; either both xml:lang and lang used on an element with different values, or, for (X)HTML5, only xml:lang is used." +err_URI_scheme = "Unusual URI scheme used in <%s>; may that be a mistake, e.g., resulting from using an undefined CURIE prefix or an incorrect CURIE?" +err_illegal_safe_CURIE = "Illegal safe CURIE: %s; ignored" +err_no_CURIE_in_safe_CURIE = "Safe CURIE is used, but the value does not correspond to a defined CURIE: [%s]; ignored" +err_undefined_terms = "'%s' is used as a term, but has not been defined as such; ignored" +err_non_legal_CURIE_ref = "Relative URI is not allowed in this position (or not a legal CURIE reference) '%s'; ignored" +err_undefined_CURIE = "Undefined CURIE: '%s'; ignored" + +err_unusual_char_in_URI = "Unusual character in uri: %s; possible error?" + +############################################################################################# + +from pyRdfa.state import ExecutionContext +from pyRdfa.parse import parse_one_node +from pyRdfa.options import Options +from pyRdfa.transform import top_about, empty_safe_curie, vocab_for_role +from pyRdfa.utils import URIOpener +from pyRdfa.host import HostLanguage, MediaTypes, preferred_suffixes, content_to_host_language + +# Environment variable used to characterize cache directories for RDFa vocabulary files. +CACHE_DIR_VAR = "PyRdfaCacheDir" + +# current "official" version of RDFa that this package implements. This can be changed at the invocation of the package +rdfa_current_version = "1.1" + +# I removed schemes that would not appear as a prefix anyway, like iris.beep +# http://en.wikipedia.org/wiki/URI_scheme seems to be a good source of information +# as well as http://www.iana.org/assignments/uri-schemes.html +# There are some overlaps here, but better more than not enough... + +# This comes from wikipedia +registered_iana_schemes = [ + "aaa","aaas","acap","cap","cid","crid","data","dav","dict","dns","fax","file", "ftp","geo","go", + "gopher","h323","http","https","iax","icap","im","imap","info","ipp","iris","ldap", "lsid", + "mailto","mid","modem","msrp","msrps", "mtqp", "mupdate","news","nfs","nntp","opaquelocktoken", + "pop","pres", "prospero","rstp","rsync", "service","shttp","sieve","sip","sips", "sms", "snmp", "soap", "tag", + "tel","telnet", "tftp", "thismessage","tn3270","tip","tv","urn","vemmi","wais","ws", "wss", "xmpp" +] + +# This comes from wikipedia, too +unofficial_common = [ + "about", "adiumxtra", "aim", "apt", "afp", "aw", "bitcoin", "bolo", "callto", "chrome", "coap", + "content", "cvs", "doi", "ed2k", "facetime", "feed", "finger", "fish", "git", "gg", + "gizmoproject", "gtalk", "irc", "ircs", "irc6", "itms", "jar", "javascript", + "keyparc", "lastfm", "ldaps", "magnet", "maps", "market", "message", "mms", + "msnim", "mumble", "mvn", "notes", "palm", "paparazzi", "psync", "rmi", + "secondlife", "sgn", "skype", "spotify", "ssh", "sftp", "smb", "soldat", + "steam", "svn", "teamspeak", "things", "udb", "unreal", "ut2004", + "ventrillo", "view-source", "webcal", "wtai", "wyciwyg", "xfire", "xri", "ymsgr" +] + +# These come from the IANA page +historical_iana_schemes = [ + "fax", "mailserver", "modem", "pack", "prospero", "snews", "videotex", "wais" +] + +provisional_iana_schemes = [ + "afs", "dtn", "dvb", "icon", "ipn", "jms", "oid", "rsync", "ni" +] + +other_used_schemes = [ + "hdl", "isbn", "issn", "mstp", "rtmp", "rtspu", "stp" +] + +uri_schemes = registered_iana_schemes + unofficial_common + historical_iana_schemes + provisional_iana_schemes + other_used_schemes + +# List of built-in transformers that are to be run regardless, because they are part of the RDFa spec +builtInTransformers = [ + empty_safe_curie, top_about, vocab_for_role +] + +######################################################################################################### +class pyRdfa : + """Main processing class for the distiller + + @ivar options: an instance of the L{Options} class + @ivar media_type: the preferred default media type, possibly set at initialization + @ivar base: the base value, possibly set at initialization + @ivar http_status: HTTP Status, to be returned when the package is used via a CGI entry. Initially set to 200, may be modified by exception handlers + """ + def __init__(self, options = None, base = "", media_type = "", rdfa_version = None) : + """ + @keyword options: Options for the distiller + @type options: L{Options} + @keyword base: URI for the default "base" value (usually the URI of the file to be processed) + @keyword media_type: explicit setting of the preferred media type (a.k.a. content type) of the the RDFa source + @keyword rdfa_version: the RDFa version that should be used. If not set, the value of the global L{rdfa_current_version} variable is used + """ + self.http_status = 200 + + self.base = base + if base == "" : + self.required_base = None + else : + self.required_base = base + self.charset = None + + # predefined content type + self.media_type = media_type + + if options == None : + self.options = Options() + else : + self.options = options + + if media_type != "" : + self.options.set_host_language(self.media_type) + + if rdfa_version is not None : + self.rdfa_version = rdfa_version + else : + self.rdfa_version = None + + def _get_input(self, name) : + """ + Trying to guess whether "name" is a URI or a string (for a file); it then tries to open this source accordingly, + returning a file-like object. If name is none of these, it returns the input argument (that should + be, supposedly, a file-like object already). + + If the media type has not been set explicitly at initialization of this instance, + the method also sets the media_type based on the HTTP GET response or the suffix of the file. See + L{host.preferred_suffixes} for the suffix to media type mapping. + + @param name: identifier of the input source + @type name: string or a file-like object + @return: a file like object if opening "name" is possible and successful, "name" otherwise + """ + try : + # Python 2 branch + isstring = isinstance(name, basestring) + except : + # Python 3 branch + isstring = isinstance(name, str) + + try : + if isstring : + # check if this is a URI, ie, if there is a valid 'scheme' part + # otherwise it is considered to be a simple file + if urlparse(name)[0] != "" : + url_request = URIOpener(name) + self.base = url_request.location + if self.media_type == "" : + if url_request.content_type in content_to_host_language : + self.media_type = url_request.content_type + else : + self.media_type = MediaTypes.xml + self.options.set_host_language(self.media_type) + self.charset = url_request.charset + if self.required_base == None : + self.required_base = name + return url_request.data + else : + self.base = name + # Creating a File URI for this thing + if self.required_base == None : + self.required_base = "file://" + os.path.join(os.getcwd(),name) + if self.media_type == "" : + self.media_type = MediaTypes.xml + # see if the default should be overwritten + for suffix in preferred_suffixes : + if name.endswith(suffix) : + self.media_type = preferred_suffixes[suffix] + self.charset = 'utf-8' + break + self.options.set_host_language(self.media_type) + return file(name) + else : + return name + except HTTPError : + raise sys.exc_info()[1] + except : + (type, value, traceback) = sys.exc_info() + raise FailedSource(value) + + #################################################################################################################### + # Externally used methods + # + def graph_from_DOM(self, dom, graph = None, pgraph = None) : + """ + Extract the RDF Graph from a DOM tree. This is where the real processing happens. All other methods get down to this + one, eventually (e.g., after opening a URI and parsing it into a DOM). + @param dom: a DOM Node element, the top level entry node for the whole tree (i.e., the C{dom.documentElement} is used to initiate processing down the node hierarchy) + @keyword graph: an RDF Graph (if None, than a new one is created) + @type graph: rdflib Graph instance. + @keyword pgraph: an RDF Graph to hold (possibly) the processor graph content. If None, and the error/warning triples are to be generated, they will be added to the returned graph. Otherwise they are stored in this graph. + @type pgraph: rdflib Graph instance + @return: an RDF Graph + @rtype: rdflib Graph instance + """ + def copyGraph(tog, fromg) : + for t in fromg : + tog.add(t) + for k,ns in fromg.namespaces() : + tog.bind(k,ns) + + if graph == None : + # Create the RDF Graph, that will contain the return triples... + graph = Graph() + + # this will collect the content, the 'default graph', as called in the RDFa spec + default_graph = Graph() + + # get the DOM tree + topElement = dom.documentElement + + # Create the initial state. This takes care of things + # like base, top level namespace settings, etc. + state = ExecutionContext(topElement, default_graph, base=self.base, options=self.options, rdfa_version=self.rdfa_version) + + # Perform the built-in and external transformations on the HTML tree. + for trans in self.options.transformers + builtInTransformers : + trans(topElement, self.options, state) + + # This may have changed if the state setting detected an explicit version information: + self.rdfa_version = state.rdfa_version + + # The top level subject starts with the current document; this + # is used by the recursion + # this function is the real workhorse + parse_one_node(topElement, default_graph, None, state, []) + + # If the RDFS expansion has to be made, here is the place... + if self.options.vocab_expansion : + from pyRdfa.rdfs.process import process_rdfa_sem + process_rdfa_sem(default_graph, self.options) + + # What should be returned depends on the way the options have been set up + if self.options.output_default_graph : + copyGraph(graph, default_graph) + if self.options.output_processor_graph : + if pgraph != None : + copyGraph(pgraph, self.options.processor_graph.graph) + else : + copyGraph(graph, self.options.processor_graph.graph) + elif self.options.output_processor_graph : + if pgraph != None : + copyGraph(pgraph, self.options.processor_graph.graph) + else : + copyGraph(graph, self.options.processor_graph.graph) + + # this is necessary if several DOM trees are handled in a row... + self.options.reset_processor_graph() + + return graph + + def graph_from_source(self, name, graph = None, rdfOutput = False, pgraph = None) : + """ + Extract an RDF graph from an RDFa source. The source is parsed, the RDF extracted, and the RDFa Graph is + returned. This is a front-end to the L{pyRdfa.graph_from_DOM} method. + + @param name: a URI, a file name, or a file-like object + @param graph: rdflib Graph instance. If None, a new one is created. + @param pgraph: rdflib Graph instance for the processor graph. If None, and the error/warning triples are to be generated, they will be added to the returned graph. Otherwise they are stored in this graph. + @param rdfOutput: whether runtime exceptions should be turned into RDF and returned as part of the processor graph + @return: an RDF Graph + @rtype: rdflib Graph instance + """ + def copyErrors(tog, options) : + if tog == None : + tog = Graph() + if options.output_processor_graph : + for t in options.processor_graph.graph : + tog.add(t) + for k,ns in options.processor_graph.graph.namespaces() : + tog.bind(k,ns) + options.reset_processor_graph() + return tog + + # Separating this for a forward Python 3 compatibility + try : + # Python 2 branch + isstring = isinstance(name, basestring) + except : + # Python 3 branch + isstring = isinstance(name, str) + + try : + # First, open the source... Possible HTTP errors are returned as error triples + input = None + try : + input = self._get_input(name) + except FailedSource : + f = sys.exc_info()[1] + self.http_status = 400 + if not rdfOutput : raise f + err = self.options.add_error(f.msg, FileReferenceError, name) + self.options.processor_graph.add_http_context(err, 400) + return copyErrors(graph, self.options) + except HTTPError : + h = sys.exc_info()[1] + self.http_status = h.http_code + if not rdfOutput : raise h + err = self.options.add_error("HTTP Error: %s (%s)" % (h.http_code,h.msg), HTError, name) + self.options.processor_graph.add_http_context(err, h.http_code) + return copyErrors(graph, self.options) + except Exception : + e = sys.exc_info()[1] + self.http_status = 500 + # Something nasty happened:-( + if not rdfOutput : raise e + err = self.options.add_error(str(e), context = name) + self.options.processor_graph.add_http_context(err, 500) + return copyErrors(graph, self.options) + + dom = None + try : + msg = "" + parser = None + if self.options.host_language == HostLanguage.html5 : + import warnings + warnings.filterwarnings("ignore", category=DeprecationWarning) + import html5lib + parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("dom")) + if self.charset : + # This means the HTTP header has provided a charset, or the + # file is a local file when we suppose it to be a utf-8 + dom = parser.parse(input, encoding=self.charset) + else : + # No charset set. The HTMLLib parser tries to sniff into the + # the file to find a meta header for the charset; if that + # works, fine, otherwise it falls back on window-... + dom = parser.parse(input) + + try : + if isstring : + input.close() + input = self._get_input(name) + else : + input.seek(0) + from pyRdfa.host import adjust_html_version + self.rdfa_version = adjust_html_version(input, self.rdfa_version) + except : + # if anyting goes wrong, it is not really important; rdfa version stays what it was... + pass + + else : + # in other cases an XML parser has to be used + from pyRdfa.host import adjust_xhtml_and_version + parse = xml.dom.minidom.parse + dom = parse(input) + (adjusted_host_language, version) = adjust_xhtml_and_version(dom, self.options.host_language, self.rdfa_version) + self.options.host_language = adjusted_host_language + self.rdfa_version = version + except ImportError : + msg = "HTML5 parser not available. Try installing html5lib <http://code.google.com/p/html5lib>" + raise ImportError(msg) + except Exception : + e = sys.exc_info()[1] + # These are various parsing exception. Per spec, this is a case when + # error triples MUST be returned, ie, the usage of rdfOutput (which switches between an HTML formatted + # return page or a graph with error triples) does not apply + err = self.options.add_error(str(e), context = name) + self.http_status = 400 + self.options.processor_graph.add_http_context(err, 400) + return copyErrors(graph, self.options) + + # If we got here, we have a DOM tree to operate on... + return self.graph_from_DOM(dom, graph, pgraph) + except Exception : + # Something nasty happened during the generation of the graph... + (a,b,c) = sys.exc_info() + sys.excepthook(a,b,c) + if isinstance(b, ImportError) : + self.http_status = None + else : + self.http_status = 500 + if not rdfOutput : raise b + err = self.options.add_error(str(b), context = name) + self.options.processor_graph.add_http_context(err, 500) + return copyErrors(graph, self.options) + + def rdf_from_sources(self, names, outputFormat = "turtle", rdfOutput = False) : + """ + Extract and RDF graph from a list of RDFa sources and serialize them in one graph. The sources are parsed, the RDF + extracted, and serialization is done in the specified format. + @param names: list of sources, each can be a URI, a file name, or a file-like object + @keyword outputFormat: serialization format. Can be one of "turtle", "n3", "xml", "pretty-xml", "nt". "xml", "pretty-xml", "json" or "json-ld". "turtle" and "n3", "xml" and "pretty-xml", and "json" and "json-ld" are synonyms, respectively. Note that the JSON-LD serialization works with RDFLib 3.* only. + @keyword rdfOutput: controls what happens in case an exception is raised. If the value is False, the caller is responsible handling it; otherwise a graph is returned with an error message included in the processor graph + @type rdfOutput: boolean + @return: a serialized RDF Graph + @rtype: string + """ + # This is better because it gives access to the various, non-standard serializations + # If it does not work because the extra are not installed, fall back to the standard + # rdlib distribution... + try : + from pyRdfaExtras import MyGraph + graph = MyGraph() + except : + graph = Graph() + + graph.bind("xsd", Namespace('http://www.w3.org/2001/XMLSchema#')) + # the value of rdfOutput determines the reaction on exceptions... + for name in names : + self.graph_from_source(name, graph, rdfOutput) + retval = graph.serialize(format=outputFormat) + return retval + + def rdf_from_source(self, name, outputFormat = "turtle", rdfOutput = False) : + """ + Extract and RDF graph from an RDFa source and serialize it in one graph. The source is parsed, the RDF + extracted, and serialization is done in the specified format. + @param name: a URI, a file name, or a file-like object + @keyword outputFormat: serialization format. Can be one of "turtle", "n3", "xml", "pretty-xml", "nt". "xml", "pretty-xml", "json" or "json-ld". "turtle" and "n3", "xml" and "pretty-xml", and "json" and "json-ld" are synonyms, respectively. Note that the JSON-LD serialization works with RDFLib 3.* only. + @keyword rdfOutput: controls what happens in case an exception is raised. If the value is False, the caller is responsible handling it; otherwise a graph is returned with an error message included in the processor graph + @type rdfOutput: boolean + @return: a serialized RDF Graph + @rtype: string + """ + return self.rdf_from_sources([name], outputFormat, rdfOutput) + +################################################# CGI Entry point +def processURI(uri, outputFormat, form={}) : + """The standard processing of an RDFa uri options in a form; used as an entry point from a CGI call. + + The call accepts extra form options (i.e., HTTP GET options) as follows: + + - C{graph=[output|processor|output,processor|processor,output]} specifying which graphs are returned. Default: C{output} + - C{space_preserve=[true|false]} means that plain literals are normalized in terms of white spaces. Default: C{false} + - C{rfa_version} provides the RDFa version that should be used for distilling. The string should be of the form "1.0" or "1.1". Default is the highest version the current package implements, currently "1.1" + - C{host_language=[xhtml,html,xml]} : the host language. Used when files are uploaded or text is added verbatim, otherwise the HTTP return header should be used. Default C{xml} + - C{embedded_rdf=[true|false]} : whether embedded turtle or RDF/XML content should be added to the output graph. Default: C{false} + - C{vocab_expansion=[true|false]} : whether the vocabularies should be expanded through the restricted RDFS entailment. Default: C{false} + - C{vocab_cache=[true|false]} : whether vocab caching should be performed or whether it should be ignored and vocabulary files should be picked up every time. Default: C{false} + - C{vocab_cache_report=[true|false]} : whether vocab caching details should be reported. Default: C{false} + - C{vocab_cache_bypass=[true|false]} : whether vocab caches have to be regenerated every time. Default: C{false} + - C{rdfa_lite=[true|false]} : whether warnings should be generated for non RDFa Lite attribute usage. Default: C{false} + + @param uri: URI to access. Note that the C{text:} and C{uploaded:} fake URI values are treated separately; the former is for textual intput (in which case a StringIO is used to get the data) and the latter is for uploaded file, where the form gives access to the file directly. + @param outputFormat: serialization format, as defined by the package. Currently "xml", "turtle", "nt", or "json". Default is "turtle", also used if any other string is given. + @param form: extra call options (from the CGI call) to set up the local options + @type form: cgi FieldStorage instance + @return: serialized graph + @rtype: string + """ + def _get_option(param, compare_value, default) : + param_old = param.replace('_','-') + if param in list(form.keys()) : + val = form.getfirst(param).lower() + return val == compare_value + elif param_old in list(form.keys()) : + # this is to ensure the old style parameters are still valid... + # in the old days I used '-' in the parameters, the standard favours '_' + val = form.getfirst(param_old).lower() + return val == compare_value + else : + return default + + if uri == "uploaded:" : + input = form["uploaded"].file + base = "" + elif uri == "text:" : + input = StringIO(form.getfirst("text")) + base = "" + else : + input = uri + base = uri + + if "rdfa_version" in list(form.keys()) : + rdfa_version = form.getfirst("rdfa_version") + else : + rdfa_version = None + + # working through the possible options + # Host language: HTML, XHTML, or XML + # Note that these options should be used for the upload and inline version only in case of a form + # for real uris the returned content type should be used + if "host_language" in list(form.keys()) : + if form.getfirst("host_language").lower() == "xhtml" : + media_type = MediaTypes.xhtml + elif form.getfirst("host_language").lower() == "html" : + media_type = MediaTypes.html + elif form.getfirst("host_language").lower() == "svg" : + media_type = MediaTypes.svg + elif form.getfirst("host_language").lower() == "atom" : + media_type = MediaTypes.atom + else : + media_type = MediaTypes.xml + else : + media_type = "" + + transformers = [] + + if "rdfa_lite" in list(form.keys()) and form.getfirst("rdfa_lite").lower() == "true" : + from pyRdfa.transform.lite import lite_prune + transformers.append(lite_prune) + + # The code below is left for backward compatibility only. In fact, these options are not exposed any more, + # they are not really in use + if "extras" in list(form.keys()) and form.getfirst("extras").lower() == "true" : + from pyRdfa.transform.metaname import meta_transform + from pyRdfa.transform.OpenID import OpenID_transform + from pyRdfa.transform.DublinCore import DC_transform + for t in [OpenID_transform, DC_transform, meta_transform] : + transformers.append(t) + else : + if "extra-meta" in list(form.keys()) and form.getfirst("extra-meta").lower() == "true" : + from pyRdfa.transform.metaname import meta_transform + transformers.append(meta_transform) + if "extra-openid" in list(form.keys()) and form.getfirst("extra-openid").lower() == "true" : + from pyRdfa.transform.OpenID import OpenID_transform + transformers.append(OpenID_transform) + if "extra-dc" in list(form.keys()) and form.getfirst("extra-dc").lower() == "true" : + from pyRdfa.transform.DublinCore import DC_transform + transformers.append(DC_transform) + + output_default_graph = True + output_processor_graph = False + # Note that I use the 'graph' and the 'rdfagraph' form keys here. Reason is that + # I used 'graph' in the previous versions, including the RDFa 1.0 processor, + # so if I removed that altogether that would create backward incompatibilities + # On the other hand, the RDFa 1.1 doc clearly refers to 'rdfagraph' as the standard + # key. + a = None + if "graph" in list(form.keys()) : + a = form.getfirst("graph").lower() + elif "rdfagraph" in list(form.keys()) : + a = form.getfirst("rdfagraph").lower() + if a != None : + if a == "processor" : + output_default_graph = False + output_processor_graph = True + elif a == "processor,output" or a == "output,processor" : + output_processor_graph = True + + embedded_rdf = _get_option( "embedded_rdf", "true", False) + space_preserve = _get_option( "space_preserve", "true", True) + vocab_cache = _get_option( "vocab_cache", "true", True) + vocab_cache_report = _get_option( "vocab_cache_report", "true", False) + refresh_vocab_cache = _get_option( "vocab_cache_refresh", "true", False) + vocab_expansion = _get_option( "vocab_expansion", "true", False) + if vocab_cache_report : output_processor_graph = True + + options = Options(output_default_graph = output_default_graph, + output_processor_graph = output_processor_graph, + space_preserve = space_preserve, + transformers = transformers, + vocab_cache = vocab_cache, + vocab_cache_report = vocab_cache_report, + refresh_vocab_cache = refresh_vocab_cache, + vocab_expansion = vocab_expansion, + embedded_rdf = embedded_rdf + ) + processor = pyRdfa(options = options, base = base, media_type = media_type, rdfa_version = rdfa_version) + + # Decide the output format; the issue is what should happen in case of a top level error like an inaccessibility of + # the html source: should a graph be returned or an HTML page with an error message? + + # decide whether HTML or RDF should be sent. + htmlOutput = False + #if 'HTTP_ACCEPT' in os.environ : + # acc = os.environ['HTTP_ACCEPT'] + # possibilities = ['text/html', + # 'application/rdf+xml', + # 'text/turtle; charset=utf-8', + # 'application/json', + # 'application/ld+json', + # 'text/rdf+n3'] + # + # # this nice module does content negotiation and returns the preferred format + # sg = acceptable_content_type(acc, possibilities) + # htmlOutput = (sg != None and sg[0] == content_type('text/html')) + # os.environ['rdfaerror'] = 'true' + + # This is really for testing purposes only, it is an unpublished flag to force RDF output no + # matter what + try : + graph = processor.rdf_from_source(input, outputFormat, rdfOutput = ("forceRDFOutput" in list(form.keys())) or not htmlOutput) + if outputFormat == "n3" : + retval = 'Content-Type: text/rdf+n3; charset=utf-8\n' + elif outputFormat == "nt" or outputFormat == "turtle" : + retval = 'Content-Type: text/turtle; charset=utf-8\n' + elif outputFormat == "json-ld" or outputFormat == "json" : + retval = 'Content-Type: application/json; charset=utf-8\n' + else : + retval = 'Content-Type: application/rdf+xml; charset=utf-8\n' + retval += '\n' + retval += graph + return retval + except HTTPError : + (type,h,traceback) = sys.exc_info() + import cgi + + retval = 'Content-type: text/html; charset=utf-8\nStatus: %s \n\n' % h.http_code + retval += "<html>\n" + retval += "<head>\n" + retval += "<title>HTTP Error in distilling RDFa content</title>\n" + retval += "</head><body>\n" + retval += "<h1>HTTP Error in distilling RDFa content</h1>\n" + retval += "<p>HTTP Error: %s (%s)</p>\n" % (h.http_code,h.msg) + retval += "<p>On URI: <code>'%s'</code></p>\n" % cgi.escape(uri) + retval +="</body>\n" + retval +="</html>\n" + return retval + except : + # This branch should occur only if an exception is really raised, ie, if it is not turned + # into a graph value. + (type,value,traceback) = sys.exc_info() + + import traceback, cgi + + retval = 'Content-type: text/html; charset=utf-8\nStatus: %s\n\n' % processor.http_status + retval += "<html>\n" + retval += "<head>\n" + retval += "<title>Exception in RDFa processing</title>\n" + retval += "</head><body>\n" + retval += "<h1>Exception in distilling RDFa</h1>\n" + retval += "<pre>\n" + strio = StringIO() + traceback.print_exc(file=strio) + retval += strio.getvalue() + retval +="</pre>\n" + retval +="<pre>%s</pre>\n" % value + retval +="<h1>Distiller request details</h1>\n" + retval +="<dl>\n" + if uri == "text:" and "text" in form and form["text"].value != None and len(form["text"].value.strip()) != 0 : + retval +="<dt>Text input:</dt><dd>%s</dd>\n" % cgi.escape(form["text"].value).replace('\n','<br/>') + elif uri == "uploaded:" : + retval +="<dt>Uploaded file</dt>\n" + else : + retval +="<dt>URI received:</dt><dd><code>'%s'</code></dd>\n" % cgi.escape(uri) + if "host_language" in list(form.keys()) : + retval +="<dt>Media Type:</dt><dd>%s</dd>\n" % media_type + if "graph" in list(form.keys()) : + retval +="<dt>Requested graphs:</dt><dd>%s</dd>\n" % form.getfirst("graph").lower() + else : + retval +="<dt>Requested graphs:</dt><dd>default</dd>\n" + retval +="<dt>Output serialization format:</dt><dd> %s</dd>\n" % outputFormat + if "space_preserve" in form : retval +="<dt>Space preserve:</dt><dd> %s</dd>\n" % form["space_preserve"].value + retval +="</dl>\n" + retval +="</body>\n" + retval +="</html>\n" + return retval diff --git a/rdflib/plugins/parsers/pyRdfa/embeddedRDF.py b/rdflib/plugins/parsers/pyRdfa/embeddedRDF.py new file mode 100644 index 00000000..d1300770 --- /dev/null +++ b/rdflib/plugins/parsers/pyRdfa/embeddedRDF.py @@ -0,0 +1,88 @@ +# -*- coding: utf-8 -*- +""" +Extracting possible embedded RDF/XML content from the file and parse it separately into the Graph. This is used, for example +by U{SVG 1.2 Tiny<http://www.w3.org/TR/SVGMobile12/>}. + +@author: U{Ivan Herman<a href="http://www.w3.org/People/Ivan/">} +@license: This software is available for use under the +U{W3C® SOFTWARE NOTICE AND LICENSE<href="http://www.w3.org/Consortium/Legal/2002/copyright-software-20021231">} +@contact: Ivan Herman, ivan@w3.org +@version: $Id: embeddedRDF.py,v 1.14 2012/05/18 15:31:13 ivan Exp $ +""" + +# Python 3 foolproof way... +try : + from io import StringIO +except : + from StringIO import StringIO + +from pyRdfa.host import HostLanguage, accept_embedded_rdf_xml, accept_embedded_turtle +from pyRdfa.utils import return_XML +import re, sys + +def handle_embeddedRDF(node, graph, state) : + """ + Handles embedded RDF. There are two possibilities: + + - the file is one of the XML dialects that allows for an embedded RDF/XML portion. See the L{host.accept_embedded_rdf_xml} for those (a typical example is SVG). + - the file is HTML and there is a turtle portion in the C{<script>} element with type text/turtle. + + @param node: a DOM node for the top level element + @param graph: target rdf graph + @type graph: RDFLib's Graph object instance + @param state: the inherited state (namespaces, lang, etc) + @type state: L{state.ExecutionContext} + @return: whether an RDF/XML or turtle content has been detected or not. If TRUE, the RDFa processing should not occur on the node and its descendents. + @rtype: Boolean + """ + #def _get_prefixes_in_turtle() : + # retval = "" + # for key in state.term_or_curie.ns : + # retval += "@prefix %s: <%s> .\n" % (key, state.term_or_curie.ns[key]) + # retval += '\n' + # return retval + + # This feature is optional! + def _get_literal(Pnode): + """ + Get the full text + @param Pnode: DOM Node + @return: string + """ + rc = "" + for node in Pnode.childNodes: + if node.nodeType in [node.TEXT_NODE, node.CDATA_SECTION_NODE] : + rc = rc + node.data + # Sigh... the HTML5 parser does not recognize the CDATA escapes, ie, it just passes on the <![CDATA[ and ]]> strings:-( + return rc.replace("<![CDATA[","").replace("]]>","") + + if state.options.embedded_rdf : + # Embedded turtle, per the latest Turtle draft + if state.options.host_language in accept_embedded_turtle and node.nodeName.lower() == "script" : + if node.hasAttribute("type") and node.getAttribute("type") == "text/turtle" : + #prefixes = _get_prefixes_in_turtle() + #content = _get_literal(node) + #rdf = StringIO(prefixes + content) + content = _get_literal(node) + rdf = StringIO(content) + try : + graph.parse(rdf, format="n3", publicID = state.base) + state.options.add_info("The output graph includes triples coming from an embedded Turtle script") + except : + (type,value,traceback) = sys.exc_info() + state.options.add_error("Embedded Turtle content could not be parsed (problems with %s?); ignored" % value) + return True + elif state.options.host_language in accept_embedded_rdf_xml and node.localName == "RDF" and node.namespaceURI == "http://www.w3.org/1999/02/22-rdf-syntax-ns#" : + rdf = StringIO(return_XML(state, node)) + try : + graph.parse(rdf) + state.options.add_info("The output graph includes triples coming from an embedded RDF/XML subtree") + except : + (type,value,traceback) = sys.exc_info() + state.options.add_error("Embedded RDF/XML content could not parsed (problems with %s?); ignored" % value) + return True + else : + return False + else : + return False + diff --git a/rdflib/plugins/parsers/pyRdfa/extras/__init__.py b/rdflib/plugins/parsers/pyRdfa/extras/__init__.py new file mode 100644 index 00000000..51124777 --- /dev/null +++ b/rdflib/plugins/parsers/pyRdfa/extras/__init__.py @@ -0,0 +1,4 @@ +""" +Collection of external modules that are used by pyRdfa and are added for an easier +distribution +""" diff --git a/rdflib/plugins/parsers/pyRdfa/extras/httpheader.py b/rdflib/plugins/parsers/pyRdfa/extras/httpheader.py new file mode 100644 index 00000000..b923b470 --- /dev/null +++ b/rdflib/plugins/parsers/pyRdfa/extras/httpheader.py @@ -0,0 +1,2016 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +""" Utility functions to work with HTTP headers. + + This module provides some utility functions useful for parsing + and dealing with some of the HTTP 1.1 protocol headers which + are not adequately covered by the standard Python libraries. + + Requires Python 2.2 or later. + + The functionality includes the correct interpretation of the various + Accept-* style headers, content negotiation, byte range requests, + HTTP-style date/times, and more. + + There are a few classes defined by this module: + + * class content_type -- media types such as 'text/plain' + * class language_tag -- language tags such as 'en-US' + * class range_set -- a collection of (byte) range specifiers + * class range_spec -- a single (byte) range specifier + + The primary functions in this module may be categorized as follows: + + * Content negotiation functions... + * acceptable_content_type() + * acceptable_language() + * acceptable_charset() + * acceptable_encoding() + + * Mid-level header parsing functions... + * parse_accept_header() + * parse_accept_language_header() + * parse_range_header() + + * Date and time... + * http_datetime() + * parse_http_datetime() + + * Utility functions... + * quote_string() + * remove_comments() + * canonical_charset() + + * Low level string parsing functions... + * parse_comma_list() + * parse_comment() + * parse_qvalue_accept_list() + * parse_media_type() + * parse_number() + * parse_parameter_list() + * parse_quoted_string() + * parse_range_set() + * parse_range_spec() + * parse_token() + * parse_token_or_quoted_string() + + And there are some specialized exception classes: + + * RangeUnsatisfiableError + * RangeUnmergableError + * ParseError + + See also: + + * RFC 2616, "Hypertext Transfer Protocol -- HTTP/1.1", June 1999. + <http://www.ietf.org/rfc/rfc2616.txt> + Errata at <http://purl.org/NET/http-errata> + * RFC 2046, "(MIME) Part Two: Media Types", November 1996. + <http://www.ietf.org/rfc/rfc2046.txt> + * RFC 3066, "Tags for the Identification of Languages", January 2001. + <http://www.ietf.org/rfc/rfc3066.txt> + + + Note: I have made a small modification on the regexp for internet date, + to make it more liberal (ie, accept a time zone string of the form +0000) + Ivan Herman <http://www.ivan-herman.net>, March 2011. + + Have added statements to make it (hopefully) Python 3 compatible. + Ivan Herman <http://www.ivan-herman.net>, August 2012. +""" + +__author__ = "Deron Meranda <http://deron.meranda.us/>" +__date__ = "2012-08-31" +__version__ = "1.02" +__credits__ = """Copyright (c) 2005 Deron E. Meranda <http://deron.meranda.us/> +Licensed under GNU LGPL 2.1 or later. See <http://www.fsf.org/>. + +This library is free software; you can redistribute it and/or +modify it under the terms of the GNU Lesser General Public +License as published by the Free Software Foundation; either +version 2.1 of the License, or (at your option) any later version. + +This library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with this library; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +""" + +# Character classes from RFC 2616 section 2.2 +SEPARATORS = '()<>@,;:\\"/[]?={} \t' +LWS = ' \t\n\r' # linear white space +CRLF = '\r\n' +DIGIT = '0123456789' +HEX = '0123456789ABCDEFabcdef' + +import sys +PY3 = (sys.version_info[0] >= 3) + +# Try to get a set/frozenset implementation if possible +try: + type(frozenset) +except NameError: + try: + # The demset.py module is available at http://deron.meranda.us/ + from demset import set, frozenset + __emulating_set = True # So we can clean up global namespace later + except ImportError: + pass + +try: + # Turn character classes into set types (for Python 2.4 or greater) + SEPARATORS = frozenset([c for c in SEPARATORS]) + LWS = frozenset([c for c in LWS]) + CRLF = frozenset([c for c in CRLF]) + DIGIT = frozenset([c for c in DIGIT]) + HEX = frozenset([c for c in HEX]) + del c +except NameError: + # Python 2.3 or earlier, leave as simple strings + pass + + +def _is_string( obj ): + """Returns True if the object is a string or unicode type.""" + if PY3 : + return isinstance(obj,str) + else : + return isinstance(obj,str) or isinstance(obj,unicode) + + +def http_datetime( dt=None ): + """Formats a datetime as an HTTP 1.1 Date/Time string. + + Takes a standard Python datetime object and returns a string + formatted according to the HTTP 1.1 date/time format. + + If no datetime is provided (or None) then the current + time is used. + + ABOUT TIMEZONES: If the passed in datetime object is naive it is + assumed to be in UTC already. But if it has a tzinfo component, + the returned timestamp string will have been converted to UTC + automatically. So if you use timezone-aware datetimes, you need + not worry about conversion to UTC. + + """ + if not dt: + import datetime + dt = datetime.datetime.utcnow() + else: + try: + dt = dt - dt.utcoffset() + except: + pass # no timezone offset, just assume already in UTC + + s = dt.strftime('%a, %d %b %Y %H:%M:%S GMT') + return s + + +def parse_http_datetime( datestring, utc_tzinfo=None, strict=False ): + """Returns a datetime object from an HTTP 1.1 Date/Time string. + + Note that HTTP dates are always in UTC, so the returned datetime + object will also be in UTC. + + You can optionally pass in a tzinfo object which should represent + the UTC timezone, and the returned datetime will then be + timezone-aware (allowing you to more easly translate it into + different timzeones later). + + If you set 'strict' to True, then only the RFC 1123 format + is recognized. Otherwise the backwards-compatible RFC 1036 + and Unix asctime(3) formats are also recognized. + + Please note that the day-of-the-week is not validated. + Also two-digit years, although not HTTP 1.1 compliant, are + treated according to recommended Y2K rules. + + """ + import re, datetime + m = re.match(r'(?P<DOW>[a-z]+), (?P<D>\d+) (?P<MON>[a-z]+) (?P<Y>\d+) (?P<H>\d+):(?P<M>\d+):(?P<S>\d+(\.\d+)?) (?P<TZ>[a-zA-Z0-9_+]+)$', + datestring, re.IGNORECASE) + if not m and not strict: + m = re.match(r'(?P<DOW>[a-z]+) (?P<MON>[a-z]+) (?P<D>\d+) (?P<H>\d+):(?P<M>\d+):(?P<S>\d+) (?P<Y>\d+)$', + datestring, re.IGNORECASE) + if not m: + m = re.match(r'(?P<DOW>[a-z]+), (?P<D>\d+)-(?P<MON>[a-z]+)-(?P<Y>\d+) (?P<H>\d+):(?P<M>\d+):(?P<S>\d+(\.\d+)?) (?P<TZ>\w+)$', + datestring, re.IGNORECASE) + if not m: + raise ValueError('HTTP date is not correctly formatted') + + try: + tz = m.group('TZ').upper() + except: + tz = 'GMT' + if tz not in ('GMT','UTC','0000','00:00'): + raise ValueError('HTTP date is not in GMT timezone') + + monname = m.group('MON').upper() + mdict = {'JAN':1, 'FEB':2, 'MAR':3, 'APR':4, 'MAY':5, 'JUN':6, + 'JUL':7, 'AUG':8, 'SEP':9, 'OCT':10, 'NOV':11, 'DEC':12} + month = mdict.get(monname) + if not month: + raise ValueError('HTTP date has an unrecognizable month') + y = int(m.group('Y')) + if y < 100: + century = datetime.datetime.utcnow().year / 100 + if y < 50: + y = century * 100 + y + else: + y = (century - 1) * 100 + y + d = int(m.group('D')) + hour = int(m.group('H')) + minute = int(m.group('M')) + try: + second = int(m.group('S')) + except: + second = float(m.group('S')) + dt = datetime.datetime( y, month, d, hour, minute, second, tzinfo=utc_tzinfo ) + return dt + + +class RangeUnsatisfiableError(ValueError): + """Exception class when a byte range lies outside the file size boundaries.""" + def __init__(self, reason=None): + if not reason: + reason = 'Range is unsatisfiable' + ValueError.__init__(self, reason) + + +class RangeUnmergableError(ValueError): + """Exception class when byte ranges are noncontiguous and can not be merged together.""" + def __init__(self, reason=None): + if not reason: + reason = 'Ranges can not be merged together' + ValueError.__init__(self, reason) + + +class ParseError(ValueError): + """Exception class representing a string parsing error.""" + def __init__(self, args, input_string, at_position): + ValueError.__init__(self, args) + self.input_string = input_string + self.at_position = at_position + def __str__(self): + if self.at_position >= len(self.input_string): + return '%s\n\tOccured at end of string' % self.args[0] + else: + return '%s\n\tOccured near %s' % (self.args[0], repr(self.input_string[self.at_position:self.at_position+16])) + + +def is_token(s): + """Determines if the string is a valid token.""" + for c in s: + if ord(c) < 32 or ord(c) > 128 or c in SEPARATORS: + return False + return True + + +def parse_comma_list(s, start=0, element_parser=None, min_count=0, max_count=0): + """Parses a comma-separated list with optional whitespace. + + Takes an optional callback function `element_parser`, which + is assumed to be able to parse an individual element. It + will be passed the string and a `start` argument, and + is expected to return a tuple (parsed_result, chars_consumed). + + If no element_parser is given, then either single tokens or + quoted strings will be parsed. + + If min_count > 0, then at least that many non-empty elements + must be in the list, or an error is raised. + + If max_count > 0, then no more than that many non-empty elements + may be in the list, or an error is raised. + + """ + if min_count > 0 and start == len(s): + raise ParseError('Comma-separated list must contain some elements',s,start) + elif start >= len(s): + raise ParseError('Starting position is beyond the end of the string',s,start) + + if not element_parser: + element_parser = parse_token_or_quoted_string + results = [] + pos = start + while pos < len(s): + e = element_parser( s, pos ) + if not e or e[1] == 0: + break # end of data? + else: + results.append( e[0] ) + pos += e[1] + while pos < len(s) and s[pos] in LWS: + pos += 1 + if pos < len(s) and s[pos] != ',': + break + while pos < len(s) and s[pos] == ',': + # skip comma and any "empty" elements + pos += 1 # skip comma + while pos < len(s) and s[pos] in LWS: + pos += 1 + if len(results) < min_count: + raise ParseError('Comma-separated list does not have enough elements',s,pos) + elif max_count and len(results) > max_count: + raise ParseError('Comma-separated list has too many elements',s,pos) + return (results, pos-start) + + +def parse_token(s, start=0): + """Parses a token. + + A token is a string defined by RFC 2616 section 2.2 as: + token = 1*<any CHAR except CTLs or separators> + + Returns a tuple (token, chars_consumed), or ('',0) if no token + starts at the given string position. On a syntax error, a + ParseError exception will be raised. + + """ + return parse_token_or_quoted_string(s, start, allow_quoted=False, allow_token=True) + + +def quote_string(s, always_quote=True): + """Produces a quoted string according to HTTP 1.1 rules. + + If always_quote is False and if the string is also a valid token, + then this function may return a string without quotes. + + """ + need_quotes = False + q = '' + for c in s: + if ord(c) < 32 or ord(c) > 127 or c in SEPARATORS: + q += '\\' + c + need_quotes = True + else: + q += c + if need_quotes or always_quote: + return '"' + q + '"' + else: + return q + + +def parse_quoted_string(s, start=0): + """Parses a quoted string. + + Returns a tuple (string, chars_consumed). The quote marks will + have been removed and all \-escapes will have been replaced with + the characters they represent. + + """ + return parse_token_or_quoted_string(s, start, allow_quoted=True, allow_token=False) + + +def parse_token_or_quoted_string(s, start=0, allow_quoted=True, allow_token=True): + """Parses a token or a quoted-string. + + 's' is the string to parse, while start is the position within the + string where parsing should begin. It will returns a tuple + (token, chars_consumed), with all \-escapes and quotation already + processed. + + Syntax is according to BNF rules in RFC 2161 section 2.2, + specifically the 'token' and 'quoted-string' declarations. + Syntax errors in the input string will result in ParseError + being raised. + + If allow_quoted is False, then only tokens will be parsed instead + of either a token or quoted-string. + + If allow_token is False, then only quoted-strings will be parsed + instead of either a token or quoted-string. + """ + if not allow_quoted and not allow_token: + raise ValueError('Parsing can not continue with options provided') + + if start >= len(s): + raise ParseError('Starting position is beyond the end of the string',s,start) + has_quote = (s[start] == '"') + if has_quote and not allow_quoted: + raise ParseError('A quoted string was not expected', s, start) + if not has_quote and not allow_token: + raise ParseError('Expected a quotation mark', s, start) + + s2 = '' + pos = start + if has_quote: + pos += 1 + while pos < len(s): + c = s[pos] + if c == '\\' and has_quote: + # Note this is NOT C-style escaping; the character after the \ is + # taken literally. + pos += 1 + if pos == len(s): + raise ParseError("End of string while expecting a character after '\\'",s,pos) + s2 += s[pos] + pos += 1 + elif c == '"' and has_quote: + break + elif not has_quote and (c in SEPARATORS or ord(c)<32 or ord(c)>127): + break + else: + s2 += c + pos += 1 + if has_quote: + # Make sure we have a closing quote mark + if pos >= len(s) or s[pos] != '"': + raise ParseError('Quoted string is missing closing quote mark',s,pos) + else: + pos += 1 + return s2, (pos - start) + + +def remove_comments(s, collapse_spaces=True): + """Removes any ()-style comments from a string. + + In HTTP, ()-comments can nest, and this function will correctly + deal with that. + + If 'collapse_spaces' is True, then if there is any whitespace + surrounding the comment, it will be replaced with a single space + character. Whitespace also collapses across multiple comment + sequences, so that "a (b) (c) d" becomes just "a d". + + Otherwise, if 'collapse_spaces' is False then all whitespace which + is outside any comments is left intact as-is. + + """ + if '(' not in s: + return s # simple case + A = [] + dostrip = False + added_comment_space = False + pos = 0 + if collapse_spaces: + # eat any leading spaces before a comment + i = s.find('(') + if i >= 0: + while pos < i and s[pos] in LWS: + pos += 1 + if pos != i: + pos = 0 + else: + dostrip = True + added_comment_space = True # lie + while pos < len(s): + if s[pos] == '(': + cmt, k = parse_comment( s, pos ) + pos += k + if collapse_spaces: + dostrip = True + if not added_comment_space: + if len(A) > 0 and A[-1] and A[-1][-1] in LWS: + # previous part ended with whitespace + A[-1] = A[-1].rstrip() + A.append(' ') # comment becomes one space + added_comment_space = True + else: + i = s.find( '(', pos ) + if i == -1: + if dostrip: + text = s[pos:].lstrip() + if s[pos] in LWS and not added_comment_space: + A.append(' ') + added_comment_space = True + else: + text = s[pos:] + if text: + A.append(text) + dostrip = False + added_comment_space = False + break # end of string + else: + if dostrip: + text = s[pos:i].lstrip() + if s[pos] in LWS and not added_comment_space: + A.append(' ') + added_comment_space = True + else: + text = s[pos:i] + if text: + A.append(text) + dostrip = False + added_comment_space = False + pos = i + if dostrip and len(A) > 0 and A[-1] and A[-1][-1] in LWS: + A[-1] = A[-1].rstrip() + return ''.join(A) + + +def _test_comments(): + """A self-test on comment processing. Returns number of test failures.""" + def _testrm( a, b, collapse ): + b2 = remove_comments( a, collapse ) + if b != b2: + print( 'Comment test failed:' ) + print( ' remove_comments( %s, collapse_spaces=%s ) -> %s' % (repr(a), repr(collapse), repr(b2)) ) + print( ' expected %s' % repr(b) ) + return 1 + return 0 + failures = 0 + failures += _testrm( r'', '', False ) + failures += _testrm( r'(hello)', '', False) + failures += _testrm( r'abc (hello) def', 'abc def', False) + failures += _testrm( r'abc (he(xyz)llo) def', 'abc def', False) + failures += _testrm( r'abc (he\(xyz)llo) def', 'abc llo) def', False) + failures += _testrm( r'abc(hello)def', 'abcdef', True) + failures += _testrm( r'abc (hello) def', 'abc def', True) + failures += _testrm( r'abc (hello)def', 'abc def', True) + failures += _testrm( r'abc(hello) def', 'abc def', True) + failures += _testrm( r'abc(hello) (world)def', 'abc def', True) + failures += _testrm( r'abc(hello)(world)def', 'abcdef', True) + failures += _testrm( r' (hello) (world) def', 'def', True) + failures += _testrm( r'abc (hello) (world) ', 'abc', True) + return failures + +def parse_comment(s, start=0): + """Parses a ()-style comment from a header value. + + Returns tuple (comment, chars_consumed), where the comment will + have had the outer-most parentheses and white space stripped. Any + nested comments will still have their parentheses and whitespace + left intact. + + All \-escaped quoted pairs will have been replaced with the actual + characters they represent, even within the inner nested comments. + + You should note that only a few HTTP headers, such as User-Agent + or Via, allow ()-style comments within the header value. + + A comment is defined by RFC 2616 section 2.2 as: + + comment = "(" *( ctext | quoted-pair | comment ) ")" + ctext = <any TEXT excluding "(" and ")"> + """ + if start >= len(s): + raise ParseError('Starting position is beyond the end of the string',s,start) + if s[start] != '(': + raise ParseError('Comment must begin with opening parenthesis',s,start) + + s2 = '' + nestlevel = 1 + pos = start + 1 + while pos < len(s) and s[pos] in LWS: + pos += 1 + + while pos < len(s): + c = s[pos] + if c == '\\': + # Note this is not C-style escaping; the character after the \ is + # taken literally. + pos += 1 + if pos == len(s): + raise ParseError("End of string while expecting a character after '\\'",s,pos) + s2 += s[pos] + pos += 1 + elif c == '(': + nestlevel += 1 + s2 += c + pos += 1 + elif c == ')': + nestlevel -= 1 + pos += 1 + if nestlevel >= 1: + s2 += c + else: + break + else: + s2 += c + pos += 1 + if nestlevel > 0: + raise ParseError('End of string reached before comment was closed',s,pos) + # Now rstrip s2 of all LWS chars. + while len(s2) and s2[-1] in LWS: + s2 = s2[:-1] + return s2, (pos - start) + + +class range_spec(object): + """A single contiguous (byte) range. + + A range_spec defines a range (of bytes) by specifying two offsets, + the 'first' and 'last', which are inclusive in the range. Offsets + are zero-based (the first byte is offset 0). The range can not be + empty or negative (has to satisfy first <= last). + + The range can be unbounded on either end, represented here by the + None value, with these semantics: + + * A 'last' of None always indicates the last possible byte + (although that offset may not be known). + + * A 'first' of None indicates this is a suffix range, where + the last value is actually interpreted to be the number + of bytes at the end of the file (regardless of file size). + + Note that it is not valid for both first and last to be None. + + """ + + __slots__ = ['first','last'] + + def __init__(self, first=0, last=None): + self.set( first, last ) + + def set(self, first, last): + """Sets the value of this range given the first and last offsets. + """ + if first is not None and last is not None and first > last: + raise ValueError("Byte range does not satisfy first <= last.") + elif first is None and last is None: + raise ValueError("Byte range can not omit both first and last offsets.") + self.first = first + self.last = last + + def __repr__(self): + return '%s.%s(%s,%s)' % (self.__class__.__module__, self.__class__.__name__, + self.first, self.last) + + def __str__(self): + """Returns a string form of the range as would appear in a Range: header.""" + if self.first is None and self.last is None: + return '' + s = '' + if self.first is not None: + s += '%d' % self.first + s += '-' + if self.last is not None: + s += '%d' % self.last + return s + + def __eq__(self, other): + """Compare ranges for equality. + + Note that if non-specific ranges are involved (such as 34- and -5), + they could compare as not equal even though they may represent + the same set of bytes in some contexts. + """ + return self.first == other.first and self.last == other.last + + def __ne__(self, other): + """Compare ranges for inequality. + + Note that if non-specific ranges are involved (such as 34- and -5), + they could compare as not equal even though they may represent + the same set of bytes in some contexts. + """ + return not self.__eq__(other) + + def __lt__(self, other): + """< operator is not defined""" + raise NotImplementedError('Ranges can not be relationally compared') + def __le__(self, other): + """<= operator is not defined""" + raise NotImplementedError('Ranges can not be ralationally compared') + def __gt__(self, other): + """> operator is not defined""" + raise NotImplementedError('Ranges can not be relationally compared') + def __ge__(self, other): + """>= operator is not defined""" + raise NotImplementedError('Ranges can not be relationally compared') + + def copy(self): + """Makes a copy of this range object.""" + return self.__class__( self.first, self.last ) + + def is_suffix(self): + """Returns True if this is a suffix range. + + A suffix range is one that specifies the last N bytes of a + file regardless of file size. + + """ + return self.first == None + + def is_fixed(self): + """Returns True if this range is absolute and a fixed size. + + This occurs only if neither first or last is None. Converse + is the is_unbounded() method. + + """ + return first is not None and last is not None + + def is_unbounded(self): + """Returns True if the number of bytes in the range is unspecified. + + This can only occur if either the 'first' or the 'last' member + is None. Converse is the is_fixed() method. + + """ + return self.first is None or self.last is None + + def is_whole_file(self): + """Returns True if this range includes all possible bytes. + + This can only occur if the 'last' member is None and the first + member is 0. + + """ + return self.first == 0 and self.last is None + + def __contains__(self, offset): + """Does this byte range contain the given byte offset? + + If the offset < 0, then it is taken as an offset from the end + of the file, where -1 is the last byte. This type of offset + will only work with suffix ranges. + + """ + if offset < 0: + if self.first is not None: + return False + else: + return self.last >= -offset + elif self.first is None: + return False + elif self.last is None: + return True + else: + return self.first <= offset <= self.last + + def fix_to_size(self, size): + """Changes a length-relative range to an absolute range based upon given file size. + + Ranges that are already absolute are left as is. + + Note that zero-length files are handled as special cases, + since the only way possible to specify a zero-length range is + with the suffix range "-0". Thus unless this range is a suffix + range, it can not satisfy a zero-length file. + + If the resulting range (partly) lies outside the file size then an + error is raised. + """ + + if size == 0: + if self.first is None: + self.last = 0 + return + else: + raise RangeUnsatisfiableError("Range can satisfy a zero-length file.") + + if self.first is None: + # A suffix range + self.first = size - self.last + if self.first < 0: + self.first = 0 + self.last = size - 1 + else: + if self.first > size - 1: + raise RangeUnsatisfiableError('Range begins beyond the file size.') + else: + if self.last is None: + # An unbounded range + self.last = size - 1 + return + + def merge_with(self, other): + """Tries to merge the given range into this one. + + The size of this range may be enlarged as a result. + + An error is raised if the two ranges do not overlap or are not + contiguous with each other. + """ + if self.is_whole_file() or self == other: + return + elif other.is_whole_file(): + self.first, self.last = 0, None + return + + a1, z1 = self.first, self.last + a2, z2 = other.first, other.last + + if self.is_suffix(): + if z1 == 0: # self is zero-length, so merge becomes a copy + self.first, self.last = a2, z2 + return + elif other.is_suffix(): + self.last = max(z1, z2) + else: + raise RangeUnmergableError() + elif other.is_suffix(): + if z2 == 0: # other is zero-length, so nothing to merge + return + else: + raise RangeUnmergableError() + + assert a1 is not None and a2 is not None + + if a2 < a1: + # swap ranges so a1 <= a2 + a1, z1, a2, z2 = a2, z2, a1, z1 + + assert a1 <= a2 + + if z1 is None: + if z2 is not None and z2 + 1 < a1: + raise RangeUnmergableError() + else: + self.first = min(a1, a2) + self.last = None + elif z2 is None: + if z1 + 1 < a2: + raise RangeUnmergableError() + else: + self.first = min(a1, a2) + self.last = None + else: + if a2 > z1 + 1: + raise RangeUnmergableError() + else: + self.first = a1 + self.last = max(z1, z2) + return + + +class range_set(object): + """A collection of range_specs, with units (e.g., bytes). + """ + __slots__ = ['units', 'range_specs'] + + def __init__(self): + self.units = 'bytes' + self.range_specs = [] # a list of range_spec objects + + def __str__(self): + return self.units + '=' + ', '.join([str(s) for s in self.range_specs]) + + def __repr__(self): + return '%s.%s(%s)' % (self.__class__.__module__, + self.__class__.__name__, + repr(self.__str__()) ) + + def from_str(self, s, valid_units=('bytes','none')): + """Sets this range set based upon a string, such as the Range: header. + + You can also use the parse_range_set() function for more control. + + If a parsing error occurs, the pre-exising value of this range + set is left unchanged. + + """ + r, k = parse_range_set( s, valid_units=valid_units ) + if k < len(s): + raise ParseError("Extra unparsable characters in range set specifier",s,k) + self.units = r.units + self.range_specs = r.range_specs + + def is_single_range(self): + """Does this range specifier consist of only a single range set?""" + return len(self.range_specs) == 1 + + def is_contiguous(self): + """Can the collection of range_specs be coalesced into a single contiguous range?""" + if len(self.range_specs) <= 1: + return True + merged = self.range_specs[0].copy() + for s in self.range_specs[1:]: + try: + merged.merge_with(s) + except: + return False + return True + + def fix_to_size(self, size): + """Changes all length-relative range_specs to absolute range_specs based upon given file size. + If none of the range_specs in this set can be satisfied, then the + entire set is considered unsatifiable and an error is raised. + Otherwise any unsatisfiable range_specs will simply be removed + from this set. + + """ + for i in range(len(self.range_specs)): + try: + self.range_specs[i].fix_to_size( size ) + except RangeUnsatisfiableError: + self.range_specs[i] = None + self.range_specs = [s for s in self.range_specs if s is not None] + if len(self.range_specs) == 0: + raise RangeUnsatisfiableError('No ranges can be satisfied') + + def coalesce(self): + """Collapses all consecutive range_specs which together define a contiguous range. + + Note though that this method will not re-sort the range_specs, so a + potentially contiguous range may not be collapsed if they are + not sorted. For example the ranges: + 10-20, 30-40, 20-30 + will not be collapsed to just 10-40. However if the ranges are + sorted first as with: + 10-20, 20-30, 30-40 + then they will collapse to 10-40. + """ + if len(self.range_specs) <= 1: + return + for i in range(len(self.range_specs) - 1): + a = self.range_specs[i] + b = self.range_specs[i+1] + if a is not None: + try: + a.merge_with( b ) + self.range_specs[i+1] = None # to be deleted later + except RangeUnmergableError: + pass + self.range_specs = [r for r in self.range_specs if r is not None] + + +def parse_number( s, start=0 ): + """Parses a positive decimal integer number from the string. + + A tuple is returned (number, chars_consumed). If the + string is not a valid decimal number, then (None,0) is returned. + """ + if start >= len(s): + raise ParseError('Starting position is beyond the end of the string',s,start) + if s[start] not in DIGIT: + return (None,0) # not a number + pos = start + n = 0 + while pos < len(s): + c = s[pos] + if c in DIGIT: + n *= 10 + n += ord(c) - ord('0') + pos += 1 + else: + break + return n, pos-start + + +def parse_range_spec( s, start=0 ): + """Parses a (byte) range_spec. + + Returns a tuple (range_spec, chars_consumed). + """ + if start >= len(s): + raise ParseError('Starting position is beyond the end of the string',s,start) + if s[start] not in DIGIT and s[start] != '-': + raise ParseError("Invalid range, expected a digit or '-'",s,start) + first, last = None, None + pos = start + first, k = parse_number( s, pos ) + pos += k + if s[pos] == '-': + pos += 1 + if pos < len(s): + last, k = parse_number( s, pos ) + pos += k + else: + raise ParseError("Byte range must include a '-'",s,pos) + if first is None and last is None: + raise ParseError('Byte range can not omit both first and last indices.',s,start) + R = range_spec( first, last ) + return R, pos-start + + +def parse_range_header( header_value, valid_units=('bytes','none') ): + """Parses the value of an HTTP Range: header. + + The value of the header as a string should be passed in; without + the header name itself. + + Returns a range_set object. + """ + ranges, k = parse_range_set( header_value, valid_units=valid_units ) + if k < len(header_value): + raise ParseError('Range header has unexpected or unparsable characters', + header_value, k) + return ranges + + +def parse_range_set( s, start=0, valid_units=('bytes','none') ): + """Parses a (byte) range set specifier. + + Returns a tuple (range_set, chars_consumed). + """ + if start >= len(s): + raise ParseError('Starting position is beyond the end of the string',s,start) + pos = start + units, k = parse_token( s, pos ) + pos += k + if valid_units and units not in valid_units: + raise ParseError('Unsupported units type in range specifier',s,start) + while pos < len(s) and s[pos] in LWS: + pos += 1 + if pos < len(s) and s[pos] == '=': + pos += 1 + else: + raise ParseError("Invalid range specifier, expected '='",s,pos) + while pos < len(s) and s[pos] in LWS: + pos += 1 + range_specs, k = parse_comma_list( s, pos, parse_range_spec, min_count=1 ) + pos += k + # Make sure no trash is at the end of the string + while pos < len(s) and s[pos] in LWS: + pos += 1 + if pos < len(s): + raise ParseError('Unparsable characters in range set specifier',s,pos) + + ranges = range_set() + ranges.units = units + ranges.range_specs = range_specs + return ranges, pos-start + + +def _split_at_qfactor( s ): + """Splits a string at the quality factor (;q=) parameter. + + Returns the left and right substrings as a two-member tuple. + + """ + # It may be faster, but incorrect, to use s.split(';q=',1), since + # HTTP allows any amount of linear white space (LWS) to appear + # between the parts, so it could also be "; q = ". + + # We do this parsing 'manually' for speed rather than using a + # regex, which would be r';[ \t\r\n]*q[ \t\r\n]*=[ \t\r\n]*' + + pos = 0 + while 0 <= pos < len(s): + pos = s.find(';', pos) + if pos < 0: + break # no more parameters + startpos = pos + pos = pos + 1 + while pos < len(s) and s[pos] in LWS: + pos = pos + 1 + if pos < len(s) and s[pos] == 'q': + pos = pos + 1 + while pos < len(s) and s[pos] in LWS: + pos = pos + 1 + if pos < len(s) and s[pos] == '=': + pos = pos + 1 + while pos < len(s) and s[pos] in LWS: + pos = pos + 1 + return ( s[:startpos], s[pos:] ) + return (s, '') + + +def parse_qvalue_accept_list( s, start=0, item_parser=parse_token ): + """Parses any of the Accept-* style headers with quality factors. + + This is a low-level function. It returns a list of tuples, each like: + (item, item_parms, qvalue, accept_parms) + + You can pass in a function which parses each of the item strings, or + accept the default where the items must be simple tokens. Note that + your parser should not consume any paramters (past the special "q" + paramter anyway). + + The item_parms and accept_parms are each lists of (name,value) tuples. + + The qvalue is the quality factor, a number from 0 to 1 inclusive. + + """ + itemlist = [] + pos = start + if pos >= len(s): + raise ParseError('Starting position is beyond the end of the string',s,pos) + item = None + while pos < len(s): + item, k = item_parser(s, pos) + pos += k + while pos < len(s) and s[pos] in LWS: + pos += 1 + if pos >= len(s) or s[pos] in ',;': + itemparms, qvalue, acptparms = [], None, [] + if pos < len(s) and s[pos] == ';': + pos += 1 + while pos < len(s) and s[pos] in LWS: + pos += 1 + parmlist, k = parse_parameter_list(s, pos) + for p, v in parmlist: + if p == 'q' and qvalue is None: + try: + qvalue = float(v) + except ValueError: + raise ParseError('qvalue must be a floating point number',s,pos) + if qvalue < 0 or qvalue > 1: + raise ParseError('qvalue must be between 0 and 1, inclusive',s,pos) + elif qvalue is None: + itemparms.append( (p,v) ) + else: + acptparms.append( (p,v) ) + pos += k + if item: + # Add the item to the list + if qvalue is None: + qvalue = 1 + itemlist.append( (item, itemparms, qvalue, acptparms) ) + item = None + # skip commas + while pos < len(s) and s[pos] == ',': + pos += 1 + while pos < len(s) and s[pos] in LWS: + pos += 1 + else: + break + return itemlist, pos - start + + +def parse_accept_header( header_value ): + """Parses the Accept: header. + + The value of the header as a string should be passed in; without + the header name itself. + + This will parse the value of any of the HTTP headers "Accept", + "Accept-Charset", "Accept-Encoding", or "Accept-Language". These + headers are similarly formatted, in that they are a list of items + with associated quality factors. The quality factor, or qvalue, + is a number in the range [0.0..1.0] which indicates the relative + preference of each item. + + This function returns a list of those items, sorted by preference + (from most-prefered to least-prefered). Each item in the returned + list is actually a tuple consisting of: + + ( item_name, item_parms, qvalue, accept_parms ) + + As an example, the following string, + text/plain; charset="utf-8"; q=.5; columns=80 + would be parsed into this resulting tuple, + ( 'text/plain', [('charset','utf-8')], 0.5, [('columns','80')] ) + + The value of the returned item_name depends upon which header is + being parsed, but for example it may be a MIME content or media + type (without parameters), a language tag, or so on. Any optional + parameters (delimited by semicolons) occuring before the "q=" + attribute will be in the item_parms list as (attribute,value) + tuples in the same order as they appear in the header. Any quoted + values will have been unquoted and unescaped. + + The qvalue is a floating point number in the inclusive range 0.0 + to 1.0, and roughly indicates the preference for this item. + Values outside this range will be capped to the closest extreme. + + (!) Note that a qvalue of 0 indicates that the item is + explicitly NOT acceptable to the user agent, and should be + handled differently by the caller. + + The accept_parms, like the item_parms, is a list of any attributes + occuring after the "q=" attribute, and will be in the list as + (attribute,value) tuples in the same order as they occur. + Usually accept_parms will be an empty list, as the HTTP spec + allows these extra parameters in the syntax but does not + currently define any possible values. + + All empty items will be removed from the list. However, duplicate + or conflicting values are not detected or handled in any way by + this function. + """ + def parse_mt_only(s, start): + mt, k = parse_media_type(s, start, with_parameters=False) + ct = content_type() + ct.major = mt[0] + ct.minor = mt[1] + return ct, k + + alist, k = parse_qvalue_accept_list( header_value, item_parser=parse_mt_only ) + if k < len(header_value): + raise ParseError('Accept header is invalid',header_value,k) + + ctlist = [] + for ct, ctparms, q, acptparms in alist: + if ctparms: + ct.set_parameters( dict(ctparms) ) + ctlist.append( (ct, q, acptparms) ) + return ctlist + + +def parse_media_type(media_type, start=0, with_parameters=True): + """Parses a media type (MIME type) designator into it's parts. + + Given a media type string, returns a nested tuple of it's parts. + + ((major,minor,parmlist), chars_consumed) + + where parmlist is a list of tuples of (parm_name, parm_value). + Quoted-values are appropriately unquoted and unescaped. + + If 'with_parameters' is False, then parsing will stop immediately + after the minor media type; and will not proceed to parse any + of the semicolon-separated paramters. + + Examples: + image/png -> (('image','png',[]), 9) + text/plain; charset="utf-16be" + -> (('text','plain',[('charset,'utf-16be')]), 30) + + """ + + s = media_type + pos = start + ctmaj, k = parse_token(s, pos) + if k == 0: + raise ParseError('Media type must be of the form "major/minor".', s, pos) + pos += k + if pos >= len(s) or s[pos] != '/': + raise ParseError('Media type must be of the form "major/minor".', s, pos) + pos += 1 + ctmin, k = parse_token(s, pos) + if k == 0: + raise ParseError('Media type must be of the form "major/minor".', s, pos) + pos += k + if with_parameters: + parmlist, k = parse_parameter_list(s, pos) + pos += k + else: + parmlist = [] + return ((ctmaj, ctmin, parmlist), pos - start) + + +def parse_parameter_list(s, start=0): + """Parses a semicolon-separated 'parameter=value' list. + + Returns a tuple (parmlist, chars_consumed), where parmlist + is a list of tuples (parm_name, parm_value). + + The parameter values will be unquoted and unescaped as needed. + + Empty parameters (as in ";;") are skipped, as is insignificant + white space. The list returned is kept in the same order as the + parameters appear in the string. + + """ + pos = start + parmlist = [] + while pos < len(s): + while pos < len(s) and s[pos] in LWS: + pos += 1 # skip whitespace + if pos < len(s) and s[pos] == ';': + pos += 1 + while pos < len(s) and s[pos] in LWS: + pos += 1 # skip whitespace + if pos >= len(s): + break + parmname, k = parse_token(s, pos) + if parmname: + pos += k + while pos < len(s) and s[pos] in LWS: + pos += 1 # skip whitespace + if not (pos < len(s) and s[pos] == '='): + raise ParseError('Expected an "=" after parameter name', s, pos) + pos += 1 + while pos < len(s) and s[pos] in LWS: + pos += 1 # skip whitespace + parmval, k = parse_token_or_quoted_string( s, pos ) + pos += k + parmlist.append( (parmname, parmval) ) + else: + break + return parmlist, pos - start + + +class content_type(object): + """This class represents a media type (aka a MIME content type), including parameters. + + You initialize these by passing in a content-type declaration + string, such as "text/plain; charset=ascii", to the constructor or + to the set() method. If you provide no string value, the object + returned will represent the wildcard */* content type. + + Normally you will get the value back by using str(), or optionally + you can access the components via the 'major', 'minor', 'media_type', + or 'parmdict' members. + + """ + def __init__(self, content_type_string=None, with_parameters=True): + """Create a new content_type object. + + See the set() method for a description of the arguments. + """ + if content_type_string: + self.set( content_type_string, with_parameters=with_parameters ) + else: + self.set( '*/*' ) + + def set_parameters(self, parameter_list_or_dict): + """Sets the optional paramters based upon the parameter list. + + The paramter list should be a semicolon-separated name=value string. + Any paramters which already exist on this object will be deleted, + unless they appear in the given paramter_list. + + """ + if hasattr(parameter_list_or_dict, 'has_key'): + # already a dictionary + pl = parameter_list_or_dict + else: + pl, k = parse_parameter_list(parameter_list) + if k < len(parameter_list): + raise ParseError('Invalid parameter list',paramter_list,k) + self.parmdict = dict(pl) + + def set(self, content_type_string, with_parameters=True): + """Parses the content type string and sets this object to it's value. + + For a more complete description of the arguments, see the + documentation for the parse_media_type() function in this module. + """ + mt, k = parse_media_type( content_type_string, with_parameters=with_parameters ) + if k < len(content_type_string): + raise ParseError('Not a valid content type',content_type_string, k) + major, minor, pdict = mt + self._set_major( major ) + self._set_minor( minor ) + self.parmdict = dict(pdict) + + def _get_major(self): + return self._major + def _set_major(self, s): + s = s.lower() # case-insentive + if not is_token(s): + raise ValueError('Major media type contains an invalid character') + self._major = s + + def _get_minor(self): + return self._minor + def _set_minor(self, s): + s = s.lower() # case-insentive + if not is_token(s): + raise ValueError('Minor media type contains an invalid character') + self._minor = s + + major = property(_get_major,_set_major,doc="Major media classification") + minor = property(_get_minor,_set_minor,doc="Minor media sub-classification") + + def __str__(self): + """String value.""" + s = '%s/%s' % (self.major, self.minor) + if self.parmdict: + extra = '; '.join([ '%s=%s' % (a[0],quote_string(a[1],False)) for a in self.parmdict.items()]) + s += '; ' + extra + return s + + def __unicode__(self): + """Unicode string value.""" + # In Python 3 this is probably unnecessary in general, this is just to avoid possible syntax issues. I.H. + if PY3 : + return str(self.__str__()) + else : + return unicode(self.__str__()) + + def __repr__(self): + """Python representation of this object.""" + s = '%s(%s)' % (self.__class__.__name__, repr(self.__str__())) + return s + + + def __hash__(self): + """Hash this object; the hash is dependent only upon the value.""" + return hash(str(self)) + + def __getstate__(self): + """Pickler""" + return str(self) + + def __setstate__(self, state): + """Unpickler""" + self.set(state) + + def __len__(self): + """Logical length of this media type. + For example: + len('*/*') -> 0 + len('image/*') -> 1 + len('image/png') -> 2 + len('text/plain; charset=utf-8') -> 3 + len('text/plain; charset=utf-8; filename=xyz.txt') -> 4 + + """ + if self.major == '*': + return 0 + elif self.minor == '*': + return 1 + else: + return 2 + len(self.parmdict) + + def __eq__(self, other): + """Equality test. + + Note that this is an exact match, including any parameters if any. + """ + return self.major == other.major and \ + self.minor == other.minor and \ + self.parmdict == other.parmdict + + def __ne__(self, other): + """Inequality test.""" + return not self.__eq__(other) + + def _get_media_type(self): + """Returns the media 'type/subtype' string, without parameters.""" + return '%s/%s' % (self.major, self.minor) + + media_type = property(_get_media_type, doc="Returns the just the media type 'type/subtype' without any paramters (read-only).") + + def is_wildcard(self): + """Returns True if this is a 'something/*' media type. + """ + return self.minor == '*' + + def is_universal_wildcard(self): + """Returns True if this is the unspecified '*/*' media type. + """ + return self.major == '*' and self.minor == '*' + + def is_composite(self): + """Is this media type composed of multiple parts. + """ + return self.major == 'multipart' or self.major == 'message' + + def is_xml(self): + """Returns True if this media type is XML-based. + + Note this does not consider text/html to be XML, but + application/xhtml+xml is. + """ + return self.minor == 'xml' or self.minor.endswith('+xml') + +# Some common media types +content_formdata = content_type('multipart/form-data') +content_urlencoded = content_type('application/x-www-form-urlencoded') +content_byteranges = content_type('multipart/byteranges') # RFC 2616 sect 14.16 +content_opaque = content_type('application/octet-stream') +content_html = content_type('text/html') +content_xhtml = content_type('application/xhtml+xml') + + +def acceptable_content_type( accept_header, content_types, ignore_wildcard=True ): + """Determines if the given content type is acceptable to the user agent. + + The accept_header should be the value present in the HTTP + "Accept:" header. In mod_python this is typically obtained from + the req.http_headers_in table; in WSGI it is environ["Accept"]; + other web frameworks may provide other methods of obtaining it. + + Optionally the accept_header parameter can be pre-parsed, as + returned from the parse_accept_header() function in this module. + + The content_types argument should either be a single MIME media + type string, or a sequence of them. It represents the set of + content types that the caller (server) is willing to send. + Generally, the server content_types should not contain any + wildcarded values. + + This function determines which content type which is the most + preferred and is acceptable to both the user agent and the server. + If one is negotiated it will return a four-valued tuple like: + + (server_content_type, ua_content_range, qvalue, accept_parms) + + The first tuple value is one of the server's content_types, while + the remaining tuple values descript which of the client's + acceptable content_types was matched. In most cases accept_parms + will be an empty list (see description of parse_accept_header() + for more details). + + If no content type could be negotiated, then this function will + return None (and the caller should typically cause an HTTP 406 Not + Acceptable as a response). + + Note that the wildcarded content type "*/*" sent by the client + will be ignored, since it is often incorrectly sent by web + browsers that don't really mean it. To override this, call with + ignore_wildcard=False. Partial wildcards such as "image/*" will + always be processed, but be at a lower priority than a complete + matching type. + + See also: RFC 2616 section 14.1, and + <http://www.iana.org/assignments/media-types/> + + """ + if _is_string(accept_header): + accept_list = parse_accept_header(accept_header) + else: + accept_list = accept_header + + if _is_string(content_types): + content_types = [content_types] + + server_ctlist = [content_type(ct) for ct in content_types] + del ct + + #print 'AC', repr(accept_list) + #print 'SV', repr(server_ctlist) + + best = None # (content_type, qvalue, accept_parms, matchlen) + + for server_ct in server_ctlist: + best_for_this = None + for client_ct, qvalue, aargs in accept_list: + if ignore_wildcard and client_ct.is_universal_wildcard(): + continue # */* being ignored + + matchlen = 0 # how specifically this one matches (0 is a non-match) + if client_ct.is_universal_wildcard(): + matchlen = 1 # */* is a 1 + elif client_ct.major == server_ct.major: + if client_ct.minor == '*': # something/* is a 2 + matchlen = 2 + elif client_ct.minor == server_ct.minor: # something/something is a 3 + matchlen = 3 + # must make sure all the parms match too + for pname, pval in client_ct.parmdict.items(): + sval = server_ct.parmdict.get(pname) + if pname == 'charset': + # special case for charset to match aliases + pval = canonical_charset(pval) + sval = canonical_charset(sval) + if sval == pval: + matchlen = matchlen + 1 + else: + matchlen = 0 + break + else: + matchlen = 0 + + #print 'S',server_ct,' C',client_ct,' M',matchlen,'Q',qvalue + if matchlen > 0: + if not best_for_this \ + or matchlen > best_for_this[-1] \ + or (matchlen == best_for_this[-1] and qvalue > best_for_this[2]): + # This match is better + best_for_this = (server_ct, client_ct, qvalue, aargs, matchlen) + #print 'BEST2 NOW', repr(best_for_this) + if not best or \ + (best_for_this and best_for_this[2] > best[2]): + best = best_for_this + #print 'BEST NOW', repr(best) + if not best or best[1] <= 0: + return None + return best[:-1] + + +# Aliases of common charsets, see <http://www.iana.org/assignments/character-sets>. +character_set_aliases = { + 'ASCII': 'US-ASCII', + 'ISO646-US': 'US-ASCII', + 'IBM367': 'US-ASCII', + 'CP367': 'US-ASCII', + 'CSASCII': 'US-ASCII', + 'ANSI_X3.4-1968': 'US-ASCII', + 'ISO_646.IRV:1991': 'US-ASCII', + + 'UTF7': 'UTF-7', + + 'UTF8': 'UTF-8', + + 'UTF16': 'UTF-16', + 'UTF16LE': 'UTF-16LE', + 'UTF16BE': 'UTF-16BE', + + 'UTF32': 'UTF-32', + 'UTF32LE': 'UTF-32LE', + 'UTF32BE': 'UTF-32BE', + + 'UCS2': 'ISO-10646-UCS-2', + 'UCS_2': 'ISO-10646-UCS-2', + 'UCS-2': 'ISO-10646-UCS-2', + 'CSUNICODE': 'ISO-10646-UCS-2', + + 'UCS4': 'ISO-10646-UCS-4', + 'UCS_4': 'ISO-10646-UCS-4', + 'UCS-4': 'ISO-10646-UCS-4', + 'CSUCS4': 'ISO-10646-UCS-4', + + 'ISO_8859-1': 'ISO-8859-1', + 'LATIN1': 'ISO-8859-1', + 'CP819': 'ISO-8859-1', + 'IBM819': 'ISO-8859-1', + + 'ISO_8859-2': 'ISO-8859-2', + 'LATIN2': 'ISO-8859-2', + + 'ISO_8859-3': 'ISO-8859-3', + 'LATIN3': 'ISO-8859-3', + + 'ISO_8859-4': 'ISO-8859-4', + 'LATIN4': 'ISO-8859-4', + + 'ISO_8859-5': 'ISO-8859-5', + 'CYRILLIC': 'ISO-8859-5', + + 'ISO_8859-6': 'ISO-8859-6', + 'ARABIC': 'ISO-8859-6', + 'ECMA-114': 'ISO-8859-6', + + 'ISO_8859-6-E': 'ISO-8859-6-E', + 'ISO_8859-6-I': 'ISO-8859-6-I', + + 'ISO_8859-7': 'ISO-8859-7', + 'GREEK': 'ISO-8859-7', + 'GREEK8': 'ISO-8859-7', + 'ECMA-118': 'ISO-8859-7', + + 'ISO_8859-8': 'ISO-8859-8', + 'HEBREW': 'ISO-8859-8', + + 'ISO_8859-8-E': 'ISO-8859-8-E', + 'ISO_8859-8-I': 'ISO-8859-8-I', + + 'ISO_8859-9': 'ISO-8859-9', + 'LATIN5': 'ISO-8859-9', + + 'ISO_8859-10': 'ISO-8859-10', + 'LATIN6': 'ISO-8859-10', + + 'ISO_8859-13': 'ISO-8859-13', + + 'ISO_8859-14': 'ISO-8859-14', + 'LATIN8': 'ISO-8859-14', + + 'ISO_8859-15': 'ISO-8859-15', + 'LATIN9': 'ISO-8859-15', + + 'ISO_8859-16': 'ISO-8859-16', + 'LATIN10': 'ISO-8859-16', + } + +def canonical_charset( charset ): + """Returns the canonical or preferred name of a charset. + + Additional character sets can be recognized by this function by + altering the character_set_aliases dictionary in this module. + Charsets which are not recognized are simply converted to + upper-case (as charset names are always case-insensitive). + + See <http://www.iana.org/assignments/character-sets>. + + """ + # It would be nice to use Python's codecs modules for this, but + # there is no fixed public interface to it's alias mappings. + if not charset: + return charset + uc = charset.upper() + uccon = character_set_aliases.get( uc, uc ) + return uccon + + +def acceptable_charset( accept_charset_header, charsets, ignore_wildcard=True, default='ISO-8859-1' ): + """ + Determines if the given charset is acceptable to the user agent. + + The accept_charset_header should be the value present in the HTTP + "Accept-Charset:" header. In mod_python this is typically + obtained from the req.http_headers table; in WSGI it is + environ["Accept-Charset"]; other web frameworks may provide other + methods of obtaining it. + + Optionally the accept_charset_header parameter can instead be the + list returned from the parse_accept_header() function in this + module. + + The charsets argument should either be a charset identifier string, + or a sequence of them. + + This function returns the charset identifier string which is the + most prefered and is acceptable to both the user agent and the + caller. It will return the default value if no charset is negotiable. + + Note that the wildcarded charset "*" will be ignored. To override + this, call with ignore_wildcard=False. + + See also: RFC 2616 section 14.2, and + <http://www.iana.org/assignments/character-sets> + + """ + if default: + default = _canonical_charset(default) + + if _is_string(accept_charset_header): + accept_list = parse_accept_header(accept_charset_header) + else: + accept_list = accept_charset_header + + if _is_string(charsets): + charsets = [_canonical_charset(charsets)] + else: + charsets = [_canonical_charset(c) for c in charsets] + + # Note per RFC that 'ISO-8859-1' is special, and is implictly in the + # accept list with q=1; unless it is already in the list, or '*' is in the list. + + best = None + for c, qvalue, junk in accept_list: + if c == '*': + default = None + if ignore_wildcard: + continue + if not best or qvalue > best[1]: + best = (c, qvalue) + else: + c = _canonical_charset(c) + for test_c in charsets: + if c == default: + default = None + if c == test_c and (not best or best[0]=='*' or qvalue > best[1]): + best = (c, qvalue) + if default and default in [test_c.upper() for test_c in charsets]: + best = (default, 1) + if best[0] == '*': + best = (charsets[0], best[1]) + return best + + + +class language_tag(object): + """This class represents an RFC 3066 language tag. + + Initialize objects of this class with a single string representing + the language tag, such as "en-US". + + Case is insensitive. Wildcarded subtags are ignored or stripped as + they have no significance, so that "en-*" is the same as "en". + However the universal wildcard "*" language tag is kept as-is. + + Note that although relational operators such as < are defined, + they only form a partial order based upon specialization. + + Thus for example, + "en" <= "en-US" + but, + not "en" <= "de", and + not "de" <= "en". + + """ + + def __init__(self, tagname): + """Initialize objects of this class with a single string representing + the language tag, such as "en-US". Case is insensitive. + + """ + + self.parts = tagname.lower().split('-') + while len(self.parts) > 1 and self.parts[-1] == '*': + del self.parts[-1] + + def __len__(self): + """Number of subtags in this tag.""" + if len(self.parts) == 1 and self.parts[0] == '*': + return 0 + return len(self.parts) + + def __str__(self): + """The standard string form of this language tag.""" + a = [] + if len(self.parts) >= 1: + a.append(self.parts[0]) + if len(self.parts) >= 2: + if len(self.parts[1]) == 2: + a.append( self.parts[1].upper() ) + else: + a.append( self.parts[1] ) + a.extend( self.parts[2:] ) + return '-'.join(a) + + def __unicode__(self): + """The unicode string form of this language tag.""" + # Probably unnecessary in Python 3 + if PY3 : + return str(self.__str__()) + else : + return unicode(self.__str__()) + + def __repr__(self): + """The python representation of this language tag.""" + s = '%s("%s")' % (self.__class__.__name__, self.__str__()) + return s + + def superior(self): + """Returns another instance of language_tag which is the superior. + + Thus en-US gives en, and en gives *. + + """ + if len(self) <= 1: + return self.__class__('*') + return self.__class__( '-'.join(self.parts[:-1]) ) + + def all_superiors(self, include_wildcard=False): + """Returns a list of this language and all it's superiors. + + If include_wildcard is False, then "*" will not be among the + output list, unless this language is itself "*". + + """ + langlist = [ self ] + l = self + while not l.is_universal_wildcard(): + l = l.superior() + if l.is_universal_wildcard() and not include_wildcard: + continue + langlist.append(l) + return langlist + + def is_universal_wildcard(self): + """Returns True if this language tag represents all possible + languages, by using the reserved tag of "*". + + """ + return len(self.parts) == 1 and self.parts[0] == '*' + + def dialect_of(self, other, ignore_wildcard=True): + """Is this language a dialect (or subset/specialization) of another. + + This method returns True if this language is the same as or a + specialization (dialect) of the other language_tag. + + If ignore_wildcard is False, then all languages will be + considered to be a dialect of the special language tag of "*". + + """ + if not ignore_wildcard and self.is_universal_wildcard(): + return True + for i in range( min(len(self), len(other)) ): + if self.parts[i] != other.parts[i]: + return False + if len(self) >= len(other): + return True + return False + + def __eq__(self, other): + """== operator. Are the two languages the same?""" + + return self.parts == other.parts + + def __neq__(self, other): + """!= operator. Are the two languages different?""" + + return not self.__eq__(other) + + def __lt__(self, other): + """< operator. Returns True if the other language is a more + specialized dialect of this one.""" + + return other.dialect_of(self) and self != other + + def __le__(self, other): + """<= operator. Returns True if the other language is the same + as or a more specialized dialect of this one.""" + return other.dialect_of(self) + + def __gt__(self, other): + """> operator. Returns True if this language is a more + specialized dialect of the other one.""" + + return self.dialect_of(other) and self != other + + def __ge__(self, other): + """>= operator. Returns True if this language is the same as + or a more specialized dialect of the other one.""" + + return self.dialect_of(other) + + +def parse_accept_language_header( header_value ): + """Parses the Accept-Language header. + + Returns a list of tuples, each like: + + (language_tag, qvalue, accept_parameters) + + """ + alist, k = parse_qvalue_accept_list( header_value) + if k < len(header_value): + raise ParseError('Accept-Language header is invalid',header_value,k) + + langlist = [] + for token, langparms, q, acptparms in alist: + if langparms: + raise ParseError('Language tag may not have any parameters',header_value,0) + lang = language_tag( token ) + langlist.append( (lang, q, acptparms) ) + + return langlist + + +def acceptable_language( accept_header, server_languages, ignore_wildcard=True, assume_superiors=True ): + """Determines if the given language is acceptable to the user agent. + + The accept_header should be the value present in the HTTP + "Accept-Language:" header. In mod_python this is typically + obtained from the req.http_headers_in table; in WSGI it is + environ["Accept-Language"]; other web frameworks may provide other + methods of obtaining it. + + Optionally the accept_header parameter can be pre-parsed, as + returned by the parse_accept_language_header() function defined in + this module. + + The server_languages argument should either be a single language + string, a language_tag object, or a sequence of them. It + represents the set of languages that the server is willing to + send to the user agent. + + Note that the wildcarded language tag "*" will be ignored. To + override this, call with ignore_wildcard=False, and even then + it will be the lowest-priority choice regardless of it's + quality factor (as per HTTP spec). + + If the assume_superiors is True then it the languages that the + browser accepts will automatically include all superior languages. + Any superior languages which must be added are done so with one + half the qvalue of the language which is present. For example, if + the accept string is "en-US", then it will be treated as if it + were "en-US, en;q=0.5". Note that although the HTTP 1.1 spec says + that browsers are supposed to encourage users to configure all + acceptable languages, sometimes they don't, thus the ability + for this function to assume this. But setting assume_superiors + to False will insure strict adherence to the HTTP 1.1 spec; which + means that if the browser accepts "en-US", then it will not + be acceptable to send just "en" to it. + + This function returns the language which is the most prefered and + is acceptable to both the user agent and the caller. It will + return None if no language is negotiable, otherwise the return + value is always an instance of language_tag. + + See also: RFC 3066 <http://www.ietf.org/rfc/rfc3066.txt>, and + ISO 639, links at <http://en.wikipedia.org/wiki/ISO_639>, and + <http://www.iana.org/assignments/language-tags>. + + """ + # Note special instructions from RFC 2616 sect. 14.1: + # "The language quality factor assigned to a language-tag by the + # Accept-Language field is the quality value of the longest + # language- range in the field that matches the language-tag." + + if _is_string(accept_header): + accept_list = parse_accept_language_header(accept_header) + else: + accept_list = accept_header + + # Possibly add in any "missing" languages that the browser may + # have forgotten to include in the list. Insure list is sorted so + # more general languages come before more specific ones. + + accept_list.sort() + all_tags = [a[0] for a in accept_list] + if assume_superiors: + to_add = [] + for langtag, qvalue, aargs in accept_list: + if len(langtag) >= 2: + for suptag in langtag.all_superiors( include_wildcard=False ): + if suptag not in all_tags: + # Add in superior at half the qvalue + to_add.append( (suptag, qvalue / 2, '') ) + all_tags.append( suptag ) + accept_list.extend( to_add ) + + # Convert server_languages to a list of language_tags + if _is_string(server_languages): + server_languages = [language_tag(server_languages)] + elif isinstance(server_languages, language_tag): + server_languages = [server_languages] + else: + server_languages = [language_tag(lang) for lang in server_languages] + + # Select the best one + best = None # tuple (langtag, qvalue, matchlen) + + for langtag, qvalue, aargs in accept_list: + # aargs is ignored for Accept-Language + if qvalue <= 0: + continue # UA doesn't accept this language + + if ignore_wildcard and langtag.is_universal_wildcard(): + continue # "*" being ignored + + for svrlang in server_languages: + # The best match is determined first by the quality factor, + # and then by the most specific match. + + matchlen = -1 # how specifically this one matches (0 is a non-match) + if svrlang.dialect_of( langtag, ignore_wildcard=ignore_wildcard ): + matchlen = len(langtag) + if not best \ + or matchlen > best[2] \ + or (matchlen == best[2] and qvalue > best[1]): + # This match is better + best = (langtag, qvalue, matchlen) + if not best: + return None + return best[0] + + +# Clean up global namespace +try: + if __emulating_set: + del set + del frozenset +except NameError: + pass + +# end of file diff --git a/rdflib/plugins/parsers/pyRdfa/graph.py b/rdflib/plugins/parsers/pyRdfa/graph.py new file mode 100644 index 00000000..a03fb725 --- /dev/null +++ b/rdflib/plugins/parsers/pyRdfa/graph.py @@ -0,0 +1,158 @@ +# -*- coding: utf-8 -*- +""" +Wrapper around RDFLib's Graph object. The issue is that, in RDFLib 2.X, the turtle and the RDF/XML serialization both have some issues (bugs and ugly output). As a result, the package’s own serializers should be registered and used. On the other hand, in RDFLib 3.X this becomes unnecessary, it is better to keep to the library’s own version. This wrapper provides a subclass of RDFLib’s Graph overriding the serialize method to register, if necessary, a different serializer and use that one. + +Also, some bindings (in the RDFLib sense) are done automatically, to ensure a nicer output for widely used schemas… + +@summary: Shell around RDLib's Graph +@organization: U{World Wide Web Consortium<http://www.w3.org>} +@author: U{Ivan Herman<a href="http://www.w3.org/People/Ivan/">} +@license: This software is available for use under the +U{W3C® SOFTWARE NOTICE AND LICENSE<href="http://www.w3.org/Consortium/Legal/2002/copyright-software-20021231">} + +@var _bindings: Default bindings. This is just for the beauty of things: bindings are added to the graph to make the output nicer. If this is not done, RDFlib defines prefixes like "_1:", "_2:" which is, though correct, ugly… +""" + +""" +$Id: graph.py,v 1.6 2012/03/23 14:06:25 ivan Exp $ $Date: 2012/03/23 14:06:25 $ + +""" + +import rdflib +if rdflib.__version__ >= "3.0.0" : + from rdflib import Graph +else : + from rdflib.Graph import Graph +from rdflib import Namespace + +_xml_serializer_name = "my-rdfxml" +_turtle_serializer_name = "my-turtle" +_json_serializer_name = "my-json-ld" + +try: + from cStringIO import StringIO +except ImportError: + from StringIO import StringIO + +# Default bindings. This is just for the beauty of things: bindings are added to the graph to make the output nicer. If this is not done, RDFlib defines prefixes like "_1:", "_2:" which is, though correct, ugly... +_bindings = [ +] + + +######################################################################################################### +class MyGraph(Graph) : + """ + Wrapper around RDFLib's Graph object. The issue is that the serializers in RDFLib are buggy:-( + + In RDFLib 2.X both the Turtle and the RDF/XML serializations have issues (bugs and ugly output). In RDFLib 3.X + the Turtle serialization seems to be fine, but the RDF/XML has problems:-( + + This wrapper provides a subclass of RDFLib’s Graph overriding the serialize method to register, + if necessary, a different serializer and use that one. + + @cvar xml_serializer_registered_2: flag to avoid duplicate registration for RDF/XML for rdflib 2.* + @type xml_serializer_registered_2: boolean + @cvar xml_serializer_registered_3: flag to avoid duplicate registration for RDF/XML for rdflib 3.* + @type xml_serializer_registered_3: boolean + @cvar json_serializer_registered: flag to avoid duplicate registration for JSON-LD for rdflib 3.* + @type json_serializer_registered: boolean + @cvar turtle_serializer_registered_2: flag to avoid duplicate registration for Turtle for rdflib 2.* + @type turtle_serializer_registered_2: boolean + """ + xml_serializer_registered_2 = False + xml_serializer_registered_3 = False + turtle_serializer_registered_2 = False + json_serializer_registered = False + + def __init__(self) : + Graph.__init__(self) + for (prefix,uri) in _bindings : + self.bind(prefix,Namespace(uri)) + + def _register_XML_serializer_3(self) : + """The default XML Serializer of RDFLib 3.X is buggy, mainly when handling lists. An L{own version<serializers.prettyXMLserializer_3>} is + registered in RDFlib and used in the rest of the package. + """ + if not MyGraph.xml_serializer_registered_3 : + from rdflib.plugin import register + from rdflib.serializer import Serializer + if rdflib.__version__ > "3.1.0" : + register(_xml_serializer_name, Serializer, + "pyRdfa.serializers.prettyXMLserializer_3_2", "PrettyXMLSerializer") + else : + register(_xml_serializer_name, Serializer, + "pyRdfa.serializers.prettyXMLserializer_3", "PrettyXMLSerializer") + MyGraph.xml_serializer_registered_3 = True + + def _register_JSON_serializer_3(self) : + """JSON LD serializer + """ + if not MyGraph.json_serializer_registered : + from rdflib.plugin import register + from rdflib.serializer import Serializer + register(_json_serializer_name, Serializer, + "pyRdfa.serializers.jsonserializer", "JsonSerializer") + MyGraph.json_serializer_registered = True + + def _register_XML_serializer_2(self) : + """The default XML Serializer of RDFLib 2.X is buggy, mainly when handling lists. + An L{own version<serializers.prettyXMLserializer>} is + registered in RDFlib and used in the rest of the package. This is not used for RDFLib 3.X. + """ + if not MyGraph.xml_serializer_registered_2 : + from rdflib.plugin import register + from rdflib.syntax import serializer, serializers + register(_xml_serializer_name, serializers.Serializer, + "pyRdfa.serializers.prettyXMLserializer", "PrettyXMLSerializer") + MyGraph.xml_serializer_registered_2 = True + + def _register_Turtle_serializer_2(self) : + """The default Turtle Serializers of RDFLib 2.X is buggy and not very nice as far as the output is concerned. + An L{own version<serializers.TurtleSerializer>} is registered in RDFLib and used in the rest of the package. + This is not used for RDFLib 3.X. + """ + if not MyGraph.turtle_serializer_registered_2 : + from rdflib.plugin import register + from rdflib.syntax import serializer, serializers + register(_turtle_serializer_name, serializers.Serializer, + "pyRdfa.serializers.turtleserializer", "TurtleSerializer") + MyGraph.turtle_serialzier_registered_2 = True + + def add(self, (s,p,o)) : + """Overriding the Graph's add method to filter out triples with possible None values. It may happen + in case, for example, a host language is not properly set up for the distiller""" + if s == None or p == None or o == None : + return + else : + Graph.add(self, (s,p,o)) + + def serialize(self, format = "xml") : + """Overriding the Graph's serialize method to adjust the output format""" + if rdflib.__version__ >= "3.0.0" : + # this is the easy case + if format == "xml" or format == "pretty-xml" : + self._register_XML_serializer_3() + return Graph.serialize(self, format=_xml_serializer_name) + elif format == "json-ld" or format == "json" : + # The new version of the serialziers in RDFLib 3.2.X require this extra round... + # I do not have the patience of working out why that is so. + self._register_JSON_serializer_3() + stream = StringIO() + Graph.serialize(self, format=_json_serializer_name, destination = stream) + return stream.getvalue() + elif format == "nt" : + return Graph.serialize(self, format="nt") + elif format == "n3" or format == "turtle" : + retval ="" + return Graph.serialize(self, format="turtle") + else : + if format == "xml" or format == "pretty-xml" : + self._register_XML_serializer_2() + return Graph.serialize(self, format=_xml_serializer_name) + elif format == "nt" : + return Graph.serialize(self, format="nt") + elif format == "n3" or format == "turtle" : + self._register_Turtle_serializer_2() + return Graph.serialize(self, format=_turtle_serializer_name) + + diff --git a/rdflib/plugins/parsers/pyRdfa/host/__init__.py b/rdflib/plugins/parsers/pyRdfa/host/__init__.py new file mode 100755 index 00000000..ffe740fb --- /dev/null +++ b/rdflib/plugins/parsers/pyRdfa/host/__init__.py @@ -0,0 +1,207 @@ +# -*- coding: utf-8 -*- +""" +Host language sub-package for the pyRdfa package. It contains variables and possible modules necessary to manage various RDFa +host languages. + +This module may have to be modified if a new host language is added to the system. In many cases the rdfa_core as a host language is enough, because there is no need for a special processing. However, some host languages may require an initial context, or their value may control some transformations, in which case additional data have to be added to this module. This module header contains all tables and arrays to be adapted, and the module content may contain specific transformation methods. + + +@summary: RDFa Host package +@requires: U{RDFLib package<http://rdflib.net>} +@organization: U{World Wide Web Consortium<http://www.w3.org>} +@author: U{Ivan Herman<a href="http://www.w3.org/People/Ivan/">} +@license: This software is available for use under the +U{W3C® SOFTWARE NOTICE AND LICENSE<href="http://www.w3.org/Consortium/Legal/2002/copyright-software-20021231">} + +@var content_to_host_language: a dictionary mapping a media type to a host language +@var preferred_suffixes: mapping from preferred suffixes for media types; used if the file is local, ie, there is not HTTP return value for the media type. It corresponds to the preferred suffix in the media type registration +@var initial_contexts: mapping from host languages to list of initial contexts +@var accept_xml_base: list of host languages that accept the xml:base attribute for base setting +@var accept_xml_lang: list of host languages that accept the xml:lang attribute for language setting. Note that XHTML and HTML have some special rules, and those are hard coded... +@var warn_xmlns_usage: list of host languages that should generate a warning for the usage of @xmlns (for RDFa 1.1) +@var accept_embedded_rdf_xml: list of host languages that might also include RDF data using an embedded RDF/XML (e.g., SVG). That RDF data may be merged with the output +@var accept_embedded_turtle: list of host languages that might also include RDF data using a C{script} element. That RDF data may be merged with the output +@var require_embedded_rdf: list of languages that must accept embedded RDF, ie, the corresponding option is irrelevant +@var host_dom_transforms: dictionary mapping a host language to an array of methods that are invoked at the beginning of the parsing process for a specific node. That function can do a last minute change on that DOM node, eg, adding or modifying an attribute. The method's signature is (node, state), where node is the DOM node, and state is the L{Execution context<pyRdfa.state.ExecutionContext>}. +@var predefined_1_0_rel: terms that are hardcoded for HTML+RDF1.0 and replace the initial context for that version +@var beautifying_prefixes: this is really just to make the output more attractive: for each media type a dictionary of prefix-URI pairs that can be used to make the terms look better... +@var default_vocabulary: as its name suggests, default @vocab value for a specific host language + +""" + +""" +$Id: __init__.py,v 1.19 2012/06/28 11:58:14 ivan Exp $ +$Date: 2012/06/28 11:58:14 $ +""" +__version__ = "3.0" + +from pyRdfa.host.atom import atom_add_entry_type +from pyRdfa.host.html5 import html5_extra_attributes, remove_rel + +class HostLanguage : + """An enumeration style class: recognized host language types for this processor of RDFa. Some processing details may depend on these host languages. "rdfa_core" is the default Host Language is nothing else is defined.""" + rdfa_core = "RDFa Core" + xhtml = "XHTML+RDFa" + xhtml5 = "XHTML5+RDFa" + html5 = "HTML5+RDFa" + atom = "Atom+RDFa" + svg = "SVG+RDFa" + +# initial contexts for host languages +initial_contexts = { + HostLanguage.xhtml : ["http://www.w3.org/2011/rdfa-context/rdfa-1.1", + "http://www.w3.org/2011/rdfa-context/xhtml-rdfa-1.1"], + HostLanguage.xhtml5 : ["http://www.w3.org/2011/rdfa-context/rdfa-1.1"], + HostLanguage.html5 : ["http://www.w3.org/2011/rdfa-context/rdfa-1.1"], + HostLanguage.rdfa_core : ["http://www.w3.org/2011/rdfa-context/rdfa-1.1"], + HostLanguage.atom : ["http://www.w3.org/2011/rdfa-context/rdfa-1.1"], + HostLanguage.svg : ["http://www.w3.org/2011/rdfa-context/rdfa-1.1"], +} + +beautifying_prefixes = { + HostLanguage.xhtml : { + "xhv" : "http://www.w3.org/1999/xhtml/vocab#" + }, + HostLanguage.html5 : { + "xhv" : "http://www.w3.org/1999/xhtml/vocab#" + }, + HostLanguage.xhtml5 : { + "xhv" : "http://www.w3.org/1999/xhtml/vocab#" + }, + HostLanguage.atom : { + "atomrel" : "http://www.iana.org/assignments/relation/" + } +} + + +accept_xml_base = [ HostLanguage.rdfa_core, HostLanguage.atom, HostLanguage.svg, HostLanguage.xhtml5 ] +accept_xml_lang = [ HostLanguage.rdfa_core, HostLanguage.atom, HostLanguage.svg ] + +accept_embedded_rdf_xml = [ HostLanguage.svg, HostLanguage.rdfa_core ] +accept_embedded_turtle = [ HostLanguage.svg, HostLanguage.html5, HostLanguage.xhtml5, HostLanguage.xhtml ] + +# some languages, eg, SVG, require that embedded content should be combined with the default graph, +# ie, it cannot be turned down by an option +require_embedded_rdf = [ HostLanguage.svg ] + +warn_xmlns_usage = [ HostLanguage.html5, HostLanguage.xhtml5, HostLanguage.xhtml] + +host_dom_transforms = { + HostLanguage.atom : [atom_add_entry_type], + HostLanguage.html5 : [html5_extra_attributes, remove_rel], + HostLanguage.xhtml5 : [html5_extra_attributes, remove_rel] +} + +default_vocabulary = { + HostLanguage.atom : "http://www.iana.org/assignments/relation/" +} + +predefined_1_0_rel = ['alternate', 'appendix', 'cite', 'bookmark', 'chapter', 'contents', +'copyright', 'glossary', 'help', 'icon', 'index', 'meta', 'next', 'p3pv1', 'prev', 'previous', +'role', 'section', 'subsection', 'start', 'license', 'up', 'last', 'stylesheet', 'first', 'top'] + +# ---------------------------------------------------------------------------------------------------------- + +class MediaTypes : + """An enumeration style class: some common media types (better have them at one place to avoid misstyping...)""" + rdfxml = 'application/rdf+xml' + turtle = 'text/turtle' + html = 'text/html' + xhtml = 'application/xhtml+xml' + svg = 'application/svg+xml' + svgi = 'image/svg+xml' + smil = 'application/smil+xml' + atom = 'application/atom+xml' + xml = 'application/xml' + xmlt = 'text/xml' + nt = 'text/plain' + +# mapping from (some) content types to RDFa host languages. This may control the exact processing or at least the initial context (see below)... +content_to_host_language = { + MediaTypes.html : HostLanguage.html5, + MediaTypes.xhtml : HostLanguage.xhtml, + MediaTypes.xml : HostLanguage.rdfa_core, + MediaTypes.xmlt : HostLanguage.rdfa_core, + MediaTypes.smil : HostLanguage.rdfa_core, + MediaTypes.svg : HostLanguage.svg, + MediaTypes.svgi : HostLanguage.svg, + MediaTypes.atom : HostLanguage.atom, +} + +# mapping preferred suffixes to media types... +preferred_suffixes = { + ".rdf" : MediaTypes.rdfxml, + ".ttl" : MediaTypes.turtle, + ".n3" : MediaTypes.turtle, + ".owl" : MediaTypes.rdfxml, + ".html" : MediaTypes.html, + ".shtml" : MediaTypes.html, + ".xhtml" : MediaTypes.xhtml, + ".svg" : MediaTypes.svg, + ".smil" : MediaTypes.smil, + ".xml" : MediaTypes.xml, + ".nt" : MediaTypes.nt, + ".atom" : MediaTypes.atom +} + +# DTD combinations that may determine the host language and the rdfa version +_XHTML_1_0 = [ + ("-//W3C//DTD XHTML+RDFa 1.0//EN", "http://www.w3.org/MarkUp/DTD/xhtml-rdfa-1.dtd") +] + +_XHTML_1_1 = [ + ("-//W3C//DTD XHTML+RDFa 1.1//EN", "http://www.w3.org/MarkUp/DTD/xhtml-rdfa-2.dtd"), + ("-//W3C//DTD HTML 4.01+RDFa 1.1//EN", "http://www.w3.org/MarkUp/DTD/html401-rdfa11-1.dtd") +] + +_XHTML = [ + ("-//W3C//DTD XHTML 1.0 Strict//EN", "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"), + ("-//W3C//DTD XHTML 1.0 Transitional//EN", "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"), + ("-//W3C//DTD XHTML 1.1//EN", "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd") +] + +def adjust_html_version(input, rdfa_version) : + """ + Adjust the rdfa_version based on the (possible) DTD + @param input: the input stream that has to be parsed by an xml parser + @param rdfa_version: the current rdfa_version; will be returned if nothing else is found + @return: the rdfa_version, either "1.0" or "1.1, if the DTD says so, otherwise the input rdfa_version value + """ + import xml.dom.minidom + parse = xml.dom.minidom.parse + dom = parse(input) + + (hl,version) = adjust_xhtml_and_version(dom, HostLanguage.xhtml, rdfa_version) + return version + +def adjust_xhtml_and_version(dom, incoming_language, rdfa_version) : + """ + Check if the xhtml+RDFa is really XHTML 0 or 1 or whether it should be considered as XHTML5. This is done + by looking at the DTD. Furthermore, checks whether whether the system id signals an rdfa 1.0, in which case the + version is also set. + + @param dom: top level DOM node + @param incoming_language: host language to be checked; the whole check is relevant for xhtml only. + @param rdfa_version: rdfa_version as known by the caller + @return: a tuple of the possibly modified host language (ie, set to XHTML5) and the possibly modified rdfa version (ie, set to "1.0", "1.1", or the incoming rdfa_version if nothing is found) + """ + if incoming_language == HostLanguage.xhtml : + try : + # There may not be any doctype set in the first place... + publicId = dom.doctype.publicId + systemId = dom.doctype.systemId + + if (publicId, systemId) in _XHTML_1_0 : + return (HostLanguage.xhtml,"1.0") + elif (publicId, systemId) in _XHTML_1_1 : + return (HostLanguage.xhtml,"1.1") + elif (publicId, systemId) in _XHTML : + return (HostLanguage.xhtml, rdfa_version) + else : + return (HostLanguage.xhtml5, rdfa_version) + except : + # If any of those are missing, forget it... + return (HostLanguage.xhtml5, rdfa_version) + else : + return (incoming_language, rdfa_version) + diff --git a/rdflib/plugins/parsers/pyRdfa/host/atom.py b/rdflib/plugins/parsers/pyRdfa/host/atom.py new file mode 100644 index 00000000..149a4a4c --- /dev/null +++ b/rdflib/plugins/parsers/pyRdfa/host/atom.py @@ -0,0 +1,29 @@ +# -*- coding: utf-8 -*- +""" +Simple transfomer for Atom: the C{@typeof=""} is added to the C{<entry>} element (unless something is already there). + +@summary: Add a top "about" to <head> and <body> +@requires: U{RDFLib package<http://rdflib.net>} +@organization: U{World Wide Web Consortium<http://www.w3.org>} +@author: U{Ivan Herman<a href="http://www.w3.org/People/Ivan/">} +@license: This software is available for use under the +U{W3C® SOFTWARE NOTICE AND LICENSE<href="http://www.w3.org/Consortium/Legal/2002/copyright-software-20021231">} +@contact: Ivan Herman, ivan@w3.org +""" + +""" +$Id: atom.py,v 1.2 2012/05/21 15:28:20 ivan Exp $ +$Date: 2012/05/21 15:28:20 $ +""" + +def atom_add_entry_type(node, state) : + """ + @param node: the current node that could be modified + @param state: current state + @type state: L{Execution context<pyRdfa.state.ExecutionContext>} + """ + def res_set(node) : + return True in [ node.hasAttribute(a) for a in ["resource", "about", "href", "src"] ] + + if node.tagName == "entry" and not res_set(node) and node.hasAttribute("typeof") == False : + node.setAttribute("typeof","") diff --git a/rdflib/plugins/parsers/pyRdfa/host/html5.py b/rdflib/plugins/parsers/pyRdfa/host/html5.py new file mode 100644 index 00000000..3a346498 --- /dev/null +++ b/rdflib/plugins/parsers/pyRdfa/host/html5.py @@ -0,0 +1,241 @@ +# -*- coding: utf-8 -*- +""" +Simple transfomer for HTML5: add a @src for any @data, add a @content for the @value attribute of the <data> element, and interpret the <time> element. + +@summary: Add a top "about" to <head> and <body> +@requires: U{RDFLib package<http://rdflib.net>} +@organization: U{World Wide Web Consortium<http://www.w3.org>} +@author: U{Ivan Herman<a href="http://www.w3.org/People/Ivan/">} +@license: This software is available for use under the +U{W3C® SOFTWARE NOTICE AND LICENSE<href="http://www.w3.org/Consortium/Legal/2002/copyright-software-20021231">} +@contact: Ivan Herman, ivan@w3.org +""" + +""" +$Id: html5.py,v 1.10 2012/06/28 11:58:14 ivan Exp $ +$Date: 2012/06/28 11:58:14 $ +""" +try : + from functools import reduce +except : + # Not important. This import is necessary in Python 3 only and the newer versions of Python 2.X it is there + # for a forward compatibility with Python 3 + pass + +# The handling of datatime is a little bit more complex... better put this in a separate function for a better management +from datetime import datetime +import re +datetime_type = "http://www.w3.org/2001/XMLSchema#dateTime" +time_type = "http://www.w3.org/2001/XMLSchema#time" +date_type = "http://www.w3.org/2001/XMLSchema#date" +date_gYear = "http://www.w3.org/2001/XMLSchema#gYear" +date_gYearMonth = "http://www.w3.org/2001/XMLSchema#gYearMonth" +date_gMonthDay = "http://www.w3.org/2001/XMLSchema#gMonthDay" +duration_type = "http://www.w3.org/2001/XMLSchema#duration" +plain = "plain" + +handled_time_types = [ datetime_type, time_type, date_type, date_gYear, date_gYearMonth, date_gMonthDay, duration_type ] + +_formats = { + date_gMonthDay : [ "%m-%d" ], + date_gYearMonth : [ "%Y-%m"], + date_gYear : [ "%Y" ], + date_type : [ "%Y-%m-%d", "%Y-%m-%dZ" ], + time_type : [ "%H:%M", + "%H:%M:%S", + "%H:%M:%SZ", + "%H:%M:%S.%f" ], + datetime_type : [ "%Y-%m-%dT%H:%M", + "%Y-%m-%dT%H:%M:%S", + "%Y-%m-%dT%H:%M:%S.%f", + "%Y-%m-%dT%H:%MZ", + "%Y-%m-%dT%H:%M:%SZ", + "%Y-%m-%dT%H:%M:%S.%fZ" ], + duration_type : [ "P%dD", + "P%YY%mM%dD", + "P%YY%mM", + "P%YY%dD", + "P%YY", + "P%mM", + "P%mM%dD", + ], +} + +_dur_times = [ "%HH%MM%SS", "%HH", "%MM", "%SS", "%HH%MM", "%HH%SS", "%MM%SS" ] + +def _format_test(string) : + """ + Tests the string format to see whether it fits one of the time datatypes + @param string: attribute value to test + @return: a URI for the xsd datatype or the string 'plain' + """ + # Try to get the easy cases: + for key in _formats : + for format in _formats[key] : + try : + # try to check if the syntax is fine + d = datetime.strptime(string, format) + # bingo! + return key + except ValueError : + pass + + # Now come the special cases:-( + # Check first for the duration stuff, that is the nastiest. + if len(string) > 2 and string[0] == 'P' or (string [0] == '-' and string[1] == 'P') : + # this is meant to be a duration type + # first of all, get rid of the leading '-' and check again + if string[0] == '-' : + for format in _formats[duration_type] : + try : + # try to check if the syntax is fine + d = datetime.strptime(string, format) + # bingo! + return duration_type + except ValueError : + pass + # Let us see if the value contains a separate time portion, and cut that one + durs = string.split('T') + if len(durs) == 2 : + # yep, so we should check again + dur = durs[0] + tm = durs[1] + # Check the duration part + td = False + for format in _formats[duration_type] : + try : + # try to check if the syntax is fine + d = datetime.strptime(dur, format) + # bingo! + td = True + break + except ValueError : + pass + if td == True : + # Getting there... + for format in _dur_times : + try : + # try to check if the syntax is fine + d = datetime.strptime(tm, format) + # bingo! + return duration_type + except ValueError : + pass + # something went wrong... + return plain + else : + # Well, no more tricks, this is a plain type + return plain + + + # If we got here, we should check the time zone + # there is a discrepancy betwen the python and the HTML5/XSD lexical string, + # which means that this has to handled separately for the date and the timezone portion + try : + # The time-zone-less portion of the string + str = string[0:-6] + # The time-zone portion + tz = string[-5:] + try : + t = datetime.strptime(tz,"%H:%M") + except ValueError : + # Bummer, this is not a correct time + return plain + # The time-zone is fine, the datetime portion has to be checked + for format in _formats[datetime_type] : + try : + # try to check if it is fine + d = datetime.strptime(str, format) + # Bingo! + return datetime_type + except ValueError : + pass + except : + pass + return plain + +def html5_extra_attributes(node, state) : + """ + @param node: the current node that could be modified + @param state: current state + @type state: L{Execution context<pyRdfa.state.ExecutionContext>} + """ + def _get_literal(Pnode): + """ + Get (recursively) the full text from a DOM Node. + + @param Pnode: DOM Node + @return: string + """ + rc = "" + for node in Pnode.childNodes: + if node.nodeType == node.TEXT_NODE: + rc = rc + node.data + elif node.nodeType == node.ELEMENT_NODE : + rc = rc + self._get_literal(node) + if state.options.space_preserve : + return rc + else : + return re.sub(r'(\r| |\n|\t)+'," ",rc).strip() + #return re.sub(r'(\r| |\n|\t)+',"",rc).strip() + # end getLiteral + + if node.hasAttribute("value") : + # state.supress_lang = True + node.setAttribute("content", node.getAttribute("value")) + + elif node.tagName == "time": + # see if there is already a datatype element; if so, the author has made his/her own encoding + # The value can come from the attribute or the content: + if node.hasAttribute("datetime") : + value = node.getAttribute("datetime") + else : + # The value comes from the content of the XML + value = _get_literal(node) + # If the user has already set the datatype, then let that one win + if not node.hasAttribute("datatype") : + # Check the datatype: + dt = _format_test(value) + if dt != plain : + node.setAttribute("datatype",dt) + # Finally, set the value itself + node.setAttribute("content",value) + + #elif node.hasAttribute("data") and not node.hasAttribute("src") : + # node.setAttribute("src", node.getAttribute("data")) + +def remove_rel(node, state): + """ + If @property and @rel/@rev are on the same element, then only CURIE and URI can appear as a rel/rev value. + + @param node: the current node that could be modified + @param state: current state + @type state: L{Execution context<pyRdfa.state.ExecutionContext>} + """ + from pyRdfa.termorcurie import termname + def _massage_node(node,attr) : + """The real work for remove_rel is done here, parametrized with @rel and @rev""" + if node.hasAttribute("property") and node.hasAttribute(attr) : + vals = node.getAttribute(attr).strip().split() + if len(vals) != 0 : + final_vals = [ v for v in vals if not termname.match(v) ] + if len(final_vals) == 0 : + node.removeAttribute(attr) + else : + node.setAttribute(attr, reduce(lambda x,y: x+' '+y,final_vals)) + + _massage_node(node, "rev") + _massage_node(node, "rel") + + + + + + + + + + + + + diff --git a/rdflib/plugins/parsers/pyRdfa/initialcontext.py b/rdflib/plugins/parsers/pyRdfa/initialcontext.py new file mode 100644 index 00000000..344d3b51 --- /dev/null +++ b/rdflib/plugins/parsers/pyRdfa/initialcontext.py @@ -0,0 +1,100 @@ +# -*- coding: utf-8 -*- +""" +Built-in version of the initial contexts for RDFa Core, and RDFa + HTML + +@summary: Management of vocabularies, terms, and their mapping to URI-s. +@requires: U{RDFLib package<http://rdflib.net>} +@organization: U{World Wide Web Consortium<http://www.w3.org>} +@author: U{Ivan Herman<a href="http://www.w3.org/People/Ivan/">} +@license: This software is available for use under the +U{W3C® SOFTWARE NOTICE AND LICENSE<href="http://www.w3.org/Consortium/Legal/2002/copyright-software-20021231">} + +@var initial_context: dictionary for all the initial context data, keyed through the context URI-s +""" + +""" +$Id: initialcontext.py,v 1.8 2012/04/23 09:54:27 ivan Exp $ +$Date: 2012/04/23 09:54:27 $ +""" + +class Wrapper : + pass + +initial_context = { + "http://www.w3.org/2011/rdfa-context/rdfa-1.1" : Wrapper(), + "http://www.w3.org/2011/rdfa-context/xhtml-rdfa-1.1" : Wrapper(), +} + +initial_context["http://www.w3.org/2011/rdfa-context/rdfa-1.1"].ns = { + 'owl' : 'http://www.w3.org/2002/07/owl#', + 'gr' : 'http://purl.org/goodrelations/v1#', + 'ctag' : 'http://commontag.org/ns#', + 'cc' : 'http://creativecommons.org/ns#', + 'grddl' : 'http://www.w3.org/2003/g/data-view#', + 'rif' : 'http://www.w3.org/2007/rif#', + 'sioc' : 'http://rdfs.org/sioc/ns#', + 'skos' : 'http://www.w3.org/2004/02/skos/core#', + 'xml' : 'http://www.w3.org/XML/1998/namespace', + 'rdfs' : 'http://www.w3.org/2000/01/rdf-schema#', + 'rev' : 'http://purl.org/stuff/rev#', + 'rdfa' : 'http://www.w3.org/ns/rdfa#', + 'dc' : 'http://purl.org/dc/terms/', + 'dcterms' : 'http://purl.org/dc/terms/', + 'foaf' : 'http://xmlns.com/foaf/0.1/', + 'void' : 'http://rdfs.org/ns/void#', + 'ical' : 'http://www.w3.org/2002/12/cal/icaltzd#', + 'vcard' : 'http://www.w3.org/2006/vcard/ns#', + 'wdrs' : 'http://www.w3.org/2007/05/powder-s#', + 'og' : 'http://ogp.me/ns#', + 'wdr' : 'http://www.w3.org/2007/05/powder#', + 'rdf' : 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', + 'xhv' : 'http://www.w3.org/1999/xhtml/vocab#', + 'xsd' : 'http://www.w3.org/2001/XMLSchema#', + 'v' : 'http://rdf.data-vocabulary.org/#', + 'skosxl' : 'http://www.w3.org/2008/05/skos-xl#', + 'schema' : 'http://schema.org/', + 'ma' : 'http://www.w3.org/ns/ma-ont#', +} + +initial_context["http://www.w3.org/2011/rdfa-context/rdfa-1.1"].terms = { + 'describedby' : 'http://www.w3.org/2007/05/powder-s#describedby', + 'role' : 'http://www.w3.org/1999/xhtml/vocab#role', + 'license' : 'http://www.w3.org/1999/xhtml/vocab#license', +} + +initial_context["http://www.w3.org/2011/rdfa-context/rdfa-1.1"].vocabulary = "" + +initial_context["http://www.w3.org/2011/rdfa-context/xhtml-rdfa-1.1"].ns = { +} + +initial_context["http://www.w3.org/2011/rdfa-context/xhtml-rdfa-1.1"].vocabulary = "" + +initial_context["http://www.w3.org/2011/rdfa-context/xhtml-rdfa-1.1"].terms = { + 'alternate' : 'http://www.w3.org/1999/xhtml/vocab#alternate', + 'appendix' : 'http://www.w3.org/1999/xhtml/vocab#appendix', + 'cite' : 'http://www.w3.org/1999/xhtml/vocab#cite', + 'bookmark' : 'http://www.w3.org/1999/xhtml/vocab#bookmark', + 'chapter' : 'http://www.w3.org/1999/xhtml/vocab#chapter', + 'contents' : 'http://www.w3.org/1999/xhtml/vocab#contents', + 'copyright' : 'http://www.w3.org/1999/xhtml/vocab#copyright', + 'glossary' : 'http://www.w3.org/1999/xhtml/vocab#glossary', + 'help' : 'http://www.w3.org/1999/xhtml/vocab#help', + 'icon' : 'http://www.w3.org/1999/xhtml/vocab#icon', + 'index' : 'http://www.w3.org/1999/xhtml/vocab#index', + 'meta' : 'http://www.w3.org/1999/xhtml/vocab#meta', + 'next' : 'http://www.w3.org/1999/xhtml/vocab#next', + 'license' : 'http://www.w3.org/1999/xhtml/vocab#license', + 'p3pv1' : 'http://www.w3.org/1999/xhtml/vocab#p3pv1', + 'prev' : 'http://www.w3.org/1999/xhtml/vocab#prev', + 'previous' : 'http://www.w3.org/1999/xhtml/vocab#previous', + 'role' : 'http://www.w3.org/1999/xhtml/vocab#role', + 'section' : 'http://www.w3.org/1999/xhtml/vocab#section', + 'stylesheet' : 'http://www.w3.org/1999/xhtml/vocab#stylesheet', + 'subsection' : 'http://www.w3.org/1999/xhtml/vocab#subsection', + 'start' : 'http://www.w3.org/1999/xhtml/vocab#start', + 'up' : 'http://www.w3.org/1999/xhtml/vocab#up', + 'last' : 'http://www.w3.org/1999/xhtml/vocab#last', + 'first' : 'http://www.w3.org/1999/xhtml/vocab#first', + 'top' : 'http://www.w3.org/1999/xhtml/vocab#top', +} + diff --git a/rdflib/plugins/parsers/pyRdfa/options.py b/rdflib/plugins/parsers/pyRdfa/options.py new file mode 100644 index 00000000..1ff3f942 --- /dev/null +++ b/rdflib/plugins/parsers/pyRdfa/options.py @@ -0,0 +1,249 @@ +# -*- coding: utf-8 -*- +""" +L{Options} class: collect the possible options that govern the parsing possibilities. The module also includes the L{ProcessorGraph} class that handles the processor graph, per RDFa 1.1 (i.e., the graph containing errors and warnings). + +@summary: RDFa parser (distiller) +@requires: U{RDFLib<http://rdflib.net>} +@organization: U{World Wide Web Consortium<http://www.w3.org>} +@author: U{Ivan Herman<a href="http://www.w3.org/People/Ivan/">} +@license: This software is available for use under the +U{W3C SOFTWARE NOTICE AND LICENSE<href="http://www.w3.org/Consortium/Legal/2002/copyright-software-20021231">} +""" + +""" +$Id: options.py,v 1.16 2012/08/20 14:14:14 ivan Exp $ $Date: 2012/08/20 14:14:14 $ +""" + +import sys, datetime + +import rdflib +from rdflib import URIRef +from rdflib import Literal +from rdflib import BNode +from rdflib import Namespace +if rdflib.__version__ >= "3.0.0" : + from rdflib import Graph + from rdflib import RDF as ns_rdf + from rdflib import RDFS as ns_rdfs +else : + from rdflib.Graph import Graph + from rdflib.RDFS import RDFSNS as ns_rdfs + from rdflib.RDF import RDFNS as ns_rdf + +from pyRdfa.host import HostLanguage, MediaTypes, content_to_host_language, predefined_1_0_rel, require_embedded_rdf +from pyRdfa import ns_xsd, ns_distill, ns_rdfa +from pyRdfa import RDFA_Error, RDFA_Warning, RDFA_Info + +ns_dc = Namespace("http://purl.org/dc/terms/") +ns_ht = Namespace("http://www.w3.org/2006/http#") + +class ProcessorGraph : + """Wrapper around the 'processor graph', ie, the (RDF) Graph containing the warnings, + error messages, and informational messages. + """ + def __init__(self) : + self.graph = Graph() + self.graph.bind("dcterms", ns_dc) + self.graph.bind("pyrdfa", ns_distill) + self.graph.bind("rdf", ns_rdf) + self.graph.bind("rdfa", ns_rdfa) + self.graph.bind("ht", ns_ht) + + def add_triples(self, msg, top_class, info_class, context, node) : + """ + Add an error structure to the processor graph: a bnode with a number of predicates. The structure + follows U{the processor graph vocabulary<http://www.w3.org/2010/02/rdfa/wiki/Processor_Graph_Vocabulary>} as described + on the RDFa WG Wiki page. + + @param msg: the core error message, added as an object to a dc:description + @param top_class: Error, Warning, or Info; an explicit rdf:type added to the bnode + @type top_class: URIRef + @param info_class: An additional error class, added as an rdf:type to the bnode in case it is not None + @type info_class: URIRef + @param context: An additional information added, if not None, as an object with rdfa:context as a predicate + @type context: either an URIRef or a URI String (an URIRef will be created in the second case) + @param node: The node's element name that contains the error + @type node: string + @return: the bnode that serves as a subject for the errors. The caller may add additional information + @rtype: BNode + """ + # Python 3 foolproof way + try : + is_context_string = isinstance(context, basestring) + except : + is_context_string = isinstance(context, str) + + bnode = BNode() + + if node != None: + try : + full_msg = "[In element '%s'] %s" % (node.nodeName, msg) + except : + full_msg = "[In element '%s'] %s" % (node, msg) + else : + full_msg = msg + + self.graph.add((bnode, ns_rdf["type"], top_class)) + if info_class : + self.graph.add((bnode, ns_rdf["type"], info_class)) + self.graph.add((bnode, ns_dc["description"], Literal(full_msg))) + self.graph.add((bnode, ns_dc["date"], Literal(datetime.datetime.utcnow().isoformat(),datatype=ns_xsd["dateTime"]))) + if context and (isinstance(context,URIRef) or is_context_string): + htbnode = BNode() + self.graph.add( (bnode, ns_rdfa["context"],htbnode) ) + self.graph.add( (htbnode, ns_rdf["type"], ns_ht["Request"]) ) + self.graph.add( (htbnode, ns_ht["requestURI"], Literal("%s" % context)) ) + return bnode + + def add_http_context(self, subj, http_code) : + """ + Add an additional HTTP context to a message with subject in C{subj}, using the U{<http://www.w3.org/2006/http#>} + vocabulary. Typically used to extend an error structure, as created by L{add_triples}. + + @param subj: an RDFLib resource, typically a blank node + @param http_code: HTTP status code + """ + bnode = BNode() + self.graph.add((subj, ns_rdfa["context"], bnode)) + self.graph.add((bnode, ns_rdf["type"], ns_ht["Response"])) + self.graph.add((bnode, ns_ht["responseCode"], URIRef("http://www.w3.org/2006/http#%s" % http_code))) + +class Options : + """Settable options. An instance of this class is stored in + the L{execution context<ExecutionContext>} of the parser. + + @ivar space_preserve: whether plain literals should preserve spaces at output or not + @type space_preserve: Boolean + + @ivar output_default_graph: whether the 'default' graph should be returned to the user + @type output_default_graph: Boolean + + @ivar output_processor_graph: whether the 'processor' graph should be returned to the user + @type output_processor_graph: Boolean + + @ivar processor_graph: the 'processor' Graph + @type processor_graph: L{ProcessorGraph} + + @ivar transformers: extra transformers + @type transformers: list + + @ivar vocab_cache_report: whether the details of vocabulary file caching process should be reported as information (mainly for debug) + @type vocab_cache_report: Boolean + + @ivar refresh_vocab_cache: whether the caching checks of vocabs should be by-passed, ie, if caches should be re-generated regardless of the stored date (important for vocab development) + @type refresh_vocab_cache: Boolean + + @ivar embedded_rdf: whether embedded RDF (ie, turtle in an HTML script element or an RDF/XML content in SVG) should be extracted and added to the final graph. This is a non-standard option... + @type embedded_rdf: Boolean + + @ivar vocab_expansion: whether the @vocab elements should be expanded and a mini-RDFS processing should be done on the merged graph + @type vocab_expansion: Boolean + + @ivar vocab_cache: whether the system should use the vocabulary caching mechanism when expanding via the mini-RDFS, or should just fetch the graphs every time + @type vocab_cache: Boolean + + @ivar host_language: the host language for the RDFa attributes. Default is HostLanguage.xhtml, but it can be HostLanguage.rdfa_core and HostLanguage.html5, or others... + @type host_language: integer (logically: an enumeration) + + @ivar content_type: the content type of the host file. Default is None + @type content_type: string (logically: an enumeration) + + @ivar add_informational_messages: whether informational messages should also be added to the processor graph, or only errors and warnings + """ + def __init__(self, output_default_graph = True, + output_processor_graph = False, + space_preserve = True, + transformers = [], + embedded_rdf = True, + vocab_expansion = False, + vocab_cache = True, + vocab_cache_report = False, + refresh_vocab_cache = False, + add_informational_messages = False) : + self.space_preserve = space_preserve + self.transformers = transformers + self.processor_graph = ProcessorGraph() + self.output_default_graph = output_default_graph + self.output_processor_graph = output_processor_graph + self.host_language = HostLanguage.rdfa_core + self.vocab_cache_report = vocab_cache_report + self.refresh_vocab_cache = refresh_vocab_cache + self.embedded_rdf = embedded_rdf + self.vocab_expansion = vocab_expansion + self.vocab_cache = vocab_cache + self.add_informational_messages = add_informational_messages + + def set_host_language(self, content_type) : + """ + Set the host language for processing, based on the recognized types. If this is not a recognized content type, + it falls back to RDFa core (i.e., XML) + @param content_type: content type + @type content_type: string + """ + if content_type in content_to_host_language : + self.host_language = content_to_host_language[content_type] + else : + self.host_language = HostLanguage.rdfa_core + + if self.host_language in require_embedded_rdf : + self.embedded_rdf = True + + def __str__(self) : + retval = """Current options: + preserve space : %s + output processor graph : %s + output default graph : %s + host language : %s + accept embedded RDF : %s + perfom semantic postprocessing : %s + cache vocabulary graphs : %s + """ + return retval % (self.space_preserve, self.output_processor_graph, self.output_default_graph, self.host_language, self.embedded_rdf, self.rdfa_sem, self.vocab_cache) + + def reset_processor_graph(self): + """Empty the processor graph. This is necessary if the same options is reused + for several RDFa sources, and new error messages should be generated. + """ + self.processor_graph.graph.remove((None,None,None)) + + def add_warning(self, txt, warning_type=None, context=None, node=None, buggy_value=None) : + """Add a warning to the processor graph. + @param txt: the warning text. + @keyword warning_type: Warning Class + @type warning_type: URIRef + @keyword context: possible context to be added to the processor graph + @type context: URIRef or String + @keyword buggy_value: a special case when a 'term' is not recognized; no warning is generated for that case if the value is part of the 'usual' XHTML terms, because almost all RDFa file contains some of those and that would pollute the output + @type buggy_value: String + """ + if warning_type == ns_rdfa["UnresolvedTerm"] and buggy_value in predefined_1_0_rel : + return + return self.processor_graph.add_triples(txt, RDFA_Warning, warning_type, context, node) + + def add_info(self, txt, info_type=None, context=None, node=None, buggy_value=None) : + """Add an informational comment to the processor graph. + @param txt: the information text. + @keyword info_type: Info Class + @type info_type: URIRef + @keyword context: possible context to be added to the processor graph + @type context: URIRef or String + @keyword buggy_value: a special case when a 'term' is not recognized; no information is generated for that case if the value is part of the 'usual' XHTML terms, because almost all RDFa file contains some of those and that would pollute the output + @type buggy_value: String + """ + if self.add_informational_messages : + return self.processor_graph.add_triples(txt, RDFA_Info, info_type, context, node) + else : + return + + def add_error(self, txt, err_type=None, context=None, node=None, buggy_value=None) : + """Add an error to the processor graph. + @param txt: the information text. + @keyword err_type: Error Class + @type err_type: URIRef + @keyword context: possible context to be added to the processor graph + @type context: URIRef or String + @keyword buggy_value: a special case when a 'term' is not recognized; no error is generated for that case if the value is part of the 'usual' XHTML terms, because almost all RDFa file contains some of those and that would pollute the output + @type buggy_value: String + """ + return self.processor_graph.add_triples(txt, RDFA_Error, err_type, context, node) + diff --git a/rdflib/plugins/parsers/pyRdfa/parse.py b/rdflib/plugins/parsers/pyRdfa/parse.py new file mode 100644 index 00000000..ecec8981 --- /dev/null +++ b/rdflib/plugins/parsers/pyRdfa/parse.py @@ -0,0 +1,510 @@ +# -*- coding: utf-8 -*- +""" +The core parsing function of RDFa. Some details are +put into other modules to make it clearer to update/modify (e.g., generation of C{@property} values, or managing the current state). + +Note that the entry point (L{parse_one_node}) bifurcates into an RDFa 1.0 and RDFa 1.1 version, ie, +to L{_parse_1_0} and L{_parse_1_1}. Some of the parsing details (management of C{@property}, list facilities, changed behavior on C{@typeof})) have changed +between versions and forcing the two into one function would be counter productive. + +@summary: RDFa core parser processing step +@organization: U{World Wide Web Consortium<http://www.w3.org>} +@author: U{Ivan Herman<a href="http://www.w3.org/People/Ivan/">} +@license: This software is available for use under the +U{W3C® SOFTWARE NOTICE AND LICENSE<href="http://www.w3.org/Consortium/Legal/2002/copyright-software-20021231">} +""" + +""" +$Id: parse.py,v 1.17 2012/06/12 11:47:28 ivan Exp $ +$Date: 2012/06/12 11:47:28 $ +""" + +import sys + +from pyRdfa.state import ExecutionContext +from pyRdfa.property import ProcessProperty +from pyRdfa.embeddedRDF import handle_embeddedRDF +from pyRdfa.host import HostLanguage, host_dom_transforms + +import rdflib +from rdflib import URIRef +from rdflib import Literal +from rdflib import BNode +from rdflib import Namespace +if rdflib.__version__ >= "3.0.0" : + from rdflib import Graph + from rdflib import RDF as ns_rdf + from rdflib import RDFS as ns_rdfs +else : + from rdflib.Graph import Graph + from rdflib.RDFS import RDFSNS as ns_rdfs + from rdflib.RDF import RDFNS as ns_rdf + +from pyRdfa import IncorrectBlankNodeUsage, err_no_blank_node +from pyRdfa.utils import has_one_of_attributes + +####################################################################### +def parse_one_node(node, graph, parent_object, incoming_state, parent_incomplete_triples) : + """The (recursive) step of handling a single node. + + This entry just switches between the RDFa 1.0 and RDFa 1.1 versions for parsing. This method is only invoked once, + actually, from the top level; the recursion then happens in the L{_parse_1_0} and L{_parse_1_1} methods for + RDFa 1.0 and RDFa 1.1, respectively. + + @param node: the DOM node to handle + @param graph: the RDF graph + @type graph: RDFLib's Graph object instance + @param parent_object: the parent's object, as an RDFLib URIRef + @param incoming_state: the inherited state (namespaces, lang, etc.) + @type incoming_state: L{state.ExecutionContext} + @param parent_incomplete_triples: list of hanging triples (the missing resource set to None) to be handled (or not) + by the current node. + @return: whether the caller has to complete it's parent's incomplete triples + @rtype: Boolean + """ + # Branch according to versions. + if incoming_state.rdfa_version >= "1.1" : + _parse_1_1(node, graph, parent_object, incoming_state, parent_incomplete_triples) + else : + _parse_1_0(node, graph, parent_object, incoming_state, parent_incomplete_triples) + +####################################################################### +def _parse_1_1(node, graph, parent_object, incoming_state, parent_incomplete_triples) : + """The (recursive) step of handling a single node. See the + U{RDFa 1.1 Core document<http://www.w3.org/TR/rdfa-core/>} for further details. + + This is the RDFa 1.1 version. + + @param node: the DOM node to handle + @param graph: the RDF graph + @type graph: RDFLib's Graph object instance + @param parent_object: the parent's object, as an RDFLib URIRef + @param incoming_state: the inherited state (namespaces, lang, etc.) + @type incoming_state: L{state.ExecutionContext} + @param parent_incomplete_triples: list of hanging triples (the missing resource set to None) to be handled (or not) + by the current node. + @return: whether the caller has to complete it's parent's incomplete triples + @rtype: Boolean + """ + def header_check(p_obj) : + """Special disposition for the HTML <head> and <body> elements...""" + if state.options.host_language in [ HostLanguage.xhtml, HostLanguage.html5, HostLanguage.xhtml5 ] : + if node.nodeName == "head" or node.nodeName == "body" : + if not has_one_of_attributes(node, "about", "resource", "src", "href") : + return p_obj + else : + return None + + # Update the state. This means, for example, the possible local settings of + # namespaces and lang + state = None + state = ExecutionContext(node, graph, inherited_state=incoming_state) + + #--------------------------------------------------------------------------------- + # Handling the role attribute is pretty much orthogonal to everything else... + handle_role_attribute(node, graph, state) + + #--------------------------------------------------------------------------------- + # Handle the special case for embedded RDF, eg, in SVG1.2. + # This may add some triples to the target graph that does not originate from RDFa parsing + # If the function return TRUE, that means that an rdf:RDF has been found. No + # RDFa parsing should be done on that subtree, so we simply return... + if state.options.embedded_rdf and node.nodeType == node.ELEMENT_NODE and handle_embeddedRDF(node, graph, state) : + return + + #--------------------------------------------------------------------------------- + # calling the host language specific massaging of the DOM + if state.options.host_language in host_dom_transforms and node.nodeType == node.ELEMENT_NODE : + for func in host_dom_transforms[state.options.host_language] : func(node, state) + + #--------------------------------------------------------------------------------- + # First, let us check whether there is anything to do at all. Ie, + # whether there is any relevant RDFa specific attribute on the element + # + if not has_one_of_attributes(node, "href", "resource", "about", "property", "rel", "rev", "typeof", "src", "vocab", "prefix") : + # nop, there is nothing to do here, just go down the tree and return... + for n in node.childNodes : + if n.nodeType == node.ELEMENT_NODE : parse_one_node(n, graph, parent_object, state, parent_incomplete_triples) + return + + #----------------------------------------------------------------- + # The goal is to establish the subject and object for local processing + # The behaviour is slightly different depending on the presense or not + # of the @rel/@rev attributes + current_subject = None + current_object = None + typed_resource = None + + if has_one_of_attributes(node, "rel", "rev") : + # in this case there is the notion of 'left' and 'right' of @rel/@rev + # in establishing the new Subject and the objectResource + current_subject = header_check(parent_object) + + # set first the subject + if node.hasAttribute("about") : + current_subject = state.getURI("about") + if node.hasAttribute("typeof") : typed_resource = current_subject + + # get_URI may return None in case of an illegal CURIE, so + # we have to be careful here, not use only an 'else' + if current_subject == None : + current_subject = parent_object + else : + state.reset_list_mapping(origin = current_subject) + + # set the object resource + current_object = state.getResource("resource", "href", "src") + + if node.hasAttribute("typeof") and not node.hasAttribute("about") : + if current_object == None : + current_object = BNode() + typed_resource = current_object + + if not node.hasAttribute("inlist") and current_object != None : + # In this case the newly defined object is, in fact, the head of the list + # just reset the whole thing. + state.reset_list_mapping(origin = current_object) + + elif node.hasAttribute("property") and not has_one_of_attributes(node, "content", "datatype") : + current_subject = header_check(parent_object) + + # this is the case when the property may take hold of @src and friends... + if node.hasAttribute("about") : + current_subject = state.getURI("about") + if node.hasAttribute("typeof") : typed_resource = current_subject + + # getURI may return None in case of an illegal CURIE, so + # we have to be careful here, not use only an 'else' + if current_subject == None : + current_subject = parent_object + else : + state.reset_list_mapping(origin = current_subject) + + if typed_resource == None and node.hasAttribute("typeof") : + typed_resource = state.getResource("resource", "href", "src") + if typed_resource == None : + typed_resource = BNode() + current_object = typed_resource + else : + current_object = current_subject + + else : + current_subject = header_check(parent_object) + + # in this case all the various 'resource' setting attributes + # behave identically, though they also have their own priority + if current_subject == None : + current_subject = state.getResource("about", "resource", "href", "src") + + # get_URI_ref may return None in case of an illegal CURIE, so + # we have to be careful here, not use only an 'else' + if current_subject == None : + if node.hasAttribute("typeof") : + current_subject = BNode() + state.reset_list_mapping(origin = current_subject) + else : + current_subject = parent_object + else : + state.reset_list_mapping(origin = current_subject) + + # in this case no non-literal triples will be generated, so the + # only role of the current_object Resource is to be transferred to + # the children node + current_object = current_subject + if node.hasAttribute("typeof") : typed_resource = current_subject + + # --------------------------------------------------------------------- + ## The possible typeof indicates a number of type statements on the typed resource + for defined_type in state.getURI("typeof") : + if typed_resource : + graph.add((typed_resource, ns_rdf["type"], defined_type)) + + # --------------------------------------------------------------------- + # In case of @rel/@rev, either triples or incomplete triples are generated + # the (possible) incomplete triples are collected, to be forwarded to the children + incomplete_triples = [] + for prop in state.getURI("rel") : + if not isinstance(prop,BNode) : + if node.hasAttribute("inlist") : + if current_object != None : + # Add the content to the list. Note that if the same list + # was initialized, at some point, by a None, it will be + # overwritten by this real content + state.add_to_list_mapping(prop, current_object) + else : + # Add a dummy entry to the list... Note that + # if that list was initialized already with a real content + # this call will have no effect + state.add_to_list_mapping(prop, None) + + # Add a placeholder into the hanging rels + incomplete_triples.append( (None, prop, None) ) + else : + theTriple = (current_subject, prop, current_object) + if current_object != None : + graph.add(theTriple) + else : + incomplete_triples.append(theTriple) + else : + state.options.add_warning(err_no_blank_node % "rel", warning_type=IncorrectBlankNodeUsage, node=node.nodeName) + + for prop in state.getURI("rev") : + if not isinstance(prop,BNode) : + theTriple = (current_object,prop,current_subject) + if current_object != None : + graph.add(theTriple) + else : + incomplete_triples.append(theTriple) + else : + state.options.add_warning(err_no_blank_node % "rev", warning_type=IncorrectBlankNodeUsage, node=node.nodeName) + + # ---------------------------------------------------------------------- + # Generation of the @property values, including literals. The newSubject is the subject + # A particularity of property is that it stops the parsing down the DOM tree if an XML Literal is generated, + # because everything down there is part of the generated literal. + if node.hasAttribute("property") : + ProcessProperty(node, graph, current_subject, state, typed_resource).generate_1_1() + + # ---------------------------------------------------------------------- + # Setting the current object to a bnode is setting up a possible resource + # for the incomplete triples downwards + if current_object == None : + object_to_children = BNode() + else : + object_to_children = current_object + + #----------------------------------------------------------------------- + # Here is the recursion step for all the children + for n in node.childNodes : + if n.nodeType == node.ELEMENT_NODE : + _parse_1_1(n, graph, object_to_children, state, incomplete_triples) + + # --------------------------------------------------------------------- + # At this point, the parent's incomplete triples may be completed + for (s,p,o) in parent_incomplete_triples : + if s == None and o == None : + # This is an encoded version of a hanging rel for a collection: + incoming_state.add_to_list_mapping( p, current_subject ) + else : + if s == None : s = current_subject + if o == None : o = current_subject + graph.add((s,p,o)) + + # Generate the lists, if any and if this is the level where a new list was originally created + if state.new_list and not state.list_empty() : + for prop in state.get_list_props() : + vals = state.get_list_value(prop) + if vals == None : + # This was an empty list, in fact, ie, the list has been initiated by a <xxx rel="prop" inlist> + # but no list content has ever been added + graph.add( (state.get_list_origin(), prop, ns_rdf["nil"]) ) + else : + heads = [ BNode() for r in vals ] + [ ns_rdf["nil"] ] + for i in range(0, len(vals)) : + graph.add( (heads[i], ns_rdf["first"], vals[i]) ) + graph.add( (heads[i], ns_rdf["rest"], heads[i+1]) ) + # Anchor the list + graph.add( (state.get_list_origin(), prop, heads[0]) ) + + # ------------------------------------------------------------------- + # This should be it... + # ------------------------------------------------------------------- + return + + +################################################################################################################## +def _parse_1_0(node, graph, parent_object, incoming_state, parent_incomplete_triples) : + """The (recursive) step of handling a single node. See the + U{RDFa 1.0 syntax document<http://www.w3.org/TR/rdfa-syntax>} for further details. + + This is the RDFa 1.0 version. + + @param node: the DOM node to handle + @param graph: the RDF graph + @type graph: RDFLib's Graph object instance + @param parent_object: the parent's object, as an RDFLib URIRef + @param incoming_state: the inherited state (namespaces, lang, etc.) + @type incoming_state: L{state.ExecutionContext} + @param parent_incomplete_triples: list of hanging triples (the missing resource set to None) to be handled (or not) + by the current node. + @return: whether the caller has to complete it's parent's incomplete triples + @rtype: Boolean + """ + + # Update the state. This means, for example, the possible local settings of + # namespaces and lang + state = None + state = ExecutionContext(node, graph, inherited_state=incoming_state) + + #--------------------------------------------------------------------------------- + # Handling the role attribute is pretty much orthogonal to everything else... + handle_role_attribute(node, graph, state) + + #--------------------------------------------------------------------------------- + # Handle the special case for embedded RDF, eg, in SVG1.2. + # This may add some triples to the target graph that does not originate from RDFa parsing + # If the function return TRUE, that means that an rdf:RDF has been found. No + # RDFa parsing should be done on that subtree, so we simply return... + if state.options.embedded_rdf and node.nodeType == node.ELEMENT_NODE and handle_embeddedRDF(node, graph, state) : + return + + #--------------------------------------------------------------------------------- + # calling the host language specific massaging of the DOM + if state.options.host_language in host_dom_transforms and node.nodeType == node.ELEMENT_NODE : + for func in host_dom_transforms[state.options.host_language] : func(node, state) + + #--------------------------------------------------------------------------------- + # First, let us check whether there is anything to do at all. Ie, + # whether there is any relevant RDFa specific attribute on the element + # + if not has_one_of_attributes(node, "href", "resource", "about", "property", "rel", "rev", "typeof", "src") : + # nop, there is nothing to do here, just go down the tree and return... + for n in node.childNodes : + if n.nodeType == node.ELEMENT_NODE : parse_one_node(n, graph, parent_object, state, parent_incomplete_triples) + return + + #----------------------------------------------------------------- + # The goal is to establish the subject and object for local processing + # The behaviour is slightly different depending on the presense or not + # of the @rel/@rev attributes + current_subject = None + current_object = None + prop_object = None + + if has_one_of_attributes(node, "rel", "rev") : + # in this case there is the notion of 'left' and 'right' of @rel/@rev + # in establishing the new Subject and the objectResource + current_subject = state.getResource("about","src") + + # get_URI may return None in case of an illegal CURIE, so + # we have to be careful here, not use only an 'else' + if current_subject == None : + if node.hasAttribute("typeof") : + current_subject = BNode() + else : + current_subject = parent_object + else : + state.reset_list_mapping(origin = current_subject) + + # set the object resource + current_object = state.getResource("resource", "href") + + else : + # in this case all the various 'resource' setting attributes + # behave identically, though they also have their own priority + current_subject = state.getResource("about", "src", "resource", "href") + + # get_URI_ref may return None in case of an illegal CURIE, so + # we have to be careful here, not use only an 'else' + if current_subject == None : + if node.hasAttribute("typeof") : + current_subject = BNode() + else : + current_subject = parent_object + current_subject = parent_object + else : + state.reset_list_mapping(origin = current_subject) + + # in this case no non-literal triples will be generated, so the + # only role of the current_object Resource is to be transferred to + # the children node + current_object = current_subject + + # --------------------------------------------------------------------- + ## The possible typeof indicates a number of type statements on the new Subject + for defined_type in state.getURI("typeof") : + graph.add((current_subject, ns_rdf["type"], defined_type)) + + # --------------------------------------------------------------------- + # In case of @rel/@rev, either triples or incomplete triples are generated + # the (possible) incomplete triples are collected, to be forwarded to the children + incomplete_triples = [] + for prop in state.getURI("rel") : + if not isinstance(prop,BNode) : + theTriple = (current_subject, prop, current_object) + if current_object != None : + graph.add(theTriple) + else : + incomplete_triples.append(theTriple) + else : + state.options.add_warning(err_no_blank_node % "rel", warning_type=IncorrectBlankNodeUsage, node=node.nodeName) + + for prop in state.getURI("rev") : + if not isinstance(prop,BNode) : + theTriple = (current_object,prop,current_subject) + if current_object != None : + graph.add(theTriple) + else : + incomplete_triples.append(theTriple) + else : + state.options.add_warning(err_no_blank_node % "rev", warning_type=IncorrectBlankNodeUsage, node=node.nodeName) + + # ---------------------------------------------------------------------- + # Generation of the literal values. The newSubject is the subject + # A particularity of property is that it stops the parsing down the DOM tree if an XML Literal is generated, + # because everything down there is part of the generated literal. + if node.hasAttribute("property") : + ProcessProperty(node, graph, current_subject, state).generate_1_0() + + # ---------------------------------------------------------------------- + # Setting the current object to a bnode is setting up a possible resource + # for the incomplete triples downwards + if current_object == None : + object_to_children = BNode() + else : + object_to_children = current_object + + #----------------------------------------------------------------------- + # Here is the recursion step for all the children + for n in node.childNodes : + if n.nodeType == node.ELEMENT_NODE : + _parse_1_0(n, graph, object_to_children, state, incomplete_triples) + + # --------------------------------------------------------------------- + # At this point, the parent's incomplete triples may be completed + for (s,p,o) in parent_incomplete_triples : + if s == None and o == None : + # This is an encoded version of a hanging rel for a collection: + incoming_state.add_to_list_mapping( p, current_subject ) + else : + if s == None : s = current_subject + if o == None : o = current_subject + graph.add((s,p,o)) + + # ------------------------------------------------------------------- + # This should be it... + # ------------------------------------------------------------------- + return + + +####################################################################### +# Handle the role attribute +def handle_role_attribute(node, graph, state) : + """ + Handling the role attribute, according to http://www.w3.org/TR/role-attribute/#using-role-in-conjunction-with-rdfa + @param node: the DOM node to handle + @param graph: the RDF graph + @type graph: RDFLib's Graph object instance + @param state: the inherited state (namespaces, lang, etc.) + @type state: L{state.ExecutionContext} + """ + if node.hasAttribute("role") : + if node.hasAttribute("id") : + id = node.getAttribute("id").strip() + subject = URIRef(state.base + '#' + id) + else : + subject = BNode() + predicate = URIRef('http://www.w3.org/1999/xhtml/vocab#role') + for object in state.getURI("role") : + graph.add((subject, predicate, object)) + + + + + + + + + + + diff --git a/rdflib/plugins/parsers/pyRdfa/property.py b/rdflib/plugins/parsers/pyRdfa/property.py new file mode 100644 index 00000000..716c28a8 --- /dev/null +++ b/rdflib/plugins/parsers/pyRdfa/property.py @@ -0,0 +1,295 @@ +# -*- coding: utf-8 -*- +""" +Implementation of the C{@property} value handling. + +RDFa 1.0 and RDFa 1.1 are fairly different. RDFa 1.0 generates only literals, see +U{RDFa Task Force's wiki page<http://www.w3.org/2006/07/SWD/wiki/RDFa/LiteralObject>} for the details. +On the other hand, RDFa 1.1, beyond literals, can also generate URI references. Hence the duplicate method in the L{ProcessProperty} class, one for RDFa 1.0 and the other for RDFa 1.1. + +@summary: RDFa Literal generation +@requires: U{RDFLib package<http://rdflib.net>} +@organization: U{World Wide Web Consortium<http://www.w3.org>} +@author: U{Ivan Herman<a href="http://www.w3.org/People/Ivan/">} +@license: This software is available for use under the +U{W3C® SOFTWARE NOTICE AND LICENSE<href="http://www.w3.org/Consortium/Legal/2002/copyright-software-20021231">} +""" + +""" +$Id: property.py,v 1.11 2012/06/12 11:47:11 ivan Exp $ +$Date: 2012/06/12 11:47:11 $ +""" + +import re + +import rdflib +from rdflib import BNode +from rdflib import Literal, URIRef, Namespace +if rdflib.__version__ >= "3.0.0" : + from rdflib import RDF as ns_rdf + from rdflib.term import XSDToPython +else : + from rdflib.RDF import RDFNS as ns_rdf + from rdflib.Literal import XSDToPython + +from pyRdfa import IncorrectBlankNodeUsage, IncorrectLiteral, err_no_blank_node, ns_xsd +from pyRdfa.utils import has_one_of_attributes, return_XML +from pyRdfa.host.html5 import handled_time_types + +XMLLiteral = ns_rdf["XMLLiteral"] +HTMLLiteral = URIRef("http://www.w3.org/1999/02/22-rdf-syntax-ns#HTML") + +class ProcessProperty : + """Generate the value for C{@property} taking into account datatype, etc. + Note: this class is created only if the C{@property} is indeed present, no need to check. + + @ivar node: DOM element node + @ivar graph: the (RDF) graph to add the properies to + @ivar subject: the RDFLib URIRef serving as a subject for the generated triples + @ivar state: the current state to be used for the CURIE-s + @type state: L{state.ExecutionContext} + @ivar typed_resource: Typically the bnode generated by a @typeof + """ + def __init__(self, node, graph, subject, state, typed_resource = None) : + """ + @param node: DOM element node + @param graph: the (RDF) graph to add the properies to + @param subject: the RDFLib URIRef serving as a subject for the generated triples + @param state: the current state to be used for the CURIE-s + @param state: L{state.ExecutionContext} + @param typed_resource: Typically the bnode generated by a @typeof; in RDFa 1.1, that becomes the object for C{@property} + """ + self.node = node + self.graph = graph + self.subject = subject + self.state = state + self.typed_resource = typed_resource + + def generate(self) : + """ + Common entry point for the RDFa 1.0 and RDFa 1.1 versions; bifurcates based on the RDFa version, as retrieved from the state object. + """ + if self.state.rdfa_version >= "1.1" : + self.generate_1_1() + else : + self.generate_1_0() + + def generate_1_1(self) : + """Generate the property object, 1.1 version""" + + ######################################################################### + # See if the target is _not_ a literal + irirefs = ("resource", "href", "src") + noiri = ("content", "datatype", "rel", "rev") + notypediri = ("content", "datatype", "rel", "rev", "about", "about_pruned") + if has_one_of_attributes(self.node, irirefs) and not has_one_of_attributes(self.node, noiri) : + # @href/@resource/@src takes the lead here... + object = self.state.getResource(irirefs) + elif self.node.hasAttribute("typeof") and not has_one_of_attributes(self.node, notypediri) and self.typed_resource != None : + # a @typeof creates a special branch in case the typed resource was set during parsing + object = self.typed_resource + else : + # We have to generate a literal + + # Get, if exists, the value of @datatype + datatype = '' + dtset = False + if self.node.hasAttribute("datatype") : + dtset = True + dt = self.node.getAttribute("datatype") + if dt != "" : + datatype = self.state.getURI("datatype") + + # Supress lange is set in case some elements explicitly want to supress the effect of language + # There were discussions, for example, that the <time> element should do so. Although, + # after all, this was reversed, the functionality is kept in the code in case another + # element might need it... + if self.state.lang != None and self.state.supress_lang == False : + lang = self.state.lang + else : + lang = '' + + # The simple case: separate @content attribute + if self.node.hasAttribute("content") : + val = self.node.getAttribute("content") + # Handling the automatic uri conversion case + if dtset == False : + object = Literal(val, lang=lang) + else : + object = self._create_Literal(val, datatype=datatype, lang=lang) + # The value of datatype has been set, and the keyword paramaters take care of the rest + else : + # see if there *is* a datatype (even if it is empty!) + if dtset : + if datatype == XMLLiteral : + object = Literal(self._get_XML_literal(self.node), datatype=XMLLiteral) + elif datatype == HTMLLiteral : + object = Literal(self._get_HTML_literal(self.node), datatype=HTMLLiteral) + else : + object = self._create_Literal(self._get_literal(self.node), datatype=datatype, lang=lang) + else : + object = self._create_Literal(self._get_literal(self.node), lang=lang) + + if object != None : + for prop in self.state.getURI("property") : + if not isinstance(prop, BNode) : + if self.node.hasAttribute("inlist") : + self.state.add_to_list_mapping(prop, object) + else : + self.graph.add( (self.subject, prop, object) ) + else : + self.state.options.add_warning(err_no_blank_node % "property", warning_type=IncorrectBlankNodeUsage, node=self.node.nodeName) + + # return + + def generate_1_0(self) : + """Generate the property object, 1.0 version""" + + ######################################################################### + # We have to generate a literal indeed. + # Get, if exists, the value of @datatype + datatype = '' + dtset = False + if self.node.hasAttribute("datatype") : + dtset = True + dt = self.node.getAttribute("datatype") + if dt != "" : + datatype = self.state.getURI("datatype") + + if self.state.lang != None : + lang = self.state.lang + else : + lang = '' + + # The simple case: separate @content attribute + if self.node.hasAttribute("content") : + val = self.node.getAttribute("content") + # Handling the automatic uri conversion case + if dtset == False : + object = Literal(val, lang=lang) + else : + object = self._create_Literal(val, datatype=datatype, lang=lang) + # The value of datatype has been set, and the keyword paramaters take care of the rest + else : + # see if there *is* a datatype (even if it is empty!) + if dtset : + # yep. The Literal content is the pure text part of the current element: + # We have to check whether the specified datatype is, in fact, an + # explicit XML Literal + if datatype == XMLLiteral : + object = Literal(self._get_XML_literal(self.node),datatype=XMLLiteral) + elif datatype == HTMLLiteral : + object = Literal(self._get_HTML_literal(self.node), datatype=HTMLLiteral) + else : + object = self._create_Literal(self._get_literal(self.node), datatype=datatype, lang=lang) + else : + # no controlling @datatype. We have to see if there is markup in the contained + # element + if True in [ n.nodeType == self.node.ELEMENT_NODE for n in self.node.childNodes ] : + # yep, and XML Literal should be generated + object = self._create_Literal(self._get_XML_literal(self.node), datatype=XMLLiteral) + else : + # At this point, there might be entities in the string that are returned as real characters by the dom + # implementation. That should be turned back + object = self._create_Literal(self._get_literal(self.node), lang=lang) + + for prop in self.state.getURI("property") : + if not isinstance(prop,BNode) : + self.graph.add( (self.subject,prop,object) ) + else : + self.state.options.add_warning(err_no_blank_node % "property", warning_type=IncorrectBlankNodeUsage, node=self.node.nodeName) + + # return + + ###################################################################################################################################### + + + def _putBackEntities(self, str) : + """Put 'back' entities for the '&','<', and '>' characters, to produce a proper XML string. + Used by the XML Literal extraction. + @param str: string to be converted + @return: string with entities + @rtype: string + """ + return str.replace('&','&').replace('<','<').replace('>','>') + + def _get_literal(self, Pnode): + """ + Get (recursively) the full text from a DOM Node. + + @param Pnode: DOM Node + @return: string + """ + rc = "" + for node in Pnode.childNodes: + if node.nodeType == node.TEXT_NODE: + rc = rc + node.data + elif node.nodeType == node.ELEMENT_NODE : + rc = rc + self._get_literal(node) + + # The decision of the group in February 2008 is not to normalize the result by default. + # This is reflected in the default value of the option + + if self.state.options.space_preserve : + return rc + else : + return re.sub(r'(\r| |\n|\t)+'," ",rc).strip() + # end getLiteral + + def _get_XML_literal(self, Pnode) : + """ + Get (recursively) the XML Literal content of a DOM Node. + + @param Pnode: DOM Node + @return: string + """ + rc = "" + for node in Pnode.childNodes: + if node.nodeType == node.TEXT_NODE: + rc = rc + self._putBackEntities(node.data) + elif node.nodeType == node.ELEMENT_NODE : + rc = rc + return_XML(self.state, node, base = False) + return rc + # end getXMLLiteral + + def _get_HTML_literal(self, Pnode) : + """ + Get (recursively) the XML Literal content of a DOM Node. + + @param Pnode: DOM Node + @return: string + """ + rc = "" + for node in Pnode.childNodes: + if node.nodeType == node.TEXT_NODE: + rc = rc + self._putBackEntities(node.data) + elif node.nodeType == node.ELEMENT_NODE : + rc = rc + return_XML(self.state, node, base = False, xmlns = False ) + return rc + # end getXMLLiteral + + def _create_Literal(self, val, datatype = '', lang = '') : + """ + Create a literal, taking into account the datatype and language. + @return: Literal + """ + if datatype == None or datatype == '' : + return Literal(val, lang=lang) + #elif datatype == ns_xsd["string"] : + # return Literal(val) + else : + # This is a bit convoluted... the default setup of rdflib does not gracefully react if the + # datatype cannot properly be converted to Python. I have to copy and reuse some of the + # rdflib code to get this working... + # To make things worse: rdlib 3.1.0 does not handle the various xsd date types properly, ie, + # the conversion function below will generate errors. Ie, the check should be skipped for those + if ("%s" % datatype) in handled_time_types and rdflib.__version__ < "3.2.0" : + convFunc = False + else : + convFunc = XSDToPython.get(datatype, None) + if convFunc : + try : + pv = convFunc(val) + # If we got there the literal value and its datatype match + except : + self.state.options.add_warning("Incompatible value (%s) and datatype (%s) in Literal definition." % (val, datatype), warning_type=IncorrectLiteral, node=self.node.nodeName) + return Literal(val, datatype=datatype) diff --git a/rdflib/plugins/parsers/pyRdfa/rdfs/__init__.py b/rdflib/plugins/parsers/pyRdfa/rdfs/__init__.py new file mode 100644 index 00000000..ec0731e2 --- /dev/null +++ b/rdflib/plugins/parsers/pyRdfa/rdfs/__init__.py @@ -0,0 +1,49 @@ +# -*- coding: utf-8 -*- +""" +Separate module to handle vocabulary expansions. The L{cache} module takes care of caching vocabulary graphs; the L{process} +module takes care of the expansion itself. + +@organization: U{World Wide Web Consortium<http://www.w3.org>} +@author: U{Ivan Herman<a href="http://www.w3.org/People/Ivan/">} +@license: This software is available for use under the +U{W3C® SOFTWARE NOTICE AND LICENSE<href="http://www.w3.org/Consortium/Legal/2002/copyright-software-20021231">} + +""" + +""" +$Id: __init__.py,v 1.4 2012/08/20 13:15:28 ivan Exp $ $Date: 2012/08/20 13:15:28 $ + +""" + +import sys +import os + +import rdflib +from rdflib import URIRef +from rdflib import Literal +from rdflib import BNode +from rdflib import Namespace +if rdflib.__version__ >= "3.0.0" : + from rdflib import RDF as ns_rdf + from rdflib import RDFS as ns_rdfs + from rdflib import Graph +else : + from rdflib.RDFS import RDFSNS as ns_rdfs + from rdflib.RDF import RDFNS as ns_rdf + from rdflib.Graph import Graph + +from pyRdfa import RDFaError, pyRdfaError +from pyRdfa import ns_rdfa, ns_xsd, ns_distill + +VocabCachingInfo = ns_distill["VocabCachingInfo"] + + +# Error message texts + +err_outdated_cache = "Vocab document <%s> could not be dereferenced; using possibly outdated cache" +err_unreachable_vocab = "Vocab document <%s> could not be dereferenced" +err_unparsable_Turtle_vocab = "Could not parse vocab in Turtle at <%s> (%s)" +err_unparsable_xml_vocab = "Could not parse vocab in RDF/XML at <%s> (%s)" +err_unparsable_ntriples_vocab = "Could not parse vocab in N-Triple at <%s> (%s)" +err_unparsable_rdfa_vocab = "Could not parse vocab in RDFa at <%s> (%s)" +err_unrecognised_vocab_type = "Unrecognized media type for the vocab file <%s>: '%s'" diff --git a/rdflib/plugins/parsers/pyRdfa/rdfs/cache.py b/rdflib/plugins/parsers/pyRdfa/rdfs/cache.py new file mode 100644 index 00000000..778c424e --- /dev/null +++ b/rdflib/plugins/parsers/pyRdfa/rdfs/cache.py @@ -0,0 +1,395 @@ +# -*- coding: utf-8 -*- +""" +Managing Vocab Caching. + +@summary: RDFa parser (distiller) +@requires: U{RDFLib<http://rdflib.net>} +@organization: U{World Wide Web Consortium<http://www.w3.org>} +@author: U{Ivan Herman<a href="http://www.w3.org/People/Ivan/">} +@license: This software is available for use under the +U{W3C® SOFTWARE NOTICE AND LICENSE<href="http://www.w3.org/Consortium/Legal/2002/copyright-software-20021231">} +""" +import os, sys, datetime, re + +PY3 = (sys.version_info[0] >= 3) + +import rdflib +from rdflib import URIRef +from rdflib import Literal +from rdflib import BNode +from rdflib import Namespace +if rdflib.__version__ >= "3.0.0" : + from rdflib import RDF as ns_rdf + from rdflib import RDFS as ns_rdfs + from rdflib import Graph +else : + from rdflib.RDFS import RDFSNS as ns_rdfs + from rdflib.RDF import RDFNS as ns_rdf + from rdflib.Graph import Graph + +from pyRdfa import HTTPError, RDFaError +from pyRdfa.host import MediaTypes, HostLanguage +from pyRdfa.utils import create_file_name, URIOpener, quote_URI +from pyRdfa.options import Options +from pyRdfa import ns_rdfa + +from pyRdfa.rdfs import err_outdated_cache +from pyRdfa.rdfs import err_unreachable_vocab +from pyRdfa.rdfs import err_unparsable_Turtle_vocab +from pyRdfa.rdfs import err_unparsable_xml_vocab +from pyRdfa.rdfs import err_unparsable_ntriples_vocab +from pyRdfa.rdfs import err_unparsable_rdfa_vocab +from pyRdfa.rdfs import err_unrecognised_vocab_type + +from pyRdfa.rdfs import VocabCachingInfo + +# Regular expression object for a general XML application media type +xml_application_media_type = re.compile("application/[a-zA-Z0-9]+\+xml") + +from pyRdfa.utils import URIOpener + +#=========================================================================================== +if PY3 : + import pickle +else : + import cPickle as pickle + +# Protocol to be used for pickle files. 0 is good for debug, it stores the data in ASCII; 1 is better for deployment, +# it stores data in binary format. Care should be taken for consistency; when changing from 0 to 1 or back, all +# cached data should be removed/regenerated, otherwise mess may occur... +_Pickle_Protocol = 1 + +# If I could rely on python 2.5 or 2.6 (or higher) I could use the with...as... idiom for what is below, it +# is indeed nicer. But I cannot... +def _load(fname) : + """ + Load a cached file and return the resulting object + @param fname: file name + """ + try : + f = open(fname) + return pickle.load(f) + finally : + f.close() + +def _dump(obj, fname) : + """ + Dump an object into cached file + @param obj: Python object to store + @param fname: file name + """ + try : + f = open(fname, "w") + pickle.dump(obj, f, _Pickle_Protocol) + f.flush() + finally : + f.close() + +#=========================================================================================== +class CachedVocabIndex : + """ + Class to manage the cache index. Takes care of finding the vocab directory, and manages the index + to the individual vocab data. + + The vocab directory is set to a platform specific area, unless an environment variable + sets it explicitly. The environment variable is "PyRdfaCacheDir" + + Every time the index is changed, the index is put back (via pickle) to the directory. + + @ivar app_data_dir: directory for the vocabulary cache directory + @ivar index_fname: the full path of the index file on the disc + @ivar indeces: the in-memory version of the index (a directory mapping URI-s to tuples) + @ivar options: the error handler (option) object to send warnings to + @type options: L{options.Options} + @ivar report: whether details on the caching should be reported + @type report: Boolean + @cvar vocabs: File name used for the index in the cache directory + @cvar preference_path: Cache directories for the three major platforms (ie, mac, windows, unix) + @type preference_path: directory, keyed by "mac", "win", and "unix" + @cvar architectures: Various 'architectures' as returned by the python call, and their mapping on one of the major platforms. If an architecture is missing, it is considered to be "unix" + @type architectures: directory, mapping architectures to "mac", "win", or "unix" + """ + # File Name used for the index in the cache directory + vocabs = "cache_index" + # Cache directories for the three major platforms... + preference_path = { + "mac" : "Library/Application Support/pyRdfa-cache", + "win" : "pyRdfa-cache", + "unix" : ".pyRdfa-cache" + } + # various architectures as returned by the python call, and their mapping on platorm. If an architecture is not here, it is considered as unix + architectures = { + "darwin" : "mac", + "nt" : "win", + "win32" : "win", + "cygwin" : "win" + } + def __init__(self, options = None) : + """ + @param options: the error handler (option) object to send warnings to + @type options: L{options.Options} + """ + self.options = options + self.report = (options != None) and options.vocab_cache_report + + # This is where the cache files should be + self.app_data_dir = self._give_preference_path() + self.index_fname = os.path.join(self.app_data_dir, self.vocabs) + self.indeces = {} + + # Check whether that directory exists. + if not os.path.isdir(self.app_data_dir) : + try : + os.mkdir(self.app_data_dir) + except Exception : + (type,value,traceback) = sys.exc_info() + if self.report: options.add_info("Could not create the vocab cache area %s" % value, VocabCachingInfo) + return + else : + # check whether it is at least readable + if not os.access(self.app_data_dir, os.R_OK) : + if self.report: options.add_info("Vocab cache directory is not readable", VocabCachingInfo) + return + if not os.access(self.app_data_dir, os.W_OK) : + if self.report: options.add_info("Vocab cache directory is not writeable, but readable", VocabCachingInfo) + return + + if os.path.exists(self.index_fname) : + if os.access(self.index_fname, os.R_OK) : + self.indeces = _load(self.index_fname) + else : + if self.report: options.add_info("Vocab cache index not readable", VocabCachingInfo) + else : + # This is the very initial phase, creation + # of a a new index + if os.access(self.app_data_dir, os.W_OK) : + # This is then put into a pickle file to put the stake in the ground... + try : + _dump(self.indeces, self.index_fname) + except Exception : + (type,value,traceback) = sys.exc_info() + if self.report: options.add_info("Could not create the vocabulary index %s" % value, VocabCachingInfo) + else : + if self.report: options.add_info("Vocabulary cache directory is not writeable", VocabCachingInfo) + self.cache_writeable = False + + def add_ref(self, uri, vocab_reference) : + """ + Add a new entry to the index, possibly removing the previous one. + + @param uri: the URI that serves as a key in the index directory + @param vocab_reference: tuple consisting of file name, modification date, and expiration date + """ + # Store the index right away + self.indeces[uri] = vocab_reference + try : + _dump(self.indeces, self.index_fname) + except Exception : + (type,value,traceback) = sys.exc_info() + if self.report: self.options.add_info("Could not store the cache index %s" % value, VocabCachingInfo) + + def get_ref(self, uri) : + """ + Get an index entry, if available, None otherwise. + The return value is a tuple: file name, modification date, and expiration date + + @param uri: the URI that serves as a key in the index directory + """ + if uri in self.indeces : + return tuple(self.indeces[uri]) + else : + return None + + def _give_preference_path(self) : + """ + Find the vocab cache directory. + """ + from pyRdfa import CACHE_DIR_VAR + if CACHE_DIR_VAR in os.environ : + return os.environ[CACHE_DIR_VAR] + else : + # find the preference path on the architecture + platform = sys.platform + if platform in self.architectures : + system = self.architectures[platform] + else : + system = "unix" + + if system == "win" : + # there is a user variable set for that purpose + app_data = os.path.expandvars("%APPDATA%") + return os.path.join(app_data,self.preference_path[system]) + else : + return os.path.join(os.path.expanduser('~'),self.preference_path[system]) + +#=========================================================================================== +class CachedVocab(CachedVocabIndex) : + """ + Cache for a specific vocab. The content of the cache is the graph. These are also the data that are stored + on the disc (in pickled form) + + @ivar graph: the RDF graph + @ivar URI: vocabulary URI + @ivar filename: file name (not the complete path) of the cached version + @ivar creation_date: creation date of the cache + @type creation_date: datetime + @ivar expiration_date: expiration date of the cache + @type expiration_date: datetime + @cvar runtime_cache : a run time cache for already 'seen' vocabulary files. Apart from (marginally) speeding up processing, this also prevents recursion + @type runtime_cache : dictionary + """ + def __init__(self, URI, options = None) : + """ + @param URI: real URI for the vocabulary file + @param options: the error handler (option) object to send warnings to + @type options: L{options.Options} + """ + # First see if this particular vocab has been handled before. If yes, it is extracted and everything + # else can be forgotten. + self.uri = URI + (self.filename, self.creation_date, self.expiration_date) = ("",None,None) + self.graph = Graph() + + try : + CachedVocabIndex.__init__(self, options) + vocab_reference = self.get_ref(URI) + self.caching = True + except Exception : + # what this means is that the caching becomes impossible through some system error... + (type,value,traceback) = sys.exc_info() + if self.report: options.add_info("Could not access the vocabulary cache area %s" % value, VocabCachingInfo, URI) + vocab_reference = None + self.caching = False + + if vocab_reference == None : + # This has never been cached before + if self.report: options.add_info("No cache exists for %s, generating one" % URI, VocabCachingInfo) + + # Store all the cache data unless caching proves to be impossible + if self._get_vocab_data(newCache = True) and self.caching : + self.filename = create_file_name(self.uri) + self._store_caches() + if self.report: + options.add_info("Generated a cache for %s, with an expiration date of %s" % (URI,self.expiration_date), VocabCachingInfo, URI) + else : + (self.filename, self.creation_date, self.expiration_date) = vocab_reference + if self.report: options.add_info("Found a cache for %s, expiring on %s" % (URI,self.expiration_date), VocabCachingInfo) + # Check if the expiration date is still away + if options.refresh_vocab_cache == False and datetime.datetime.utcnow() <= self.expiration_date : + # We are fine, we can just extract the data from the cache and we're done + if self.report: options.add_info("Cache for %s is still valid; extracting the data" % URI, VocabCachingInfo) + fname = os.path.join(self.app_data_dir, self.filename) + try : + self.graph = _load(fname) + except Exception : + # what this means is that the caching becomes impossible VocabCachingInfo + (type,value,traceback) = sys.exc_info() + sys.excepthook(type,value,traceback) + if self.report: options.add_info("Could not access the vocab cache %s (%s)" % (value,fname), VocabCachingInfo, URI) + else : + if self.report : + if options.refresh_vocab_cache == True : + options.add_info("Time check is bypassed; refreshing the cache for %s" % URI, VocabCachingInfo) + else : + options.add_info("Cache timeout; refreshing the cache for %s" % URI, VocabCachingInfo) + # we have to refresh the graph + if self._get_vocab_data(newCache = False) == False : + # bugger; the cache could not be refreshed, using the current one, and setting the cache artificially + # to be valid for the coming hour, hoping that the access issues will be resolved by then... + if self.report: + options.add_info("Could not refresh vocabulary cache for %s, using the old cache, extended its expiration time by an hour (network problems?)" % URI, VocabCachingInfo, URI) + fname = os.path.join(self.app_data_dir, self.filename) + try : + self.graph = _load(fname) + self.expiration_date = datetime.datetime.utcnow() + datetime.timedelta(hours=1) + except Exception : + # what this means is that the caching becomes impossible VocabCachingInfo + (type,value,traceback) = sys.exc_info() + sys.excepthook(type,value,traceback) + if self.report: options.add_info("Could not access the vocabulary cache %s (%s)" % (value,fname), VocabCachingInfo, URI) + self.creation_date = datetime.datetime.utcnow() + if self.report: + options.add_info("Generated a new cache for %s, with an expiration date of %s" % (URI,self.expiration_date), VocabCachingInfo, URI) + + self._store_caches() + + def _get_vocab_data(self, newCache = True) : + """Just a macro like function to get the data to be cached""" + from pyRdfa.rdfs.process import return_graph + (self.graph, self.expiration_date) = return_graph(self.uri, self.options, newCache) + return self.graph != None + + def _store_caches(self) : + """Called if the creation date, etc, have been refreshed or new, and + all content must be put into a cache file + """ + # Store the cached version of the vocabulary file + fname = os.path.join(self.app_data_dir, self.filename) + try : + _dump(self.graph, fname) + except Exception : + (type,value,traceback) = sys.exc_info() + if self.report : self.options.add_info("Could not write cache file %s (%s)", (fname,value), VocabCachingInfo, self.uri) + # Update the index + self.add_ref(self.uri,(self.filename, self.creation_date, self.expiration_date)) + +######################################################################################################################################### + +def offline_cache_generation(args) : + """Generate a cache for the vocabulary in args. + + @param args: array of vocabulary URIs. + """ + class LocalOption : + def __init__(self) : + self.vocab_cache_report = True + + def pr(self, wae, txt, warning_type, context) : + print( "====" ) + if warning_type != None : print( warning_type ) + print( wae + ": " + txt ) + if context != None: print( context ) + print( "====" ) + + def add_warning(self, txt, warning_type=None, context=None) : + """Add a warning to the processor graph. + @param txt: the warning text. + @keyword warning_type: Warning Class + @type warning_type: URIRef + @keyword context: possible context to be added to the processor graph + @type context: URIRef or String + """ + self.pr("Warning",txt,warning_type,context) + + def add_info(self, txt, info_type=None, context=None) : + """Add an informational comment to the processor graph. + @param txt: the information text. + @keyword info_type: Info Class + @type info_type: URIRef + @keyword context: possible context to be added to the processor graph + @type context: URIRef or String + """ + self.pr("Info",txt,info_type,context) + + def add_error(self, txt, err_type=None, context=None) : + """Add an error to the processor graph. + @param txt: the information text. + @keyword err_type: Error Class + @type err_type: URIRef + @keyword context: possible context to be added to the processor graph + @type context: URIRef or String + """ + self.pr("Error",txt,err_type,context) + + for uri in args : + # This should write the cache + print( ">>>>> Writing Cache <<<<<" ) + writ = CachedVocab(uri,options = LocalOption(),report = True) + # Now read it back and print the content for tracing + print( ">>>>> Reading Cache <<<<<" ) + rd = CachedVocab(uri,options = LocalOption(),report = True) + print( "URI: " + uri ) + print( "default vocab: " + rd.vocabulary ) + print( "terms: %s prefixes: %s" % (rd.terms,rd.ns) ) + +
\ No newline at end of file diff --git a/rdflib/plugins/parsers/pyRdfa/rdfs/process.py b/rdflib/plugins/parsers/pyRdfa/rdfs/process.py new file mode 100644 index 00000000..4fbf1050 --- /dev/null +++ b/rdflib/plugins/parsers/pyRdfa/rdfs/process.py @@ -0,0 +1,304 @@ +# -*- coding: utf-8 -*- +""" +@organization: U{World Wide Web Consortium<http://www.w3.org>} +@author: U{Ivan Herman<a href="http://www.w3.org/People/Ivan/">} +@license: This software is available for use under the +U{W3C® SOFTWARE NOTICE AND LICENSE<href="http://www.w3.org/Consortium/Legal/2002/copyright-software-20021231">} + +""" + +""" +$Id: process.py,v 1.7 2012-03-23 14:06:38 ivan Exp $ $Date: 2012-03-23 14:06:38 $ + +""" + +import sys +import os + +import rdflib +from rdflib import URIRef +from rdflib import Literal +from rdflib import BNode +from rdflib import Namespace +if rdflib.__version__ >= "3.0.0" : + from rdflib import RDF as ns_rdf + from rdflib import RDFS as ns_rdfs + from rdflib import Graph +else : + from rdflib.RDFS import RDFSNS as ns_rdfs + from rdflib.RDF import RDFNS as ns_rdf + from rdflib.Graph import Graph + +ns_owl = Namespace("http://www.w3.org/2002/07/owl#") + +from pyRdfa.host import MediaTypes + +from pyRdfa.utils import URIOpener + +from pyRdfa.rdfs import err_outdated_cache +from pyRdfa.rdfs import err_unreachable_vocab +from pyRdfa.rdfs import err_unparsable_Turtle_vocab +from pyRdfa.rdfs import err_unparsable_xml_vocab +from pyRdfa.rdfs import err_unparsable_ntriples_vocab +from pyRdfa.rdfs import err_unparsable_rdfa_vocab +from pyRdfa.rdfs import err_unrecognised_vocab_type + +from pyRdfa import VocabReferenceError + +from pyRdfa.rdfs.cache import CachedVocab +from pyRdfa import HTTPError, RDFaError + +############################################################################################################# + +def return_graph(uri, options, newCache = False) : + """Parse a file, and return an RDFLib Graph. The URI's content type is checked and either one of + RDFLib's parsers is invoked (for the Turtle, RDF/XML, and N Triple cases) or a separate RDFa processing is invoked + on the RDFa content. + + The Accept header of the HTTP request gives a preference to Turtle, followed by RDF/XML and then HTML (RDFa), in case content negotiation is used. + + This function is used to retreive the vocabulary file and turn it into an RDFLib graph. + + @param uri: URI for the graph + @param options: used as a place where warnings can be sent + @param newCache: in case this is used with caching, whether a new cache is generated; that modifies the warning text + @return: A tuple consisting of an RDFLib Graph instance and an expiration date); None if the dereferencing or the parsing was unsuccessful + """ + def return_to_cache(msg) : + if newCache : + options.add_warning(err_unreachable_vocab % uri, warning_type=VocabReferenceError) + else : + options.add_warning(err_outdated_cache % uri, warning_type=VocabReferenceError) + + retval = None + expiration_date = None + content = None + + try : + content = URIOpener(uri, + {'Accept' : 'text/html;q=0.8, application/xhtml+xml;q=0.8, text/turtle;q=1.0, application/rdf+xml;q=0.9'}) + except HTTPError : + (type,value,traceback) = sys.exc_info() + return_to_cache(value) + return (None,None) + except RDFaError : + (type,value,traceback) = sys.exc_info() + return_to_cache(value) + return (None,None) + except Exception : + (type,value,traceback) = sys.exc_info() + return_to_cache(value) + return (None,None) + + # Store the expiration date of the newly accessed data + expiration_date = content.expiration_date + + if content.content_type == MediaTypes.turtle : + try : + retval = Graph() + retval.parse(content.data, format="n3") + except : + (type,value,traceback) = sys.exc_info() + options.add_warning(err_unparsable_Turtle_vocab % (uri,value)) + elif content.content_type == MediaTypes.rdfxml : + try : + retval = Graph() + retval.parse(content.data) + except : + (type,value,traceback) = sys.exc_info() + options.add_warning(err_unparsable_Turtle_vocab % (uri,value)) + elif content.content_type == MediaTypes.nt : + try : + retval = Graph() + retval.parse(content.data, format="nt") + except : + (type,value,traceback) = sys.exc_info() + options.add_warning(err_unparsable_ntriples_vocab % (uri,value)) + elif content.content_type in [MediaTypes.xhtml, MediaTypes.html, MediaTypes.xml] or xml_application_media_type.match(content.content_type) != None : + try : + from pyRdfa import pyRdfa + from pyRdfa.options import Options + options = Options() + retval = pyRdfa(options).graph_from_source(content.data) + except : + (type,value,traceback) = sys.exc_info() + options.add_warning(err_unparsable_rdfa_vocab % (uri,value)) + else : + options.add_warning(err_unrecognised_vocab_type % (uri, content.content_type)) + + return (retval, expiration_date) + +############################################################################################ +type = ns_rdf["type"] +Property = ns_rdf["Property"] +Class = ns_rdfs["Class"] +subClassOf = ns_rdfs["subClassOf"] +subPropertyOf = ns_rdfs["subPropertyOf"] +equivalentProperty = ns_owl["equivalentProperty"] +equivalentClass = ns_owl["equivalentClass"] + +class MiniOWL : + """ + Class implementing the simple OWL RL Reasoning required by RDFa in managing vocabulary files. This is done via + a forward chaining process (in the L{closure} method) using a few simple rules as defined by the RDF and the OWL Semantics + specifications. + + @ivar graph: the graph that has to be expanded + @ivar added_triples: each cycle collects the triples that are to be added to the graph eventually. + @type added_triples: a set, to ensure the unicity of triples being added + """ + def __init__(self, graph, schema_semantics = False) : + self.graph = graph + self.added_triples = None + self.schema_semantics = schema_semantics + + def closure(self) : + """ + Generate the closure the graph. This is the real 'core'. + + The processing rules store new triples via the L{separate method<store_triple>} which stores + them in the L{added_triples<added_triples>} array. If that array is emtpy at the end of a cycle, + it means that the whole process can be stopped. + """ + + # Go cyclically through all rules until no change happens + new_cycle = True + cycle_num = 0 + while new_cycle : + # yes, there was a change, let us go again + cycle_num += 1 + + # go through all rules, and collect the replies (to see whether any change has been done) + # the new triples to be added are collected separately not to interfere with + # the current graph yet + self.added_triples = set() + + # Execute all the rules; these might fill up the added triples array + for t in self.graph : self.rules(t) + + # Add the tuples to the graph (if necessary, that is). If any new triple has been generated, a new cycle + # will be necessary... + new_cycle = len(self.added_triples) > 0 + + for t in self.added_triples : self.graph.add(t) + + def store_triple(self, t) : + """ + In contrast to its name, this does not yet add anything to the graph itself, it just stores the tuple in an + L{internal set<added_triples>}. (It is important for this to be a set: some of the rules in the various closures may + generate the same tuples several times.) Before adding the tuple to the set, the method checks whether + the tuple is in the final graph already (if yes, it is not added to the set). + + The set itself is emptied at the start of every processing cycle; the triples are then effectively added to the + graph at the end of such a cycle. If the set is + actually empty at that point, this means that the cycle has not added any new triple, and the full processing can stop. + + @param t: the triple to be added to the graph, unless it is already there + @type t: a 3-element tuple of (s,p,o) + """ + (s,p,o) = t + if t not in self.graph : + self.added_triples.add(t) + + def rules(self, t) : + """ + Go through the OWL-RL entailement rules prp-spo1, prp-eqp1, prp-eqp2, cax-sco, cax-eqc1, and cax-eqc2 by extending the graph. + @param t: a triple (in the form of a tuple) + """ + s,p,o = t + if self.schema_semantics : + # extra resonings on the vocabulary only to reduce the overall load by reducing the expected number of chaining cycles + if p == subPropertyOf : + for Z,Y,xxx in self.graph.triples((o, subPropertyOf, None)) : + self.store_triple((s,subPropertyOf,xxx)) + elif p == equivalentProperty : + for Z,Y,xxx in self.graph.triples((o, equivalentProperty, None)) : + self.store_triple((s,equivalentProperty,xxx)) + for xxx,Y,Z in self.graph.triples((None, equivalentProperty, s)) : + self.store_triple((xxx,equivalentProperty,o)) + elif p == subClassOf : + for Z,Y,xxx in self.graph.triples((o, subClassOf, None)) : + self.store_triple((s,subClassOf,xxx)) + elif p == equivalentClass : + for Z,Y,xxx in self.graph.triples((o, equivalentClass, None)) : + self.store_triple((s,equivalentClass,xxx)) + for xxx,Y,Z in self.graph.triples((None, equivalentClass, s)) : + self.store_triple((xxx,equivalentClass,o)) + else : + if p == subPropertyOf : + # prp-spo1 + for zzz,Z,www in self.graph.triples((None, s, None)) : + self.store_triple((zzz, o, www)) + elif p == equivalentProperty : + # prp-eqp1 + for zzz,Z,www in self.graph.triples((None, s, None)) : + self.store_triple((zzz, o, www)) + # prp-eqp2 + for zzz,Z,www in self.graph.triples((None, o, None)) : + self.store_triple((zzz, s, www)) + elif p == subClassOf : + # cax-sco + for vvv,Y,Z in self.graph.triples((None, type, s)) : + self.store_triple((vvv, type, o)) + elif p == equivalentClass : + # cax-eqc1 + for vvv,Y,Z in self.graph.triples((None, type, s)) : + self.store_triple((vvv, type, o)) + # cax-eqc2 + for vvv,Y,Z in self.graph.triples((None, type, o)) : + self.store_triple((vvv, type, s)) + +######################################################################################################## + +def process_rdfa_sem(graph, options) : + """ + Expand the graph through the minimal RDFS and OWL rules defined for RDFa. + + The expansion is done in several steps: + 1. the vocabularies are retrieved from the incoming graph (there are RDFa triples generated for that) + 2. all vocabularies are merged into a separate vocabulary graph + 3. the RDFS/OWL expansion is done on the vocabulary graph, to take care of all the subproperty, subclass, etc, chains + 4. the (expanded) vocabulary graph content is added to the incoming graph + 5. the incoming graph is expanded + 6. the triples appearing in the vocabulary graph are removed from the incoming graph, to avoid unnecessary extra triples from the data + + @param graph: an RDFLib Graph instance, to be expanded + @param options: options as defined for the RDFa run; used to generate warnings + @type options: L{pyRdfa.Options} + """ + # 1. collect the vocab URI-s + vocabs = set() + from pyRdfa import RDFA_VOCAB + for ((s,p,v)) in graph.triples((None,RDFA_VOCAB,None)) : + vocabs.add((str(v))) + + if len(vocabs) >= 0 : + # 2. get all the vocab graphs + vocab_graph = Graph() + for uri in vocabs : + if options.vocab_cache : + v_graph = CachedVocab(uri, options).graph + else : + (v_graph, exp_date) = return_graph(uri, options) + if v_graph != None : + for t in v_graph : + vocab_graph.add(t) + + # 3. Get the closure of the vocab graph; this will take care of local subproperty, etc, statements + # Strictly speaking this is not necessary, but will speed up processing, because it may save chaining cycles on the + # real graph + MiniOWL(vocab_graph, schema_semantics = True).closure() + + # 4. Now get the vocab graph content added to the default graph + for t in vocab_graph : + graph.add(t) + + # 5. get the graph expanded through RDFS + MiniOWL(graph).closure() + + # 4. clean up the graph by removing the schema triples + for t in vocab_graph : graph.remove(t) + + # That was it... + return graph + diff --git a/rdflib/plugins/parsers/pyRdfa/state.py b/rdflib/plugins/parsers/pyRdfa/state.py new file mode 100644 index 00000000..f17ef5ad --- /dev/null +++ b/rdflib/plugins/parsers/pyRdfa/state.py @@ -0,0 +1,556 @@ +# -*- coding: utf-8 -*- +""" +Parser's execution context (a.k.a. state) object and handling. The state includes: + + - language, retrieved from C{@xml:lang} or C{@lang} + - URI base, determined by C{<base>} or set explicitly. This is a little bit superfluous, because the current RDFa syntax does not make use of C{@xml:base}; i.e., this could be a global value. But the structure is prepared to add C{@xml:base} easily, if needed. + - options, in the form of an L{options<pyRdfa.options>} instance + - a separate vocabulary/CURIE handling resource, in the form of an L{termorcurie<pyRdfa.TermOrCurie>} instance + +The execution context object is also used to handle URI-s, CURIE-s, terms, etc. + +@summary: RDFa parser execution context +@organization: U{World Wide Web Consortium<http://www.w3.org>} +@author: U{Ivan Herman<a href="http://www.w3.org/People/Ivan/">} +@license: This software is available for use under the +U{W3C® SOFTWARE NOTICE AND LICENSE<href="http://www.w3.org/Consortium/Legal/2002/copyright-software-20021231">} +""" + +""" +$Id: state.py,v 1.21 2012/08/20 14:14:14 ivan Exp $ +$Date: 2012/08/20 14:14:14 $ +""" +import sys +(py_v_major, py_v_minor, py_v_micro, py_v_final, py_v_serial) = sys.version_info + +import rdflib +from rdflib import URIRef +from rdflib import Literal +from rdflib import BNode +from rdflib import Namespace +if rdflib.__version__ >= "3.0.0" : + from rdflib import RDF as ns_rdf + from rdflib import RDFS as ns_rdfs +else : + from rdflib.RDFS import RDFSNS as ns_rdfs + from rdflib.RDF import RDFNS as ns_rdf + +from pyRdfa.options import Options +from pyRdfa.utils import quote_URI +from pyRdfa.host import HostLanguage, accept_xml_base, accept_xml_lang, beautifying_prefixes + +from pyRdfa.termorcurie import TermOrCurie +from pyRdfa import UnresolvablePrefix, UnresolvableTerm + +from pyRdfa import err_lang +from pyRdfa import err_URI_scheme +from pyRdfa import err_illegal_safe_CURIE +from pyRdfa import err_no_CURIE_in_safe_CURIE +from pyRdfa import err_undefined_terms +from pyRdfa import err_non_legal_CURIE_ref +from pyRdfa import err_undefined_CURIE + +import re +import random +if py_v_major >= 3 : + from urllib.parse import urlparse, urlunparse, urlsplit, urljoin +else : + from urlparse import urlparse, urlunparse, urlsplit, urljoin + +from types import * + +class ListStructure : + """Special class to handle the C{@inlist} type structures in RDFa 1.1; stores the "origin", i.e, + where the list will be attached to, and the mappings as defined in the spec. + """ + def __init__(self) : + self.mapping = {} + self.origin = None + +#### Core Class definition +class ExecutionContext : + """State at a specific node, including the current set of namespaces in the RDFLib sense, current language, + the base, vocabularies, etc. The class is also used to interpret URI-s and CURIE-s to produce + URI references for RDFLib. + + @ivar options: reference to the overall options + @type options: L{Options} + @ivar base: the 'base' URI + @ivar parsedBase: the parsed version of base, as produced by urlparse.urlsplit + @ivar defaultNS: default namespace (if defined via @xmlns) to be used for XML Literals + @ivar lang: language tag (possibly None) + @ivar term_or_curie: vocabulary management class instance + @type term_or_curie: L{termorcurie.TermOrCurie} + @ivar list_mapping: dictionary of arrays, containing a list of URIs key-ed via properties for lists + @ivar node: the node to which this state belongs + @type node: DOM node instance + @ivar rdfa_version: RDFa version of the content + @type rdfa_version: String + @ivar supress_lang: in some cases, the effect of the lang attribute should be supressed for the given node, although it should be inherited down below (example: @value attribute of the data element in HTML5) + @type supress_lang: Boolean + @cvar _list: list of attributes that allow for lists of values and should be treated as such + @cvar _resource_type: dictionary; mapping table from attribute name to the exact method to retrieve the URI(s). Is initialized at first instantiation. + """ + + # list of attributes that allow for lists of values and should be treated as such + _list = [ "rel", "rev", "property", "typeof", "role" ] + # mapping table from attribute name to the exact method to retrieve the URI(s). + _resource_type = {} + + def __init__(self, node, graph, inherited_state=None, base="", options=None, rdfa_version = None) : + """ + @param node: the current DOM Node + @param graph: the RDFLib Graph + @keyword inherited_state: the state as inherited + from upper layers. This inherited_state is mixed with the state information + retrieved from the current node. + @type inherited_state: L{state.ExecutionContext} + @keyword base: string denoting the base URI for the specific node. This overrides the possible + base inherited from the upper layers. The + current XHTML+RDFa syntax does not allow the usage of C{@xml:base}, but SVG1.2 does, so this is + necessary for SVG (and other possible XML dialects that accept C{@xml:base}) + @keyword options: invocation options, and references to warning graphs + @type options: L{Options<pyRdfa.options>} + """ + def remove_frag_id(uri) : + """ + The fragment ID for self.base must be removed + """ + try : + # To be on the safe side:-) + t = urlparse(uri) + return urlunparse((t[0],t[1],t[2],t[3],t[4],"")) + except : + return uri + + # This is, conceptually, an additional class initialization, but it must be done run time, otherwise import errors show up + if len( ExecutionContext._resource_type ) == 0 : + ExecutionContext._resource_type = { + "href" : ExecutionContext._URI, + "src" : ExecutionContext._URI, + "vocab" : ExecutionContext._URI, + + "about" : ExecutionContext._CURIEorURI, + "resource" : ExecutionContext._CURIEorURI, + + "rel" : ExecutionContext._TERMorCURIEorAbsURI, + "rev" : ExecutionContext._TERMorCURIEorAbsURI, + "datatype" : ExecutionContext._TERMorCURIEorAbsURI, + "typeof" : ExecutionContext._TERMorCURIEorAbsURI, + "property" : ExecutionContext._TERMorCURIEorAbsURI, + "role" : ExecutionContext._TERMorCURIEorAbsURI, + } + #----------------------------------------------------------------- + self.node = node + + #----------------------------------------------------------------- + # Settling the base. In a generic XML, xml:base should be accepted at all levels (though this is not the + # case in, say, XHTML...) + # At the moment, it is invoked with a 'None' at the top level of parsing, that is + # when the <base> element is looked for (for the HTML cases, that is) + if inherited_state : + self.rdfa_version = inherited_state.rdfa_version + self.base = inherited_state.base + self.options = inherited_state.options + + self.list_mapping = inherited_state.list_mapping + self.new_list = False + + # for generic XML versions the xml:base attribute should be handled + if self.options.host_language in accept_xml_base and node.hasAttribute("xml:base") : + self.base = remove_frag_id(node.getAttribute("xml:base")) + else : + # this is the branch called from the very top + self.list_mapping = ListStructure() + self.new_list = True + + if rdfa_version is not None : + self.rdfa_version = rdfa_version + else : + from pyRdfa import rdfa_current_version + self.rdfa_version = rdfa_current_version + + # This value can be overwritten by a @version attribute + if node.hasAttribute("version") : + top_version = node.getAttribute("version") + if top_version.find("RDFa 1.0") != -1 or top_version.find("RDFa1.0") != -1 : + self.rdfa_version = "1.0" + elif top_version.find("RDFa 1.1") != -1 or top_version.find("RDFa1.1") != -1 : + self.rdfa_version = "1.1" + + # this is just to play safe. I believe this should actually not happen... + if options == None : + from pyRdfa import Options + self.options = Options() + else : + self.options = options + + self.base = "" + # handle the base element case for HTML + if self.options.host_language in [ HostLanguage.xhtml, HostLanguage.html5, HostLanguage.xhtml5 ] : + for bases in node.getElementsByTagName("base") : + if bases.hasAttribute("href") : + self.base = remove_frag_id(bases.getAttribute("href")) + continue + elif self.options.host_language in accept_xml_base and node.hasAttribute("xml:base") : + self.base = remove_frag_id(node.getAttribute("xml:base")) + + # If no local setting for base occurs, the input argument has it + if self.base == "" : + self.base = base + + # Perform an extra beautification in RDFLib + if self.options.host_language in beautifying_prefixes : + dict = beautifying_prefixes[self.options.host_language] + for key in dict : + graph.bind(key,dict[key]) + + input_info = "Input Host Language:%s, RDFa version:%s, base:%s" % (self.options.host_language, self.rdfa_version, self.base) + self.options.add_info(input_info) + + + #----------------------------------------------------------------- + # this will be used repeatedly, better store it once and for all... + self.parsedBase = urlsplit(self.base) + + #----------------------------------------------------------------- + # generate and store the local CURIE handling class instance + self.term_or_curie = TermOrCurie(self, graph, inherited_state) + + #----------------------------------------------------------------- + # Settling the language tags + # @lang has priority over @xml:lang + # it is a bit messy: the three fundamental modes (xhtml, html, or xml) are all slightly different:-( + # first get the inherited state's language, if any + if inherited_state : + self.lang = inherited_state.lang + else : + self.lang = None + + self.supress_lang = False + + + if self.options.host_language in [ HostLanguage.xhtml, HostLanguage.xhtml5, HostLanguage.html5 ] : + # we may have lang and xml:lang + if node.hasAttribute("lang") : + lang = node.getAttribute("lang").lower() + else : + lang = None + if node.hasAttribute("xml:lang") : + xmllang = node.getAttribute("xml:lang").lower() + else : + xmllang = None + # First of all, set the value, if any + if xmllang != None : + # this has priority + if len(xmllang) != 0 : + self.lang = xmllang + else : + self.lang = None + elif lang != None : + if len(lang) != 0 : + self.lang = lang + else : + self.lang = None + # Ideally, a warning should be generated if lang and xmllang are both present with different values. But + # the HTML5 Parser does its magic by overriding a lang value if xmllang is present, so the potential + # error situations are simply swallowed... + + elif self.options.host_language in accept_xml_lang and node.hasAttribute("xml:lang") : + self.lang = node.getAttribute("xml:lang").lower() + if len(self.lang) == 0 : self.lang = None + + #----------------------------------------------------------------- + # Set the default namespace. Used when generating XML Literals + if node.hasAttribute("xmlns") : + self.defaultNS = node.getAttribute("xmlns") + elif inherited_state and inherited_state.defaultNS != None : + self.defaultNS = inherited_state.defaultNS + else : + self.defaultNS = None + # end __init__ + + def _URI(self, val) : + """Returns a URI for a 'pure' URI (ie, not a CURIE). The method resolves possible relative URI-s. It also + checks whether the URI uses an unusual URI scheme (and issues a warning); this may be the result of an + uninterpreted CURIE... + @param val: attribute value to be interpreted + @type val: string + @return: an RDFLib URIRef instance + """ + def create_URIRef(uri, check = True) : + """ + Mini helping function: it checks whether a uri is using a usual scheme before a URIRef is created. In case + there is something unusual, a warning is generated (though the URIRef is created nevertheless) + @param uri: (absolute) URI string + @return: an RDFLib URIRef instance + """ + from pyRdfa import uri_schemes + val = uri.strip() + if check and urlsplit(val)[0] not in uri_schemes : + self.options.add_warning(err_URI_scheme % val.strip(), node=self.node.nodeName) + return URIRef(val) + + def join(base, v, check = True) : + """ + Mini helping function: it makes a urljoin for the paths. Based on the python library, but + that one has a bug: in some cases it + swallows the '#' or '?' character at the end. This is clearly a problem with + Semantic Web URI-s, so this is checked, too + @param base: base URI string + @param v: local part + @param check: whether the URI should be checked against the list of 'existing' URI schemes + @return: an RDFLib URIRef instance + """ + # UGLY!!! There is a bug for a corner case in python version <= 2.5.X + if len(v) > 0 and v[0] == '?' and (py_v_major < 3 and py_v_minor <= 5) : + return create_URIRef(base+v, check) + #### + + joined = urljoin(base, v) + try : + if v[-1] != joined[-1] and (v[-1] == "#" or v[-1] == "?") : + return create_URIRef(joined + v[-1], check) + else : + return create_URIRef(joined, check) + except : + return create_URIRef(joined, check) + + if val == "" : + # The fragment ID must be removed... + return URIRef(self.base) + + # fall back on good old traditional URI-s. + # To be on the safe side, let us use the Python libraries + if self.parsedBase[0] == "" : + # base is, in fact, a local file name + # The following call is just to be sure that some pathological cases when + # the ':' _does_ appear in the URI but not in a scheme position is taken + # care of properly... + + key = urlsplit(val)[0] + if key == "" : + # relative URI, to be combined with local file name: + return join(self.base, val, check = False) + else : + return create_URIRef(val) + else : + # Trust the python library... + # Well, not quite:-) there is what is, in my view, a bug in the urljoin; in some cases it + # swallows the '#' or '?' character at the end. This is clearly a problem with + # Semantic Web URI-s + return join(self.base, val) + # end _URI + + def _CURIEorURI(self, val) : + """Returns a URI for a (safe or not safe) CURIE. In case it is a safe CURIE but the CURIE itself + is not defined, an error message is issued. Otherwise, if it is not a CURIE, it is taken to be a URI + @param val: attribute value to be interpreted + @type val: string + @return: an RDFLib URIRef instance or None + """ + if val == "" : + return URIRef(self.base) + + safe_curie = False + if val[0] == '[' : + # If a safe CURIE is asked for, a pure URI is not acceptable. + # Is checked below, and that is why the safe_curie flag is necessary + if val[-1] != ']' : + # that is certainly forbidden: an incomplete safe CURIE + self.options.add_warning(err_illegal_safe_CURIE % val, UnresolvablePrefix, node=self.node.nodeName) + return None + else : + val = val[1:-1] + safe_curie = True + # There is a branch here depending on whether we are in 1.1 or 1.0 mode + if self.rdfa_version >= "1.1" : + retval = self.term_or_curie.CURIE_to_URI(val) + if retval == None : + # the value could not be interpreted as a CURIE, ie, it did not produce any valid URI. + # The rule says that then the whole value should be considered as a URI + # except if it was part of a safe CURIE. In that case it should be ignored... + if safe_curie : + self.options.add_warning(err_no_CURIE_in_safe_CURIE % val, UnresolvablePrefix, node=self.node.nodeName) + return None + else : + return self._URI(val) + else : + # there is an unlikely case where the retval is actually a URIRef with a relative URI. Better filter that one out + if isinstance(retval, BNode) == False and urlsplit(str(retval))[0] == "" : + # yep, there is something wrong, a new URIRef has to be created: + return URIRef(self.base+str(retval)) + else : + return retval + else : + # in 1.0 mode a CURIE can be considered only in case of a safe CURIE + if safe_curie : + return self.term_or_curie.CURIE_to_URI(val) + else : + return self._URI(val) + # end _CURIEorURI + + def _TERMorCURIEorAbsURI(self, val) : + """Returns a URI either for a term or for a CURIE. The value must be an NCNAME to be handled as a term; otherwise + the method falls back on a CURIE or an absolute URI. + @param val: attribute value to be interpreted + @type val: string + @return: an RDFLib URIRef instance or None + """ + from pyRdfa import uri_schemes + # This case excludes the pure base, ie, the empty value + if val == "" : + return None + + from pyRdfa.termorcurie import ncname, termname + if termname.match(val) : + # This is a term, must be handled as such... + retval = self.term_or_curie.term_to_URI(val) + if not retval : + self.options.add_warning(err_undefined_terms % val, UnresolvableTerm, node=self.node.nodeName, buggy_value = val) + return None + else : + return retval + else : + # try a CURIE + retval = self.term_or_curie.CURIE_to_URI(val) + if retval : + return retval + elif self.rdfa_version >= "1.1" : + # See if it is an absolute URI + scheme = urlsplit(val)[0] + if scheme == "" : + # bug; there should be no relative URIs here + self.options.add_warning(err_non_legal_CURIE_ref % val, UnresolvablePrefix, node=self.node.nodeName) + return None + else : + if scheme not in uri_schemes : + self.options.add_warning(err_URI_scheme % val.strip(), node=self.node.nodeName) + return URIRef(val) + else : + # rdfa 1.0 case + self.options.add_warning(err_undefined_CURIE % val.strip(), UnresolvablePrefix, node=self.node.nodeName) + return None + # end _TERMorCURIEorAbsURI + + # ----------------------------------------------------------------------------------------------- + + def getURI(self, attr) : + """Get the URI(s) for the attribute. The name of the attribute determines whether the value should be + a pure URI, a CURIE, etc, and whether the return is a single element of a list of those. This is done + using the L{ExecutionContext._resource_type} table. + @param attr: attribute name + @type attr: string + @return: an RDFLib URIRef instance (or None) or a list of those + """ + if self.node.hasAttribute(attr) : + val = self.node.getAttribute(attr) + else : + if attr in ExecutionContext._list : + return [] + else : + return None + + # This may raise an exception if the attr has no key. This, actually, + # should not happen if the code is correct, but it does not harm having it here... + try : + func = ExecutionContext._resource_type[attr] + except : + # Actually, this should not happen... + func = ExecutionContext._URI + + if attr in ExecutionContext._list : + # Allows for a list + resources = [ func(self, v.strip()) for v in val.strip().split() if v != None ] + retval = [ r for r in resources if r != None ] + else : + retval = func(self, val.strip()) + return retval + # end getURI + + def getResource(self, *args) : + """Get single resources from several different attributes. The first one that returns a valid URI wins. + @param args: variable list of attribute names, or a single attribute being a list itself. + @return: an RDFLib URIRef instance (or None) : + """ + if len(args) == 0 : + return None + if isinstance(args[0], TupleType) or isinstance(args[0], ListType) : + rargs = args[0] + else : + rargs = args + + for resource in rargs : + uri = self.getURI(resource) + if uri != None : return uri + return None + + # ----------------------------------------------------------------------------------------------- + def reset_list_mapping(self, origin=None) : + """ + Reset, ie, create a new empty dictionary for the list mapping. + """ + self.list_mapping = ListStructure() + if origin: self.set_list_origin(origin) + self.new_list = True + + def list_empty(self) : + """ + Checks whether the list is empty. + @return: Boolean + """ + return len(self.list_mapping.mapping) == 0 + + def get_list_props(self) : + """ + Return the list of property values in the list structure + @return: list of URIRef + """ + return list(self.list_mapping.mapping.keys()) + + def get_list_value(self,prop) : + """ + Return the list of values in the list structure for a specific property + @return: list of RDF nodes + """ + return self.list_mapping.mapping[prop] + + def set_list_origin(self, origin) : + """ + Set the origin of the list, ie, the subject to attach the final list(s) to + @param origin: URIRef + """ + self.list_mapping.origin = origin + + def get_list_origin(self) : + """ + Return the origin of the list, ie, the subject to attach the final list(s) to + @return: URIRef + """ + return self.list_mapping.origin + + def add_to_list_mapping(self, property, resource) : + """Add a new property-resource on the list mapping structure. The latter is a dictionary of arrays; + if the array does not exist yet, it will be created on the fly. + + @param property: the property URI, used as a key in the dictionary + @param resource: the resource to be added to the relevant array in the dictionary. Can be None; this is a dummy + placeholder for C{<span rel="property" inlist>...</span>} constructions that may be filled in by children or siblings; if not + an empty list has to be generated. + """ + if property in self.list_mapping.mapping : + if resource != None : + # indeed, if it is None, than it should not override anything + if self.list_mapping.mapping[property] == None : + # replacing a dummy with real content + self.list_mapping.mapping[property] = [ resource ] + else : + self.list_mapping.mapping[property].append(resource) + else : + if resource != None : + self.list_mapping.mapping[property] = [ resource ] + else : + self.list_mapping.mapping[property] = None + + +#################### diff --git a/rdflib/plugins/parsers/pyRdfa/termorcurie.py b/rdflib/plugins/parsers/pyRdfa/termorcurie.py new file mode 100644 index 00000000..bbcddd08 --- /dev/null +++ b/rdflib/plugins/parsers/pyRdfa/termorcurie.py @@ -0,0 +1,479 @@ +# -*- coding: utf-8 -*- +""" +Management of vocabularies, terms, and their mapping to URI-s. The main class of this module (L{TermOrCurie}) is, +conceptually, part of the overall state of processing at a node (L{state.ExecutionContext}) but putting it into a separate +module makes it easider to maintain. + +@summary: Management of vocabularies, terms, and their mapping to URI-s. +@requires: U{RDFLib package<http://rdflib.net>} +@organization: U{World Wide Web Consortium<http://www.w3.org>} +@author: U{Ivan Herman<a href="http://www.w3.org/People/Ivan/">} +@license: This software is available for use under the +U{W3C® SOFTWARE NOTICE AND LICENSE<href="http://www.w3.org/Consortium/Legal/2002/copyright-software-20021231">} + +@var XHTML_PREFIX: prefix for the XHTML vocabulary URI (set to 'xhv') +@var XHTML_URI: URI prefix of the XHTML vocabulary +@var ncname: Regular expression object for NCNAME +@var termname: Regular expression object for a term +@var xml_application_media_type: Regular expression object for a general XML application media type +""" + +""" +$Id: termorcurie.py,v 1.9 2012/05/21 15:27:07 ivan Exp $ +$Date: 2012/05/21 15:27:07 $ +""" + +import re, sys +import xml.dom.minidom +import random + +if sys.version_info[0] >= 3 : + from urllib.parse import urlsplit +else : + from urlparse import urlsplit + + +import rdflib +from rdflib import URIRef +from rdflib import Literal +from rdflib import BNode +from rdflib import Namespace +if rdflib.__version__ >= "3.0.0" : + from rdflib import Graph + from rdflib import RDF as ns_rdf + from rdflib import RDFS as ns_rdfs +else : + from rdflib.Graph import Graph + from rdflib.RDFS import RDFSNS as ns_rdfs + from rdflib.RDF import RDFNS as ns_rdf + +from pyRdfa.options import Options +from pyRdfa.utils import quote_URI, URIOpener +from pyRdfa.host import MediaTypes, HostLanguage, predefined_1_0_rel, warn_xmlns_usage +from pyRdfa import IncorrectPrefixDefinition, RDFA_VOCAB, UnresolvableReference +from pyRdfa import ns_rdfa + +from pyRdfa import err_redefining_URI_as_prefix +from pyRdfa import err_xmlns_deprecated +from pyRdfa import err_bnode_local_prefix +from pyRdfa import err_col_local_prefix +from pyRdfa import err_missing_URI_prefix +from pyRdfa import err_invalid_prefix +from pyRdfa import err_no_default_prefix +from pyRdfa import err_prefix_and_xmlns +from pyRdfa import err_non_ncname_prefix +from pyRdfa import err_absolute_reference +from pyRdfa import err_query_reference +from pyRdfa import err_fragment_reference + +# Regular expression object for NCNAME +ncname = re.compile("^[A-Za-z][A-Za-z0-9._-]*$") + +# Regular expression object for term name +termname = re.compile("^[A-Za-z]([A-Za-z0-9._-]|/)*$") + +# Regular expression object for a general XML application media type +xml_application_media_type = re.compile("application/[a-zA-Z0-9]+\+xml") + +XHTML_PREFIX = "xhv" +XHTML_URI = "http://www.w3.org/1999/xhtml/vocab#" + +#### Managing blank nodes for CURIE-s: mapping from local names to blank nodes. +_bnodes = {} +_empty_bnode = BNode() + +#### + +class InitialContext : + """ + Get the initial context values. In most cases this class has an empty content, except for the + top level (in case of RDFa 1.1). Each L{TermOrCurie} class has one instance of this class. It provides initial + mappings for terms, namespace prefixes, etc, that the top level L{TermOrCurie} instance uses for its own initialization. + + @ivar terms: collection of all term mappings + @type terms: dictionary + @ivar ns: namespace mapping + @type ns: dictionary + @ivar vocabulary: default vocabulary + @type vocabulary: string + """ + + def __init__(self, state, top_level) : + """ + @param state: the state behind this term mapping + @type state: L{state.ExecutionContext} + @param top_level : whether this is the top node of the DOM tree (the only place where initial contexts are handled) + @type top_level : boolean + """ + self.state = state + + # This is to store the local terms + self.terms = {} + # This is to store the local Namespaces (a.k.a. prefixes) + self.ns = {} + # Default vocabulary + self.vocabulary = None + + if state.rdfa_version < "1.1" or top_level == False : + return + + from pyRdfa.initialcontext import initial_context as context_data + from pyRdfa.host import initial_contexts as context_ids + from pyRdfa.host import default_vocabulary + + for id in context_ids[state.options.host_language] : + # This gives the id of a initial context, valid for this media type: + data = context_data[id] + + # Merge the context data with the overall definition + if state.options.host_language in default_vocabulary : + self.vocabulary = default_vocabulary[state.options.host_language] + elif data.vocabulary != "" : + self.vocabulary = data.vocabulary + + for key in data.terms : + self.terms[key] = URIRef(data.terms[key]) + for key in data.ns : + self.ns[key] = Namespace(data.ns[key]) + + +################################################################################################################## + +class TermOrCurie : + """ + Wrapper around vocabulary management, ie, mapping a term to a URI, as well as a CURIE to a URI. Each instance of this class belongs to a + "state", instance of L{state.ExecutionContext}. Context definitions are managed at initialization time. + + (In fact, this class is, conceptually, part of the overall state at a node, and has been separated here for an + easier maintenance.) + + The class takes care of the stack-like behavior of vocabulary items, ie, inheriting everything that is possible + from the "parent". At initialization time, this works through the prefix definitions (i.e., C{@prefix} or C{@xmln:} attributes) + and/or C{@vocab} attributes. + + @ivar state: State to which this instance belongs + @type state: L{state.ExecutionContext} + @ivar graph: The RDF Graph under generation + @type graph: rdflib.Graph + @ivar terms: mapping from terms to URI-s + @type terms: dictionary + @ivar ns: namespace declarations, ie, mapping from prefixes to URIs + @type ns: dictionary + @ivar default_curie_uri: URI for a default CURIE + """ + def __init__(self, state, graph, inherited_state) : + """Initialize the vocab bound to a specific state. + @param state: the state to which this vocab instance belongs to + @type state: L{state.ExecutionContext} + @param graph: the RDF graph being worked on + @type graph: rdflib.Graph + @param inherited_state: the state inherited by the current state. 'None' if this is the top level state. + @type inherited_state: L{state.ExecutionContext} + """ + def check_prefix(pr) : + from pyRdfa import uri_schemes + if pr in uri_schemes : + # The prefix being defined is a registered URI scheme, better avoid it... + state.options.add_warning(err_redefining_URI_as_prefix % pr, node=state.node.nodeName) + + self.state = state + self.graph = graph + + # -------------------------------------------------------------------------------- + # This is set to non-void only on the top level and in the case of 1.1 + default_vocab = InitialContext(self.state, inherited_state == None) + + # Set the default CURIE URI + if inherited_state == None : + # This is the top level... + # AFAIK there is no default setting for the URI-s + # self.default_curie_uri = None + self.default_curie_uri = Namespace(XHTML_URI) + self.graph.bind(XHTML_PREFIX, self.default_curie_uri) + else : + self.default_curie_uri = inherited_state.term_or_curie.default_curie_uri + + # -------------------------------------------------------------------------------- + # Set the default term URI + # Note that it is still an open issue whether the XHTML_URI should be used + # for RDFa core, or whether it should be set to None. + # This is a 1.1 feature, ie, should be ignored if the version is < 1.0 + if state.rdfa_version >= "1.1" : + # that is the absolute default setup... + if inherited_state == None : + self.default_term_uri = None + else : + self.default_term_uri = inherited_state.term_or_curie.default_term_uri + + # see if the initial context has defined a default vocabulary: + if default_vocab.vocabulary : + self.default_term_uri = default_vocab.vocabulary + + # see if there is local vocab + def_term_uri = self.state.getURI("vocab") + if def_term_uri : + self.default_term_uri = def_term_uri + self.graph.add((URIRef(self.state.base),RDFA_VOCAB,URIRef(def_term_uri))) + else : + self.default_term_uri = None + + # -------------------------------------------------------------------------------- + # The simpler case: terms, adding those that have been defined by a possible initial context + if inherited_state is None : + # this is the vocabulary belonging to the top level of the tree! + self.terms = {} + if state.rdfa_version >= "1.1" : + # Simply get the terms defined by the default vocabularies. There is no need for merging + for key in default_vocab.terms : + self.terms[key] = default_vocab.terms[key] + else : + # The terms are hardwired... + for key in predefined_1_0_rel : + self.terms[key] = URIRef(XHTML_URI + key) + self.graph.bind(XHTML_PREFIX, XHTML_URI) + else : + # just refer to the inherited terms + self.terms = inherited_state.term_or_curie.terms + + #----------------------------------------------------------------- + # the locally defined namespaces + dict = {} + # locally defined xmlns namespaces, necessary for correct XML Literal generation + xmlns_dict = {} + + # Add the namespaces defined via a initial context + for key in default_vocab.ns : + dict[key] = default_vocab.ns[key] + self.graph.bind(key, dict[key]) + + # Add the locally defined namespaces using the xmlns: syntax + for i in range(0, state.node.attributes.length) : + attr = state.node.attributes.item(i) + if attr.name.find('xmlns:') == 0 : + # yep, there is a namespace setting + prefix = attr.localName + if prefix != "" : # exclude the top level xmlns setting... + if state.rdfa_version >= "1.1" and state.options.host_language in warn_xmlns_usage : + state.options.add_warning(err_xmlns_deprecated % prefix, IncorrectPrefixDefinition, node=state.node.nodeName) + if prefix == "_" : + state.options.add_warning(err_bnode_local_prefix, IncorrectPrefixDefinition, node=state.node.nodeName) + elif prefix.find(':') != -1 : + state.options.add_warning(err_col_local_prefix % prefix, IncorrectPrefixDefinition, node=state.node.nodeName) + else : + # quote the URI, ie, convert special characters into %.. This is + # true, for example, for spaces + uri = quote_URI(attr.value, state.options) + # create a new RDFLib Namespace entry + ns = Namespace(uri) + # Add an entry to the dictionary if not already there (priority is left to right!) + if state.rdfa_version >= "1.1" : + pr = prefix.lower() + else : + pr = prefix + dict[pr] = ns + xmlns_dict[pr] = ns + self.graph.bind(pr,ns) + check_prefix(pr) + + # Add the locally defined namespaces using the @prefix syntax + # this may override the definition @xmlns + if state.rdfa_version >= "1.1" and state.node.hasAttribute("prefix") : + pr = state.node.getAttribute("prefix") + if pr != None : + # separator character is whitespace + pr_list = pr.strip().split() + # range(0, len(pr_list), 2) + for i in range(len(pr_list) - 2, -1, -2) : + prefix = pr_list[i] + # see if there is a URI at all + if i == len(pr_list) - 1 : + state.options.add_warning(err_missing_URI_prefix % (prefix,pr), node=state.node.nodeName) + break + else : + value = pr_list[i+1] + + # see if the value of prefix is o.k., ie, there is a ':' at the end + if prefix[-1] != ':' : + state.options.add_warning(err_invalid_prefix % (prefix,pr), IncorrectPrefixDefinition, node=state.node.nodeName) + continue + elif prefix == ":" : + state.options.add_warning(err_no_default_prefix % pr, IncorrectPrefixDefinition, node=state.node.nodeName) + continue + else : + prefix = prefix[:-1] + uri = Namespace(quote_URI(value, state.options)) + if prefix == "" : + #something to be done here + self.default_curie_uri = uri + elif prefix == "_" : + state.options.add_warning(err_bnode_local_prefix, IncorrectPrefixDefinition, node=state.node.nodeName) + else : + # last check: is the prefix an NCNAME? + if ncname.match(prefix) : + real_prefix = prefix.lower() + dict[real_prefix] = uri + self.graph.bind(real_prefix,uri) + # Additional warning: is this prefix overriding an existing xmlns statement with a different URI? if + # so, that may lead to discrepancies between an RDFa 1.0 and RDFa 1.1 run... + if (prefix in xmlns_dict and xmlns_dict[prefix] != uri) or (real_prefix in xmlns_dict and xmlns_dict[real_prefix] != uri) : + state.options.add_warning(err_prefix_and_xmlns % (real_prefix,real_prefix), node=state.node.nodeName) + check_prefix(real_prefix) + + else : + state.options.add_warning(err_non_ncname_prefix % (prefix,pr), IncorrectPrefixDefinition, node=state.node.nodeName) + + # See if anything has been collected at all. + # If not, the namespaces of the incoming state is + # taken over by reference. Otherwise that is copied to the + # the local dictionary + self.ns = {} + if len(dict) == 0 and inherited_state : + self.ns = inherited_state.term_or_curie.ns + else : + if inherited_state : + for key in inherited_state.term_or_curie.ns : self.ns[key] = inherited_state.term_or_curie.ns[key] + for key in dict : self.ns[key] = dict[key] + else : + self.ns = dict + + # the xmlns prefixes have to be stored separately, again for XML Literal generation + self.xmlns = {} + if len(xmlns_dict) == 0 and inherited_state : + self.xmlns = inherited_state.term_or_curie.xmlns + else : + if inherited_state : + for key in inherited_state.term_or_curie.xmlns : self.xmlns[key] = inherited_state.term_or_curie.xmlns[key] + for key in xmlns_dict : self.xmlns[key] = xmlns_dict[key] + else : + self.xmlns = xmlns_dict + # end __init__ + + def _check_reference(self, val) : + """Checking the CURIE reference for correctness. It is probably not 100% foolproof, but may take care + of some of the possible errors. See the URI RFC for the details. + """ + def char_check(s, not_allowed = ['#','[',']']) : + for c in not_allowed : + if s.find(c) != -1 : return False + return True + # Creating an artificial http URI to fool the urlparse module... + scheme, netloc, url, query, fragment = urlsplit('http:' + val) + if netloc != "" and self.state.rdfa_version >= "1.1" : + self.state.options.add_warning(err_absolute_reference % (netloc, val), UnresolvableReference, node=self.state.node.nodeName) + return False + elif not char_check(query) : + self.state.options.add_warning(err_query_reference % (query, val), UnresolvableReference, node=self.state.node.nodeName) + return False + elif not char_check(fragment) : + self.state.options.add_warning(err_fragment_reference % (fragment, val), UnresolvableReference, node=self.state.node.nodeName) + return False + else : + return True + + def CURIE_to_URI(self, val) : + """CURIE to URI mapping. + + This method does I{not} take care of the last step of CURIE processing, ie, the fact that if + it does not have a CURIE then the value is used a URI. This is done on the caller's side, because this has + to be combined with base, for example. The method I{does} take care of BNode processing, though, ie, + CURIE-s of the form "_:XXX". + + @param val: the full CURIE + @type val: string + @return: URIRef of a URI or None. + """ + # Just to be on the safe side: + if val == "" : + return None + elif val == ":" : + if self.default_curie_uri : + return URIRef(self.default_curie_uri) + else : + return None + + # See if this is indeed a valid CURIE, ie, it can be split by a colon + curie_split = val.split(':',1) + if len(curie_split) == 1 : + # there is no ':' character in the string, ie, it is not a valid CURIE + return None + else : + if self.state.rdfa_version >= "1.1" : + prefix = curie_split[0].lower() + else : + prefix = curie_split[0] + reference = curie_split[1] + + #if len(reference) > 0 : + # if self.state.rdfa_version >= "1.1" and (len(prefix) == 0 or prefix in self.ns) and reference.startswith('//') : + # # This has been defined as illegal in RDFa 1.1 + # self.state.options.add_warning(err_absolute_reference % (reference, val), UnresolvableReference, node=self.state.node.nodeName) + # return None + # if reference[0] == ":" : + # return None + + # first possibility: empty prefix + if len(prefix) == 0 : + if self.default_curie_uri and self._check_reference(reference) : + return self.default_curie_uri[reference] + else : + return None + else : + # prefix is non-empty; can be a bnode + if prefix == "_" : + # yep, BNode processing. There is a difference whether the reference is empty or not... + if len(reference) == 0 : + return _empty_bnode + else : + # see if this variable has been used before for a BNode + if reference in _bnodes : + return _bnodes[reference] + else : + # a new bnode... + retval = BNode() + _bnodes[reference] = retval + return retval + # check if the prefix is a valid NCNAME + elif ncname.match(prefix) : + # see if there is a binding for this: + if prefix in self.ns and self._check_reference(reference) : + # yep, a binding has been defined! + if len(reference) == 0 : + return URIRef(str(self.ns[prefix])) + else : + return self.ns[prefix][reference] + else : + # no definition for this thing... + return None + else : + return None + # end CURIE_to_URI + + def term_to_URI(self, term) : + """A term to URI mapping, where term is a simple string and the corresponding + URI is defined via the @vocab (ie, default term uri) mechanism. Returns None if term is not defined + @param term: string + @return: an RDFLib URIRef instance (or None) + """ + if len(term) == 0 : return None + + if termname.match(term) : + # It is a valid NCNAME + + # First of all, a @vocab nukes everything. That has to be done first... + if self.default_term_uri != None : + return URIRef(self.default_term_uri + term) + + # For default terms, the algorithm is (see 7.4.3 of the document): first make a case sensitive match; + # if that fails than make a case insensive one + # 1. simple, case sensitive test: + if term in self.terms : + # yep, term is a valid key as is + return self.terms[term] + + # 2. case insensitive test + for defined_term in self.terms : + if term.lower() == defined_term.lower() : + return self.terms[defined_term] + + # If it got here, it is all wrong... + return None diff --git a/rdflib/plugins/parsers/pyRdfa/transform/DublinCore.py b/rdflib/plugins/parsers/pyRdfa/transform/DublinCore.py new file mode 100755 index 00000000..aab14b95 --- /dev/null +++ b/rdflib/plugins/parsers/pyRdfa/transform/DublinCore.py @@ -0,0 +1,97 @@ +# -*- coding: utf-8 -*- +""" +Transfomer: handles the Dublin Core recommendation for XHTML for adding DC values. What this means is that: + + - DC namespaces are defined via C{<link rel="schema.XX" value="...."/>} + - The 'XX.term' is used much like QNames in C{<link>} and C{<meta>} elements. For the latter, the namespaced names are added to a C{@property} attribute. + +This transformer adds "real" namespaces and changes the DC references in link and meta elements to abide to the +RDFa namespace syntax. + +@summary: Dublin Core transformer +@requires: U{RDFLib package<http://rdflib.net>} +@organization: U{World Wide Web Consortium<http://www.w3.org>} +@author: U{Ivan Herman<a href="http://www.w3.org/People/Ivan/">} +@license: This software is available for use under the +U{W3C® SOFTWARE NOTICE AND LICENSE<href="http://www.w3.org/Consortium/Legal/2002/copyright-software-20021231">} +@contact: Ivan Herman, ivan@w3.org +""" + +""" +@version: $Id: DublinCore.py,v 1.4 2012-01-18 14:16:44 ivan Exp $ +$Date: 2012-01-18 14:16:44 $ +""" + +def DC_transform(html, options, state) : + """ + @param html: a DOM node for the top level html element + @param options: invocation options + @type options: L{Options<pyRdfa.options>} + @param state: top level execution state + @type state: L{State<pyRdfa.state>} + """ + from pyRdfa.host import HostLanguage + if not( options.host_language in [ HostLanguage.xhtml, HostLanguage.html5, HostLanguage.xhtml5 ] ) : + return + + # the head element is necessary; to be sure, the namespaces are set + # on that level only + head = None + try : + head = html.getElementsByTagName("head")[0] + except : + # no head.... + return + + # At first, the DC namespaces must be found + dcprefixes = {} + for link in html.getElementsByTagName("link") : + if link.hasAttribute("rel") : + rel = link.getAttribute("rel") + uri = link.getAttribute("href") + if uri != None and rel != None and rel.startswith("schema.") : + # bingo... + try : + localname = rel.split(".")[1] + head.setAttributeNS("", "xmlns:"+localname,uri) + dcprefixes[localname] = uri + except : + # problem with the split; just ignore + pass + + # get the link elements now to find the dc elements + for link in html.getElementsByTagName("link") : + if link.hasAttribute("rel") : + newProp = "" + for rel in link.getAttribute("rel").strip().split() : + # see if there is '.' to separate the attributes + if rel.find(".") != -1 : + key = rel.split(".",1)[0] + lname = rel.split(".",1)[1] + if key in dcprefixes and lname != "" : + # yep, this is one of those... + newProp += " " + key + ":" + lname + else : + newProp += " " + rel + else : + newProp += " " + rel + link.setAttribute("rel",newProp.strip()) + + # do almost the same with the meta elements... + for meta in html.getElementsByTagName("meta") : + if meta.hasAttribute("name") : + newProp = "" + for name in meta.getAttribute("name").strip().split() : + # see if there is '.' to separate the attributes + if name.find(".") != -1 : + key = name.split(".",1)[0] + lname = name.split(".",1)[1] + if key in dcprefixes and lname != "" : + # yep, this is one of those... + newProp += " " + key + ":" + lname + else : + newProp += " " + name + else : + newProp += " " + name + meta.setAttribute("property", newProp.strip()) + diff --git a/rdflib/plugins/parsers/pyRdfa/transform/OpenID.py b/rdflib/plugins/parsers/pyRdfa/transform/OpenID.py new file mode 100755 index 00000000..83e5d586 --- /dev/null +++ b/rdflib/plugins/parsers/pyRdfa/transform/OpenID.py @@ -0,0 +1,64 @@ +# -*- coding: utf-8 -*- +""" +Simple transfomer: handle OpenID elements. Ie: an openid namespace is added and the usual +'link' elements for openid are exchanged against a namespaced version. + +@summary: OpenID transformer module. +@requires: U{RDFLib package<http://rdflib.net>} +@organization: U{World Wide Web Consortium<http://www.w3.org>} +@author: U{Ivan Herman<a href="http://www.w3.org/People/Ivan/">} +@license: This software is available for use under the +U{W3C® SOFTWARE NOTICE AND LICENSE<href="http://www.w3.org/Consortium/Legal/2002/copyright-software-20021231">} +@contact: Ivan Herman, ivan@w3.org +@var OPENID_NS: the OpenID URI used in the package +""" + +""" +$Id: OpenID.py,v 1.4 2012-01-18 14:16:44 ivan Exp $ +$Date: 2012-01-18 14:16:44 $ +""" + +OPENID_NS = "http://xmlns.openid.net/auth#" + + +def OpenID_transform(html, options, state) : + """ + Replace C{openid.XXX} type C{@rel} attribute values in C{<link>} elements by C{openid:XXX}. The openid URI is also + added to the top level namespaces with the C{openid:} local name. + + @param html: a DOM node for the top level html element + @param options: invocation options + @type options: L{Options<pyRdfa.options>} + @param state: top level execution state + @type state: L{State<pyRdfa.state>} + """ + from pyRdfa.host import HostLanguage + if not( options.host_language in [ HostLanguage.xhtml, HostLanguage.html5, HostLanguage.xhtml5 ] ) : + return + + # the head element is necessary; to be sure, the namespaces are set + # on that level only + head = None + try : + head = html.getElementsByTagName("head")[0] + except : + # no head.... + return + + foundOpenId = False + for link in html.getElementsByTagName("link") : + if link.hasAttribute("rel") : + rel = link.getAttribute("rel") + newProp = "" + for n in rel.strip().split() : + if n.startswith("openid.") : + newProp += " " + n.replace("openid.","openid:") + foundOpenId = True + else : + newProp += " " + n + link.setAttribute("rel",newProp.strip()) + + # Add the OpenId namespace if necessary + if foundOpenId and not head.hasAttribute("xmlns:openid") : + head.setAttributeNS("", "xmlns:openid", OPENID_NS) + diff --git a/rdflib/plugins/parsers/pyRdfa/transform/__init__.py b/rdflib/plugins/parsers/pyRdfa/transform/__init__.py new file mode 100755 index 00000000..55310ba0 --- /dev/null +++ b/rdflib/plugins/parsers/pyRdfa/transform/__init__.py @@ -0,0 +1,120 @@ +# -*- coding: utf-8 -*- +""" +Transformer sub-package for the pyRdfa package. It contains modules with transformer functions; each may be +invoked by pyRdfa to transform the dom tree before the "real" RDfa processing. + +@summary: RDFa Transformer package +@requires: U{RDFLib package<http://rdflib.net>} +@organization: U{World Wide Web Consortium<http://www.w3.org>} +@author: U{Ivan Herman<a href="http://www.w3.org/People/Ivan/">} +@license: This software is available for use under the +U{W3C® SOFTWARE NOTICE AND LICENSE<href="http://www.w3.org/Consortium/Legal/2002/copyright-software-20021231">} +""" + +""" +$Id: __init__.py,v 1.8 2012/06/12 11:47:19 ivan Exp $ +$Date: 2012/06/12 11:47:19 $ +""" +__version__ = "3.0" + +# Here are the transfomer functions that are to be performed for all RDFa files, no matter what + +def top_about(root, options, state) : + """ + @param root: a DOM node for the top level element + @param options: invocation options + @type options: L{Options<pyRdfa.options>} + @param state: top level execution state + @type state: L{State<pyRdfa.state>} + """ + def set_about(node) : + if has_one_of_attributes(node, "rel", "rev") : + if not has_one_of_attributes(top, "about", "src") : + node.setAttribute("about","") + else : + if not has_one_of_attributes(node, "href", "resource", "about", "src") : + node.setAttribute("about","") + + from pyRdfa.host import HostLanguage + from pyRdfa.utils import has_one_of_attributes + + if not has_one_of_attributes(root, "about") : + # The situation is a bit complicated: if a @resource is present without anything else, then it sets + # the subject, ie, should be accepted... + if has_one_of_attributes(root, "resource", "href", "src") : + if has_one_of_attributes(root, "rel", "rev","property") : + root.setAttribute("about","") + else : + root.setAttribute("about","") + + if options.host_language in [ HostLanguage.xhtml, HostLanguage.html5, HostLanguage.xhtml5 ] : + if state.rdfa_version >= "1.1" : + pass + else : + for top in root.getElementsByTagName("head") : + if not has_one_of_attributes(top, "href", "resource", "about", "src") : + set_about(top) + for top in root.getElementsByTagName("body") : + if not has_one_of_attributes(top, "href", "resource", "about", "src") : + set_about(top) + + +def empty_safe_curie(node, options, state) : + """ + Remove the attributes whose value is an empty safe curie. It also adds an 'artificial' flag, ie, an + attribute (called 'emptysc') into the node to signal that there _is_ an attribute with an ignored + safe curie value. The name of the attribute is 'about_pruned' or 'resource_pruned'. + + @param node: a DOM node for the top level element + @param options: invocation options + @type options: L{Options<pyRdfa.options>} + @param state: top level execution state + @type state: L{State<pyRdfa.state>} + """ + def prune_safe_curie(node,name) : + if node.hasAttribute(name) : + av = node.getAttribute(name) + if av == '[]' : + node.removeAttribute(name) + node.setAttribute(name+'_pruned','') + msg = "Attribute @%s uses an empty safe CURIE; the attribute is ignored" % name + options.add_warning(msg, node=node) + + prune_safe_curie(node, "about") + prune_safe_curie(node, "resource") + for n in node.childNodes : + if n.nodeType == node.ELEMENT_NODE : + empty_safe_curie(n, options, state) + +def vocab_for_role(node, options, state) : + """ + The value of the @role attribute (defined separately in the U{Role Attribute Specification Lite<http://www.w3.org/TR/role-attribute/#using-role-in-conjunction-with-rdfa>}) should be as if a @vocab value to the + XHTML vocabulary was defined for it. This method turns all terms in role attributes into full URI-s, so that + this would not be an issue for the run-time. + + @param node: a DOM node for the top level element + @param options: invocation options + @type options: L{Options<pyRdfa.options>} + @param state: top level execution state + @type state: L{State<pyRdfa.state>} + """ + from pyRdfa.termorcurie import termname, XHTML_URI + + def handle_role(node) : + if node.hasAttribute("role") : + old_values = node.getAttribute("role").strip().split() + new_values = "" + for val in old_values : + if termname.match(val) : + new_values += XHTML_URI + val + ' ' + else : + new_values += val + ' ' + node.setAttribute("role", new_values.strip()) + + handle_role(node) + for n in node.childNodes : + if n.nodeType == node.ELEMENT_NODE : + vocab_for_role(n, options, state) + + + diff --git a/rdflib/plugins/parsers/pyRdfa/transform/lite.py b/rdflib/plugins/parsers/pyRdfa/transform/lite.py new file mode 100755 index 00000000..37419359 --- /dev/null +++ b/rdflib/plugins/parsers/pyRdfa/transform/lite.py @@ -0,0 +1,52 @@ +# -*- coding: utf-8 -*- +""" + +@author: U{Ivan Herman<a href="http://www.w3.org/People/Ivan/">} +@license: This software is available for use under the +U{W3C® SOFTWARE NOTICE AND LICENSE<href="http://www.w3.org/Consortium/Legal/2002/copyright-software-20021231">} +@contact: Ivan Herman, ivan@w3.org +@version: $Id: lite.py,v 1.9 2012/06/26 13:29:58 ivan Exp $ +$Date: 2012/06/26 13:29:58 $ +""" + +non_lite_attributes = ["about","inlist","datatype","rev","rel"] + +def lite_prune(top, options, state) : + """ + This is a misnomer. The current version does not remove anything from the tree, just generates warnings as for the + usage of non-lite attributes. A more aggressive version would mean to remove those attributes, but that would, + in fact, define an RDFa Lite conformance level in the parser, which is against the WG decisions. So this should + not be done; the corresponding commands are commented in the code below... + + @param top: a DOM node for the top level element + @param options: invocation options + @type options: L{Options<pyRdfa.options>} + @param state: top level execution state + @type state: L{State<pyRdfa.state>} + """ + def generate_warning(node, attr) : + if attr == "rel" : + msg = "Attribute @rel is not used in RDFa Lite, ignored (consider using @property)" + elif attr == "about" : + msg = "Attribute @about is not used in RDFa Lite, ignored (consider using a <link> element with @href or @resource)" + else : + msg = "Attribute @%s is not used in RDFa Lite, ignored" % attr + options.add_warning(msg, node=node) + + def remove_attrs(node) : + # first the @content; this has a special treatment + if node.tagName != "meta" and node.hasAttribute("content") : + generate_warning(node, "content") + # node.removeAttribute("content") + else : + for attr in non_lite_attributes : + if node.hasAttribute(attr) : + generate_warning(node, attr) + # node.removeAttribute(attr) + + remove_attrs(top) + for n in top.childNodes : + if n.nodeType == top.ELEMENT_NODE : + lite_prune(n, options, state) + +
\ No newline at end of file diff --git a/rdflib/plugins/parsers/pyRdfa/transform/metaname.py b/rdflib/plugins/parsers/pyRdfa/transform/metaname.py new file mode 100755 index 00000000..649107a4 --- /dev/null +++ b/rdflib/plugins/parsers/pyRdfa/transform/metaname.py @@ -0,0 +1,29 @@ +# -*- coding: utf-8 -*- +""" +Simple transfomer: C{meta} element is extended with a C{property} attribute, with a copy of the +C{name} attribute values. + +@author: U{Ivan Herman<a href="http://www.w3.org/People/Ivan/">} +@license: This software is available for use under the +U{W3C® SOFTWARE NOTICE AND LICENSE<href="http://www.w3.org/Consortium/Legal/2002/copyright-software-20021231">} +@contact: Ivan Herman, ivan@w3.org +@version: $Id: metaname.py,v 1.3 2012-01-18 14:16:45 ivan Exp $ +$Date: 2012-01-18 14:16:45 $ +""" + +def meta_transform(html, options, state) : + """ + @param html: a DOM node for the top level html element + @param options: invocation options + @type options: L{Options<pyRdfa.options>} + @param state: top level execution state + @type state: L{State<pyRdfa.state>} + """ + from pyRdfa.host import HostLanguage + if not( options.host_language in [ HostLanguage.xhtml, HostLanguage.html5, HostLanguage.xhtml5 ] ) : + return + + for meta in html.getElementsByTagName("meta") : + if meta.hasAttribute("name") and not meta.hasAttribute("property") : + meta.setAttribute("property", meta.getAttribute("name")) + diff --git a/rdflib/plugins/parsers/pyRdfa/utils.py b/rdflib/plugins/parsers/pyRdfa/utils.py new file mode 100644 index 00000000..8cd7c6c0 --- /dev/null +++ b/rdflib/plugins/parsers/pyRdfa/utils.py @@ -0,0 +1,254 @@ +# -*- coding: utf-8 -*- +""" +Various utilities for pyRdfa. + +Most of the utilities are straightforward. + +@organization: U{World Wide Web Consortium<http://www.w3.org>} +@author: U{Ivan Herman<a href="http://www.w3.org/People/Ivan/">} +@license: This software is available for use under the +U{W3C® SOFTWARE NOTICE AND LICENSE<href="http://www.w3.org/Consortium/Legal/2002/copyright-software-20021231">} + + +""" + +""" +$Id: utils.py,v 1.8 2012/05/17 15:02:48 ivan Exp $ +$Date: 2012/05/17 15:02:48 $ +""" +import os, os.path, sys, imp, datetime + +# Python 3 vs. 2 switch +if sys.version_info[0] >= 3 : + from urllib.request import Request, urlopen + from urllib.parse import urljoin, quote + from http.server import BaseHTTPRequestHandler + from urllib.error import HTTPError as urllib_HTTPError +else : + from urllib2 import Request, urlopen + from urllib2 import HTTPError as urllib_HTTPError + from urlparse import urljoin + from urllib import quote + from BaseHTTPServer import BaseHTTPRequestHandler + +from pyRdfa.extras.httpheader import content_type, parse_http_datetime + +import rdflib +if rdflib.__version__ >= "3.0.0" : + from rdflib import RDF as ns_rdf +else : + from rdflib.RDF import RDFNS as ns_rdf + +from pyRdfa.host import HostLanguage, preferred_suffixes +from types import * + +######################################################################################################### +# Handling URIs +class URIOpener : + """A wrapper around the urllib2 method to open a resource. Beyond accessing the data itself, the class + sets a number of instance variable that might be relevant for processing. + The class also adds an accept header to the outgoing request, namely + text/html and application/xhtml+xml (unless set explicitly by the caller). + + If the content type is set by the server, the relevant HTTP response field is used. Otherwise, + common suffixes are used (see L{host.preferred_suffixes}) to set the content type (this is really of importance + for C{file:///} URI-s). If none of these works, the content type is empty. + + Interpretation of the content type for the return is done by Deron Meranda's U{httpheader module<http://deron.meranda.us/>}. + + @ivar data: the real data, ie, a file-like object + @ivar headers: the return headers as sent back by the server + @ivar content_type: the content type of the resource or the empty string, if the content type cannot be determined + @ivar location: the real location of the data (ie, after possible redirection and content negotiation) + @ivar last_modified_date: sets the last modified date if set in the header, None otherwise + @ivar expiration_date: sets the expiration date if set in the header, I{current UTC plus one day} otherwise (this is used for caching purposes, hence this artificial setting) + """ + CONTENT_LOCATION = 'Content-Location' + CONTENT_TYPE = 'Content-Type' + LAST_MODIFIED = 'Last-Modified' + EXPIRES = 'Expires' + def __init__(self, name, additional_headers = {}) : + """ + @param name: URL to be opened + @keyword additional_headers: additional HTTP request headers to be added to the call + """ + try : + # Note the removal of the fragment ID. This is necessary, per the HTTP spec + req = Request(url=name.split('#')[0]) + + for key in additional_headers : + req.add_header(key, additional_headers[key]) + if 'Accept' not in additional_headers : + req.add_header('Accept', 'text/html, application/xhtml+xml') + + self.data = urlopen(req) + self.headers = self.data.info() + + if URIOpener.CONTENT_TYPE in self.headers : + # The call below will remove the possible media type parameters, like charset settings + ct = content_type(self.headers[URIOpener.CONTENT_TYPE]) + self.content_type = ct.media_type + if 'charset' in ct.parmdict : + self.charset = ct.parmdict['charset'] + else : + self.charset = None + # print + else : + # check if the suffix can be used for the content type; this may be important + # for file:// type URI or if the server is not properly set up to return the right + # mime type + self.charset = None + self.content_type = "" + for suffix in preferred_suffixes.keys() : + if name.endswith(suffix) : + self.content_type = preferred_suffixes[suffix] + break + + if URIOpener.CONTENT_LOCATION in self.headers : + self.location = urljoin(self.data.geturl(),self.headers[URIOpener.CONTENT_LOCATION]) + else : + self.location = name + + self.expiration_date = datetime.datetime.utcnow() + datetime.timedelta(days=1) + if URIOpener.EXPIRES in self.headers : + try : + # Thanks to Deron Meranda for the HTTP date conversion method... + self.expiration_date = parse_http_datetime(self.headers[URIOpener.EXPIRES]) + except : + # The Expires date format was wrong, sorry, forget it... + pass + + self.last_modified_date = None + if URIOpener.LAST_MODIFIED in self.headers : + try : + # Thanks to Deron Meranda for the HTTP date conversion method... + self.last_modified_date = parse_http_datetime(self.headers[URIOpener.LAST_MODIFIED]) + except : + # The last modified date format was wrong, sorry, forget it... + pass + + except urllib_HTTPError : + e = sys.exc_info()[1] + from pyRdfa import HTTPError + msg = BaseHTTPRequestHandler.responses[e.code] + raise HTTPError('%s' % msg[1], e.code) + except Exception : + e = sys.exc_info()[1] + from pyRdfa import RDFaError + raise RDFaError('%s' % e) + +######################################################################################################### + +# 'safe' characters for the URI quoting, ie, characters that can safely stay as they are. Other +# special characters are converted to their %.. equivalents for namespace prefixes +_unquotedChars = ':/\?=#~' +_warnChars = [' ','\n','\r','\t'] + +def quote_URI(uri, options = None) : + """ + 'quote' a URI, ie, exchange special characters for their '%..' equivalents. Some of the characters + may stay as they are (listed in L{_unquotedChars}. If one of the characters listed in L{_warnChars} + is also in the uri, an extra warning is also generated. + @param uri: URI + @param options: + @type options: L{Options<pyRdfa.Options>} + """ + from pyRdfa import err_unusual_char_in_URI + suri = uri.strip() + for c in _warnChars : + if suri.find(c) != -1 : + if options != None : + options.add_warning(err_unusual_char_in_URI % suri) + break + return quote(suri, _unquotedChars) + +######################################################################################################### + +def create_file_name(uri) : + """ + Create a suitable file name from an (absolute) URI. Used, eg, for the generation of a file name for a cached vocabulary file. + """ + suri = uri.strip() + final_uri = quote(suri,_unquotedChars) + # Remove some potentially dangereous characters + return final_uri.replace(' ','_').replace('%','_').replace('-','_').replace('+','_').replace('/','_').replace('?','_').replace(':','_').replace('=','_').replace('#','_') + +######################################################################################################### +def has_one_of_attributes(node,*args) : + """ + Check whether one of the listed attributes is present on a (DOM) node. + @param node: DOM element node + @param args: possible attribute names + @return: True or False + @rtype: Boolean + """ + if len(args) == 0 : + return None + if isinstance(args[0], TupleType) or isinstance(args[0],ListType) : + rargs = args[0] + else : + rargs = args + + return True in [ node.hasAttribute(attr) for attr in rargs ] + +######################################################################################################### +def traverse_tree(node, func) : + """Traverse the whole element tree, and perform the function C{func} on all the elements. + @param node: DOM element node + @param func: function to be called on the node. Input parameter is a DOM Element Node. If the function returns a boolean True, the recursion is stopped. + """ + if func(node) : + return + + for n in node.childNodes : + if n.nodeType == node.ELEMENT_NODE : + traverse_tree(n, func) + +######################################################################################################### +def return_XML(state, inode, base = True, xmlns = True) : + """ + Get (recursively) the XML Literal content of a DOM Element Node. (Most of the processing is done + via a C{node.toxml} call of the xml minidom implementation.) + + @param inode: DOM Node + @param state: L{pyRdfa.state.ExecutionContext} + @param base: whether the base element should be added to the output + @type base: Boolean + @param xmlns: whether the namespace declarations should be repeated in the generated node + @type xmlns: Boolean + @return: string + """ + node = inode.cloneNode(True) + # Decorate the element with namespaces.lang values and, optionally, base + if base : + node.setAttribute("xml:base",state.base) + if xmlns : + for prefix in state.term_or_curie.xmlns : + if not node.hasAttribute("xmlns:%s" % prefix) : + node.setAttribute("xmlns:%s" % prefix,"%s" % state.term_or_curie.xmlns[prefix]) + # Set the default namespace, if not done (and is available) + if not node.getAttribute("xmlns") and state.defaultNS != None : + node.setAttribute("xmlns", state.defaultNS) + # Get the lang, if necessary + if state.lang : + if state.options.host_language in [ HostLanguage.xhtml, HostLanguage.xhtml5, HostLanguage.html5 ] : + if not node.getAttribute("lang") : + node.setAttribute("lang", state.lang) + else : + if not node.getAttribute("xml:lang") : + node.setAttribute("xml:lang", state.lang) + + return node.toxml() + +######################################################################################################### + +def dump(node) : + """ + This is just for debug purposes: it prints the essential content of the node in the tree starting at node. + + @param node: DOM node + """ + print( node.toprettyxml(indent="", newl="") ) + + + diff --git a/rdflib/plugins/parsers/structureddata.py b/rdflib/plugins/parsers/structureddata.py new file mode 100644 index 00000000..b4f2fc72 --- /dev/null +++ b/rdflib/plugins/parsers/structureddata.py @@ -0,0 +1,153 @@ +import sys, imp + +from rdflib.parser import Parser, StringInputSource, URLInputSource, FileInputSource + +# This is the parser interface as it would look when called from the rest of RDFLib +class RDFaParser(Parser) : + def parse(self, source, graph, + pgraph = None, + media_type = None, + rdfa_version = None, + embedded_rdf = False, + vocab_expansion = False, + vocab_cache = False, + rdfOutput = False) : + """ + @param source: one of the input sources that the RDFLib package defined + @type source: InputSource class instance + @param graph: target graph for the triples; output graph, in RDFa spec. parlance + @type graph: RDFLib Graph + @keyword pgraph: target for error and warning triples; processor graph, in RDFa spec. parlance. If set to None, these triples are ignored + @type pgraph: RDFLib Graph + @keyword media_type: explicit setting of the preferred media type (a.k.a. content type) of the the RDFa source. None means the content type of the HTTP result is used, or a guess is made based on the suffix of a file + @type media_type: string + @keyword rdfa_version: 1.0 or 1.1. If the value is None, then, by default, 1.1 is used unless the source has explicit signals to use 1.0 (e.g., using a @version attribute, using a DTD set up for 1.0, etc) + @type rdfa_version: string + @keyword embedded_rdf: some formats allow embedding RDF in other formats: (X)HTML can contain turtle in a special <script> element, SVG can have RDF/XML embedded in a <metadata> element. This flag controls whether those triples should be interpreted and added to the output graph. Some languages (e.g., SVG) require this, and the flag is ignored. + @type embedded_rdf: Boolean + @keyword vocab_expansion: whether the RDFa @vocab attribute should also mean vocabulary expansion (see the RDFa 1.1 spec for further details) + @type vocab_expansion: Boolean + @keyword vocab_cache: in case vocab expansion is used, whether the expansion data (i.e., vocabulary) should be cached locally. This requires the ability for the local application to write on the local file system + @type vocab_chache: Boolean + @keyword rdfOutput: whether Exceptions should be catched and added, as triples, to the processor graph, or whether they should be raised. + @type rdfOutput: Boolean + """ + # We need a dynamic way of setting the import path. That ensures that the core of the pyRdfa module + # can be developed independently of the rdflib package + path = imp.find_module('rdflib')[1] + sys.path.insert(0,path+'/plugins/parsers') + + try: + from pyRdfa import pyRdfa, Options + + if isinstance(source, StringInputSource) : + orig_source = source.getByteStream() + elif isinstance(source, URLInputSource) : + orig_source = source.url + elif isinstance(source, FileInputSource) : + orig_source = source.file.name + source.file.close() + + self.options = Options(output_processor_graph = (pgraph != None), + embedded_rdf = embedded_rdf, + vocab_expansion = vocab_expansion, + vocab_cache = vocab_cache) + + baseURI = source.getPublicId() + processor = pyRdfa(self.options, base = baseURI, media_type = media_type, rdfa_version = rdfa_version) + processor.graph_from_source(orig_source, graph=graph, pgraph=pgraph, rdfOutput = rdfOutput) + finally : + sys.path.pop(0) + +class MicrodataParser(Parser) : + def parse(self, source, graph, + vocab_expansion = False, + vocab_cache = False, + rdfOutput = False) : + """ + @param source: one of the input sources that the RDFLib package defined + @type source: InputSource class instance + @param graph: target graph for the triples; output graph, in RDFa spec. parlance + @type graph: RDFLib Graph + @keyword vocab_expansion: whether the RDFa @vocab attribute should also mean vocabulary expansion (see the RDFa 1.1 spec for further details) + @type vocab_expansion: Boolean + @keyword vocab_cache: in case vocab expansion is used, whether the expansion data (i.e., vocabulary) should be cached locally. This requires the ability for the local application to write on the local file system + @type vocab_chache: Boolean + @keyword rdfOutput: whether Exceptions should be catched and added, as triples, to the processor graph, or whether they should be raised. + @type rdfOutput: Boolean + """ + # We need a dynamic way of setting the import path. That ensures that the core of the pyRdfa module + # can be developed independently of the rdflib package + path = imp.find_module('rdflib')[1] + sys.path.insert(0,path+'/plugins/parsers') + + try: + from pyMicrodata import pyMicrodata + + if isinstance(source, StringInputSource) : + orig_source = source.getByteStream() + elif isinstance(source, URLInputSource) : + orig_source = source.url + elif isinstance(source, FileInputSource) : + orig_source = source.file.name + source.file.close() + + baseURI = source.getPublicId() + processor = pyMicrodata(base = baseURI) + processor.graph_from_source(orig_source, graph=graph, rdfOutput = rdfOutput) + finally : + sys.path.pop(0) + +class StructuredDataParser(Parser) : + def parse(self, source, graph, + pgraph = None, + embedded_rdf = True, + vocab_expansion = False, + vocab_cache = False, + rdfOutput = False) : + """ + @param source: one of the input sources that the RDFLib package defined + @type source: InputSource class instance + @param graph: target graph for the triples; output graph, in RDFa spec. parlance + @type graph: RDFLib Graph + @keyword pgraph: target for error and warning triples; processor graph, in RDFa spec. parlance. If set to None, these triples are ignored + @type pgraph: RDFLib Graph + @keyword embedded_rdf: some formats allow embedding RDF in other formats: (X)HTML can contain turtle in a special <script> element, SVG can have RDF/XML embedded in a <metadata> element. This flag controls whether those triples should be interpreted and added to the output graph. Some languages (e.g., SVG) require this, and the flag is ignored. + @type embedded_rdf: Boolean + @keyword vocab_expansion: whether the RDFa @vocab attribute should also mean vocabulary expansion (see the RDFa 1.1 spec for further details) + @type vocab_expansion: Boolean + @keyword vocab_cache: in case vocab expansion is used, whether the expansion data (i.e., vocabulary) should be cached locally. This requires the ability for the local application to write on the local file system + @type vocab_chache: Boolean + @keyword rdfOutput: whether Exceptions should be catched and added, as triples, to the processor graph, or whether they should be raised. + @type rdfOutput: Boolean + """ + # We need a dynamic way of setting the import path. That ensures that the core of the pyRdfa module + # can be developed independently of the rdflib package + path = imp.find_module('rdflib')[1] + sys.path.insert(0,path+'/plugins/parsers') + try: + if isinstance(source, StringInputSource) : + orig_source = source.getByteStream() + elif isinstance(source, URLInputSource) : + orig_source = source.url + elif isinstance(source, FileInputSource) : + orig_source = source.file.name + source.file.close() + baseURI = source.getPublicId() + + # The RDFa part + from pyRdfa import pyRdfa, Options + self.options = Options(output_processor_graph = (pgraph != None), + embedded_rdf = embedded_rdf, + vocab_expansion = vocab_expansion, + vocab_cache = vocab_cache) + + processor = pyRdfa(self.options, base = baseURI, media_type = 'text/html', rdfa_version = '1.1') + processor.graph_from_source(orig_source, graph=graph, pgraph=pgraph, rdfOutput = rdfOutput) + + # The Microdata part + from pyMicrodata import pyMicrodata + processor = pyMicrodata(base = baseURI) + processor.graph_from_source(orig_source, graph=graph, rdfOutput = rdfOutput) + finally : + sys.path.pop(0) @@ -105,6 +105,12 @@ setup( 'rdflib/plugins/parsers', 'rdflib/plugins/parsers/rdfa', 'rdflib/plugins/parsers/rdfa/transform', + 'rdflib/plugins/parsers/pyRdfa', + 'rdflib/plugins/parsers/pyRdfa/transform', + 'rdflib/plugins/parsers/pyRdfa/extras', + 'rdflib/plugins/parsers/pyRdfa/host', + 'rdflib/plugins/parsers/pyRdfa/rdfs', + 'rdflib/plugins/parsers/pyMicrodata', 'rdflib/plugins/serializers', ], **kwargs |