rdflib/plugins/parsers/pyRdfa/utils.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233

# -*- coding: utf-8 -*-
"""
Various utilities for pyRdfa.

Most of the utilities are straightforward.

@organization: U{World Wide Web Consortium<http://www.w3.org>}
@author: U{Ivan Herman<a href="http://www.w3.org/People/Ivan/">}
@license: This software is available for use under the
U{W3C® SOFTWARE NOTICE AND LICENSE<href="http://www.w3.org/Consortium/Legal/2002/copyright-software-20021231">}


"""

"""
$Id: utils.py,v 1.9 2012/11/16 17:51:53 ivan Exp $
$Date: 2012/11/16 17:51:53 $
"""
import sys, datetime

from six.moves.urllib.request import Request, urlopen
from six.moves.urllib.parse import urljoin, quote
from six.moves.urllib.error import HTTPError
from six.moves.BaseHTTPServer import BaseHTTPRequestHandler
from six import text_type

from .extras.httpheader import content_type, parse_http_datetime

from .host import HostLanguage, preferred_suffixes

#########################################################################################################
# Handling URIs
class URIOpener :
	"""A wrapper around the urllib2 method to open a resource. Beyond accessing the data itself, the class
	sets a number of instance variable that might be relevant for processing.
	The class also adds an accept header to the outgoing request, namely
	text/html and application/xhtml+xml (unless set explicitly by the caller).

	If the content type is set by the server, the relevant HTTP response field is used. Otherwise,
	common suffixes are used (see L{host.preferred_suffixes}) to set the content type (this is really of importance
	for C{file:///} URI-s). If none of these works, the content type is empty.

	Interpretation of the content type for the return is done by Deron Meranda's U{httpheader module<http://deron.meranda.us/>}.

	@ivar data: the real data, ie, a file-like object
	@ivar headers: the return headers as sent back by the server
	@ivar content_type: the content type of the resource or the empty string, if the content type cannot be determined
	@ivar location: the real location of the data (ie, after possible redirection and content negotiation)
	@ivar last_modified_date: sets the last modified date if set in the header, None otherwise
	@ivar expiration_date: sets the expiration date if set in the header, I{current UTC plus one day} otherwise (this is used for caching purposes, hence this artificial setting)
	"""
	CONTENT_LOCATION	= 'Content-Location'
	CONTENT_TYPE		= 'Content-Type'
	LAST_MODIFIED		= 'Last-Modified'
	EXPIRES				= 'Expires'
	def __init__(self, name, additional_headers = {}) :
		"""
		@param name: URL to be opened
		@keyword additional_headers: additional HTTP request headers to be added to the call
		"""
		try :
			# Note the removal of the fragment ID. This is necessary, per the HTTP spec
			req = Request(url=name.split('#')[0])

			for key in additional_headers :
				req.add_header(key, additional_headers[key])
			if 'Accept' not in additional_headers :
				req.add_header('Accept', 'text/html, application/xhtml+xml')

			self.data		= urlopen(req)
			self.headers	= self.data.info()

			if URIOpener.CONTENT_TYPE in self.headers :
				# The call below will remove the possible media type parameters, like charset settings
				ct = content_type(self.headers[URIOpener.CONTENT_TYPE])
				self.content_type = ct.media_type
				if 'charset' in ct.parmdict :
					self.charset = ct.parmdict['charset']
				else :
					self.charset = None
				# print
			else :
				# check if the suffix can be used for the content type; this may be important
				# for file:// type URI or if the server is not properly set up to return the right
				# mime type
				self.charset = None
				self.content_type = ""
				for suffix in preferred_suffixes.keys() :
					if name.endswith(suffix) :
						self.content_type = preferred_suffixes[suffix]
						break

			if URIOpener.CONTENT_LOCATION in self.headers :
				self.location = urljoin(self.data.geturl(),self.headers[URIOpener.CONTENT_LOCATION])
			else :
				self.location = name

			self.expiration_date = datetime.datetime.utcnow() + datetime.timedelta(days=1)
			if URIOpener.EXPIRES in self.headers :
				try :
					# Thanks to Deron Meranda for the HTTP date conversion method...
					self.expiration_date = parse_http_datetime(self.headers[URIOpener.EXPIRES])
				except :
					# The Expires date format was wrong, sorry, forget it...
					pass

			self.last_modified_date = None
			if URIOpener.LAST_MODIFIED in self.headers :
				try :
					# Thanks to Deron Meranda for the HTTP date conversion method...
					self.last_modified_date = parse_http_datetime(self.headers[URIOpener.LAST_MODIFIED])
				except :
					# The last modified date format was wrong, sorry, forget it...
					pass

		except HTTPError :
			e = sys.exc_info()[1]
			from . import HTTPError
			msg = BaseHTTPRequestHandler.responses[e.code]
			raise HTTPError('%s' % msg[1], e.code)
		except Exception :
			e = sys.exc_info()[1]
			from . import RDFaError
			raise RDFaError('%s' % e)

#########################################################################################################

# 'safe' characters for the URI quoting, ie, characters that can safely stay as they are. Other
# special characters are converted to their %.. equivalents for namespace prefixes
_unquotedChars = ':/\?=#~'
_warnChars     = [' ','\n','\r','\t']

def quote_URI(uri, options = None) :
	"""
	'quote' a URI, ie, exchange special characters for their '%..' equivalents. Some of the characters
	may stay as they are (listed in L{_unquotedChars}. If one of the characters listed in L{_warnChars}
	is also in the uri, an extra warning is also generated.
	@param uri: URI
	@param options:
	@type options: L{Options<pyRdfa.Options>}
	"""
	from . import err_unusual_char_in_URI
	suri = uri.strip()
	for c in _warnChars :
		if suri.find(c) != -1 :
			if options != None :
				options.add_warning(err_unusual_char_in_URI % suri)
			break
	return quote(suri, _unquotedChars)

#########################################################################################################

def create_file_name(uri) :
	"""
	Create a suitable file name from an (absolute) URI. Used, eg, for the generation of a file name for a cached vocabulary file.
	"""
	suri = uri.strip()
	final_uri = quote(suri,_unquotedChars)
	# Remove some potentially dangereous characters
	return final_uri.replace(' ','_').replace('%','_').replace('-','_').replace('+','_').replace('/','_').replace('?','_').replace(':','_').replace('=','_').replace('#','_')

#########################################################################################################
def has_one_of_attributes(node,*args) :
	"""
	Check whether one of the listed attributes is present on a (DOM) node.
	@param node: DOM element node
	@param args: possible attribute names
	@return: True or False
	@rtype: Boolean
	"""
	if len(args) == 0 :
		return None
	if isinstance(args[0], tuple) or isinstance(args[0], list) :
		rargs = args[0]
	else :
		rargs = args

	return True in [ node.hasAttribute(attr) for attr in rargs ]

#########################################################################################################
def traverse_tree(node, func) :
	"""Traverse the whole element tree, and perform the function C{func} on all the elements.
	@param node: DOM element node
	@param func: function to be called on the node. Input parameter is a DOM Element Node. If the function returns a boolean True, the recursion is stopped.
	"""
	if func(node) :
		return

	for n in node.childNodes :
		if n.nodeType == node.ELEMENT_NODE :
			traverse_tree(n, func)

#########################################################################################################
def return_XML(state, inode, base = True, xmlns = True) :
	"""
	Get (recursively) the XML Literal content of a DOM Element Node. (Most of the processing is done
	via a C{node.toxml} call of the xml minidom implementation.)

	@param inode: DOM Node
	@param state: L{pyRdfa.state.ExecutionContext}
	@param base: whether the base element should be added to the output
	@type base: Boolean
	@param xmlns: whether the namespace declarations should be repeated in the generated node
	@type xmlns: Boolean
	@return: string
	"""
	node = inode.cloneNode(True)
	# Decorate the element with namespaces values and, optionally, base
	if base :
		node.setAttribute("xml:base",state.base)
	if xmlns :
		for prefix in state.term_or_curie.xmlns :
			if not node.hasAttribute("xmlns:%s" % prefix) :
				node.setAttribute("xmlns:%s" % prefix,"%s" % state.term_or_curie.xmlns[prefix])
		# Set the default namespace, if not done (and is available)
		if not node.getAttribute("xmlns") and state.defaultNS != None :
			node.setAttribute("xmlns", state.defaultNS)

	if sys.version_info[0] >= 3 :
		return node.toxml()
	else :
		q = node.toxml(encoding='utf-8')
		return text_type(q, encoding='utf-8')

#########################################################################################################

def dump(node) :
	"""
	This is just for debug purposes: it prints the essential content of the node in the tree starting at node.

	@param node: DOM node
	"""
	print( node.toprettyxml(indent="", newl="") )