diff options
-rw-r--r-- | pagetemplatefile.py | 119 | ||||
-rw-r--r-- | tests/test_ptfile.py | 198 | ||||
-rw-r--r-- | typesniffer.py | 64 |
3 files changed, 381 insertions, 0 deletions
diff --git a/pagetemplatefile.py b/pagetemplatefile.py new file mode 100644 index 0000000..3517f5f --- /dev/null +++ b/pagetemplatefile.py @@ -0,0 +1,119 @@ +############################################################################## +# +# Copyright (c) 2001, 2002 Zope Corporation and Contributors. +# All Rights Reserved. +# +# This software is subject to the provisions of the Zope Public License, +# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution. +# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED +# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS +# FOR A PARTICULAR PURPOSE. +# +############################################################################## +"""Filesystem Page Template module + +Zope object encapsulating a Page Template from the filesystem. + +$Id$ +""" + +__all__ = ("PageTemplateFile",) + +import os +import sys +import re +import logging + +from zope.pagetemplate.pagetemplate import PageTemplate + +from typesniffer import sniff_type +from typesniffer import XML_PREFIX_MAX_LENGTH + +DEFAULT_ENCODING = "utf-8" + +meta_pattern = re.compile( + r'\s*<meta\s+http-equiv=["\']?Content-Type["\']?' + r'\s+content=["\']?([^;]+);\s*charset=([^"\']+)["\']?\s*>\s*', + re.IGNORECASE) + +def package_home(gdict): + filename = gdict["__file__"] + return os.path.dirname(filename) + +class PageTemplateFile(PageTemplate): + "Zope wrapper for filesystem Page Template using TAL, TALES, and METAL" + + _v_last_read = 0 + + def __init__(self, filename, _prefix=None): + path = self.get_path_from_prefix(_prefix) + self.filename = os.path.join(path, filename) + if not os.path.isfile(self.filename): + raise ValueError("No such file", self.filename) + + def get_path_from_prefix(self, _prefix): + if isinstance(_prefix, str): + path = _prefix + else: + if _prefix is None: + _prefix = sys._getframe(2).f_globals + path = package_home(_prefix) + return path + + def _prepare_html(self, text): + match = meta_pattern.search(text) + if match is not None: + type, encoding = match.groups() + # TODO: Shouldn't <meta>/<?xml?> stripping + # be in PageTemplate.__call__()? + text = meta_pattern.sub("", text) + else: + type = None + encoding = DEFAULT_ENCODING + return unicode(text, encoding), type + + def _read_file(self): + __traceback_info__ = self.filename + f = open(self.filename, "rb") + try: + text = f.read(XML_PREFIX_MAX_LENGTH) + except: + f.close() + raise + type_ = sniff_type(text) + if type_ == "text/xml": + text += f.read() + else: + # For HTML, we really want the file read in text mode: + f.close() + f = open(self.filename) + text = f.read() + text, type_ = self._prepare_html(text) + f.close() + return text, type_ + + def _cook_check(self): + if self._v_last_read and not __debug__: + return + __traceback_info__ = self.filename + try: + mtime = os.path.getmtime(self.filename) + except OSError: + mtime = 0 + if self._v_program is not None and mtime == self._v_last_read: + return + text, type = self._read_file() + self.pt_edit(text, type) + self._cook() + if self._v_errors: + logging.error('PageTemplateFile: Error in template: %s', + '\n'.join(self._v_errors)) + return + self._v_last_read = mtime + + def pt_source_file(self): + return self.filename + + def __getstate__(self): + raise TypeError("non-picklable object") diff --git a/tests/test_ptfile.py b/tests/test_ptfile.py new file mode 100644 index 0000000..6d8d5fc --- /dev/null +++ b/tests/test_ptfile.py @@ -0,0 +1,198 @@ +############################################################################## +# +# Copyright (c) 2004 Zope Corporation and Contributors. +# All Rights Reserved. +# +# This software is subject to the provisions of the Zope Public License, +# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution. +# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED +# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS +# FOR A PARTICULAR PURPOSE. +# +############################################################################## +"""Tests of PageTemplateFile. + +$Id$ +""" +import os +import tempfile +import unittest + +from zope.pagetemplate.pagetemplatefile import PageTemplateFile +from zope.pagetemplate.typesniffer import sniff_type + +class TypeSniffingTestCase(unittest.TestCase): + + TEMPFILENAME = tempfile.mktemp() + + def tearDown(self): + if os.path.exists(self.TEMPFILENAME): + os.unlink(self.TEMPFILENAME) + + def get_pt(self, text): + f = open(self.TEMPFILENAME, "wb") + f.write(text) + f.close() + pt = PageTemplateFile(self.TEMPFILENAME) + pt.read() + return pt + + def check_content_type(self, text, expected_type): + pt = self.get_pt(text) + self.assertEqual(pt.content_type, expected_type) + + def test_sniffer_xml_ascii(self): + self.check_content_type( + "<?xml version='1.0' encoding='ascii'?><doc/>", + "text/xml") + self.check_content_type( + "<?xml\tversion='1.0' encoding='ascii'?><doc/>", + "text/xml") + + def test_sniffer_xml_utf8(self): + # w/out byte order mark + self.check_content_type( + "<?xml version='1.0' encoding='utf-8'?><doc/>", + "text/xml") + self.check_content_type( + "<?xml\tversion='1.0' encoding='utf-8'?><doc/>", + "text/xml") + # with byte order mark + self.check_content_type( + "\xef\xbb\xbf<?xml version='1.0' encoding='utf-8'?><doc/>", + "text/xml") + self.check_content_type( + "\xef\xbb\xbf<?xml\tversion='1.0' encoding='utf-8'?><doc/>", + "text/xml") + + def test_sniffer_xml_utf16_be(self): + # w/out byte order mark + self.check_content_type( + "\0<\0?\0x\0m\0l\0 \0v\0e\0r\0s\0i\0o\0n\0=\0'\01\0.\0000\0'" + "\0 \0e\0n\0c\0o\0d\0i\0n\0g\0=\0'\0u\0t\0f\0-\08\0'\0?\0>" + "\0<\0d\0o\0c\0/\0>", + "text/xml") + self.check_content_type( + "\0<\0?\0x\0m\0l\0\t\0v\0e\0r\0s\0i\0o\0n\0=\0'\01\0.\0000\0'" + "\0 \0e\0n\0c\0o\0d\0i\0n\0g\0=\0'\0u\0t\0f\0-\08\0'\0?\0>" + "\0<\0d\0o\0c\0/\0>", + "text/xml") + # with byte order mark + self.check_content_type( + "\xfe\xff" + "\0<\0?\0x\0m\0l\0 \0v\0e\0r\0s\0i\0o\0n\0=\0'\01\0.\0000\0'" + "\0 \0e\0n\0c\0o\0d\0i\0n\0g\0=\0'\0u\0t\0f\0-\08\0'\0?\0>" + "\0<\0d\0o\0c\0/\0>", + "text/xml") + self.check_content_type( + "\xfe\xff" + "\0<\0?\0x\0m\0l\0\t\0v\0e\0r\0s\0i\0o\0n\0=\0'\01\0.\0000\0'" + "\0 \0e\0n\0c\0o\0d\0i\0n\0g\0=\0'\0u\0t\0f\0-\08\0'\0?\0>" + "\0<\0d\0o\0c\0/\0>", + "text/xml") + + def test_sniffer_xml_utf16_le(self): + # w/out byte order mark + self.check_content_type( + "<\0?\0x\0m\0l\0 \0v\0e\0r\0s\0i\0o\0n\0=\0'\01\0.\0000\0'\0" + " \0e\0n\0c\0o\0d\0i\0n\0g\0=\0'\0u\0t\0f\0-\08\0'\0?\0>\0" + "<\0d\0o\0c\0/\0>\n", + "text/xml") + self.check_content_type( + "<\0?\0x\0m\0l\0\t\0v\0e\0r\0s\0i\0o\0n\0=\0'\01\0.\0000\0'\0" + " \0e\0n\0c\0o\0d\0i\0n\0g\0=\0'\0u\0t\0f\0-\08\0'\0?\0>\0" + "<\0d\0o\0c\0/\0>\0", + "text/xml") + # with byte order mark + self.check_content_type( + "\xff\xfe" + "<\0?\0x\0m\0l\0 \0v\0e\0r\0s\0i\0o\0n\0=\0'\01\0.\0000\0'\0" + " \0e\0n\0c\0o\0d\0i\0n\0g\0=\0'\0u\0t\0f\0-\08\0'\0?\0>\0" + "<\0d\0o\0c\0/\0>\0", + "text/xml") + self.check_content_type( + "\xff\xfe" + "<\0?\0x\0m\0l\0\t\0v\0e\0r\0s\0i\0o\0n\0=\0'\01\0.\0000\0'\0" + " \0e\0n\0c\0o\0d\0i\0n\0g\0=\0'\0u\0t\0f\0-\08\0'\0?\0>\0" + "<\0d\0o\0c\0/\0>\0", + "text/xml") + + HTML_PUBLIC_ID = "-//W3C//DTD HTML 4.01 Transitional//EN" + HTML_SYSTEM_ID = "http://www.w3.org/TR/html4/loose.dtd" + + def test_sniffer_html_ascii(self): + self.check_content_type( + "<!DOCTYPE html [ SYSTEM '%s' ]><html></html>" + % self.HTML_SYSTEM_ID, + "text/html") + self.check_content_type( + "<html><head><title>sample document</title></head></html>", + "text/html") + + # TODO: This reflects a case that simply isn't handled by the + # sniffer; there are many, but it gets it right more often than + # before. + def donttest_sniffer_xml_simple(self): + self.check_content_type("<doc><element/></doc>", + "text/xml") + + def test_html_default_encoding(self): + pt = self.get_pt( + "<html><head><title>" + # 'Test' in russian (utf-8) + "\xd0\xa2\xd0\xb5\xd1\x81\xd1\x82" + "</title></head></html>") + rendered = pt() + self.failUnless(isinstance(rendered, unicode)) + self.failUnlessEqual(rendered, + u"<html><head><title>" + u"\u0422\u0435\u0441\u0442" + u"</title></head></html>\n") + + def test_html_encoding_by_meta(self): + pt = self.get_pt( + "<html><head><title>" + # 'Test' in russian (windows-1251) + "\xd2\xe5\xf1\xf2" + '</title><meta http-equiv="Content-Type"' + ' content="text/html; charset=windows-1251">' + "</head></html>") + rendered = pt() + self.failUnless(isinstance(rendered, unicode)) + self.failUnlessEqual(rendered, + u"<html><head><title>" + u"\u0422\u0435\u0441\u0442" + u"</title></head></html>\n") + + ##def test_xml_sniffing_from_extension(self): + ## # This checks the extension of the page template + ## this_directory = os.path.split(__file__)[0] + ## filepath = os.path.join( + ## this_directory, + ## 'test.xpt') + ## xpt = PageTemplateFile(filepath) + ## self.assert_(os.path.normcase(xpt.filename).endswith('.xpt')) + ## text, type_ = xpt._read_file() + ## self.assertEqual(type_, 'text/xml') + + def test_type_sniffing_based_on_xmlns(self): + from zope.pagetemplate.typesniffer import sniff_type + self.assertEqual( + sniff_type("<doc><element/></doc>"), None) + self.assertEqual( + sniff_type("<doc xmlns=''><element/></doc>"), 'text/xml') + self.assertEqual( + sniff_type("<doc><element xmlns=''/></doc>"), 'text/xml') + self.assertEqual( + sniff_type("<doc xmlns='http://foo/bar'><element/></doc>"), + 'text/xml') + self.assertEqual( + sniff_type("<doc ><element xmlns='http://foo/bar'/></doc>"), + 'text/xml') + +def test_suite(): + return unittest.makeSuite(TypeSniffingTestCase) + +if __name__ == "__main__": + unittest.main(defaultTest="test_suite") diff --git a/typesniffer.py b/typesniffer.py new file mode 100644 index 0000000..6deeb99 --- /dev/null +++ b/typesniffer.py @@ -0,0 +1,64 @@ +############################################################################## +# +# Copyright (c) 2005 Zope Corporation and Contributors. +# All Rights Reserved. +# +# This software is subject to the provisions of the Zope Public License, +# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution. +# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED +# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS +# FOR A PARTICULAR PURPOSE. +# +############################################################################## +"""Type sniffer for page template input + +$Id$ +""" + +import xml.parsers.expat + +XML_PREFIXES = [ + "<?xml", # ascii, utf-8 + "\xef\xbb\xbf<?xml", # utf-8 w/ byte order mark + "\0<\0?\0x\0m\0l", # utf-16 big endian + "<\0?\0x\0m\0l\0", # utf-16 little endian + "\xfe\xff\0<\0?\0x\0m\0l", # utf-16 big endian w/ byte order mark + "\xff\xfe<\0?\0x\0m\0l\0", # utf-16 little endian w/ byte order mark + ] + +XML_PREFIX_MAX_LENGTH = max(map(len, XML_PREFIXES)) + +class NamespaceFound(Exception): + # This exception is throwned by the parser when a namespace is + # found to stop the parsing. + pass + +def StartNamespaceDeclHandler(prefix, url): + # Called when an element contains a namespace declaration. + raise NamespaceFound + +def sniff_type(text): + """Return 'text/xml' if text appears to be XML, otherwise return None. + + o if the document contains the xml process header <?xml ... ?> + o if the document contains any namespace declarations + """ + + # Check the xml processing header + for prefix in XML_PREFIXES: + if text.startswith(prefix): + return "text/xml" + + # Check if the document contains any namespace declarations + parser = xml.parsers.expat.ParserCreate(namespace_separator=' ') + parser.StartNamespaceDeclHandler = StartNamespaceDeclHandler + try: + parser.Parse(text) + except xml.parsers.expat.ExpatError: + return None + except NamespaceFound: + return "text/xml" + else: + return None + |