3 files changed, 381 insertions, 0 deletions
diff --git a/pagetemplatefile.py b/pagetemplatefile.py
new file mode 100644
index 0000000..3517f5f
--- /dev/null
+++ b/pagetemplatefile.py
@@ -0,0 +1,119 @@
+##############################################################################
+#
+# Copyright (c) 2001, 2002 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE.
+#
+##############################################################################
+"""Filesystem Page Template module
+
+Zope object encapsulating a Page Template from the filesystem.
+
+$Id$
+"""
+
+__all__ = ("PageTemplateFile",)
+
+import os
+import sys
+import re
+import logging
+
+from zope.pagetemplate.pagetemplate import PageTemplate
+
+from typesniffer import sniff_type
+from typesniffer import XML_PREFIX_MAX_LENGTH
+
+DEFAULT_ENCODING = "utf-8"
+
+meta_pattern = re.compile(
+    r'\s*<meta\s+http-equiv=["\']?Content-Type["\']?'
+    r'\s+content=["\']?([^;]+);\s*charset=([^"\']+)["\']?\s*>\s*',
+    re.IGNORECASE)
+
+def package_home(gdict):
+    filename = gdict["__file__"]
+    return os.path.dirname(filename)
+
+class PageTemplateFile(PageTemplate):
+    "Zope wrapper for filesystem Page Template using TAL, TALES, and METAL"
+
+    _v_last_read = 0
+
+    def __init__(self, filename, _prefix=None):
+        path = self.get_path_from_prefix(_prefix)
+        self.filename = os.path.join(path, filename)
+        if not os.path.isfile(self.filename):
+            raise ValueError("No such file", self.filename)
+
+    def get_path_from_prefix(self, _prefix):
+        if isinstance(_prefix, str):
+            path = _prefix
+        else:
+            if _prefix is None:
+                _prefix = sys._getframe(2).f_globals
+            path = package_home(_prefix)
+        return path
+
+    def _prepare_html(self, text):
+        match = meta_pattern.search(text)
+        if match is not None:
+            type, encoding = match.groups()
+            # TODO: Shouldn't <meta>/<?xml?> stripping
+            # be in PageTemplate.__call__()?
+            text = meta_pattern.sub("", text)
+        else:
+            type = None
+            encoding = DEFAULT_ENCODING
+        return unicode(text, encoding), type
+
+    def _read_file(self):
+        __traceback_info__ = self.filename
+        f = open(self.filename, "rb")
+        try:
+            text = f.read(XML_PREFIX_MAX_LENGTH)
+        except:
+            f.close()
+            raise
+        type_ = sniff_type(text)
+        if type_ == "text/xml":
+            text += f.read()
+        else:
+            # For HTML, we really want the file read in text mode:
+            f.close()
+            f = open(self.filename)
+            text = f.read()
+            text, type_ = self._prepare_html(text)
+        f.close()
+        return text, type_
+
+    def _cook_check(self):
+        if self._v_last_read and not __debug__:
+            return
+        __traceback_info__ = self.filename
+        try:
+            mtime = os.path.getmtime(self.filename)
+        except OSError:
+            mtime = 0
+        if self._v_program is not None and mtime == self._v_last_read:
+            return
+        text, type = self._read_file()
+        self.pt_edit(text, type)
+        self._cook()
+        if self._v_errors:
+            logging.error('PageTemplateFile: Error in template: %s',
+                '\n'.join(self._v_errors))
+            return
+        self._v_last_read = mtime
+
+    def pt_source_file(self):
+        return self.filename
+
+    def __getstate__(self):
+        raise TypeError("non-picklable object")
diff --git a/tests/test_ptfile.py b/tests/test_ptfile.py
new file mode 100644
index 0000000..6d8d5fc
--- /dev/null
+++ b/tests/test_ptfile.py
@@ -0,0 +1,198 @@
+##############################################################################
+#
+# Copyright (c) 2004 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE.
+#
+##############################################################################
+"""Tests of PageTemplateFile.
+
+$Id$
+"""
+import os
+import tempfile
+import unittest
+
+from zope.pagetemplate.pagetemplatefile import PageTemplateFile
+from zope.pagetemplate.typesniffer import sniff_type
+
+class TypeSniffingTestCase(unittest.TestCase):
+
+    TEMPFILENAME = tempfile.mktemp()
+
+    def tearDown(self):
+        if os.path.exists(self.TEMPFILENAME):
+            os.unlink(self.TEMPFILENAME)
+
+    def get_pt(self, text):
+        f = open(self.TEMPFILENAME, "wb")
+        f.write(text)
+        f.close()
+        pt = PageTemplateFile(self.TEMPFILENAME)
+        pt.read()
+        return pt
+
+    def check_content_type(self, text, expected_type):
+        pt = self.get_pt(text)
+        self.assertEqual(pt.content_type, expected_type)
+
+    def test_sniffer_xml_ascii(self):
+        self.check_content_type(
+            "<?xml version='1.0' encoding='ascii'?><doc/>",
+            "text/xml")
+        self.check_content_type(
+            "<?xml\tversion='1.0' encoding='ascii'?><doc/>",
+            "text/xml")
+
+    def test_sniffer_xml_utf8(self):
+        # w/out byte order mark
+        self.check_content_type(
+            "<?xml version='1.0' encoding='utf-8'?><doc/>",
+            "text/xml")
+        self.check_content_type(
+            "<?xml\tversion='1.0' encoding='utf-8'?><doc/>",
+            "text/xml")
+        # with byte order mark
+        self.check_content_type(
+            "\xef\xbb\xbf<?xml version='1.0' encoding='utf-8'?><doc/>",
+            "text/xml")
+        self.check_content_type(
+            "\xef\xbb\xbf<?xml\tversion='1.0' encoding='utf-8'?><doc/>",
+            "text/xml")
+
+    def test_sniffer_xml_utf16_be(self):
+        # w/out byte order mark
+        self.check_content_type(
+            "\0<\0?\0x\0m\0l\0 \0v\0e\0r\0s\0i\0o\0n\0=\0'\01\0.\0000\0'"
+            "\0 \0e\0n\0c\0o\0d\0i\0n\0g\0=\0'\0u\0t\0f\0-\08\0'\0?\0>"
+            "\0<\0d\0o\0c\0/\0>",
+            "text/xml")
+        self.check_content_type(
+            "\0<\0?\0x\0m\0l\0\t\0v\0e\0r\0s\0i\0o\0n\0=\0'\01\0.\0000\0'"
+            "\0 \0e\0n\0c\0o\0d\0i\0n\0g\0=\0'\0u\0t\0f\0-\08\0'\0?\0>"
+            "\0<\0d\0o\0c\0/\0>",
+            "text/xml")
+        # with byte order mark
+        self.check_content_type(
+            "\xfe\xff"
+            "\0<\0?\0x\0m\0l\0 \0v\0e\0r\0s\0i\0o\0n\0=\0'\01\0.\0000\0'"
+            "\0 \0e\0n\0c\0o\0d\0i\0n\0g\0=\0'\0u\0t\0f\0-\08\0'\0?\0>"
+            "\0<\0d\0o\0c\0/\0>",
+            "text/xml")
+        self.check_content_type(
+            "\xfe\xff"
+            "\0<\0?\0x\0m\0l\0\t\0v\0e\0r\0s\0i\0o\0n\0=\0'\01\0.\0000\0'"
+            "\0 \0e\0n\0c\0o\0d\0i\0n\0g\0=\0'\0u\0t\0f\0-\08\0'\0?\0>"
+            "\0<\0d\0o\0c\0/\0>",
+            "text/xml")
+
+    def test_sniffer_xml_utf16_le(self):
+        # w/out byte order mark
+        self.check_content_type(
+            "<\0?\0x\0m\0l\0 \0v\0e\0r\0s\0i\0o\0n\0=\0'\01\0.\0000\0'\0"
+            " \0e\0n\0c\0o\0d\0i\0n\0g\0=\0'\0u\0t\0f\0-\08\0'\0?\0>\0"
+            "<\0d\0o\0c\0/\0>\n",
+            "text/xml")
+        self.check_content_type(
+            "<\0?\0x\0m\0l\0\t\0v\0e\0r\0s\0i\0o\0n\0=\0'\01\0.\0000\0'\0"
+            " \0e\0n\0c\0o\0d\0i\0n\0g\0=\0'\0u\0t\0f\0-\08\0'\0?\0>\0"
+            "<\0d\0o\0c\0/\0>\0",
+            "text/xml")
+        # with byte order mark
+        self.check_content_type(
+            "\xff\xfe"
+            "<\0?\0x\0m\0l\0 \0v\0e\0r\0s\0i\0o\0n\0=\0'\01\0.\0000\0'\0"
+            " \0e\0n\0c\0o\0d\0i\0n\0g\0=\0'\0u\0t\0f\0-\08\0'\0?\0>\0"
+            "<\0d\0o\0c\0/\0>\0",
+            "text/xml")
+        self.check_content_type(
+            "\xff\xfe"
+            "<\0?\0x\0m\0l\0\t\0v\0e\0r\0s\0i\0o\0n\0=\0'\01\0.\0000\0'\0"
+            " \0e\0n\0c\0o\0d\0i\0n\0g\0=\0'\0u\0t\0f\0-\08\0'\0?\0>\0"
+            "<\0d\0o\0c\0/\0>\0",
+            "text/xml")
+
+    HTML_PUBLIC_ID = "-//W3C//DTD HTML 4.01 Transitional//EN"
+    HTML_SYSTEM_ID = "http://www.w3.org/TR/html4/loose.dtd"
+
+    def test_sniffer_html_ascii(self):
+        self.check_content_type(
+            "<!DOCTYPE html [ SYSTEM '%s' ]><html></html>"
+            % self.HTML_SYSTEM_ID,
+            "text/html")
+        self.check_content_type(
+            "<html><head><title>sample document</title></head></html>",
+            "text/html")
+
+    # TODO: This reflects a case that simply isn't handled by the
+    # sniffer; there are many, but it gets it right more often than
+    # before.
+    def donttest_sniffer_xml_simple(self):
+        self.check_content_type("<doc><element/></doc>",
+                                "text/xml")
+
+    def test_html_default_encoding(self):
+        pt = self.get_pt(
+            "<html><head><title>"
+            # 'Test' in russian (utf-8)
+            "\xd0\xa2\xd0\xb5\xd1\x81\xd1\x82"
+            "</title></head></html>")
+        rendered = pt()
+        self.failUnless(isinstance(rendered, unicode))
+        self.failUnlessEqual(rendered,
+            u"<html><head><title>"
+            u"\u0422\u0435\u0441\u0442"
+            u"</title></head></html>\n")
+
+    def test_html_encoding_by_meta(self):
+        pt = self.get_pt(
+            "<html><head><title>"
+            # 'Test' in russian (windows-1251)
+            "\xd2\xe5\xf1\xf2"
+            '</title><meta http-equiv="Content-Type"'
+            ' content="text/html; charset=windows-1251">'
+            "</head></html>")
+        rendered = pt()
+        self.failUnless(isinstance(rendered, unicode))
+        self.failUnlessEqual(rendered,
+            u"<html><head><title>"
+            u"\u0422\u0435\u0441\u0442"
+            u"</title></head></html>\n")
+
+    ##def test_xml_sniffing_from_extension(self):
+    ##    # This checks the extension of the page template
+    ##    this_directory = os.path.split(__file__)[0]
+    ##    filepath = os.path.join(
+    ##        this_directory,
+    ##        'test.xpt')
+    ##    xpt = PageTemplateFile(filepath)
+    ##    self.assert_(os.path.normcase(xpt.filename).endswith('.xpt'))
+    ##    text, type_ = xpt._read_file()
+    ##    self.assertEqual(type_, 'text/xml')
+
+    def test_type_sniffing_based_on_xmlns(self):
+        from zope.pagetemplate.typesniffer import sniff_type
+        self.assertEqual(
+            sniff_type("<doc><element/></doc>"), None)
+        self.assertEqual(
+            sniff_type("<doc xmlns=''><element/></doc>"), 'text/xml')
+        self.assertEqual(
+            sniff_type("<doc><element xmlns=''/></doc>"), 'text/xml')
+        self.assertEqual(
+            sniff_type("<doc xmlns='http://foo/bar'><element/></doc>"),
+            'text/xml')
+        self.assertEqual(
+            sniff_type("<doc ><element xmlns='http://foo/bar'/></doc>"),
+            'text/xml')
+
+def test_suite():
+    return unittest.makeSuite(TypeSniffingTestCase)
+
+if __name__ == "__main__":
+    unittest.main(defaultTest="test_suite")
diff --git a/typesniffer.py b/typesniffer.py
new file mode 100644
index 0000000..6deeb99
--- /dev/null
+++ b/typesniffer.py
@@ -0,0 +1,64 @@
+##############################################################################
+#
+# Copyright (c) 2005 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE.
+#
+##############################################################################
+"""Type sniffer for page template input
+
+$Id$
+"""
+
+import xml.parsers.expat
+
+XML_PREFIXES = [
+    "<?xml",                      # ascii, utf-8
+    "\xef\xbb\xbf<?xml",          # utf-8 w/ byte order mark
+    "\0<\0?\0x\0m\0l",            # utf-16 big endian
+    "<\0?\0x\0m\0l\0",            # utf-16 little endian
+    "\xfe\xff\0<\0?\0x\0m\0l",    # utf-16 big endian w/ byte order mark
+    "\xff\xfe<\0?\0x\0m\0l\0",    # utf-16 little endian w/ byte order mark
+    ]
+
+XML_PREFIX_MAX_LENGTH = max(map(len, XML_PREFIXES))
+
+class NamespaceFound(Exception):
+    # This exception is throwned by the parser when a namespace is
+    # found to stop the parsing.
+    pass
+
+def StartNamespaceDeclHandler(prefix, url):
+    # Called when an element contains a namespace declaration.
+    raise NamespaceFound
+
+def sniff_type(text):
+    """Return 'text/xml' if text appears to be XML, otherwise return None.
+
+     o if the document contains the xml process header <?xml ... ?>
+     o if the document contains any namespace declarations
+    """
+
+    # Check the xml processing header
+    for prefix in XML_PREFIXES:
+        if text.startswith(prefix):
+            return "text/xml"
+
+    # Check if the document contains any namespace declarations
+    parser = xml.parsers.expat.ParserCreate(namespace_separator=' ')
+    parser.StartNamespaceDeclHandler = StartNamespaceDeclHandler
+    try:
+        parser.Parse(text)
+    except xml.parsers.expat.ExpatError:
+        return None
+    except NamespaceFound:
+        return "text/xml"
+    else:
+        return None
+