From 71aac1f98644a517036e71263a3133b0d7faad08 Mon Sep 17 00:00:00 2001
From: Julien Anguenot <julien@anguenot.org>
Date: Fri, 14 Oct 2005 16:22:40 +0000
Subject: Enhance the page template input type sniffer. You don't need to add
 the xml header no more. It checks now the xmlns declarations as well. This is
 especially here to avoid IE quirks mode with the xml header presence.

---
 pagetemplatefile.py  | 119 +++++++++++++++++++++++++++++++
 tests/test_ptfile.py | 198 +++++++++++++++++++++++++++++++++++++++++++++++++++
 typesniffer.py       |  64 +++++++++++++++++
 3 files changed, 381 insertions(+)
 create mode 100644 pagetemplatefile.py
 create mode 100644 tests/test_ptfile.py
 create mode 100644 typesniffer.py
diff --git a/pagetemplatefile.py b/pagetemplatefile.py
new file mode 100644
index 0000000..3517f5f
--- /dev/null
+++ b/pagetemplatefile.py
@@ -0,0 +1,119 @@
+##############################################################################
+#
+# Copyright (c) 2001, 2002 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE.
+#
+##############################################################################
+"""Filesystem Page Template module
+
+Zope object encapsulating a Page Template from the filesystem.
+
+$Id$
+"""
+
+__all__ = ("PageTemplateFile",)
+
+import os
+import sys
+import re
+import logging
+
+from zope.pagetemplate.pagetemplate import PageTemplate
+
+from typesniffer import sniff_type
+from typesniffer import XML_PREFIX_MAX_LENGTH
+
+DEFAULT_ENCODING = "utf-8"
+
+meta_pattern = re.compile(
+    r'\s*<meta\s+http-equiv=["\']?Content-Type["\']?'
+    r'\s+content=["\']?([^;]+);\s*charset=([^"\']+)["\']?\s*>\s*',
+    re.IGNORECASE)
+
+def package_home(gdict):
+    filename = gdict["__file__"]
+    return os.path.dirname(filename)
+
+class PageTemplateFile(PageTemplate):
+    "Zope wrapper for filesystem Page Template using TAL, TALES, and METAL"
+
+    _v_last_read = 0
+
+    def __init__(self, filename, _prefix=None):
+        path = self.get_path_from_prefix(_prefix)
+        self.filename = os.path.join(path, filename)
+        if not os.path.isfile(self.filename):
+            raise ValueError("No such file", self.filename)
+
+    def get_path_from_prefix(self, _prefix):
+        if isinstance(_prefix, str):
+            path = _prefix
+        else:
+            if _prefix is None:
+                _prefix = sys._getframe(2).f_globals
+            path = package_home(_prefix)
+        return path
+
+    def _prepare_html(self, text):
+        match = meta_pattern.search(text)
+        if match is not None:
+            type, encoding = match.groups()
+            # TODO: Shouldn't <meta>/<?xml?> stripping
+            # be in PageTemplate.__call__()?
+            text = meta_pattern.sub("", text)
+        else:
+            type = None
+            encoding = DEFAULT_ENCODING
+        return unicode(text, encoding), type
+
+    def _read_file(self):
+        __traceback_info__ = self.filename
+        f = open(self.filename, "rb")
+        try:
+            text = f.read(XML_PREFIX_MAX_LENGTH)
+        except:
+            f.close()
+            raise
+        type_ = sniff_type(text)
+        if type_ == "text/xml":
+            text += f.read()
+        else:
+            # For HTML, we really want the file read in text mode:
+            f.close()
+            f = open(self.filename)
+            text = f.read()
+            text, type_ = self._prepare_html(text)
+        f.close()
+        return text, type_
+
+    def _cook_check(self):
+        if self._v_last_read and not __debug__:
+            return
+        __traceback_info__ = self.filename
+        try:
+            mtime = os.path.getmtime(self.filename)
+        except OSError:
+            mtime = 0
+        if self._v_program is not None and mtime == self._v_last_read:
+            return
+        text, type = self._read_file()
+        self.pt_edit(text, type)
+        self._cook()
+        if self._v_errors:
+            logging.error('PageTemplateFile: Error in template: %s',
+                '\n'.join(self._v_errors))
+            return
+        self._v_last_read = mtime
+
+    def pt_source_file(self):
+        return self.filename
+
+    def __getstate__(self):
+        raise TypeError("non-picklable object")
diff --git a/tests/test_ptfile.py b/tests/test_ptfile.py
new file mode 100644
index 0000000..6d8d5fc
--- /dev/null
+++ b/tests/test_ptfile.py
@@ -0,0 +1,198 @@
+##############################################################################
+#
+# Copyright (c) 2004 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE.
+#
+##############################################################################
+"""Tests of PageTemplateFile.
+
+$Id$
+"""
+import os
+import tempfile
+import unittest
+
+from zope.pagetemplate.pagetemplatefile import PageTemplateFile
+from zope.pagetemplate.typesniffer import sniff_type
+
+class TypeSniffingTestCase(unittest.TestCase):
+
+    TEMPFILENAME = tempfile.mktemp()
+
+    def tearDown(self):
+        if os.path.exists(self.TEMPFILENAME):
+            os.unlink(self.TEMPFILENAME)
+
+    def get_pt(self, text):
+        f = open(self.TEMPFILENAME, "wb")
+        f.write(text)
+        f.close()
+        pt = PageTemplateFile(self.TEMPFILENAME)
+        pt.read()
+        return pt
+
+    def check_content_type(self, text, expected_type):
+        pt = self.get_pt(text)
+        self.assertEqual(pt.content_type, expected_type)
+
+    def test_sniffer_xml_ascii(self):
+        self.check_content_type(
+            "<?xml version='1.0' encoding='ascii'?><doc/>",
+            "text/xml")
+        self.check_content_type(
+            "<?xml\tversion='1.0' encoding='ascii'?><doc/>",
+            "text/xml")
+
+    def test_sniffer_xml_utf8(self):
+        # w/out byte order mark
+        self.check_content_type(
+            "<?xml version='1.0' encoding='utf-8'?><doc/>",
+            "text/xml")
+        self.check_content_type(
+            "<?xml\tversion='1.0' encoding='utf-8'?><doc/>",
+            "text/xml")
+        # with byte order mark
+        self.check_content_type(
+            "\xef\xbb\xbf<?xml version='1.0' encoding='utf-8'?><doc/>",
+            "text/xml")
+        self.check_content_type(
+            "\xef\xbb\xbf<?xml\tversion='1.0' encoding='utf-8'?><doc/>",
+            "text/xml")
+
+    def test_sniffer_xml_utf16_be(self):
+        # w/out byte order mark
+        self.check_content_type(
+            "\0<\0?\0x\0m\0l\0 \0v\0e\0r\0s\0i\0o\0n\0=\0'\01\0.\0000\0'"
+            "\0 \0e\0n\0c\0o\0d\0i\0n\0g\0=\0'\0u\0t\0f\0-\08\0'\0?\0>"
+            "\0<\0d\0o\0c\0/\0>",
+            "text/xml")
+        self.check_content_type(
+            "\0<\0?\0x\0m\0l\0\t\0v\0e\0r\0s\0i\0o\0n\0=\0'\01\0.\0000\0'"
+            "\0 \0e\0n\0c\0o\0d\0i\0n\0g\0=\0'\0u\0t\0f\0-\08\0'\0?\0>"
+            "\0<\0d\0o\0c\0/\0>",
+            "text/xml")
+        # with byte order mark
+        self.check_content_type(
+            "\xfe\xff"
+            "\0<\0?\0x\0m\0l\0 \0v\0e\0r\0s\0i\0o\0n\0=\0'\01\0.\0000\0'"
+            "\0 \0e\0n\0c\0o\0d\0i\0n\0g\0=\0'\0u\0t\0f\0-\08\0'\0?\0>"
+            "\0<\0d\0o\0c\0/\0>",
+            "text/xml")
+        self.check_content_type(
+            "\xfe\xff"
+            "\0<\0?\0x\0m\0l\0\t\0v\0e\0r\0s\0i\0o\0n\0=\0'\01\0.\0000\0'"
+            "\0 \0e\0n\0c\0o\0d\0i\0n\0g\0=\0'\0u\0t\0f\0-\08\0'\0?\0>"
+            "\0<\0d\0o\0c\0/\0>",
+            "text/xml")
+
+    def test_sniffer_xml_utf16_le(self):
+        # w/out byte order mark
+        self.check_content_type(
+            "<\0?\0x\0m\0l\0 \0v\0e\0r\0s\0i\0o\0n\0=\0'\01\0.\0000\0'\0"
+            " \0e\0n\0c\0o\0d\0i\0n\0g\0=\0'\0u\0t\0f\0-\08\0'\0?\0>\0"
+            "<\0d\0o\0c\0/\0>\n",
+            "text/xml")
+        self.check_content_type(
+            "<\0?\0x\0m\0l\0\t\0v\0e\0r\0s\0i\0o\0n\0=\0'\01\0.\0000\0'\0"
+            " \0e\0n\0c\0o\0d\0i\0n\0g\0=\0'\0u\0t\0f\0-\08\0'\0?\0>\0"
+            "<\0d\0o\0c\0/\0>\0",
+            "text/xml")
+        # with byte order mark
+        self.check_content_type(
+            "\xff\xfe"
+            "<\0?\0x\0m\0l\0 \0v\0e\0r\0s\0i\0o\0n\0=\0'\01\0.\0000\0'\0"
+            " \0e\0n\0c\0o\0d\0i\0n\0g\0=\0'\0u\0t\0f\0-\08\0'\0?\0>\0"
+            "<\0d\0o\0c\0/\0>\0",
+            "text/xml")
+        self.check_content_type(
+            "\xff\xfe"
+            "<\0?\0x\0m\0l\0\t\0v\0e\0r\0s\0i\0o\0n\0=\0'\01\0.\0000\0'\0"
+            " \0e\0n\0c\0o\0d\0i\0n\0g\0=\0'\0u\0t\0f\0-\08\0'\0?\0>\0"
+            "<\0d\0o\0c\0/\0>\0",
+            "text/xml")
+
+    HTML_PUBLIC_ID = "-//W3C//DTD HTML 4.01 Transitional//EN"
+    HTML_SYSTEM_ID = "http://www.w3.org/TR/html4/loose.dtd"
+
+    def test_sniffer_html_ascii(self):
+        self.check_content_type(
+            "<!DOCTYPE html [ SYSTEM '%s' ]><html></html>"
+            % self.HTML_SYSTEM_ID,
+            "text/html")
+        self.check_content_type(
+            "<html><head><title>sample document</title></head></html>",
+            "text/html")
+
+    # TODO: This reflects a case that simply isn't handled by the
+    # sniffer; there are many, but it gets it right more often than
+    # before.
+    def donttest_sniffer_xml_simple(self):
+        self.check_content_type("<doc><element/></doc>",
+                                "text/xml")
+
+    def test_html_default_encoding(self):
+        pt = self.get_pt(
+            "<html><head><title>"
+            # 'Test' in russian (utf-8)
+            "\xd0\xa2\xd0\xb5\xd1\x81\xd1\x82"
+            "</title></head></html>")
+        rendered = pt()
+        self.failUnless(isinstance(rendered, unicode))
+        self.failUnlessEqual(rendered,
+            u"<html><head><title>"
+            u"\u0422\u0435\u0441\u0442"
+            u"</title></head></html>\n")
+
+    def test_html_encoding_by_meta(self):
+        pt = self.get_pt(
+            "<html><head><title>"
+            # 'Test' in russian (windows-1251)
+            "\xd2\xe5\xf1\xf2"
+            '</title><meta http-equiv="Content-Type"'
+            ' content="text/html; charset=windows-1251">'
+            "</head></html>")
+        rendered = pt()
+        self.failUnless(isinstance(rendered, unicode))
+        self.failUnlessEqual(rendered,
+            u"<html><head><title>"
+            u"\u0422\u0435\u0441\u0442"
+            u"</title></head></html>\n")
+
+    ##def test_xml_sniffing_from_extension(self):
+    ##    # This checks the extension of the page template
+    ##    this_directory = os.path.split(__file__)[0]
+    ##    filepath = os.path.join(
+    ##        this_directory,
+    ##        'test.xpt')
+    ##    xpt = PageTemplateFile(filepath)
+    ##    self.assert_(os.path.normcase(xpt.filename).endswith('.xpt'))
+    ##    text, type_ = xpt._read_file()
+    ##    self.assertEqual(type_, 'text/xml')
+
+    def test_type_sniffing_based_on_xmlns(self):
+        from zope.pagetemplate.typesniffer import sniff_type
+        self.assertEqual(
+            sniff_type("<doc><element/></doc>"), None)
+        self.assertEqual(
+            sniff_type("<doc xmlns=''><element/></doc>"), 'text/xml')
+        self.assertEqual(
+            sniff_type("<doc><element xmlns=''/></doc>"), 'text/xml')
+        self.assertEqual(
+            sniff_type("<doc xmlns='http://foo/bar'><element/></doc>"),
+            'text/xml')
+        self.assertEqual(
+            sniff_type("<doc ><element xmlns='http://foo/bar'/></doc>"),
+            'text/xml')
+
+def test_suite():
+    return unittest.makeSuite(TypeSniffingTestCase)
+
+if __name__ == "__main__":
+    unittest.main(defaultTest="test_suite")
diff --git a/typesniffer.py b/typesniffer.py
new file mode 100644
index 0000000..6deeb99
--- /dev/null
+++ b/typesniffer.py
@@ -0,0 +1,64 @@
+##############################################################################
+#
+# Copyright (c) 2005 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE.
+#
+##############################################################################
+"""Type sniffer for page template input
+
+$Id$
+"""
+
+import xml.parsers.expat
+
+XML_PREFIXES = [
+    "<?xml",                      # ascii, utf-8
+    "\xef\xbb\xbf<?xml",          # utf-8 w/ byte order mark
+    "\0<\0?\0x\0m\0l",            # utf-16 big endian
+    "<\0?\0x\0m\0l\0",            # utf-16 little endian
+    "\xfe\xff\0<\0?\0x\0m\0l",    # utf-16 big endian w/ byte order mark
+    "\xff\xfe<\0?\0x\0m\0l\0",    # utf-16 little endian w/ byte order mark
+    ]
+
+XML_PREFIX_MAX_LENGTH = max(map(len, XML_PREFIXES))
+
+class NamespaceFound(Exception):
+    # This exception is throwned by the parser when a namespace is
+    # found to stop the parsing.
+    pass
+
+def StartNamespaceDeclHandler(prefix, url):
+    # Called when an element contains a namespace declaration.
+    raise NamespaceFound
+
+def sniff_type(text):
+    """Return 'text/xml' if text appears to be XML, otherwise return None.
+
+     o if the document contains the xml process header <?xml ... ?>
+     o if the document contains any namespace declarations
+    """
+
+    # Check the xml processing header
+    for prefix in XML_PREFIXES:
+        if text.startswith(prefix):
+            return "text/xml"
+
+    # Check if the document contains any namespace declarations
+    parser = xml.parsers.expat.ParserCreate(namespace_separator=' ')
+    parser.StartNamespaceDeclHandler = StartNamespaceDeclHandler
+    try:
+        parser.Parse(text)
+    except xml.parsers.expat.ExpatError:
+        return None
+    except NamespaceFound:
+        return "text/xml"
+    else:
+        return None
+
-- 
cgit v1.2.1