From f6066573d25508f5cbbc5c12254086d419bb8828 Mon Sep 17 00:00:00 2001
From: Simon McVittie <simon.mcvittie@collabora.co.uk>
Date: Mon, 25 Jun 2012 17:01:51 +0100
Subject: fd.o #40817: validate UTF-8 according to the same rules as libdbus

---
 NEWS                            |   8 ++++
 _dbus_bindings/message-append.c | 102 ++++++++++++++++++++++++++++------------
 configure.ac                    |   4 ++
 test/test-standalone.py         |  59 ++++++++++++++++++++++-
 4 files changed, 143 insertions(+), 30 deletions(-)

diff --git a/NEWS b/NEWS
index 178809f..2d579af 100644
--- a/NEWS
+++ b/NEWS
@@ -1,8 +1,16 @@
 D-Bus Python Bindings 1.1.1 (UNRELEASED)
 ========================================
 
+Dependencies:
+
+* libdbus 1.6 or later is now recommended. It is not strictly required yet.
+
 Fixes:
 
+• Validate UTF-8 according to the rules libdbus uses, falling back to our
+  own (inefficient) implementation if not compiled against dbus >= 1.6
+  (fd.o #40817)
+
 • Under Python 3, in the absence of introspection or signature='...',
   pass dbus.ObjectPath or dbus.Signature arguments with the obvious
   signature 'o' or 'g', not 's'. This previously only worked in Python 2.
diff --git a/_dbus_bindings/message-append.c b/_dbus_bindings/message-append.c
index df3190d..e519ae2 100644
--- a/_dbus_bindings/message-append.c
+++ b/_dbus_bindings/message-append.c
@@ -531,6 +531,7 @@ _message_iter_append_string(DBusMessageIter *appender,
                             dbus_bool_t allow_object_path_attr)
 {
     char *s;
+    PyObject *utf8;
 
     if (sig_type == DBUS_TYPE_OBJECT_PATH && allow_object_path_attr) {
         PyObject *object_path = get_object_path (obj);
@@ -550,44 +551,87 @@ _message_iter_append_string(DBusMessageIter *appender,
     }
 
     if (PyBytes_Check(obj)) {
-        PyObject *unicode;
-
-        /* Raise TypeError if the string has embedded NULs */
-        if (PyBytes_AsStringAndSize(obj, &s, NULL) < 0) return -1;
-        /* Surely there's a faster stdlib way to validate UTF-8... */
-        unicode = PyUnicode_DecodeUTF8(s, PyBytes_GET_SIZE(obj), NULL);
-        if (!unicode) {
-            PyErr_SetString(PyExc_UnicodeError, "String parameters "
-                            "to be sent over D-Bus must be valid UTF-8");
-            return -1;
-        }
-        Py_CLEAR(unicode);
-
-        DBG("Performing actual append: string %s", s);
-        if (!dbus_message_iter_append_basic(appender, sig_type,
-                                            &s)) {
-            PyErr_NoMemory();
-            return -1;
-        }
+        utf8 = obj;
+        Py_INCREF(obj);
     }
     else if (PyUnicode_Check(obj)) {
-        PyObject *utf8 = PyUnicode_AsUTF8String(obj);
+        utf8 = PyUnicode_AsUTF8String(obj);
         if (!utf8) return -1;
-        /* Raise TypeError if the string has embedded NULs */
-        if (PyBytes_AsStringAndSize(utf8, &s, NULL) < 0) return -1;
-        DBG("Performing actual append: string (from unicode) %s", s);
-        if (!dbus_message_iter_append_basic(appender, sig_type, &s)) {
-            Py_CLEAR(utf8);
-            PyErr_NoMemory();
-            return -1;
-        }
-        Py_CLEAR(utf8);
     }
     else {
         PyErr_SetString(PyExc_TypeError,
                         "Expected a string or unicode object");
         return -1;
     }
+
+    /* Raise TypeError if the string has embedded NULs */
+    if (PyBytes_AsStringAndSize(utf8, &s, NULL) < 0)
+        return -1;
+
+    /* Validate UTF-8, strictly */
+#ifdef HAVE_DBUS_VALIDATE_UTF8
+    if (!dbus_validate_utf8(s, NULL)) {
+        PyErr_SetString(PyExc_UnicodeError, "String parameters "
+                        "to be sent over D-Bus must be valid UTF-8 "
+                        "with no noncharacter code points");
+        return -1;
+    }
+#else
+    {
+        PyObject *back_to_unicode;
+        PyObject *utf32;
+        Py_ssize_t i;
+
+        /* This checks for syntactically valid UTF-8, but does not check
+         * for noncharacters (U+nFFFE, U+nFFFF for any n, or U+FDD0..U+FDEF).
+         */
+        back_to_unicode = PyUnicode_DecodeUTF8(s, PyBytes_GET_SIZE(utf8),
+                                               "strict");
+
+        if (!back_to_unicode) {
+            return -1;
+        }
+
+        utf32 = PyUnicode_AsUTF32String(back_to_unicode);
+        Py_CLEAR(back_to_unicode);
+
+        if (!utf32) {
+            return -1;
+        }
+
+        for (i = 0; i < PyBytes_GET_SIZE(utf32) / 4; i++) {
+            dbus_uint32_t *p;
+
+            p = (dbus_uint32_t *) (PyBytes_AS_STRING(utf32)) + i;
+
+            if (/* noncharacters U+nFFFE, U+nFFFF */
+                (*p & 0xFFFF) == 0xFFFE ||
+                (*p & 0xFFFF) == 0xFFFF ||
+                /* noncharacters U+FDD0..U+FDEF */
+                (*p >= 0xFDD0 && *p <= 0xFDEF) ||
+                /* surrogates U+D800..U+DBFF (low), U+DC00..U+DFFF (high) */
+                (*p >= 0xD800 && *p <= 0xDFFF) ||
+                (*p >= 0x110000)) {
+                Py_CLEAR(utf32);
+                PyErr_SetString(PyExc_UnicodeError, "String parameters "
+                                "to be sent over D-Bus must be valid UTF-8 "
+                                "with no noncharacter code points");
+                return -1;
+            }
+        }
+
+        Py_CLEAR(utf32);
+    }
+#endif
+
+    DBG("Performing actual append: string (from unicode) %s", s);
+    if (!dbus_message_iter_append_basic(appender, sig_type, &s)) {
+        Py_CLEAR(utf8);
+        PyErr_NoMemory();
+        return -1;
+    }
+
+    Py_CLEAR(utf8);
     return 0;
 }
 
diff --git a/configure.ac b/configure.ac
index ea99996..ad644d9 100644
--- a/configure.ac
+++ b/configure.ac
@@ -158,9 +158,13 @@ PKG_CHECK_MODULES(DBUS, [dbus-1 >= 1.4])
 PKG_CHECK_MODULES(DBUS_GLIB, [dbus-glib-1 >= 0.70])
 
 dbuspy_save_CFLAGS="$CFLAGS"
+dbuspy_save_LIBS="$LIBS"
 CFLAGS="$CFLAGS $DBUS_CFLAGS"
+LIBS="$CFLAGS $DBUS_LIBS"
 AC_CHECK_TYPES([DBusBasicValue], [], [], [#include <dbus/dbus.h>])
+AC_CHECK_FUNCS([dbus_validate_utf8])
 CFLAGS="$dbuspy_save_CFLAGS"
+LIBS="$dbuspy_save_LIBS"
 
 TP_COMPILER_WARNINGS([CFLAGS_WARNINGS], [test] dbus_python_released [= 0],
   [all \
diff --git a/test/test-standalone.py b/test/test-standalone.py
index 6f403ee..584ba4f 100755
--- a/test/test-standalone.py
+++ b/test/test-standalone.py
@@ -423,6 +423,64 @@ class TestMessageMarshalling(unittest.TestCase):
             raise AssertionError('Appending too many things in a struct '
                                  'should fail')
 
+    def test_utf8(self):
+        from _dbus_bindings import SignalMessage
+        if is_py3:
+            def utf8(*xs):
+                return bytes(xs)
+            def uni(x):
+                return chr(x)
+        else:
+            def utf8(*xs):
+                return str('').join(map(chr, xs))
+            def uni(x):
+                return unichr(x)
+        for bad in [
+                uni(0xD800),
+                utf8(0xed, 0xa0, 0x80),
+                uni(0xFDD0),
+                utf8(0xef, 0xb7, 0x90),
+                uni(0xFDD7),
+                utf8(0xef, 0xb7, 0x97),
+                uni(0xFDEF),
+                utf8(0xef, 0xb7, 0xaf),
+                uni(0xFFFE),
+                utf8(0xef, 0xbf, 0xbe),
+                uni(0xFFFF),
+                utf8(0xef, 0xbf, 0xbf),
+                uni(0x0001FFFE),
+                utf8(0xf0, 0x9f, 0xbf, 0xbe),
+                uni(0x0001FFFF),
+                utf8(0xf0, 0x9f, 0xbf, 0xbf),
+                uni(0x0007FFFE),
+                utf8(0xf1, 0xbf, 0xbf, 0xbe),
+                uni(0x0007FFFF),
+                utf8(0xf1, 0xbf, 0xbf, 0xbf),
+                uni(0x0010FFFE),
+                utf8(0xf4, 0x8f, 0xbf, 0xbe),
+                uni(0x0010FFFF),
+                utf8(0xf4, 0x8f, 0xbf, 0xbf),
+                ]:
+            s = SignalMessage('/', 'foo.bar', 'baz')
+            try:
+                s.append(bad, signature='s')
+            except UnicodeError:
+                pass
+            else:
+                raise AssertionError('Appending %r should fail' % bad)
+        for good in [
+                uni(0xfdcf),
+                uni(0xfdf0),
+                uni(0xfeff),
+                uni(0x0001feff),
+                uni(0x00020000),
+                uni(0x0007feff),
+                uni(0x00080000),
+                uni(0x0010feff),
+                ]:
+            s = SignalMessage('/', 'foo.bar', 'baz')
+            s.append(good, signature='s')
+            s.append(good.encode('utf-8'), signature='s')
 
 class TestMatching(unittest.TestCase):
     def setUp(self):
@@ -442,7 +500,6 @@ class TestMatching(unittest.TestCase):
         self._message.append('/', signature='o')
         self.assertFalse(self._match.maybe_handle_message(self._message))
 
-
 if __name__ == '__main__':
     # Python 2.6 doesn't accept a `verbosity` keyword.
     kwargs = {}
-- 
cgit v1.2.1