- rework Oracle to no longer do its own unicode conversion; this has been observed

to be very slow. this now has the effect of producing "conditional" unicode conversion for the Oracle backend, as it still returns NVARCHAR etc. as unicode [ticket:2911] - add new "conditional" functionality to unicode processors; the C-level function now uses PyUnicode_Check() as a fast alternative to the isinstance() check in Python
author: Mike Bayer <mike_mp@zzzcomputing.com> 2014-01-17 17:36:43 -0500
committer: Mike Bayer <mike_mp@zzzcomputing.com> 2014-01-17 17:36:43 -0500
commit: 882f615c68cd2d244a8d2cf480f3532a84bdb6fa (patch)
tree: 546c82bc04351bca317f570f1a696ebc3ae5674e
parent: 4765895d10ff4bc89f30c99fa709438fa9764b6c (diff)
download: sqlalchemy-882f615c68cd2d244a8d2cf480f3532a84bdb6fa.tar.gz
5 files changed, 104 insertions, 17 deletions
diff --git a/doc/build/changelog/changelog_09.rst b/doc/build/changelog/changelog_09.rst
index cfb4a5b24..369eb6c42 100644
--- a/doc/build/changelog/changelog_09.rst
+++ b/doc/build/changelog/changelog_09.rst
@@ -15,6 +15,43 @@
     :version: 0.9.2
 
     .. change::
+        :tags: bug, oracle
+        :tickets: 2911
+
+        It's been observed that the usage of a cx_Oracle "outputtypehandler"
+        in Python 2.xx in order to coerce string values to Unicode is inordinately
+        expensive; even though cx_Oracle is written in C, when you pass the
+        Python ``unicode`` primitive to cursor.var() and associate with an output
+        handler, the library counts every conversion as a Python function call
+        with all the requisite overhead being recorded; this *despite* the fact
+        when running in Python 3, all strings are also unconditionally coerced
+        to unicode but it does *not* incur this overhead,
+        meaning that cx_Oracle is failing to use performant techniques in Py2K.
+        As SQLAlchemy cannot easily select for this style of type handler on a
+        per-column basis, the handler was assembled unconditionally thereby
+        adding the overhead to all string access.
+
+        So this logic has been replaced with SQLAlchemy's own unicode
+        conversion system, which now
+        only takes effect in Py2K for columns that are requested as unicode.
+        When C extensions are used, SQLAlchemy's system appears to be 2-3x faster than
+        cx_Oracle's.  Additionally, SQLAlchemy's unicode conversion has been
+        enhanced such that when the "conditional" converter is required
+        (now needed for the Oracle backend), the check for "already unicode" is now
+        performed in C and no longer introduces significant overhead.
+
+        This change has two impacts on the cx_Oracle backend.  One is that
+        string values in Py2K which aren't specifically requested with the
+        Unicode type or convert_unicode=True will now come back as ``str``,
+        not ``unicode`` - this behavior is similar to a backend such as
+        MySQL.  Additionally, when unicode values are requested with the cx_Oracle
+        backend, if the C extensions are *not* used, there is now an additional
+        overhead of an isinstance() check per column.  This tradeoff has been
+        made as it can be worked around and no longer places a performance burden
+        on the likely majority of Oracle result columns that are non-unicode
+        strings.
+
+    .. change::
         :tags: bug, orm
         :tickets: 2908
 
diff --git a/lib/sqlalchemy/cextension/processors.c b/lib/sqlalchemy/cextension/processors.c
index c1e68fe0f..d56817763 100644
--- a/lib/sqlalchemy/cextension/processors.c
+++ b/lib/sqlalchemy/cextension/processors.c
@@ -409,6 +409,45 @@ UnicodeResultProcessor_process(UnicodeResultProcessor *self, PyObject *value)
     return PyUnicode_Decode(str, len, encoding, errors);
 }
 
+static PyObject *
+UnicodeResultProcessor_conditional_process(UnicodeResultProcessor *self, PyObject *value)
+{
+    const char *encoding, *errors;
+    char *str;
+    Py_ssize_t len;
+
+    if (value == Py_None)
+        Py_RETURN_NONE;
+
+#if PY_MAJOR_VERSION >= 3
+    if (PyUnicode_Check(value) == 1) {
+        Py_INCREF(value);
+        return value;
+    }
+
+    if (PyBytes_AsStringAndSize(value, &str, &len))
+        return NULL;
+
+    encoding = PyBytes_AS_STRING(self->encoding);
+    errors = PyBytes_AS_STRING(self->errors);
+#else
+
+    if (PyUnicode_Check(value) == 1) {
+        Py_INCREF(value);
+        return value;
+    }
+
+    if (PyString_AsStringAndSize(value, &str, &len))
+        return NULL;
+
+
+    encoding = PyString_AS_STRING(self->encoding);
+    errors = PyString_AS_STRING(self->errors);
+#endif
+
+    return PyUnicode_Decode(str, len, encoding, errors);
+}
+
 static void
 UnicodeResultProcessor_dealloc(UnicodeResultProcessor *self)
 {
@@ -424,6 +463,8 @@ UnicodeResultProcessor_dealloc(UnicodeResultProcessor *self)
 static PyMethodDef UnicodeResultProcessor_methods[] = {
     {"process", (PyCFunction)UnicodeResultProcessor_process, METH_O,
      "The value processor itself."},
+    {"conditional_process", (PyCFunction)UnicodeResultProcessor_conditional_process, METH_O,
+     "Conditional version of the value processor."},
     {NULL}  /* Sentinel */
 };
 
diff --git a/lib/sqlalchemy/dialects/oracle/cx_oracle.py b/lib/sqlalchemy/dialects/oracle/cx_oracle.py
index c427e4bca..599eb21a3 100644
--- a/lib/sqlalchemy/dialects/oracle/cx_oracle.py
+++ b/lib/sqlalchemy/dialects/oracle/cx_oracle.py
@@ -748,9 +748,6 @@ class OracleDialect_cx_oracle(OracleDialect):
                             255,
                             outconverter=self._detect_decimal,
                             arraysize=cursor.arraysize)
-            # allow all strings to come back natively as Unicode
-            elif defaultType in (cx_Oracle.STRING, cx_Oracle.FIXED_CHAR):
-                return cursor.var(util.text_type, size, cursor.arraysize)
 
         def on_connect(conn):
             conn.outputtypehandler = output_type_handler
diff --git a/lib/sqlalchemy/processors.py b/lib/sqlalchemy/processors.py
index 0abf063b3..d0f52e42b 100644
--- a/lib/sqlalchemy/processors.py
+++ b/lib/sqlalchemy/processors.py
@@ -15,6 +15,7 @@ They all share one common characteristic: None is passed through unchanged.
 import codecs
 import re
 import datetime
+from . import util
 
 
 def str_to_datetime_processor_factory(regexp, type_):
@@ -66,6 +67,21 @@ def py_fallback():
                 return decoder(value, errors)[0]
         return process
 
+    def to_conditional_unicode_processor_factory(encoding, errors=None):
+        decoder = codecs.getdecoder(encoding)
+
+        def process(value):
+            if value is None:
+                return None
+            elif isinstance(value, util.text_type):
+                return value
+            else:
+                # decoder returns a tuple: (value, len). Simply dropping the
+                # len part is safe: it is done that way in the normal
+                # 'xx'.decode(encoding) code path.
+                return decoder(value, errors)[0]
+        return process
+
     def to_decimal_processor_factory(target_class, scale):
         fstring = "%%.%df" % scale
 
@@ -113,12 +129,17 @@ try:
                                        str_to_date
 
     def to_unicode_processor_factory(encoding, errors=None):
-        # this is cumbersome but it would be even more so on the C side
         if errors is not None:
             return UnicodeResultProcessor(encoding, errors).process
         else:
             return UnicodeResultProcessor(encoding).process
 
+    def to_conditional_unicode_processor_factory(encoding, errors=None):
+        if errors is not None:
+            return UnicodeResultProcessor(encoding, errors).conditional_process
+        else:
+            return UnicodeResultProcessor(encoding).conditional_process
+
     def to_decimal_processor_factory(target_class, scale):
         # Note that the scale argument is not taken into account for integer
         # values in the C implementation while it is in the Python one.
diff --git a/lib/sqlalchemy/sql/sqltypes.py b/lib/sqlalchemy/sql/sqltypes.py
index 702e77360..0cc90f26b 100644
--- a/lib/sqlalchemy/sql/sqltypes.py
+++ b/lib/sqlalchemy/sql/sqltypes.py
@@ -204,20 +204,11 @@ class String(Concatenable, TypeEngine):
                                     dialect.encoding, self.unicode_error)
 
             if needs_isinstance:
-                # we wouldn't be here unless convert_unicode='force'
-                # was specified, or the driver has erratic unicode-returning
-                # habits.  since we will be getting back unicode
-                # in most cases, we check for it (decode will fail).
-                def process(value):
-                    if isinstance(value, util.text_type):
-                        return value
-                    else:
-                        return to_unicode(value)
-                return process
+                return processors.to_conditional_unicode_processor_factory(
+                                    dialect.encoding, self.unicode_error)
             else:
-                # here, we assume that the object is not unicode,
-                # avoiding expensive isinstance() check.
-                return to_unicode
+                return processors.to_unicode_processor_factory(
+                                    dialect.encoding, self.unicode_error)
         else:
             return None
author	Mike Bayer <mike_mp@zzzcomputing.com>	2014-01-17 17:36:43 -0500
committer	Mike Bayer <mike_mp@zzzcomputing.com>	2014-01-17 17:36:43 -0500
commit	882f615c68cd2d244a8d2cf480f3532a84bdb6fa (patch)
tree	546c82bc04351bca317f570f1a696ebc3ae5674e
parent	4765895d10ff4bc89f30c99fa709438fa9764b6c (diff)
download	sqlalchemy-882f615c68cd2d244a8d2cf480f3532a84bdb6fa.tar.gz