diff options
author | Mike Bayer <mike_mp@zzzcomputing.com> | 2014-01-17 17:36:43 -0500 |
---|---|---|
committer | Mike Bayer <mike_mp@zzzcomputing.com> | 2014-01-17 17:36:43 -0500 |
commit | 882f615c68cd2d244a8d2cf480f3532a84bdb6fa (patch) | |
tree | 546c82bc04351bca317f570f1a696ebc3ae5674e | |
parent | 4765895d10ff4bc89f30c99fa709438fa9764b6c (diff) | |
download | sqlalchemy-882f615c68cd2d244a8d2cf480f3532a84bdb6fa.tar.gz |
- rework Oracle to no longer do its own unicode conversion; this has been observed
to be very slow. this now has the effect of producing "conditional" unicode
conversion for the Oracle backend, as it still returns NVARCHAR etc. as unicode
[ticket:2911]
- add new "conditional" functionality to unicode processors; the C-level
function now uses PyUnicode_Check() as a fast alternative to the isinstance()
check in Python
-rw-r--r-- | doc/build/changelog/changelog_09.rst | 37 | ||||
-rw-r--r-- | lib/sqlalchemy/cextension/processors.c | 41 | ||||
-rw-r--r-- | lib/sqlalchemy/dialects/oracle/cx_oracle.py | 3 | ||||
-rw-r--r-- | lib/sqlalchemy/processors.py | 23 | ||||
-rw-r--r-- | lib/sqlalchemy/sql/sqltypes.py | 17 |
5 files changed, 104 insertions, 17 deletions
diff --git a/doc/build/changelog/changelog_09.rst b/doc/build/changelog/changelog_09.rst index cfb4a5b24..369eb6c42 100644 --- a/doc/build/changelog/changelog_09.rst +++ b/doc/build/changelog/changelog_09.rst @@ -15,6 +15,43 @@ :version: 0.9.2 .. change:: + :tags: bug, oracle + :tickets: 2911 + + It's been observed that the usage of a cx_Oracle "outputtypehandler" + in Python 2.xx in order to coerce string values to Unicode is inordinately + expensive; even though cx_Oracle is written in C, when you pass the + Python ``unicode`` primitive to cursor.var() and associate with an output + handler, the library counts every conversion as a Python function call + with all the requisite overhead being recorded; this *despite* the fact + when running in Python 3, all strings are also unconditionally coerced + to unicode but it does *not* incur this overhead, + meaning that cx_Oracle is failing to use performant techniques in Py2K. + As SQLAlchemy cannot easily select for this style of type handler on a + per-column basis, the handler was assembled unconditionally thereby + adding the overhead to all string access. + + So this logic has been replaced with SQLAlchemy's own unicode + conversion system, which now + only takes effect in Py2K for columns that are requested as unicode. + When C extensions are used, SQLAlchemy's system appears to be 2-3x faster than + cx_Oracle's. Additionally, SQLAlchemy's unicode conversion has been + enhanced such that when the "conditional" converter is required + (now needed for the Oracle backend), the check for "already unicode" is now + performed in C and no longer introduces significant overhead. + + This change has two impacts on the cx_Oracle backend. One is that + string values in Py2K which aren't specifically requested with the + Unicode type or convert_unicode=True will now come back as ``str``, + not ``unicode`` - this behavior is similar to a backend such as + MySQL. Additionally, when unicode values are requested with the cx_Oracle + backend, if the C extensions are *not* used, there is now an additional + overhead of an isinstance() check per column. This tradeoff has been + made as it can be worked around and no longer places a performance burden + on the likely majority of Oracle result columns that are non-unicode + strings. + + .. change:: :tags: bug, orm :tickets: 2908 diff --git a/lib/sqlalchemy/cextension/processors.c b/lib/sqlalchemy/cextension/processors.c index c1e68fe0f..d56817763 100644 --- a/lib/sqlalchemy/cextension/processors.c +++ b/lib/sqlalchemy/cextension/processors.c @@ -409,6 +409,45 @@ UnicodeResultProcessor_process(UnicodeResultProcessor *self, PyObject *value) return PyUnicode_Decode(str, len, encoding, errors); } +static PyObject * +UnicodeResultProcessor_conditional_process(UnicodeResultProcessor *self, PyObject *value) +{ + const char *encoding, *errors; + char *str; + Py_ssize_t len; + + if (value == Py_None) + Py_RETURN_NONE; + +#if PY_MAJOR_VERSION >= 3 + if (PyUnicode_Check(value) == 1) { + Py_INCREF(value); + return value; + } + + if (PyBytes_AsStringAndSize(value, &str, &len)) + return NULL; + + encoding = PyBytes_AS_STRING(self->encoding); + errors = PyBytes_AS_STRING(self->errors); +#else + + if (PyUnicode_Check(value) == 1) { + Py_INCREF(value); + return value; + } + + if (PyString_AsStringAndSize(value, &str, &len)) + return NULL; + + + encoding = PyString_AS_STRING(self->encoding); + errors = PyString_AS_STRING(self->errors); +#endif + + return PyUnicode_Decode(str, len, encoding, errors); +} + static void UnicodeResultProcessor_dealloc(UnicodeResultProcessor *self) { @@ -424,6 +463,8 @@ UnicodeResultProcessor_dealloc(UnicodeResultProcessor *self) static PyMethodDef UnicodeResultProcessor_methods[] = { {"process", (PyCFunction)UnicodeResultProcessor_process, METH_O, "The value processor itself."}, + {"conditional_process", (PyCFunction)UnicodeResultProcessor_conditional_process, METH_O, + "Conditional version of the value processor."}, {NULL} /* Sentinel */ }; diff --git a/lib/sqlalchemy/dialects/oracle/cx_oracle.py b/lib/sqlalchemy/dialects/oracle/cx_oracle.py index c427e4bca..599eb21a3 100644 --- a/lib/sqlalchemy/dialects/oracle/cx_oracle.py +++ b/lib/sqlalchemy/dialects/oracle/cx_oracle.py @@ -748,9 +748,6 @@ class OracleDialect_cx_oracle(OracleDialect): 255, outconverter=self._detect_decimal, arraysize=cursor.arraysize) - # allow all strings to come back natively as Unicode - elif defaultType in (cx_Oracle.STRING, cx_Oracle.FIXED_CHAR): - return cursor.var(util.text_type, size, cursor.arraysize) def on_connect(conn): conn.outputtypehandler = output_type_handler diff --git a/lib/sqlalchemy/processors.py b/lib/sqlalchemy/processors.py index 0abf063b3..d0f52e42b 100644 --- a/lib/sqlalchemy/processors.py +++ b/lib/sqlalchemy/processors.py @@ -15,6 +15,7 @@ They all share one common characteristic: None is passed through unchanged. import codecs import re import datetime +from . import util def str_to_datetime_processor_factory(regexp, type_): @@ -66,6 +67,21 @@ def py_fallback(): return decoder(value, errors)[0] return process + def to_conditional_unicode_processor_factory(encoding, errors=None): + decoder = codecs.getdecoder(encoding) + + def process(value): + if value is None: + return None + elif isinstance(value, util.text_type): + return value + else: + # decoder returns a tuple: (value, len). Simply dropping the + # len part is safe: it is done that way in the normal + # 'xx'.decode(encoding) code path. + return decoder(value, errors)[0] + return process + def to_decimal_processor_factory(target_class, scale): fstring = "%%.%df" % scale @@ -113,12 +129,17 @@ try: str_to_date def to_unicode_processor_factory(encoding, errors=None): - # this is cumbersome but it would be even more so on the C side if errors is not None: return UnicodeResultProcessor(encoding, errors).process else: return UnicodeResultProcessor(encoding).process + def to_conditional_unicode_processor_factory(encoding, errors=None): + if errors is not None: + return UnicodeResultProcessor(encoding, errors).conditional_process + else: + return UnicodeResultProcessor(encoding).conditional_process + def to_decimal_processor_factory(target_class, scale): # Note that the scale argument is not taken into account for integer # values in the C implementation while it is in the Python one. diff --git a/lib/sqlalchemy/sql/sqltypes.py b/lib/sqlalchemy/sql/sqltypes.py index 702e77360..0cc90f26b 100644 --- a/lib/sqlalchemy/sql/sqltypes.py +++ b/lib/sqlalchemy/sql/sqltypes.py @@ -204,20 +204,11 @@ class String(Concatenable, TypeEngine): dialect.encoding, self.unicode_error) if needs_isinstance: - # we wouldn't be here unless convert_unicode='force' - # was specified, or the driver has erratic unicode-returning - # habits. since we will be getting back unicode - # in most cases, we check for it (decode will fail). - def process(value): - if isinstance(value, util.text_type): - return value - else: - return to_unicode(value) - return process + return processors.to_conditional_unicode_processor_factory( + dialect.encoding, self.unicode_error) else: - # here, we assume that the object is not unicode, - # avoiding expensive isinstance() check. - return to_unicode + return processors.to_unicode_processor_factory( + dialect.encoding, self.unicode_error) else: return None |