- A refinement to the logic which adds columns to the resulting SQL when

:meth:`.Query.distinct` is combined with :meth:`.Query.order_by` such that columns which are already present will not be added a second time, even if they are labeled with a different name. Regardless of this change, the extra columns added to the SQL have never been returned in the final result, so this change only impacts the string form of the statement as well as its behavior when used in a Core execution context. Additionally, columns are no longer added when the DISTINCT ON format is used, provided the query is not wrapped inside a subquery due to joined eager loading. fixes #3641
author: Mike Bayer <mike_mp@zzzcomputing.com> 2016-02-09 17:49:38 -0500
committer: Mike Bayer <mike_mp@zzzcomputing.com> 2016-02-09 17:49:38 -0500
commit: ff3be95620b6505943b2d7e4688abc29dca3e493 (patch)
tree: 1d90206b004c30bc296d709d5d169bf8a1f2a16a
parent: 7d2bed69abb6ab545cfa5ca967141338387417c2 (diff)
download: sqlalchemy-ff3be95620b6505943b2d7e4688abc29dca3e493.tar.gz
5 files changed, 229 insertions, 23 deletions
diff --git a/doc/build/changelog/changelog_11.rst b/doc/build/changelog/changelog_11.rst
index 5c3ad7163..2473a02a2 100644
--- a/doc/build/changelog/changelog_11.rst
+++ b/doc/build/changelog/changelog_11.rst
@@ -22,6 +22,25 @@
     :version: 1.1.0b1
 
     .. change::
+        :tags: bug, orm
+        :tickets: 3641
+
+        A refinement to the logic which adds columns to the resulting SQL when
+        :meth:`.Query.distinct` is combined with :meth:`.Query.order_by` such
+        that columns which are already present will not be added
+        a second time, even if they are labeled with a different name.
+        Regardless of this change, the extra columns added to the SQL have
+        never been returned in the final result, so this change only impacts
+        the string form of the statement as well as its behavior when used in
+        a Core execution context.   Additionally, columns are no longer added
+        when the DISTINCT ON format is used, provided the query is not
+        wrapped inside a subquery due to joined eager loading.
+
+        .. seealso::
+
+            :ref:`change_3641`
+
+    .. change::
         :tags: feature, sql
         :tickets: 3292, 3095
 
diff --git a/doc/build/changelog/migration_11.rst b/doc/build/changelog/migration_11.rst
index 07223be34..3d65ede80 100644
--- a/doc/build/changelog/migration_11.rst
+++ b/doc/build/changelog/migration_11.rst
@@ -16,7 +16,7 @@ What's New in SQLAlchemy 1.1?
     some issues may be moved to later milestones in order to allow
     for a timely release.
 
-    Document last updated: January 29, 2016
+    Document last updated: Feburary 9, 2016
 
 Introduction
 ============
@@ -487,6 +487,45 @@ associated with any bound :class:`.Engine`, then the fallback to the
 
 :ticket:`3081`
 
+.. _change_3641:
+
+Columns no longer added redundantly with DISTINCT + ORDER BY
+------------------------------------------------------------
+
+A query such as the following will now augment only those columns
+that are missing from the SELECT list, without duplicates::
+
+    q = session.query(User.id, User.name.label('name')).\
+        distinct().\
+        order_by(User.id, User.name, User.fullname)
+
+Produces::
+
+    SELECT DISTINCT user.id AS a_id, user.name AS name,
+     user.fullname AS a_fullname
+    FROM a ORDER BY user.id, user.name, user.fullname
+
+Previously, it would produce::
+
+    SELECT DISTINCT user.id AS a_id, user.name AS name, user.name AS a_name,
+      user.fullname AS a_fullname
+    FROM a ORDER BY user.id, user.name, user.fullname
+
+Where above, the ``user.name`` column is added unnecessarily.  The results
+would not be affected, as the additional columns are not included in the
+result in any case, but the columns are unnecessary.
+
+Additionally, when the Postgresql DISTINCT ON format is used by passing
+expressions to :meth:`.Query.distinct`, the above "column adding" logic
+is disabled entirely.
+
+When the query is being bundled into a subquery for the purposes of
+joined eager loading, the "augment column list" rules are are necessarily
+more aggressive so that the ORDER BY can still be satisifed, so this case
+remains unchanged.
+
+:ticket:`3641`
+
 New Features and Improvements - Core
 ====================================
 
diff --git a/lib/sqlalchemy/orm/query.py b/lib/sqlalchemy/orm/query.py
index 08600c357..8a25f570a 100644
--- a/lib/sqlalchemy/orm/query.py
+++ b/lib/sqlalchemy/orm/query.py
@@ -3254,12 +3254,11 @@ class Query(object):
         # then append eager joins onto that
 
         if context.order_by:
-            order_by_col_expr = list(
-                chain(*[
-                    sql_util.unwrap_order_by(o)
-                    for o in context.order_by
-                ])
-            )
+            order_by_col_expr = \
+                sql_util.expand_column_list_from_order_by(
+                    context.primary_columns,
+                    context.order_by
+                )
         else:
             context.order_by = None
             order_by_col_expr = []
@@ -3319,15 +3318,12 @@ class Query(object):
         if not context.order_by:
             context.order_by = None
 
-        if self._distinct and context.order_by:
-            order_by_col_expr = list(
-                chain(*[
-                    sql_util.unwrap_order_by(o)
-                    for o in context.order_by
-                ])
-            )
-            context.primary_columns += order_by_col_expr
-
+        if self._distinct is True and context.order_by:
+            context.primary_columns += \
+                sql_util.expand_column_list_from_order_by(
+                    context.primary_columns,
+                    context.order_by
+                )
         context.froms += tuple(context.eager_joins.values())
 
         statement = sql.select(
diff --git a/lib/sqlalchemy/sql/util.py b/lib/sqlalchemy/sql/util.py
index 1dcf0ee66..7e294d85f 100644
--- a/lib/sqlalchemy/sql/util.py
+++ b/lib/sqlalchemy/sql/util.py
@@ -176,6 +176,28 @@ def unwrap_order_by(clause):
     return result
 
 
+def expand_column_list_from_order_by(collist, order_by):
+    """Given the columns clause and ORDER BY of a selectable,
+    return a list of column expressions that can be added to the collist
+    corresponding to the ORDER BY, without repeating those already
+    in the collist.
+
+    """
+    cols_already_present = set([
+        col.element if col._order_by_label_element is not None
+        else col for col in collist
+    ])
+
+    return [
+        col for col in
+        chain(*[
+            unwrap_order_by(o)
+            for o in order_by
+        ])
+        if col not in cols_already_present
+    ]
+
+
 def clause_is_present(clause, search):
     """Given a target clause and a second to search within, return True
     if the target is plainly present in the search without any
diff --git a/test/orm/test_query.py b/test/orm/test_query.py
index 6445ffefd..c4c62c319 100644
--- a/test/orm/test_query.py
+++ b/test/orm/test_query.py
@@ -2695,7 +2695,9 @@ class CountTest(QueryTest):
         eq_(q.distinct().count(), 3)
 
 
-class DistinctTest(QueryTest):
+class DistinctTest(QueryTest, AssertsCompiledSQL):
+    __dialect__ = 'default'
+
     def test_basic(self):
         User = self.classes.User
 
@@ -2709,19 +2711,22 @@ class DistinctTest(QueryTest):
             order_by(desc(User.name)).all()
         )
 
-    def test_joined(self):
-        """test that orderbys from a joined table get placed into the columns
-        clause when DISTINCT is used"""
-
+    def test_columns_augmented_roundtrip_one(self):
         User, Address = self.classes.User, self.classes.Address
 
         sess = create_session()
         q = sess.query(User).join('addresses').distinct(). \
             order_by(desc(Address.email_address))
 
-        assert [User(id=7), User(id=9), User(id=8)] == q.all()
+        eq_(
+            [User(id=7), User(id=9), User(id=8)],
+            q.all()
+        )
+
+    def test_columns_augmented_roundtrip_two(self):
+        User, Address = self.classes.User, self.classes.Address
 
-        sess.expunge_all()
+        sess = create_session()
 
         # test that it works on embedded joinedload/LIMIT subquery
         q = sess.query(User).join('addresses').distinct(). \
@@ -2739,6 +2744,131 @@ class DistinctTest(QueryTest):
             ] == q.all()
         self.assert_sql_count(testing.db, go, 1)
 
+    def test_columns_augmented_roundtrip_three(self):
+        User, Address = self.classes.User, self.classes.Address
+
+        sess = create_session()
+
+        q = sess.query(User.id, User.name.label('foo'), Address.id).\
+            filter(User.name == 'jack').\
+            distinct().\
+            order_by(User.id, User.name, Address.email_address)
+
+        # even though columns are added, they aren't in the result
+        eq_(
+            q.all(),
+            [(7, 'jack', 3), (7, 'jack', 4), (7, 'jack', 2),
+             (7, 'jack', 5), (7, 'jack', 1)]
+        )
+        for row in q:
+            eq_(row.keys(), ['id', 'foo', 'id'])
+
+    def test_columns_augmented_sql_one(self):
+        User, Address = self.classes.User, self.classes.Address
+
+        sess = create_session()
+
+        q = sess.query(User.id, User.name.label('foo'), Address.id).\
+            distinct().\
+            order_by(User.id, User.name, Address.email_address)
+
+        # Address.email_address is added because of DISTINCT,
+        # however User.id, User.name are not b.c. they're already there,
+        # even though User.name is labeled
+        self.assert_compile(
+            q,
+            "SELECT DISTINCT users.id AS users_id, users.name AS foo, "
+            "addresses.id AS addresses_id, "
+            "addresses.email_address AS addresses_email_address FROM users, "
+            "addresses ORDER BY users.id, users.name, addresses.email_address"
+        )
+
+    def test_columns_augmented_sql_two(self):
+        User, Address = self.classes.User, self.classes.Address
+
+        sess = create_session()
+
+        q = sess.query(User).\
+            options(joinedload(User.addresses)).\
+            distinct().\
+            order_by(User.name, Address.email_address).\
+            limit(5)
+
+        # addresses.email_address is added to inner query so that
+        # it is available in ORDER BY
+        self.assert_compile(
+            q,
+            "SELECT anon_1.users_id AS anon_1_users_id, "
+            "anon_1.users_name AS anon_1_users_name, "
+            "anon_1.addresses_email_address AS "
+            "anon_1_addresses_email_address, "
+            "addresses_1.id AS addresses_1_id, "
+            "addresses_1.user_id AS addresses_1_user_id, "
+            "addresses_1.email_address AS addresses_1_email_address "
+            "FROM (SELECT DISTINCT users.id AS users_id, "
+            "users.name AS users_name, "
+            "addresses.email_address AS addresses_email_address "
+            "FROM users, addresses "
+            "ORDER BY users.name, addresses.email_address "
+            "LIMIT :param_1) AS anon_1 LEFT OUTER JOIN "
+            "addresses AS addresses_1 "
+            "ON anon_1.users_id = addresses_1.user_id "
+            "ORDER BY anon_1.users_name, "
+            "anon_1.addresses_email_address, addresses_1.id"
+        )
+
+    def test_columns_augmented_sql_three(self):
+        User, Address = self.classes.User, self.classes.Address
+
+        sess = create_session()
+
+        q = sess.query(User.id, User.name.label('foo'), Address.id).\
+            distinct(User.name).\
+            order_by(User.id, User.name, Address.email_address)
+
+        # no columns are added when DISTINCT ON is used
+        self.assert_compile(
+            q,
+            "SELECT DISTINCT ON (users.name) users.id AS users_id, "
+            "users.name AS foo, addresses.id AS addresses_id FROM users, "
+            "addresses ORDER BY users.id, users.name, addresses.email_address",
+            dialect='postgresql'
+        )
+
+    def test_columns_augmented_sql_four(self):
+        User, Address = self.classes.User, self.classes.Address
+
+        sess = create_session()
+
+        q = sess.query(User).join('addresses').\
+            distinct(Address.email_address). \
+            options(joinedload('addresses')).\
+            order_by(desc(Address.email_address)).limit(2)
+
+        # but for the subquery / eager load case, we still need to make
+        # the inner columns available for the ORDER BY even though its
+        # a DISTINCT ON
+        self.assert_compile(
+            q,
+            "SELECT anon_1.users_id AS anon_1_users_id, "
+            "anon_1.users_name AS anon_1_users_name, "
+            "anon_1.addresses_email_address AS "
+            "anon_1_addresses_email_address, "
+            "addresses_1.id AS addresses_1_id, "
+            "addresses_1.user_id AS addresses_1_user_id, "
+            "addresses_1.email_address AS addresses_1_email_address "
+            "FROM (SELECT DISTINCT ON (addresses.email_address) "
+            "users.id AS users_id, users.name AS users_name, "
+            "addresses.email_address AS addresses_email_address "
+            "FROM users JOIN addresses ON users.id = addresses.user_id "
+            "ORDER BY addresses.email_address DESC  "
+            "LIMIT %(param_1)s) AS anon_1 "
+            "LEFT OUTER JOIN addresses AS addresses_1 "
+            "ON anon_1.users_id = addresses_1.user_id "
+            "ORDER BY anon_1.addresses_email_address DESC, addresses_1.id",
+            dialect='postgresql'
+        )
+
 
 class PrefixWithTest(QueryTest, AssertsCompiledSQL):
author	Mike Bayer <mike_mp@zzzcomputing.com>	2016-02-09 17:49:38 -0500
committer	Mike Bayer <mike_mp@zzzcomputing.com>	2016-02-09 17:49:38 -0500
commit	ff3be95620b6505943b2d7e4688abc29dca3e493 (patch)
tree	1d90206b004c30bc296d709d5d169bf8a1f2a16a
parent	7d2bed69abb6ab545cfa5ca967141338387417c2 (diff)
download	sqlalchemy-ff3be95620b6505943b2d7e4688abc29dca3e493.tar.gz