use tree for count(distinct) when possible

author: sasha@mysql.sashanet.com <> 2001-05-11 15:07:34 -0600
committer: sasha@mysql.sashanet.com <> 2001-05-11 15:07:34 -0600
commit: c706bf40f3723e4224844a52f632529fda3412ca (patch)
tree: c1011c976487f613ded4fa7db34dbff547084d18
parent: a12117f0364761f89d6fdb015ece9ada2d1ce494 (diff)
download: mariadb-git-c706bf40f3723e4224844a52f632529fda3412ca.tar.gz
4 files changed, 117 insertions, 5 deletions
diff --git a/include/my_tree.h b/include/my_tree.h
index c0950409588..d4d28644a39 100644
--- a/include/my_tree.h
+++ b/include/my_tree.h
@@ -57,7 +57,7 @@ typedef struct st_tree {
   void (*free)(void *);
 } TREE;
 
-	/* Functions on hole tree */
+	/* Functions on whole tree */
 void init_tree(TREE *tree,uint default_alloc_size, int element_size,
 	       qsort_cmp2 compare, my_bool with_delete,
 	       void (*free_element)(void*));
diff --git a/mysys/tree.c b/mysys/tree.c
index db2b3989b81..e46ff00adad 100644
--- a/mysys/tree.c
+++ b/mysys/tree.c
@@ -84,7 +84,7 @@ void init_tree(TREE *tree, uint default_alloc_size, int size,
       ((uint) size <= sizeof(void*) || ((uint) size & (sizeof(void*)-1))))
   {
     tree->offset_to_key=sizeof(TREE_ELEMENT); /* Put key after element */
-    /* Fix allocation size so that we don't loose any memory */
+    /* Fix allocation size so that we don't lose any memory */
     default_alloc_size/=(sizeof(TREE_ELEMENT)+size);
     if (!default_alloc_size)
       default_alloc_size=1;
diff --git a/sql/item_sum.cc b/sql/item_sum.cc
index 375ba081f80..da0bc71ca1f 100644
--- a/sql/item_sum.cc
+++ b/sql/item_sum.cc
@@ -788,11 +788,56 @@ String *Item_std_field::val_str(String *str)
 
 #include "sql_select.h"
 
+static int simple_raw_key_cmp(void* arg, byte* key1, byte* key2)
+{
+  return memcmp(key1, key2, (int)arg);
+}
+
+static int simple_str_key_cmp(void* arg, byte* key1, byte* key2)
+{
+  return my_sortcmp(key1, key2, (int)arg);
+}
+
+// did not make this one static - at least gcc gets confused when
+// I try to declare a static function as a friend. If you can figure
+// out the syntax to make a static function a friend, make this one
+// static
+int composite_key_cmp(void* arg, byte* key1, byte* key2)
+{
+  Item_sum_count_distinct* item = (Item_sum_count_distinct*)arg;
+  Field** field = item->table->field, **field_end;
+  field_end = field + item->table->fields;
+  for(; field < field_end; ++field)
+    {
+      int res;
+      int len = (*field)->field_length;
+      switch((*field)->type())
+	{
+	case FIELD_TYPE_STRING:
+	case FIELD_TYPE_VAR_STRING:
+	  res = my_sortcmp(key1, key2, len);
+	  break;
+	default:
+	  res = memcmp(key1, key2, len);
+	  break;
+	}
+      if(res)
+	return res;
+      key1 += len;
+      key2 += len;
+    }
+  return 0;
+}
+
+
+
 Item_sum_count_distinct::~Item_sum_count_distinct()
 {
   if (table)
     free_tmp_table(current_thd, table);
   delete tmp_table_param;
+  if(use_tree)
+    delete_tree(&tree);
 }
 
 
@@ -821,6 +866,53 @@ bool Item_sum_count_distinct::setup(THD *thd)
 			       0, 0, current_lex->options | thd->options)))
     return 1;
   table->file->extra(HA_EXTRA_NO_ROWS);		// Don't update rows
+
+  if(table->db_type == DB_TYPE_HEAP) // no blobs, otherwise it would be
+    // MyISAM
+    {
+      qsort_cmp2 compare_key;
+      void* cmp_arg;
+      int key_len;
+      
+      if(table->fields == 1) // if we have only one field, which is
+	// the most common use of count(distinct), it is much faster
+	// to use a simpler key compare method that can take advantage
+	// of not having to worry about other fields
+	{
+	  switch(table->field[0]->type())
+	    {
+	      // if we have a string, we must take care of charsets
+	      // and case sensitivity
+	    case FIELD_TYPE_STRING:
+	    case FIELD_TYPE_VAR_STRING:
+	      compare_key = (qsort_cmp2)simple_str_key_cmp;
+	      break;
+	    default: // since at this point we cannot have blobs
+	      // anything else can be compared with memcmp
+	      compare_key = (qsort_cmp2)simple_raw_key_cmp;
+	      break;
+	    }
+	  cmp_arg = (void*)(key_len = table->field[0]->field_length);
+	  rec_offset = 1;
+	}
+      else // too bad, cannot cheat - there is more than one field
+	{
+	  cmp_arg = (void*)this;
+	  compare_key = (qsort_cmp2)composite_key_cmp;
+	  Field** field, **field_end;
+	  field_end = (field = table->field) + table->fields;
+	  for(key_len = 0; field < field_end; ++field)
+	    {
+	      key_len += (*field)->field_length;
+	    }
+	  rec_offset = table->reclength - key_len;
+	}
+
+      init_tree(&tree, 0, key_len, compare_key, 0, 0);
+      tree.cmp_arg = cmp_arg;
+      use_tree = 1;
+    }
+  
   return 0;
 }
 
@@ -830,6 +922,8 @@ void Item_sum_count_distinct::reset()
   table->file->extra(HA_EXTRA_NO_CACHE);
   table->file->delete_all_rows();
   table->file->extra(HA_EXTRA_WRITE_CACHE);
+  if(use_tree)
+    delete_tree(&tree);
   (void) add();
 }
 
@@ -843,7 +937,12 @@ bool Item_sum_count_distinct::add()
     if ((*field)->is_real_null(0))
       return 0;					// Don't count NULL
 
-  if ((error=table->file->write_row(table->record[0])))
+  if(use_tree)
+    {
+      if(!tree_insert(&tree, table->record[0] + rec_offset, 0))
+	return 1;
+    }
+  else if ((error=table->file->write_row(table->record[0])))
   {
     if (error != HA_ERR_FOUND_DUPP_KEY &&
 	error != HA_ERR_FOUND_DUPP_UNIQUE)
@@ -859,6 +958,8 @@ longlong Item_sum_count_distinct::val_int()
 {
   if (!table)					// Empty query
     return LL(0);
+  if(use_tree)
+    return tree.elements_in_tree;
   table->file->info(HA_STATUS_VARIABLE | HA_STATUS_NO_LOCK);
   return table->file->records;
 }
diff --git a/sql/item_sum.h b/sql/item_sum.h
index 8cb09c85623..863623c3b1a 100644
--- a/sql/item_sum.h
+++ b/sql/item_sum.h
@@ -21,6 +21,8 @@
 #pragma interface			/* gcc class implementation */
 #endif
 
+#include <my_tree.h>
+
 class Item_sum :public Item_result_field
 {
 public:
@@ -145,11 +147,20 @@ class Item_sum_count_distinct :public Item_sum_int
   table_map used_table_cache;
   bool fix_fields(THD *thd,TABLE_LIST *tables);
   TMP_TABLE_PARAM *tmp_table_param;
-
+  TREE tree;
+  bool use_tree; // If there are no blobs, we can use a tree, which
+  // is faster than heap table. In that case, we still use the table
+  // to help get things set up, but we insert nothing in it
+  int rec_offset; // the first few bytes of record ( at least one)
+  // are just markers for deleted and NULLs. We want to skip them since
+  // they will just bloat the tree without providing any valuable info
+
+  friend int composite_key_cmp(void* arg, byte* key1, byte* key2);
+  
   public:
   Item_sum_count_distinct(List<Item> &list)
     :Item_sum_int(list),table(0),used_table_cache(~(table_map) 0),
-    tmp_table_param(0)
+     tmp_table_param(0),use_tree(0)
   { quick_group=0; }
   ~Item_sum_count_distinct();
   table_map used_tables() const { return used_table_cache; }
author	sasha@mysql.sashanet.com <>	2001-05-11 15:07:34 -0600
committer	sasha@mysql.sashanet.com <>	2001-05-11 15:07:34 -0600
commit	c706bf40f3723e4224844a52f632529fda3412ca (patch)
tree	c1011c976487f613ded4fa7db34dbff547084d18
parent	a12117f0364761f89d6fdb015ece9ada2d1ce494 (diff)
download	mariadb-git-c706bf40f3723e4224844a52f632529fda3412ca.tar.gz