diff options
author | unknown <mronstrom@mysql.com> | 2005-07-18 13:31:02 +0200 |
---|---|---|
committer | unknown <mronstrom@mysql.com> | 2005-07-18 13:31:02 +0200 |
commit | cd483c5520949ee9840628b68cd78b9a8c88e6b5 (patch) | |
tree | 49a4797f25aaf50e6e6c5ab9d193608d969a612e /sql | |
parent | 22545f477752987c8f70c0bc4740d2e8b67a6578 (diff) | |
download | mariadb-git-cd483c5520949ee9840628b68cd78b9a8c88e6b5.tar.gz |
Patch for push of wl1354 Partitioning
Diffstat (limited to 'sql')
44 files changed, 9106 insertions, 314 deletions
diff --git a/sql/Makefile.am b/sql/Makefile.am index b1b14db4cb7..2eab9052ba7 100644 --- a/sql/Makefile.am +++ b/sql/Makefile.am @@ -63,7 +63,7 @@ noinst_HEADERS = item.h item_func.h item_sum.h item_cmpfunc.h \ parse_file.h sql_view.h sql_trigger.h \ examples/ha_example.h examples/ha_archive.h \ examples/ha_tina.h ha_blackhole.h \ - ha_federated.h + ha_federated.h ha_partition.h mysqld_SOURCES = sql_lex.cc sql_handler.cc \ item.cc item_sum.cc item_buff.cc item_func.cc \ item_cmpfunc.cc item_strfunc.cc item_timefunc.cc \ @@ -100,6 +100,7 @@ mysqld_SOURCES = sql_lex.cc sql_handler.cc \ sp_cache.cc parse_file.cc sql_trigger.cc \ examples/ha_example.cc examples/ha_archive.cc \ examples/ha_tina.cc ha_blackhole.cc \ + ha_partition.cc sql_partition.cc \ ha_federated.cc gen_lex_hash_SOURCES = gen_lex_hash.cc diff --git a/sql/field.cc b/sql/field.cc index 1a3b0f70498..0a8f0a39262 100644 --- a/sql/field.cc +++ b/sql/field.cc @@ -6311,7 +6311,8 @@ my_decimal *Field_varstring::val_decimal(my_decimal *decimal_value) } -int Field_varstring::cmp(const char *a_ptr, const char *b_ptr) +int Field_varstring::cmp_max(const char *a_ptr, const char *b_ptr, + uint max_len) { uint a_length, b_length; int diff; @@ -6326,6 +6327,8 @@ int Field_varstring::cmp(const char *a_ptr, const char *b_ptr) a_length= uint2korr(a_ptr); b_length= uint2korr(b_ptr); } + set_if_smaller(a_length, max_len); + set_if_smaller(b_length, max_len); diff= field_charset->coll->strnncollsp(field_charset, (const uchar*) a_ptr+ length_bytes, @@ -6956,13 +6959,16 @@ int Field_blob::cmp(const char *a,uint32 a_length, const char *b, } -int Field_blob::cmp(const char *a_ptr, const char *b_ptr) +int Field_blob::cmp_max(const char *a_ptr, const char *b_ptr, + uint max_length) { char *blob1,*blob2; memcpy_fixed(&blob1,a_ptr+packlength,sizeof(char*)); memcpy_fixed(&blob2,b_ptr+packlength,sizeof(char*)); - return Field_blob::cmp(blob1,get_length(a_ptr), - blob2,get_length(b_ptr)); + uint a_len= get_length(a_ptr), b_len= get_length(b_ptr); + set_if_smaller(a_len, max_length); + set_if_smaller(b_len, max_length); + return Field_blob::cmp(blob1,a_len,blob2,b_len); } @@ -7979,6 +7985,35 @@ my_decimal *Field_bit::val_decimal(my_decimal *deciaml_value) } +/* + Compare two bit fields using pointers within the record. + SYNOPSIS + cmp_max() + a Pointer to field->ptr in first record + b Pointer to field->ptr in second record + max_len Maximum length used in index + DESCRIPTION + This method is used from key_rec_cmp used by merge sorts used + by partitioned index read and later other similar places. + The a and b pointer must be pointers to the field in a record + (not the table->record[0] necessarily) +*/ +int Field_bit::cmp_max(const char *a, const char *b, uint max_len) +{ + my_ptrdiff_t a_diff= a - ptr; + my_ptrdiff_t b_diff= b - ptr; + if (bit_len) + { + int flag; + uchar bits_a= get_rec_bits(bit_ptr+a_diff, bit_ofs, bit_len); + uchar bits_b= get_rec_bits(bit_ptr+b_diff, bit_ofs, bit_len); + if ((flag= (int) (bits_a - bits_b))) + return flag; + } + return memcmp(a, b, field_length); +} + + int Field_bit::key_cmp(const byte *str, uint length) { if (bit_len) diff --git a/sql/field.h b/sql/field.h index 3f6b88198db..9b6df35de43 100644 --- a/sql/field.h +++ b/sql/field.h @@ -87,7 +87,7 @@ public: utype unireg_check; uint32 field_length; // Length of field uint field_index; // field number in fields array - uint16 flags; + uint32 flags; /* fieldnr is the id of the field (first field = 1) as is also used in key_part. */ @@ -154,6 +154,8 @@ public: virtual enum_field_types type() const =0; virtual enum_field_types real_type() const { return type(); } inline int cmp(const char *str) { return cmp(ptr,str); } + virtual int cmp_max(const char *a, const char *b, uint max_len) + { return cmp(a, b); } virtual int cmp(const char *,const char *)=0; virtual int cmp_binary(const char *a,const char *b, uint32 max_length=~0L) { return memcmp(a,b,pack_length()); } @@ -1059,7 +1061,11 @@ public: longlong val_int(void); String *val_str(String*,String *); my_decimal *val_decimal(my_decimal *); - int cmp(const char *,const char*); + int cmp_max(const char *, const char *, uint max_length); + int cmp(const char *a,const char*b) + { + return cmp_max(a, b, ~0); + } void sort_string(char *buff,uint length); void get_key_image(char *buff,uint length, imagetype type); void set_key_image(char *buff,uint length); @@ -1115,7 +1121,9 @@ public: longlong val_int(void); String *val_str(String*,String *); my_decimal *val_decimal(my_decimal *); - int cmp(const char *,const char*); + int cmp_max(const char *, const char *, uint max_length); + int cmp(const char *a,const char*b) + { return cmp_max(a, b, ~0); } int cmp(const char *a, uint32 a_length, const char *b, uint32 b_length); int cmp_binary(const char *a,const char *b, uint32 max_length=~0L); int key_cmp(const byte *,const byte*); @@ -1139,6 +1147,10 @@ public: { memcpy_fixed(str,ptr+packlength,sizeof(char*)); } + inline void get_ptr(char **str, uint row_offset) + { + memcpy_fixed(str,ptr+packlength+row_offset,sizeof(char*)); + } inline void set_ptr(char *length,char *data) { memcpy(ptr,length,packlength); @@ -1307,6 +1319,7 @@ public: my_decimal *val_decimal(my_decimal *); int cmp(const char *a, const char *b) { return cmp_binary(a, b); } + int cmp_max(const char *a, const char *b, uint max_length); int key_cmp(const byte *a, const byte *b) { return cmp_binary((char *) a, (char *) b); } int key_cmp(const byte *str, uint length); diff --git a/sql/ha_berkeley.cc b/sql/ha_berkeley.cc index 568fb727e63..6b283079072 100644 --- a/sql/ha_berkeley.cc +++ b/sql/ha_berkeley.cc @@ -1360,7 +1360,7 @@ int ha_berkeley::delete_row(const byte * record) } -int ha_berkeley::index_init(uint keynr) +int ha_berkeley::index_init(uint keynr, bool sorted) { int error; DBUG_ENTER("ha_berkeley::index_init"); @@ -1638,7 +1638,7 @@ int ha_berkeley::rnd_init(bool scan) { DBUG_ENTER("rnd_init"); current_row.flags=DB_DBT_REALLOC; - DBUG_RETURN(index_init(primary_key)); + DBUG_RETURN(index_init(primary_key, 0)); } int ha_berkeley::rnd_end() @@ -2146,7 +2146,7 @@ ulonglong ha_berkeley::get_auto_increment() (void) ha_berkeley::extra(HA_EXTRA_KEYREAD); /* Set 'active_index' */ - ha_berkeley::index_init(table->s->next_number_index); + ha_berkeley::index_init(table->s->next_number_index, 0); if (!table->s->next_number_key_offset) { // Autoincrement at key-start @@ -2485,7 +2485,7 @@ void ha_berkeley::get_status() if (!(share->status & STATUS_PRIMARY_KEY_INIT)) { (void) extra(HA_EXTRA_KEYREAD); - index_init(primary_key); + index_init(primary_key, 0); if (!index_last(table->record[1])) share->auto_ident=uint5korr(current_ident); index_end(); diff --git a/sql/ha_berkeley.h b/sql/ha_berkeley.h index f6376939445..596a59b4d43 100644 --- a/sql/ha_berkeley.h +++ b/sql/ha_berkeley.h @@ -98,7 +98,7 @@ class ha_berkeley: public handler const char **bas_ext() const; ulong table_flags(void) const { return int_table_flags; } uint max_supported_keys() const { return MAX_KEY-1; } - uint extra_rec_buf_length() { return BDB_HIDDEN_PRIMARY_KEY_LENGTH; } + uint extra_rec_buf_length() const { return BDB_HIDDEN_PRIMARY_KEY_LENGTH; } ha_rows estimate_rows_upper_bound(); const key_map *keys_to_use_for_scanning() { return &key_map_full; } bool has_transactions() { return 1;} @@ -109,7 +109,7 @@ class ha_berkeley: public handler int write_row(byte * buf); int update_row(const byte * old_data, byte * new_data); int delete_row(const byte * buf); - int index_init(uint index); + int index_init(uint index, bool sorted); int index_end(); int index_read(byte * buf, const byte * key, uint key_len, enum ha_rkey_function find_flag); diff --git a/sql/ha_federated.cc b/sql/ha_federated.cc index bbcae0613e7..1cec6faea04 100644 --- a/sql/ha_federated.cc +++ b/sql/ha_federated.cc @@ -1512,7 +1512,7 @@ int ha_federated::index_read_idx(byte *buf, uint index, const byte *key, } /* Initialized at each key walk (called multiple times unlike rnd_init()) */ -int ha_federated::index_init(uint keynr) +int ha_federated::index_init(uint keynr, bool sorted) { DBUG_ENTER("ha_federated::index_init"); DBUG_PRINT("info", diff --git a/sql/ha_federated.h b/sql/ha_federated.h index f084976718c..5908fe5afba 100644 --- a/sql/ha_federated.h +++ b/sql/ha_federated.h @@ -154,7 +154,7 @@ public: int write_row(byte * buf); int update_row(const byte * old_data, byte * new_data); int delete_row(const byte * buf); - int index_init(uint keynr); + int index_init(uint keynr, bool sorted); int index_read(byte * buf, const byte * key, uint key_len, enum ha_rkey_function find_flag); int index_read_idx(byte * buf, uint idx, const byte * key, diff --git a/sql/ha_innodb.cc b/sql/ha_innodb.cc index 4167b7c2dde..1dfc64c9137 100644 --- a/sql/ha_innodb.cc +++ b/sql/ha_innodb.cc @@ -3595,7 +3595,8 @@ int ha_innobase::index_init( /*====================*/ /* out: 0 or error number */ - uint keynr) /* in: key (index) number */ + uint keynr, /* in: key (index) number */ + bool sorted) /* in: 1 if result MUST be sorted according to index */ { int error = 0; DBUG_ENTER("index_init"); @@ -6646,7 +6647,7 @@ ha_innobase::innobase_read_and_init_auto_inc( } (void) extra(HA_EXTRA_KEYREAD); - index_init(table->s->next_number_index); + index_init(table->s->next_number_index, 1); /* Starting from 5.0.9, we use a consistent read to read the auto-inc column maximum value. This eliminates the spurious deadlocks caused diff --git a/sql/ha_innodb.h b/sql/ha_innodb.h index 98496e748b4..150eae63730 100644 --- a/sql/ha_innodb.h +++ b/sql/ha_innodb.h @@ -136,7 +136,7 @@ class ha_innobase: public handler int delete_row(const byte * buf); void unlock_row(); - int index_init(uint index); + int index_init(uint index, bool sorted); int index_end(); int index_read(byte * buf, const byte * key, uint key_len, enum ha_rkey_function find_flag); diff --git a/sql/ha_ndbcluster.cc b/sql/ha_ndbcluster.cc index 7d182d07087..1f830342b73 100644 --- a/sql/ha_ndbcluster.cc +++ b/sql/ha_ndbcluster.cc @@ -34,6 +34,7 @@ // options from from mysqld.cc extern my_bool opt_ndb_optimized_node_selection; +extern my_bool opt_ndb_linear_hash; extern const char *opt_ndbcluster_connectstring; // Default value for parallelism @@ -99,6 +100,7 @@ static HASH ndbcluster_open_tables; static byte *ndbcluster_get_key(NDB_SHARE *share,uint *length, my_bool not_used __attribute__((unused))); +static void ndb_set_fragmentation(NDBTAB & tab, TABLE *table, uint pk_len); static NDB_SHARE *get_share(const char *table_name); static void free_share(NDB_SHARE *share); @@ -861,11 +863,9 @@ bool ha_ndbcluster::uses_blob_value() { uint no_fields= table->s->fields; int i; - THD *thd= current_thd; // They always put blobs at the end.. for (i= no_fields - 1; i >= 0; i--) { - Field *field= table->field[i]; if ((m_write_op && ha_get_bit_in_write_set(i+1)) || (!m_write_op && ha_get_bit_in_read_set(i+1))) { @@ -1292,8 +1292,6 @@ inline int ha_ndbcluster::define_read_attrs(byte* buf, NdbOperation* op) { uint i; - THD *thd= current_thd; - DBUG_ENTER("define_read_attrs"); // Define attributes to read @@ -1333,7 +1331,8 @@ int ha_ndbcluster::define_read_attrs(byte* buf, NdbOperation* op) Read one record from NDB using primary key */ -int ha_ndbcluster::pk_read(const byte *key, uint key_len, byte *buf) +int ha_ndbcluster::pk_read(const byte *key, uint key_len, byte *buf, + uint32 part_id) { uint no_fields= table->s->fields; NdbConnection *trans= m_active_trans; @@ -1351,6 +1350,8 @@ int ha_ndbcluster::pk_read(const byte *key, uint key_len, byte *buf) op->readTuple(lm) != 0) ERR_RETURN(trans->getNdbError()); + if (m_use_partition_function) + op->setPartitionId(part_id); if (table->s->primary_key == MAX_KEY) { // This table has no primary key, use "hidden" primary key @@ -1388,12 +1389,12 @@ int ha_ndbcluster::pk_read(const byte *key, uint key_len, byte *buf) Read one complementing record from NDB using primary key from old_data */ -int ha_ndbcluster::complemented_pk_read(const byte *old_data, byte *new_data) +int ha_ndbcluster::complemented_pk_read(const byte *old_data, byte *new_data, + uint32 old_part_id) { uint no_fields= table->s->fields, i; NdbTransaction *trans= m_active_trans; NdbOperation *op; - THD *thd= current_thd; DBUG_ENTER("complemented_pk_read"); m_write_op= FALSE; @@ -1411,6 +1412,10 @@ int ha_ndbcluster::complemented_pk_read(const byte *old_data, byte *new_data) int res; if ((res= set_primary_key_from_record(op, old_data))) ERR_RETURN(trans->getNdbError()); + + if (m_use_partition_function) + op->setPartitionId(old_part_id); + // Read all unreferenced non-key field(s) for (i= 0; i < no_fields; i++) { @@ -1469,6 +1474,17 @@ int ha_ndbcluster::peek_row(const byte *record) if ((res= set_primary_key_from_record(op, record))) ERR_RETURN(trans->getNdbError()); + if (m_use_partition_function) + { + uint32 part_id; + int error; + if ((error= m_part_info->get_partition_id(m_part_info, &part_id))) + { + DBUG_RETURN(error); + } + op->setPartitionId(part_id); + } + if (execute_no_commit_ie(this,trans) != 0) { table->status= STATUS_NOT_FOUND; @@ -1807,7 +1823,8 @@ int ha_ndbcluster::set_bounds(NdbIndexScanOperation *op, int ha_ndbcluster::ordered_index_scan(const key_range *start_key, const key_range *end_key, - bool sorted, bool descending, byte* buf) + bool sorted, bool descending, + byte* buf, part_id_range *part_spec) { int res; bool restart; @@ -1833,11 +1850,17 @@ int ha_ndbcluster::ordered_index_scan(const key_range *start_key, (const NDBTAB *) m_table)) || op->readTuples(lm, 0, parallelism, sorted, descending)) ERR_RETURN(trans->getNdbError()); + if (m_use_partition_function && part_spec != NULL && + part_spec->start_part == part_spec->end_part) + op->setPartitionId(part_spec->start_part); m_active_cursor= op; } else { restart= TRUE; op= (NdbIndexScanOperation*)m_active_cursor; + if (m_use_partition_function && part_spec != NULL && + part_spec->start_part == part_spec->end_part) + op->setPartitionId(part_spec->start_part); DBUG_ASSERT(op->getSorted() == sorted); DBUG_ASSERT(op->getLockMode() == (NdbOperation::LockMode)get_ndb_lock_type(m_lock.type)); @@ -1937,6 +1960,17 @@ int ha_ndbcluster::write_row(byte *record) if (res != 0) ERR_RETURN(trans->getNdbError()); + if (m_use_partition_function) + { + uint32 part_id; + int error; + if ((error= m_part_info->get_partition_id(m_part_info, &part_id))) + { + DBUG_RETURN(error); + } + op->setPartitionId(part_id); + } + if (table->s->primary_key == MAX_KEY) { // Table has hidden primary key @@ -2094,6 +2128,8 @@ int ha_ndbcluster::update_row(const byte *old_data, byte *new_data) NdbScanOperation* cursor= m_active_cursor; NdbOperation *op; uint i; + uint32 old_part_id= 0, new_part_id= 0; + int error; DBUG_ENTER("update_row"); m_write_op= TRUE; @@ -2104,15 +2140,23 @@ int ha_ndbcluster::update_row(const byte *old_data, byte *new_data) ha_set_bit_in_write_set(table->timestamp_field->fieldnr); } + if (m_use_partition_function && + (error= get_parts_for_update(old_data, new_data, table->record[0], + m_part_info, &old_part_id, &new_part_id))) + { + DBUG_RETURN(error); + } + /* Check for update of primary key for special handling */ if ((table->s->primary_key != MAX_KEY) && - (key_cmp(table->s->primary_key, old_data, new_data))) + (key_cmp(table->s->primary_key, old_data, new_data)) || + (old_part_id != new_part_id)) { int read_res, insert_res, delete_res, undo_res; DBUG_PRINT("info", ("primary key update, doing pk read+delete+insert")); // Get all old fields, since we optimize away fields not in query - read_res= complemented_pk_read(old_data, new_data); + read_res= complemented_pk_read(old_data, new_data, old_part_id); if (read_res) { DBUG_PRINT("info", ("pk read failed")); @@ -2168,6 +2212,8 @@ int ha_ndbcluster::update_row(const byte *old_data, byte *new_data) m_ops_pending++; if (uses_blob_value()) m_blobs_pending= TRUE; + if (m_use_partition_function) + cursor->setPartitionId(new_part_id); } else { @@ -2175,6 +2221,8 @@ int ha_ndbcluster::update_row(const byte *old_data, byte *new_data) op->updateTuple() != 0) ERR_RETURN(trans->getNdbError()); + if (m_use_partition_function) + op->setPartitionId(new_part_id); if (table->s->primary_key == MAX_KEY) { // This table has no primary key, use "hidden" primary key @@ -2230,12 +2278,21 @@ int ha_ndbcluster::delete_row(const byte *record) NdbTransaction *trans= m_active_trans; NdbScanOperation* cursor= m_active_cursor; NdbOperation *op; + uint32 part_id; + int error; DBUG_ENTER("delete_row"); m_write_op= TRUE; statistic_increment(thd->status_var.ha_delete_count,&LOCK_status); m_rows_changed++; + if (m_use_partition_function && + (error= get_part_for_delete(record, table->record[0], m_part_info, + &part_id))) + { + DBUG_RETURN(error); + } + if (cursor) { /* @@ -2250,6 +2307,9 @@ int ha_ndbcluster::delete_row(const byte *record) ERR_RETURN(trans->getNdbError()); m_ops_pending++; + if (m_use_partition_function) + cursor->setPartitionId(part_id); + no_uncommitted_rows_update(-1); if (!m_primary_key_update) @@ -2263,6 +2323,9 @@ int ha_ndbcluster::delete_row(const byte *record) op->deleteTuple() != 0) ERR_RETURN(trans->getNdbError()); + if (m_use_partition_function) + op->setPartitionId(part_id); + no_uncommitted_rows_update(-1); if (table->s->primary_key == MAX_KEY) @@ -2388,8 +2451,6 @@ void ha_ndbcluster::print_results() DBUG_ENTER("print_results"); #ifndef DBUG_OFF - const NDBTAB *tab= (const NDBTAB*) m_table; - if (!_db_on_) DBUG_VOID_RETURN; @@ -2444,11 +2505,13 @@ print_value: } -int ha_ndbcluster::index_init(uint index) +int ha_ndbcluster::index_init(uint index, bool sorted) { DBUG_ENTER("ha_ndbcluster::index_init"); - DBUG_PRINT("enter", ("index: %u", index)); - DBUG_RETURN(handler::index_init(index)); + DBUG_PRINT("enter", ("index: %u sorted: %d", index, sorted)); + active_index= index; + m_sorted= sorted; + DBUG_RETURN(0); } @@ -2485,56 +2548,16 @@ int ha_ndbcluster::index_read(byte *buf, const byte *key, uint key_len, enum ha_rkey_function find_flag) { + key_range start_key; + bool descending= FALSE; DBUG_ENTER("ha_ndbcluster::index_read"); DBUG_PRINT("enter", ("active_index: %u, key_len: %u, find_flag: %d", active_index, key_len, find_flag)); - int error; - ndb_index_type type= get_index_type(active_index); - const KEY* key_info= table->key_info+active_index; - m_write_op= FALSE; - switch (type){ - case PRIMARY_KEY_ORDERED_INDEX: - case PRIMARY_KEY_INDEX: - if (find_flag == HA_READ_KEY_EXACT && key_info->key_length == key_len) - { - if (m_active_cursor && (error= close_scan())) - DBUG_RETURN(error); - DBUG_RETURN(pk_read(key, key_len, buf)); - } - else if (type == PRIMARY_KEY_INDEX) - { - DBUG_RETURN(1); - } - break; - case UNIQUE_ORDERED_INDEX: - case UNIQUE_INDEX: - if (find_flag == HA_READ_KEY_EXACT && key_info->key_length == key_len && - !check_null_in_key(key_info, key, key_len)) - { - if (m_active_cursor && (error= close_scan())) - DBUG_RETURN(error); - DBUG_RETURN(unique_index_read(key, key_len, buf)); - } - else if (type == UNIQUE_INDEX) - { - DBUG_RETURN(1); - } - break; - case ORDERED_INDEX: - break; - default: - case UNDEFINED_INDEX: - DBUG_ASSERT(FALSE); - DBUG_RETURN(1); - break; - } - - key_range start_key; start_key.key= key; start_key.length= key_len; start_key.flag= find_flag; - bool descending= FALSE; + descending= FALSE; switch (find_flag) { case HA_READ_KEY_OR_PREV: case HA_READ_BEFORE_KEY: @@ -2545,8 +2568,8 @@ int ha_ndbcluster::index_read(byte *buf, default: break; } - error= ordered_index_scan(&start_key, 0, TRUE, descending, buf); - DBUG_RETURN(error == HA_ERR_END_OF_FILE ? HA_ERR_KEY_NOT_FOUND : error); + DBUG_RETURN(read_range_first_to_buf(&start_key, 0, descending, + m_sorted, buf)); } @@ -2557,7 +2580,7 @@ int ha_ndbcluster::index_read_idx(byte *buf, uint index_no, statistic_increment(current_thd->status_var.ha_read_key_count, &LOCK_status); DBUG_ENTER("ha_ndbcluster::index_read_idx"); DBUG_PRINT("enter", ("index_no: %u, key_len: %u", index_no, key_len)); - index_init(index_no); + index_init(index_no, 0); DBUG_RETURN(index_read(buf, key, key_len, find_flag)); } @@ -2588,7 +2611,7 @@ int ha_ndbcluster::index_first(byte *buf) // Start the ordered index scan and fetch the first row // Only HA_READ_ORDER indexes get called by index_first - DBUG_RETURN(ordered_index_scan(0, 0, TRUE, FALSE, buf)); + DBUG_RETURN(ordered_index_scan(0, 0, TRUE, FALSE, buf, NULL)); } @@ -2596,7 +2619,7 @@ int ha_ndbcluster::index_last(byte *buf) { DBUG_ENTER("ha_ndbcluster::index_last"); statistic_increment(current_thd->status_var.ha_read_last_count,&LOCK_status); - DBUG_RETURN(ordered_index_scan(0, 0, TRUE, TRUE, buf)); + DBUG_RETURN(ordered_index_scan(0, 0, TRUE, TRUE, buf, NULL)); } int ha_ndbcluster::index_read_last(byte * buf, const byte * key, uint key_len) @@ -2605,67 +2628,76 @@ int ha_ndbcluster::index_read_last(byte * buf, const byte * key, uint key_len) DBUG_RETURN(index_read(buf, key, key_len, HA_READ_PREFIX_LAST)); } -inline int ha_ndbcluster::read_range_first_to_buf(const key_range *start_key, const key_range *end_key, - bool eq_r, bool sorted, + bool desc, bool sorted, byte* buf) { - KEY* key_info; - int error= 1; + part_id_range part_spec; + ndb_index_type type= get_index_type(active_index); + const KEY* key_info= table->key_info+active_index; + int error; DBUG_ENTER("ha_ndbcluster::read_range_first_to_buf"); - DBUG_PRINT("info", ("eq_r: %d, sorted: %d", eq_r, sorted)); + DBUG_PRINT("info", ("desc: %d, sorted: %d", desc, sorted)); - switch (get_index_type(active_index)){ + if (m_use_partition_function) + { + get_partition_set(table, buf, active_index, start_key, &part_spec); + if (part_spec.start_part > part_spec.end_part) + { + DBUG_RETURN(HA_ERR_END_OF_FILE); + } + else if (part_spec.start_part == part_spec.end_part) + { + /* + Only one partition is required to scan, if sorted is required we + don't need it any more since output from one ordered partitioned + index is always sorted. + */ + sorted= FALSE; + } + } + m_write_op= FALSE; + switch (type){ case PRIMARY_KEY_ORDERED_INDEX: case PRIMARY_KEY_INDEX: - key_info= table->key_info + active_index; if (start_key && start_key->length == key_info->key_length && start_key->flag == HA_READ_KEY_EXACT) { if (m_active_cursor && (error= close_scan())) DBUG_RETURN(error); - error= pk_read(start_key->key, start_key->length, buf); - DBUG_RETURN(error == HA_ERR_KEY_NOT_FOUND ? HA_ERR_END_OF_FILE : error); + DBUG_RETURN(pk_read(start_key->key, start_key->length, buf, + part_spec.start_part)); } break; case UNIQUE_ORDERED_INDEX: case UNIQUE_INDEX: - key_info= table->key_info + active_index; if (start_key && start_key->length == key_info->key_length && start_key->flag == HA_READ_KEY_EXACT && !check_null_in_key(key_info, start_key->key, start_key->length)) { if (m_active_cursor && (error= close_scan())) DBUG_RETURN(error); - error= unique_index_read(start_key->key, start_key->length, buf); - DBUG_RETURN(error == HA_ERR_KEY_NOT_FOUND ? HA_ERR_END_OF_FILE : error); + DBUG_RETURN(unique_index_read(start_key->key, start_key->length, buf)); } break; default: break; } - // Start the ordered index scan and fetch the first row - error= ordered_index_scan(start_key, end_key, sorted, FALSE, buf); - DBUG_RETURN(error); + DBUG_RETURN(ordered_index_scan(start_key, end_key, sorted, desc, buf, + &part_spec)); } - int ha_ndbcluster::read_range_first(const key_range *start_key, const key_range *end_key, bool eq_r, bool sorted) { byte* buf= table->record[0]; DBUG_ENTER("ha_ndbcluster::read_range_first"); - m_write_op= FALSE; - - DBUG_RETURN(read_range_first_to_buf(start_key, - end_key, - eq_r, - sorted, - buf)); + DBUG_RETURN(read_range_first_to_buf(start_key, end_key, FALSE, + sorted, buf)); } int ha_ndbcluster::read_range_next() @@ -2691,7 +2723,7 @@ int ha_ndbcluster::rnd_init(bool scan) DBUG_RETURN(-1); } } - index_init(table->s->primary_key); + index_init(table->s->primary_key, 0); DBUG_RETURN(0); } @@ -2758,7 +2790,20 @@ int ha_ndbcluster::rnd_pos(byte *buf, byte *pos) &LOCK_status); // The primary key for the record is stored in pos // Perform a pk_read using primary key "index" - DBUG_RETURN(pk_read(pos, ref_length, buf)); + { + part_id_range part_spec; + if (m_use_partition_function) + { + key_range key_spec; + KEY *key_info= table->key_info + active_index; + key_spec.key= pos; + key_spec.length= ref_length; + key_spec.flag= HA_READ_KEY_EXACT; + get_full_part_id_from_key(table, buf, key_info, &key_spec, &part_spec); + DBUG_ASSERT(part_spec.start_part == part_spec.end_part); + } + DBUG_RETURN(pk_read(pos, ref_length, buf, part_spec.start_part)); + } } @@ -2904,6 +2949,8 @@ int ha_ndbcluster::extra(enum ha_extra_function operation) m_use_write= FALSE; m_ignore_dup_key= FALSE; break; + default: + break; } DBUG_RETURN(0); @@ -3691,56 +3738,6 @@ static int create_ndb_column(NDBCOL &col, return 0; } -/* - Create a table in NDB Cluster - */ - -static void ndb_set_fragmentation(NDBTAB &tab, TABLE *form, uint pk_length) -{ - if (form->s->max_rows == (ha_rows) 0) /* default setting, don't set fragmentation */ - return; - /** - * get the number of fragments right - */ - uint no_fragments; - { -#if MYSQL_VERSION_ID >= 50000 - uint acc_row_size= 25 + /*safety margin*/ 2; -#else - uint acc_row_size= pk_length*4; - /* add acc overhead */ - if (pk_length <= 8) /* main page will set the limit */ - acc_row_size+= 25 + /*safety margin*/ 2; - else /* overflow page will set the limit */ - acc_row_size+= 4 + /*safety margin*/ 4; -#endif - ulonglong acc_fragment_size= 512*1024*1024; - ulonglong max_rows= form->s->max_rows; -#if MYSQL_VERSION_ID >= 50100 - no_fragments= (max_rows*acc_row_size)/acc_fragment_size+1; -#else - no_fragments= ((max_rows*acc_row_size)/acc_fragment_size+1 - +1/*correct rounding*/)/2; -#endif - } - { - uint no_nodes= g_ndb_cluster_connection->no_db_nodes(); - NDBTAB::FragmentType ftype; - if (no_fragments > 2*no_nodes) - { - ftype= NDBTAB::FragAllLarge; - if (no_fragments > 4*no_nodes) - push_warning(current_thd, MYSQL_ERROR::WARN_LEVEL_WARN, ER_UNKNOWN_ERROR, - "Ndb might have problems storing the max amount of rows specified"); - } - else if (no_fragments > no_nodes) - ftype= NDBTAB::FragAllMedium; - else - ftype= NDBTAB::FragAllSmall; - tab.setFragmentType(ftype); - } -} - int ha_ndbcluster::create(const char *name, TABLE *form, HA_CREATE_INFO *info) @@ -3843,7 +3840,22 @@ int ha_ndbcluster::create(const char *name, } } - ndb_set_fragmentation(tab, form, pk_length); + // Check partition info + partition_info *part_info= form->s->part_info; + if (part_info) + { + int error; + if ((error= set_up_partition_info(part_info, form, (void*)&tab))) + { + DBUG_RETURN(error); + } + } + else + { + ndb_set_fragmentation(tab, form, pk_length); + } + + if ((my_errno= check_ndb_connection())) DBUG_RETURN(my_errno); @@ -4092,6 +4104,9 @@ ha_ndbcluster::ha_ndbcluster(TABLE *table_arg): HA_NEED_READ_RANGE_BUFFER | HA_CAN_BIT_FIELD), m_share(0), + m_part_info(NULL), + m_use_partition_function(FALSE), + m_sorted(FALSE), m_use_write(FALSE), m_ignore_dup_key(FALSE), m_primary_key_update(FALSE), @@ -4206,6 +4221,15 @@ int ha_ndbcluster::open(const char *name, int mode, uint test_if_locked) if (!res) info(HA_STATUS_VARIABLE | HA_STATUS_CONST); + if (table->s->part_info) + { + m_part_info= table->s->part_info; + if (!(m_part_info->part_type == HASH_PARTITION && + m_part_info->list_of_part_fields && + !is_sub_partitioned(m_part_info))) + m_use_partition_function= TRUE; + } + DBUG_RETURN(res); } @@ -5478,12 +5502,29 @@ ha_ndbcluster::read_multi_range_first(KEY_MULTI_RANGE **found_range_p, for (; multi_range_curr<multi_range_end && curr+reclength <= end_of_buffer; multi_range_curr++) { - switch (index_type){ + part_id_range part_spec; + if (m_use_partition_function) + { + get_partition_set(table, curr, active_index, + &multi_range_curr->start_key, + &part_spec); + if (part_spec.start_part > part_spec.end_part) + { + /* + We can skip this partition since the key won't fit into any + partition + */ + curr += reclength; + multi_range_curr->range_flag |= SKIP_RANGE; + continue; + } + } + switch(index_type){ case PRIMARY_KEY_ORDERED_INDEX: if (!(multi_range_curr->start_key.length == key_info->key_length && - multi_range_curr->start_key.flag == HA_READ_KEY_EXACT)) - goto range; - /* fall through */ + multi_range_curr->start_key.flag == HA_READ_KEY_EXACT)) + goto range; + // else fall through case PRIMARY_KEY_INDEX: { multi_range_curr->range_flag |= UNIQUE_RANGE; @@ -5491,7 +5532,9 @@ ha_ndbcluster::read_multi_range_first(KEY_MULTI_RANGE **found_range_p, !op->readTuple(lm) && !set_primary_key(op, multi_range_curr->start_key.key) && !define_read_attrs(curr, op) && - (op->setAbortOption(AO_IgnoreError), TRUE)) + (op->setAbortOption(AO_IgnoreError), TRUE) && + (!m_use_partition_function || + (op->setPartitionId(part_spec.start_part), true))) curr += reclength; else ERR_RETURN(op ? op->getNdbError() : m_active_trans->getNdbError()); @@ -5500,11 +5543,11 @@ ha_ndbcluster::read_multi_range_first(KEY_MULTI_RANGE **found_range_p, break; case UNIQUE_ORDERED_INDEX: if (!(multi_range_curr->start_key.length == key_info->key_length && - multi_range_curr->start_key.flag == HA_READ_KEY_EXACT && - !check_null_in_key(key_info, multi_range_curr->start_key.key, - multi_range_curr->start_key.length))) - goto range; - /* fall through */ + multi_range_curr->start_key.flag == HA_READ_KEY_EXACT && + !check_null_in_key(key_info, multi_range_curr->start_key.key, + multi_range_curr->start_key.length))) + goto range; + // else fall through case UNIQUE_INDEX: { multi_range_curr->range_flag |= UNIQUE_RANGE; @@ -5518,8 +5561,7 @@ ha_ndbcluster::read_multi_range_first(KEY_MULTI_RANGE **found_range_p, ERR_RETURN(op ? op->getNdbError() : m_active_trans->getNdbError()); break; } - case ORDERED_INDEX: - { + case ORDERED_INDEX: { range: multi_range_curr->range_flag &= ~(uint)UNIQUE_RANGE; if (scanOp == 0) @@ -5594,7 +5636,7 @@ ha_ndbcluster::read_multi_range_first(KEY_MULTI_RANGE **found_range_p, } #if 0 -#define DBUG_MULTI_RANGE(x) printf("read_multi_range_next: case %d\n", x); +#define DBUG_MULTI_RANGE(x) DBUG_PRINT("info", ("read_multi_range_next: case %d\n", x)); #else #define DBUG_MULTI_RANGE(x) #endif @@ -5605,6 +5647,7 @@ ha_ndbcluster::read_multi_range_next(KEY_MULTI_RANGE ** multi_range_found_p) DBUG_ENTER("ha_ndbcluster::read_multi_range_next"); if (m_disable_multi_read) { + DBUG_MULTI_RANGE(11); DBUG_RETURN(handler::read_multi_range_next(multi_range_found_p)); } @@ -5614,10 +5657,16 @@ ha_ndbcluster::read_multi_range_next(KEY_MULTI_RANGE ** multi_range_found_p) const NdbOperation* op= m_current_multi_operation; for (;multi_range_curr < m_multi_range_defined; multi_range_curr++) { + DBUG_MULTI_RANGE(12); + if (multi_range_curr->range_flag & SKIP_RANGE) + continue; if (multi_range_curr->range_flag & UNIQUE_RANGE) { if (op->getNdbError().code == 0) + { + DBUG_MULTI_RANGE(13); goto found_next; + } op= m_active_trans->getNextCompletedOperation(op); m_multi_range_result_ptr += reclength; @@ -5634,6 +5683,7 @@ ha_ndbcluster::read_multi_range_next(KEY_MULTI_RANGE ** multi_range_found_p) } else { + DBUG_MULTI_RANGE(14); goto close_scan; } } @@ -5667,6 +5717,7 @@ ha_ndbcluster::read_multi_range_next(KEY_MULTI_RANGE ** multi_range_found_p) DBUG_ASSERT(range_no == -1); if ((res= m_multi_cursor->nextResult(true))) { + DBUG_MULTI_RANGE(15); goto close_scan; } multi_range_curr--; // Will be increased in for-loop @@ -5694,12 +5745,16 @@ close_scan: } else { + DBUG_MULTI_RANGE(9); DBUG_RETURN(ndb_err(m_active_trans)); } } if (multi_range_curr == multi_range_end) + { + DBUG_MULTI_RANGE(16); DBUG_RETURN(HA_ERR_END_OF_FILE); + } /** * Read remaining ranges @@ -6916,6 +6971,8 @@ ha_ndbcluster::build_scan_filter_predicate(Ndb_cond * &cond, : NULL; break; default: + field= NULL; //Keep compiler happy + DBUG_ASSERT(0); break; } switch ((negated) ? @@ -7263,4 +7320,178 @@ ha_ndbcluster::generate_scan_filter(Ndb_cond_stack *ndb_cond_stack, DBUG_RETURN(0); } + +/* + Create a table in NDB Cluster + */ +static uint get_no_fragments(ulonglong max_rows) +{ +#if MYSQL_VERSION_ID >= 50000 + uint acc_row_size= 25 + /*safety margin*/ 2; +#else + uint acc_row_size= pk_length*4; + /* add acc overhead */ + if (pk_length <= 8) /* main page will set the limit */ + acc_row_size+= 25 + /*safety margin*/ 2; + else /* overflow page will set the limit */ + acc_row_size+= 4 + /*safety margin*/ 4; +#endif + ulonglong acc_fragment_size= 512*1024*1024; +#if MYSQL_VERSION_ID >= 50100 + return (max_rows*acc_row_size)/acc_fragment_size+1; +#else + return ((max_rows*acc_row_size)/acc_fragment_size+1 + +1/*correct rounding*/)/2; +#endif +} + + +/* + Routine to adjust default number of partitions to always be a multiple + of number of nodes and never more than 4 times the number of nodes. + +*/ +static bool adjusted_frag_count(uint no_fragments, uint no_nodes, + uint &reported_frags) +{ + uint i= 0; + reported_frags= no_nodes; + while (reported_frags < no_fragments && ++i < 4 && + (reported_frags + no_nodes) < MAX_PARTITIONS) + reported_frags+= no_nodes; + return (reported_frags < no_fragments); +} + +int ha_ndbcluster::get_default_no_partitions(ulonglong max_rows) +{ + uint reported_frags; + uint no_fragments= get_no_fragments(max_rows); + uint no_nodes= g_ndb_cluster_connection->no_db_nodes(); + adjusted_frag_count(no_fragments, no_nodes, reported_frags); + return (int)reported_frags; +} + + +/* + User defined partitioning set-up. We need to check how many fragments the + user wants defined and which node groups to put those into. Later we also + want to attach those partitions to a tablespace. + + All the functionality of the partition function, partition limits and so + forth are entirely handled by the MySQL Server. There is one exception to + this rule for PARTITION BY KEY where NDB handles the hash function and + this type can thus be handled transparently also by NDB API program. + For RANGE, HASH and LIST and subpartitioning the NDB API programs must + implement the function to map to a partition. +*/ + +uint ha_ndbcluster::set_up_partition_info(partition_info *part_info, + TABLE *table, + void *tab_par) +{ + DBUG_ENTER("ha_ndbcluster::set_up_partition_info"); + ushort node_group[MAX_PARTITIONS]; + ulong ng_index= 0, i, j; + NDBTAB *tab= (NDBTAB*)tab_par; + NDBTAB::FragmentType ftype= NDBTAB::UserDefined; + partition_element *part_elem; + + if (part_info->part_type == HASH_PARTITION && + part_info->list_of_part_fields == TRUE) + { + Field **fields= part_info->part_field_array; + + if (part_info->linear_hash_ind) + ftype= NDBTAB::DistrKeyLin; + else + ftype= NDBTAB::DistrKeyHash; + + for (i= 0; i < part_info->part_field_list.elements; i++) + { + NDBCOL *col= tab->getColumn(fields[i]->fieldnr - 1); + DBUG_PRINT("info",("setting dist key on %s", col->getName())); + col->setPartitionKey(TRUE); + } + } + List_iterator<partition_element> part_it(part_info->partitions); + for (i= 0; i < part_info->no_parts; i++) + { + part_elem= part_it++; + if (!is_sub_partitioned(part_info)) + { + node_group[ng_index++]= part_elem->nodegroup_id; + //Here we should insert tablespace id based on tablespace name + } + else + { + List_iterator<partition_element> sub_it(part_elem->subpartitions); + for (j= 0; j < part_info->no_subparts; j++) + { + part_elem= sub_it++; + node_group[ng_index++]= part_elem->nodegroup_id; + //Here we should insert tablespace id based on tablespace name + } + } + } + { + uint no_nodes= g_ndb_cluster_connection->no_db_nodes(); + if (ng_index > 4 * no_nodes) + { + DBUG_RETURN(1300); + } + } + tab->setNodeGroupIds(&node_group, ng_index); + tab->setFragmentType(ftype); + DBUG_RETURN(0); +} + + +/* + This routine is used to set-up fragmentation when the user has only specified + ENGINE = NDB and no user defined partitioning what so ever. Thus all values + will be based on default values. We will choose Linear Hash or Hash with + perfect spread dependent on a session variable defined in MySQL. +*/ + +static void ndb_set_fragmentation(NDBTAB &tab, TABLE *form, uint pk_length) +{ + NDBTAB::FragmentType ftype; + ushort node_group[MAX_PARTITIONS]; + uint no_nodes= g_ndb_cluster_connection->no_db_nodes(), no_fragments, i; + DBUG_ENTER("ndb_set_fragmentation"); + + if (form->s->max_rows == (ha_rows) 0) + { + no_fragments= no_nodes; + } + else + { + /* + Ensure that we get enough fragments to handle all rows and ensure that + the table is fully distributed by keeping the number of fragments a + multiple of the number of nodes. + */ + uint fragments= get_no_fragments(form->s->max_rows); + if (adjusted_frag_count(fragments, no_nodes, no_fragments)) + { + push_warning(current_thd, + MYSQL_ERROR::WARN_LEVEL_WARN, ER_UNKNOWN_ERROR, + "Ndb might have problems storing the max amount of rows specified"); + } + } + /* + Always start with node group 0 and continue with next node group from + there + */ + node_group[0]= 0; + for (i= 1; i < no_fragments; i++) + node_group[i]= UNDEF_NODEGROUP; + if (opt_ndb_linear_hash) + ftype= NDBTAB::DistrKeyLin; + else + ftype= NDBTAB::DistrKeyHash; + tab.setFragmentType(ftype); + tab.setNodeGroupIds(&node_group, no_fragments); + DBUG_VOID_RETURN; +} #endif /* HAVE_NDBCLUSTER_DB */ diff --git a/sql/ha_ndbcluster.h b/sql/ha_ndbcluster.h index 6efc18f2f6a..f85b0fa8a04 100644 --- a/sql/ha_ndbcluster.h +++ b/sql/ha_ndbcluster.h @@ -420,7 +420,7 @@ class ha_ndbcluster: public handler int write_row(byte *buf); int update_row(const byte *old_data, byte *new_data); int delete_row(const byte *buf); - int index_init(uint index); + int index_init(uint index, bool sorted); int index_end(); int index_read(byte *buf, const byte *key, uint key_len, enum ha_rkey_function find_flag); @@ -462,6 +462,11 @@ class ha_ndbcluster: public handler const char * table_type() const; const char ** bas_ext() const; ulong table_flags(void) const; + ulong partition_flags(void) const + { + return (HA_CAN_PARTITION | HA_CAN_UPDATE_PARTITION_KEY | + HA_CAN_PARTITION_UNIQUE); + } ulong index_flags(uint idx, uint part, bool all_parts) const; uint max_supported_record_length() const; uint max_supported_keys() const; @@ -471,6 +476,7 @@ class ha_ndbcluster: public handler int rename_table(const char *from, const char *to); int delete_table(const char *name); int create(const char *name, TABLE *form, HA_CREATE_INFO *info); + int get_default_no_partitions(ulonglong max_rows); THR_LOCK_DATA **store_lock(THD *thd, THR_LOCK_DATA **to, enum thr_lock_type lock_type); @@ -549,15 +555,21 @@ private: NDB_INDEX_TYPE get_index_type_from_table(uint index_no) const; int check_index_fields_not_null(uint index_no); - int pk_read(const byte *key, uint key_len, byte *buf); - int complemented_pk_read(const byte *old_data, byte *new_data); - int peek_row(const byte *record); - int unique_index_read(const byte *key, uint key_len, - byte *buf); + uint set_up_partition_info(partition_info *part_info, + TABLE *table, + void *tab); + int complemented_pk_read(const byte *old_data, byte *new_data, + uint32 old_part_id); + int pk_read(const byte *key, uint key_len, byte *buf, uint32 part_id); int ordered_index_scan(const key_range *start_key, const key_range *end_key, - bool sorted, bool descending, byte* buf); + bool sorted, bool descending, byte* buf, + part_id_range *part_spec); int full_table_scan(byte * buf); + + int peek_row(const byte *record); + int unique_index_read(const byte *key, uint key_len, + byte *buf); int fetch_next(NdbScanOperation* op); int next_result(byte *buf); int define_read_attrs(byte* buf, NdbOperation* op); @@ -637,6 +649,11 @@ private: // NdbRecAttr has no reference to blob typedef union { const NdbRecAttr *rec; NdbBlob *blob; void *ptr; } NdbValue; NdbValue m_value[NDB_MAX_ATTRIBUTES_IN_TABLE]; + partition_info *m_part_info; + byte *m_rec0; + Field **m_part_field_array; + bool m_use_partition_function; + bool m_sorted; bool m_use_write; bool m_ignore_dup_key; bool m_primary_key_update; diff --git a/sql/ha_partition.cc b/sql/ha_partition.cc new file mode 100644 index 00000000000..30dd79551b4 --- /dev/null +++ b/sql/ha_partition.cc @@ -0,0 +1,3162 @@ +/* Copyright (C) 2005 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* + This handler was developed by Mikael Ronström for version 5.1 of MySQL. + It is an abstraction layer on top of other handlers such as MyISAM, + InnoDB, Federated, Berkeley DB and so forth. Partitioned tables can also + be handled by a storage engine. The current example of this is NDB + Cluster that has internally handled partitioning. This have benefits in + that many loops needed in the partition handler can be avoided. + + Partitioning has an inherent feature which in some cases is positive and + in some cases is negative. It splits the data into chunks. This makes + the data more manageable, queries can easily be parallelised towards the + parts and indexes are split such that there are less levels in the + index trees. The inherent disadvantage is that to use a split index + one has to scan all index parts which is ok for large queries but for + small queries it can be a disadvantage. + + Partitioning lays the foundation for more manageable databases that are + extremely large. It does also lay the foundation for more parallelism + in the execution of queries. This functionality will grow with later + versions of MySQL. + + You can enable it in your buld by doing the following during your build + process: + ./configure --with-partition + + The partition is setup to use table locks. It implements an partition "SHARE" + that is inserted into a hash by table name. You can use this to store + information of state that any partition handler object will be able to see + if it is using the same table. + + Please read the object definition in ha_partition.h before reading the rest + if this file. +*/ + +#ifdef __GNUC__ +#pragma implementation // gcc: Class implementation +#endif + +#include <mysql_priv.h> + +#ifdef HAVE_PARTITION_DB +#include "ha_partition.h" + +static const char *ha_par_ext= ".par"; +#ifdef NOT_USED +static int free_share(PARTITION_SHARE * share); +static PARTITION_SHARE *get_share(const char *table_name, TABLE * table); +#endif + +/**************************************************************************** + MODULE create/delete handler object +****************************************************************************/ + +ha_partition::ha_partition(TABLE *table) + :handler(table), m_part_info(NULL), m_create_handler(FALSE), + m_is_sub_partitioned(0) +{ + DBUG_ENTER("ha_partition::ha_partition(table)"); + init_handler_variables(); + if (table) + { + if (table->s->part_info) + { + m_part_info= table->s->part_info; + m_is_sub_partitioned= is_sub_partitioned(m_part_info); + } + } + DBUG_VOID_RETURN; +} + + +ha_partition::ha_partition(partition_info *part_info) + :handler(NULL), m_part_info(part_info), m_create_handler(TRUE), + m_is_sub_partitioned(is_sub_partitioned(m_part_info)) + +{ + DBUG_ENTER("ha_partition::ha_partition(part_info)"); + init_handler_variables(); + DBUG_ASSERT(m_part_info); + DBUG_VOID_RETURN; +} + + +void ha_partition::init_handler_variables() +{ + active_index= MAX_KEY; + m_file_buffer= NULL; + m_name_buffer_ptr= NULL; + m_engine_array= NULL; + m_file= NULL; + m_tot_parts= 0; + m_has_transactions= 0; + m_pkey_is_clustered= 0; + m_lock_type= F_UNLCK; + m_part_spec.start_part= NO_CURRENT_PART_ID; + m_scan_value= 2; + m_ref_length= 0; + m_part_spec.end_part= NO_CURRENT_PART_ID; + m_index_scan_type= partition_no_index_scan; + m_start_key.key= NULL; + m_start_key.length= 0; + m_myisam= FALSE; + m_innodb= FALSE; + m_extra_cache= FALSE; + m_extra_cache_size= 0; + m_table_flags= HA_FILE_BASED | HA_REC_NOT_IN_SEQ; + m_low_byte_first= 1; + m_part_field_array= NULL; + m_ordered_rec_buffer= NULL; + m_top_entry= NO_CURRENT_PART_ID; + m_rec_length= 0; + m_last_part= 0; + m_rec0= 0; + m_curr_key_info= 0; + +#ifdef DONT_HAVE_TO_BE_INITALIZED + m_start_key.flag= 0; + m_ordered= TRUE; +#endif +} + + +ha_partition::~ha_partition() +{ + DBUG_ENTER("ha_partition::~ha_partition()"); + if (m_file != NULL) + { + uint i; + for (i= 0; i < m_tot_parts; i++) + delete m_file[i]; + } + my_free((char*) m_ordered_rec_buffer, MYF(MY_ALLOW_ZERO_PTR)); + + clear_handler_file(); + DBUG_VOID_RETURN; +} + + +/* + The partition handler is only a layer on top of other engines. Thus it + can't really perform anything without the underlying handlers. Thus we + add this method as part of the allocation of a handler object. + + 1) Allocation of underlying handlers + If we have access to the partition info we will allocate one handler + instance for each partition. + 2) Allocation without partition info + The cases where we don't have access to this information is when called + in preparation for delete_table and rename_table and in that case we + only need to set HA_FILE_BASED. In that case we will use the .par file + that contains information about the partitions and their engines and + the names of each partition. + 3) Table flags initialisation + We need also to set table flags for the partition handler. This is not + static since it depends on what storage engines are used as underlying + handlers. + The table flags is set in this routine to simulate the behaviour of a + normal storage engine + The flag HA_FILE_BASED will be set independent of the underlying handlers + 4) Index flags initialisation + When knowledge exists on the indexes it is also possible to initialise the + index flags. Again the index flags must be initialised by using the under- + lying handlers since this is storage engine dependent. + The flag HA_READ_ORDER will be reset for the time being to indicate no + ordered output is available from partition handler indexes. Later a merge + sort will be performed using the underlying handlers. + 5) primary_key_is_clustered, has_transactions and low_byte_first is + calculated here. +*/ + +int ha_partition::ha_initialise() +{ + handler **file_array, *file; + DBUG_ENTER("ha_partition::set_up_constants"); + + if (m_part_info) + { + m_tot_parts= get_tot_partitions(m_part_info); + DBUG_ASSERT(m_tot_parts > 0); + if (m_create_handler) + { + if (new_handlers_from_part_info()) + DBUG_RETURN(1); + } + else if (get_from_handler_file(table->s->path)) + { + my_error(ER_OUTOFMEMORY, MYF(0), 129); //Temporary fix TODO print_error + DBUG_RETURN(1); + } + /* + We create all underlying table handlers here. We only do it if we have + access to the partition info. We do it in this special method to be + able to report allocation errors. + */ + /* + Set up table_flags, low_byte_first, primary_key_is_clustered and + has_transactions since they are called often in all kinds of places, + other parameters are calculated on demand. + HA_FILE_BASED is always set for partition handler since we use a + special file for handling names of partitions, engine types. + HA_CAN_GEOMETRY, HA_CAN_FULLTEXT, HA_CAN_SQL_HANDLER, + HA_CAN_INSERT_DELAYED is disabled until further investigated. + */ + m_table_flags= m_file[0]->table_flags(); + m_low_byte_first= m_file[0]->low_byte_first(); + m_has_transactions= TRUE; + m_pkey_is_clustered= TRUE; + file_array= m_file; + do + { + file= *file_array; + if (m_low_byte_first != file->low_byte_first()) + { + // Cannot have handlers with different endian + my_error(ER_MIX_HANDLER_ERROR, MYF(0)); + DBUG_RETURN(1); + } + if (!file->has_transactions()) + m_has_transactions= FALSE; + if (!file->primary_key_is_clustered()) + m_pkey_is_clustered= FALSE; + m_table_flags&= file->table_flags(); + } while (*(++file_array)); + m_table_flags&= ~(HA_CAN_GEOMETRY & HA_CAN_FULLTEXT & + HA_CAN_SQL_HANDLER & HA_CAN_INSERT_DELAYED); + /* + TODO RONM: + Make sure that the tree works without partition defined, compiles + and goes through mysql-test-run. + */ + } + m_table_flags|= HA_FILE_BASED | HA_REC_NOT_IN_SEQ; + DBUG_RETURN(0); +} + +/**************************************************************************** + MODULE meta data changes +****************************************************************************/ +/* + Used to delete a table. By the time delete_table() has been called all + opened references to this table will have been closed (and your globally + shared references released. The variable name will just be the name of + the table. You will need to remove any files you have created at this + point. + + If you do not implement this, the default delete_table() is called from + handler.cc and it will delete all files with the file extentions returned + by bas_ext(). + + Called from handler.cc by delete_table and ha_create_table(). Only used + during create if the table_flag HA_DROP_BEFORE_CREATE was specified for + the storage engine. +*/ + +int ha_partition::delete_table(const char *name) +{ + int error; + DBUG_ENTER("ha_partition::delete_table"); + if ((error= del_ren_cre_table(name, NULL, NULL, NULL))) + DBUG_RETURN(error); + DBUG_RETURN(handler::delete_table(name)); +} + + +/* + Renames a table from one name to another from alter table call. + + If you do not implement this, the default rename_table() is called from + handler.cc and it will delete all files with the file extentions returned + by bas_ext(). + + Called from sql_table.cc by mysql_rename_table(). +*/ + +int ha_partition::rename_table(const char *from, const char *to) +{ + int error; + DBUG_ENTER("ha_partition::rename_table"); + if ((error= del_ren_cre_table(from, to, NULL, NULL))) + DBUG_RETURN(error); + DBUG_RETURN(handler::rename_table(from, to)); +} + + +/* + create_handler_files is called to create any handler specific files + before opening the file with openfrm to later call ::create on the + file object. + In the partition handler this is used to store the names of partitions + and types of engines in the partitions. +*/ + +int ha_partition::create_handler_files(const char *name) +{ + DBUG_ENTER("ha_partition::create_handler_files()"); + if (create_handler_file(name)) + { + my_error(ER_CANT_CREATE_HANDLER_FILE, MYF(0)); + DBUG_RETURN(1); + } + DBUG_RETURN(0); +} + + +/* + create() is called to create a table. The variable name will have the name + of the table. When create() is called you do not need to worry about + opening the table. Also, the FRM file will have already been created so + adjusting create_info will not do you any good. You can overwrite the frm + file at this point if you wish to change the table definition, but there + are no methods currently provided for doing that. + + Called from handle.cc by ha_create_table(). +*/ + +int ha_partition::create(const char *name, TABLE *table_arg, + HA_CREATE_INFO *create_info) +{ + char t_name[FN_REFLEN]; + DBUG_ENTER("ha_partition::create"); + + strmov(t_name, name); + *fn_ext(t_name)= 0; + if (del_ren_cre_table(t_name, NULL, table_arg, create_info)) + { + handler::delete_table(t_name); + DBUG_RETURN(1); + } + DBUG_RETURN(0); +} + + +void ha_partition::update_create_info(HA_CREATE_INFO *create_info) +{ + return; +} + + +char *ha_partition::update_table_comment(const char *comment) +{ + return (char*) comment; // Nothing to change +} + + +/* + This method is used to calculate the partition name, service routine to + the del_ren_cre_table method. +*/ + +static void create_partition_name(char *out, const char *in1, const char *in2) +{ + strxmov(out, in1, "_", in2, NullS); +} + + +/* + Common routine to handle delete_table and rename_table. + The routine uses the partition handler file to get the + names of the partition instances. Both these routines + are called after creating the handler without table + object and thus the file is needed to discover the + names of the partitions and the underlying storage engines. +*/ + +uint ha_partition::del_ren_cre_table(const char *from, + const char *to, + TABLE *table_arg, + HA_CREATE_INFO *create_info) +{ + int save_error= 0, error; + char from_buff[FN_REFLEN], to_buff[FN_REFLEN]; + char *name_buffer_ptr; + uint i; + handler **file; + DBUG_ENTER("del_ren_cre_table()"); + + if (get_from_handler_file(from)) + DBUG_RETURN(TRUE); + DBUG_ASSERT(m_file_buffer); + name_buffer_ptr= m_name_buffer_ptr; + file= m_file; + i= 0; + do + { + create_partition_name(from_buff, from, name_buffer_ptr); + if (to != NULL) + { // Rename branch + create_partition_name(to_buff, to, name_buffer_ptr); + error= (*file)->rename_table((const char*) from_buff, + (const char*) to_buff); + } + else if (table_arg == NULL) // delete branch + error= (*file)->delete_table((const char*) from_buff); + else + { + set_up_table_before_create(table_arg, create_info, i); + error= (*file)->create(from_buff, table_arg, create_info); + } + name_buffer_ptr= strend(name_buffer_ptr) + 1; + if (error) + save_error= error; + i++; + } while (*(++file)); + DBUG_RETURN(save_error); +} + + +partition_element *ha_partition::find_partition_element(uint part_id) +{ + uint i; + uint curr_part_id= 0; + List_iterator_fast < partition_element > part_it(m_part_info->partitions); + + for (i= 0; i < m_part_info->no_parts; i++) + { + partition_element *part_elem; + part_elem= part_it++; + if (m_is_sub_partitioned) + { + uint j; + List_iterator_fast <partition_element> sub_it(part_elem->subpartitions); + for (j= 0; j < m_part_info->no_subparts; j++) + { + part_elem= sub_it++; + if (part_id == curr_part_id++) + return part_elem; + } + } + else if (part_id == curr_part_id++) + return part_elem; + } + DBUG_ASSERT(0); + current_thd->fatal_error(); // Abort + return NULL; +} + + +void ha_partition::set_up_table_before_create(TABLE *table, + HA_CREATE_INFO *info, + uint part_id) +{ + /* + Set up + 1) Comment on partition + 2) MAX_ROWS, MIN_ROWS on partition + 3) Index file name on partition + 4) Data file name on partition + */ + partition_element *part_elem= find_partition_element(part_id); + if (!part_elem) + return; // Fatal error + table->s->max_rows= part_elem->part_max_rows; + table->s->min_rows= part_elem->part_min_rows; + info->index_file_name= part_elem->index_file_name; + info->data_file_name= part_elem->data_file_name; +} + + +/* + Routine used to add two names with '_' in between then. Service routine + to create_handler_file + Include the NULL in the count of characters since it is needed as separator + between the partition names. +*/ + +static uint name_add(char *dest, const char *first_name, const char *sec_name) +{ + return (uint) (strxmov(dest, first_name, "_", sec_name, NullS) -dest) + 1; +} + + +/* + Method used to create handler file with names of partitions, their + engine types and the number of partitions. +*/ + +bool ha_partition::create_handler_file(const char *name) +{ + partition_element *part_elem, *subpart_elem; + uint i, j, part_name_len, subpart_name_len; + uint tot_partition_words, tot_name_len; + uint tot_len_words, tot_len_byte, chksum, tot_name_words; + char *name_buffer_ptr; + uchar *file_buffer, *engine_array; + bool result= TRUE; + char file_name[FN_REFLEN]; + File file; + List_iterator_fast < partition_element > part_it(m_part_info->partitions); + DBUG_ENTER("create_handler_file"); + + DBUG_PRINT("info", ("table name = %s", name)); + tot_name_len= 0; + for (i= 0; i < m_part_info->no_parts; i++) + { + part_elem= part_it++; + part_name_len= strlen(part_elem->partition_name); + if (!m_is_sub_partitioned) + tot_name_len+= part_name_len + 1; + else + { + List_iterator_fast<partition_element> sub_it(part_elem->subpartitions); + for (j= 0; j < m_part_info->no_subparts; j++) + { + subpart_elem= sub_it++; + subpart_name_len= strlen(subpart_elem->partition_name); + tot_name_len+= part_name_len + subpart_name_len + 2; + } + } + } + /* + File format: + Length in words 4 byte + Checksum 4 byte + Total number of partitions 4 byte + Array of engine types n * 4 bytes where + n = (m_tot_parts + 3)/4 + Length of name part in bytes 4 bytes + Name part m * 4 bytes where + m = ((length_name_part + 3)/4)*4 + + All padding bytes are zeroed + */ + tot_partition_words= (m_tot_parts + 3) / 4; + tot_name_words= (tot_name_len + 3) / 4; + tot_len_words= 4 + tot_partition_words + tot_name_words; + tot_len_byte= 4 * tot_len_words; + if (!(file_buffer= (uchar *) my_malloc(tot_len_byte, MYF(MY_ZEROFILL)))) + DBUG_RETURN(TRUE); + engine_array= (file_buffer + 12); + name_buffer_ptr= (char*) (file_buffer + ((4 + tot_partition_words) * 4)); + part_it.rewind(); + for (i= 0; i < m_part_info->no_parts; i++) + { + part_elem= part_it++; + if (!m_is_sub_partitioned) + { + name_buffer_ptr= strmov(name_buffer_ptr, part_elem->partition_name)+1; + *engine_array= (uchar) part_elem->engine_type; + DBUG_PRINT("info", ("engine: %u", *engine_array)); + engine_array++; + } + else + { + List_iterator_fast<partition_element> sub_it(part_elem->subpartitions); + for (j= 0; j < m_part_info->no_subparts; j++) + { + subpart_elem= sub_it++; + name_buffer_ptr+= name_add(name_buffer_ptr, + part_elem->partition_name, + subpart_elem->partition_name); + *engine_array= (uchar) part_elem->engine_type; + engine_array++; + } + } + } + chksum= 0; + int4store(file_buffer, tot_len_words); + int4store(file_buffer + 8, m_tot_parts); + int4store(file_buffer + 12 + (tot_partition_words * 4), tot_name_len); + for (i= 0; i < tot_len_words; i++) + chksum^= uint4korr(file_buffer + 4 * i); + int4store(file_buffer + 4, chksum); + /* + Remove .frm extension and replace with .par + Create and write and close file + to be used at open, delete_table and rename_table + */ + fn_format(file_name, name, "", ".par", MYF(MY_REPLACE_EXT)); + if ((file= my_create(file_name, CREATE_MODE, O_RDWR | O_TRUNC, + MYF(MY_WME))) >= 0) + { + result= my_write(file, (byte *) file_buffer, tot_len_byte, + MYF(MY_WME | MY_NABP)); + VOID(my_close(file, MYF(0))); + } + else + result= TRUE; + my_free((char*) file_buffer, MYF(0)); + DBUG_RETURN(result); +} + + +void ha_partition::clear_handler_file() +{ + my_free((char*) m_file_buffer, MYF(MY_ALLOW_ZERO_PTR)); + m_file_buffer= NULL; + m_name_buffer_ptr= NULL; + m_engine_array= NULL; +} + + +bool ha_partition::create_handlers() +{ + uint i; + uint alloc_len= (m_tot_parts + 1) * sizeof(handler*); + DBUG_ENTER("create_handlers"); + + if (!(m_file= (handler **) sql_alloc(alloc_len))) + DBUG_RETURN(TRUE); + bzero(m_file, alloc_len); + for (i= 0; i < m_tot_parts; i++) + { + if (!(m_file[i]= get_new_handler(table, (enum db_type) m_engine_array[i]))) + DBUG_RETURN(TRUE); + DBUG_PRINT("info", ("engine_type: %u", m_engine_array[i])); + } + m_file[m_tot_parts]= 0; + /* For the moment we only support partition over the same table engine */ + if (m_engine_array[0] == (uchar) DB_TYPE_MYISAM) + { + DBUG_PRINT("info", ("MyISAM")); + m_myisam= TRUE; + } + else if (m_engine_array[0] == (uchar) DB_TYPE_INNODB) + { + DBUG_PRINT("info", ("InnoDB")); + m_innodb= TRUE; + } + DBUG_RETURN(FALSE); +} + + +bool ha_partition::new_handlers_from_part_info() +{ + uint i, j; + partition_element *part_elem; + uint alloc_len= (m_tot_parts + 1) * sizeof(handler*); + List_iterator_fast <partition_element> part_it(m_part_info->partitions); + DBUG_ENTER("ha_partition::new_handlers_from_part_info"); + + if (!(m_file= (handler **) sql_alloc(alloc_len))) + goto error; + bzero(m_file, alloc_len); + DBUG_ASSERT(m_part_info->no_parts > 0); + + i= 0; + /* + Don't know the size of the underlying storage engine, invent a number of + bytes allocated for error message if allocation fails + */ + alloc_len= 128; + do + { + part_elem= part_it++; + if (!(m_file[i]= get_new_handler(table, part_elem->engine_type))) + goto error; + DBUG_PRINT("info", ("engine_type: %u", (uint) part_elem->engine_type)); + if (m_is_sub_partitioned) + { + for (j= 0; j < m_part_info->no_subparts; j++) + { + if (!(m_file[i]= get_new_handler(table, part_elem->engine_type))) + goto error; + DBUG_PRINT("info", ("engine_type: %u", (uint) part_elem->engine_type)); + } + } + } while (++i < m_part_info->no_parts); + if (part_elem->engine_type == DB_TYPE_MYISAM) + { + DBUG_PRINT("info", ("MyISAM")); + m_myisam= TRUE; + } + DBUG_RETURN(FALSE); +error: + my_error(ER_OUTOFMEMORY, MYF(0), alloc_len); + DBUG_RETURN(TRUE); +} + + +/* + Open handler file to get partition names, engine types and number of + partitions. +*/ + +bool ha_partition::get_from_handler_file(const char *name) +{ + char buff[FN_REFLEN], *address_tot_name_len; + File file; + char *file_buffer, *name_buffer_ptr; + uchar *engine_array; + uint i, len_bytes, len_words, tot_partition_words, tot_name_words, chksum; + DBUG_ENTER("ha_partition::get_from_handler_file"); + DBUG_PRINT("enter", ("table name: '%s'", name)); + + if (m_file_buffer) + DBUG_RETURN(FALSE); + fn_format(buff, name, "", ha_par_ext, MYF(0)); + + /* Following could be done with my_stat to read in whole file */ + if ((file= my_open(buff, O_RDONLY | O_SHARE, MYF(0))) < 0) + DBUG_RETURN(TRUE); + if (my_read(file, (byte *) & buff[0], 8, MYF(MY_NABP))) + goto err1; + len_words= uint4korr(buff); + len_bytes= 4 * len_words; + if (!(file_buffer= my_malloc(len_bytes, MYF(0)))) + goto err1; + VOID(my_seek(file, 0, MY_SEEK_SET, MYF(0))); + if (my_read(file, (byte *) file_buffer, len_bytes, MYF(MY_NABP))) + goto err2; + + chksum= 0; + for (i= 0; i < len_words; i++) + chksum ^= uint4korr((file_buffer) + 4 * i); + if (chksum) + goto err2; + m_tot_parts= uint4korr((file_buffer) + 8); + tot_partition_words= (m_tot_parts + 3) / 4; + engine_array= (uchar *) ((file_buffer) + 12); + address_tot_name_len= file_buffer + 12 + 4 * tot_partition_words; + tot_name_words= (uint4korr(address_tot_name_len) + 3) / 4; + if (len_words != (tot_partition_words + tot_name_words + 4)) + goto err2; + name_buffer_ptr= file_buffer + 16 + 4 * tot_partition_words; + VOID(my_close(file, MYF(0))); + m_file_buffer= file_buffer; // Will be freed in clear_handler_file() + m_name_buffer_ptr= name_buffer_ptr; + m_engine_array= engine_array; + if (!m_file && create_handlers()) + { + clear_handler_file(); + DBUG_RETURN(TRUE); + } + DBUG_RETURN(FALSE); + +err2: + my_free(file_buffer, MYF(0)); +err1: + VOID(my_close(file, MYF(0))); + DBUG_RETURN(TRUE); +} + +/**************************************************************************** + MODULE open/close object +****************************************************************************/ +/* + Used for opening tables. The name will be the name of the file. + A table is opened when it needs to be opened. For instance + when a request comes in for a select on the table (tables are not + open and closed for each request, they are cached). + + Called from handler.cc by handler::ha_open(). The server opens all tables + by calling ha_open() which then calls the handler specific open(). +*/ + +int ha_partition::open(const char *name, int mode, uint test_if_locked) +{ + int error; + char name_buff[FN_REFLEN]; + char *name_buffer_ptr= m_name_buffer_ptr; + handler **file; + uint alloc_len; + DBUG_ENTER("ha_partition::open"); + + ref_length= 0; + m_part_field_array= m_part_info->full_part_field_array; + if (get_from_handler_file(name)) + DBUG_RETURN(1); + m_start_key.length= 0; + m_rec0= table->record[0]; + m_rec_length= table->s->reclength; + alloc_len= m_tot_parts * (m_rec_length + PARTITION_BYTES_IN_POS); + alloc_len+= table->s->max_key_length; + if (!m_ordered_rec_buffer) + { + if (!(m_ordered_rec_buffer= my_malloc(alloc_len, MYF(MY_WME)))) + { + DBUG_RETURN(1); + } + { + /* + We set-up one record per partition and each record has 2 bytes in + front where the partition id is written. This is used by ordered + index_read. + We also set-up a reference to the first record for temporary use in + setting up the scan. + */ + char *ptr= m_ordered_rec_buffer; + uint i= 0; + do + { + int2store(ptr, i); + ptr+= m_rec_length + PARTITION_BYTES_IN_POS; + } while (++i < m_tot_parts); + m_start_key.key= ptr; + } + } + file= m_file; + do + { + create_partition_name(name_buff, name, name_buffer_ptr); + if ((error= (*file)->ha_open((const char*) name_buff, mode, + test_if_locked))) + goto err_handler; + name_buffer_ptr+= strlen(name_buffer_ptr) + 1; + set_if_bigger(ref_length, ((*file)->ref_length)); + } while (*(++file)); + /* + Add 2 bytes for partition id in position ref length. + ref_length=max_in_all_partitions(ref_length) + PARTITION_BYTES_IN_POS + */ + ref_length+= PARTITION_BYTES_IN_POS; + m_ref_length= ref_length; + /* + Release buffer read from .par file. It will not be reused again after + being opened once. + */ + clear_handler_file(); + /* + Initialise priority queue, initialised to reading forward. + */ + if ((error= init_queue(&queue, m_tot_parts, (uint) PARTITION_BYTES_IN_POS, + 0, key_rec_cmp, (void*)this))) + goto err_handler; + /* + Some handlers update statistics as part of the open call. This will in + some cases corrupt the statistics of the partition handler and thus + to ensure we have correct statistics we call info from open after + calling open on all individual handlers. + */ + info(HA_STATUS_VARIABLE | HA_STATUS_CONST); + DBUG_RETURN(0); + +err_handler: + while (file-- != m_file) + (*file)->close(); + DBUG_RETURN(error); +} + +/* + Closes a table. We call the free_share() function to free any resources + that we have allocated in the "shared" structure. + + Called from sql_base.cc, sql_select.cc, and table.cc. + In sql_select.cc it is only used to close up temporary tables or during + the process where a temporary table is converted over to being a + myisam table. + For sql_base.cc look at close_data_tables(). +*/ + +int ha_partition::close(void) +{ + handler **file; + DBUG_ENTER("ha_partition::close"); + file= m_file; + do + { + (*file)->close(); + } while (*(++file)); + DBUG_RETURN(0); +} + + +/**************************************************************************** + MODULE start/end statement +****************************************************************************/ +/* + A number of methods to define various constants for the handler. In + the case of the partition handler we need to use some max and min + of the underlying handlers in most cases. +*/ + +/* + First you should go read the section "locking functions for mysql" in + lock.cc to understand this. + This create a lock on the table. If you are implementing a storage engine + that can handle transactions look at ha_berkely.cc to see how you will + want to goo about doing this. Otherwise you should consider calling + flock() here. + Originally this method was used to set locks on file level to enable + several MySQL Servers to work on the same data. For transactional + engines it has been "abused" to also mean start and end of statements + to enable proper rollback of statements and transactions. When LOCK + TABLES has been issued the start_stmt method takes over the role of + indicating start of statement but in this case there is no end of + statement indicator(?). + + Called from lock.cc by lock_external() and unlock_external(). Also called + from sql_table.cc by copy_data_between_tables(). +*/ + +int ha_partition::external_lock(THD *thd, int lock_type) +{ + uint error; + handler **file; + DBUG_ENTER("ha_partition::external_lock"); + file= m_file; + do + { + if ((error= (*file)->external_lock(thd, lock_type))) + { + if (lock_type != F_UNLCK) + goto err_handler; + } + } while (*(++file)); + m_lock_type= lock_type; // For the future (2009?) + DBUG_RETURN(0); + +err_handler: + while (file-- != m_file) + (*file)->external_lock(thd, F_UNLCK); + DBUG_RETURN(error); +} + + +/* + The idea with handler::store_lock() is the following: + + The statement decided which locks we should need for the table + for updates/deletes/inserts we get WRITE locks, for SELECT... we get + read locks. + + Before adding the lock into the table lock handler (see thr_lock.c) + mysqld calls store lock with the requested locks. Store lock can now + modify a write lock to a read lock (or some other lock), ignore the + lock (if we don't want to use MySQL table locks at all) or add locks + for many tables (like we do when we are using a MERGE handler). + + Berkeley DB for partition changes all WRITE locks to TL_WRITE_ALLOW_WRITE + (which signals that we are doing WRITES, but we are still allowing other + reader's and writer's. + + When releasing locks, store_lock() are also called. In this case one + usually doesn't have to do anything. + + store_lock is called when holding a global mutex to ensure that only + one thread at a time changes the locking information of tables. + + In some exceptional cases MySQL may send a request for a TL_IGNORE; + This means that we are requesting the same lock as last time and this + should also be ignored. (This may happen when someone does a flush + table when we have opened a part of the tables, in which case mysqld + closes and reopens the tables and tries to get the same locks at last + time). In the future we will probably try to remove this. + + Called from lock.cc by get_lock_data(). +*/ + +THR_LOCK_DATA **ha_partition::store_lock(THD *thd, + THR_LOCK_DATA **to, + enum thr_lock_type lock_type) +{ + handler **file; + DBUG_ENTER("ha_partition::store_lock"); + file= m_file; + do + { + to= (*file)->store_lock(thd, to, lock_type); + } while (*(++file)); + DBUG_RETURN(to); +} + + +int ha_partition::start_stmt(THD *thd) +{ + int error= 0; + handler **file; + DBUG_ENTER("ha_partition::start_stmt"); + file= m_file; + do + { + if ((error= (*file)->start_stmt(thd))) + break; + } while (*(++file)); + DBUG_RETURN(error); +} + + +/* + Returns the number of store locks needed in call to store lock. + We return number of partitions since we call store_lock on each + underlying handler. Assists the above functions in allocating + sufficient space for lock structures. +*/ + +uint ha_partition::lock_count() const +{ + DBUG_ENTER("ha_partition::lock_count"); + DBUG_RETURN(m_tot_parts); +} + + +/* + Record currently processed was not in the result set of the statement + and is thus unlocked. Used for UPDATE and DELETE queries. +*/ + +void ha_partition::unlock_row() +{ + m_file[m_last_part]->unlock_row(); + return; +} + + +/**************************************************************************** + MODULE change record +****************************************************************************/ + +/* + write_row() inserts a row. buf() is a byte array of data, normally record[0]. + + You can use the field information to extract the data from the native byte + array type. + + Example of this would be: + for (Field **field=table->field ; *field ; field++) + { + ... + } + + See ha_tina.cc for an partition of extracting all of the data as strings. + ha_berekly.cc has an partition of how to store it intact by "packing" it + for ha_berkeley's own native storage type. + + See the note for update_row() on auto_increments and timestamps. This + case also applied to write_row(). + + Called from item_sum.cc, item_sum.cc, sql_acl.cc, sql_insert.cc, + sql_insert.cc, sql_select.cc, sql_table.cc, sql_udf.cc, and sql_update.cc. + + ADDITIONAL INFO: + + Most handlers set timestamp when calling write row if any such fields + exists. Since we are calling an underlying handler we assume the´ + underlying handler will assume this responsibility. + + Underlying handlers will also call update_auto_increment to calculate + the new auto increment value. We will catch the call to + get_auto_increment and ensure this increment value is maintained by + only one of the underlying handlers. +*/ + +int ha_partition::write_row(byte * buf) +{ + uint32 part_id; + int error; +#ifdef NOT_NEEDED + byte *rec0= m_rec0; +#endif + DBUG_ENTER("ha_partition::write_row"); + DBUG_ASSERT(buf == m_rec0); + +#ifdef NOT_NEEDED + if (likely(buf == rec0)) +#endif + error= m_part_info->get_partition_id(m_part_info, &part_id); +#ifdef NOT_NEEDED + else + { + set_field_ptr(m_part_field_array, buf, rec0); + error= m_part_info->get_partition_id(m_part_info, &part_id); + set_field_ptr(m_part_field_array, rec0, buf); + } +#endif + if (unlikely(error)) + DBUG_RETURN(error); + m_last_part= part_id; + DBUG_PRINT("info", ("Insert in partition %d", part_id)); + DBUG_RETURN(m_file[part_id]->write_row(buf)); +} + + +/* + Yes, update_row() does what you expect, it updates a row. old_data will + have the previous row record in it, while new_data will have the newest + data in it. + Keep in mind that the server can do updates based on ordering if an + ORDER BY clause was used. Consecutive ordering is not guarenteed. + + Currently new_data will not have an updated auto_increament record, or + and updated timestamp field. You can do these for partition by doing these: + if (table->timestamp_field_type & TIMESTAMP_AUTO_SET_ON_UPDATE) + table->timestamp_field->set_time(); + if (table->next_number_field && record == table->record[0]) + update_auto_increment(); + + Called from sql_select.cc, sql_acl.cc, sql_update.cc, and sql_insert.cc. + new_data is always record[0] + old_data is normally record[1] but may be anything + +*/ + +int ha_partition::update_row(const byte *old_data, byte *new_data) +{ + uint32 new_part_id, old_part_id; + int error; + DBUG_ENTER("ha_partition::update_row"); + + if ((error= get_parts_for_update(old_data, new_data, table->record[0], + m_part_info, &old_part_id, &new_part_id))) + { + DBUG_RETURN(error); + } + + /* + TODO: + set_internal_auto_increment= + max(set_internal_auto_increment, new_data->auto_increment) + */ + m_last_part= new_part_id; + if (new_part_id == old_part_id) + { + DBUG_PRINT("info", ("Update in partition %d", new_part_id)); + DBUG_RETURN(m_file[new_part_id]->update_row(old_data, new_data)); + } + else + { + DBUG_PRINT("info", ("Update from partition %d to partition %d", + old_part_id, new_part_id)); + if ((error= m_file[new_part_id]->write_row(new_data))) + DBUG_RETURN(error); + if ((error= m_file[old_part_id]->delete_row(old_data))) + { +#ifdef IN_THE_FUTURE + (void) m_file[new_part_id]->delete_last_inserted_row(new_data); +#endif + DBUG_RETURN(error); + } + } + DBUG_RETURN(0); +} + + +/* + This will delete a row. buf will contain a copy of the row to be deleted. + The server will call this right after the current row has been read + (from either a previous rnd_xxx() or index_xxx() call). + If you keep a pointer to the last row or can access a primary key it will + make doing the deletion quite a bit easier. + Keep in mind that the server does no guarentee consecutive deletions. + ORDER BY clauses can be used. + + Called in sql_acl.cc and sql_udf.cc to manage internal table information. + Called in sql_delete.cc, sql_insert.cc, and sql_select.cc. In sql_select + it is used for removing duplicates while in insert it is used for REPLACE + calls. + + buf is either record[0] or record[1] + +*/ + +int ha_partition::delete_row(const byte *buf) +{ + uint32 part_id; + int error; + DBUG_ENTER("ha_partition::delete_row"); + + if ((error= get_part_for_delete(buf, m_rec0, m_part_info, &part_id))) + { + DBUG_RETURN(error); + } + m_last_part= part_id; + DBUG_RETURN(m_file[part_id]->delete_row(buf)); +} + + +/* + Used to delete all rows in a table. Both for cases of truncate and + for cases where the optimizer realizes that all rows will be + removed as a result of a SQL statement. + + Called from item_sum.cc by Item_func_group_concat::clear(), + Item_sum_count_distinct::clear(), and Item_func_group_concat::clear(). + Called from sql_delete.cc by mysql_delete(). + Called from sql_select.cc by JOIN::reinit(). + Called from sql_union.cc by st_select_lex_unit::exec(). +*/ + +int ha_partition::delete_all_rows() +{ + int error; + handler **file; + DBUG_ENTER("ha_partition::delete_all_rows"); + file= m_file; + do + { + if ((error= (*file)->delete_all_rows())) + DBUG_RETURN(error); + } while (*(++file)); + DBUG_RETURN(0); +} + +/* + rows == 0 means we will probably insert many rows +*/ + +void ha_partition::start_bulk_insert(ha_rows rows) +{ + handler **file; + DBUG_ENTER("ha_partition::start_bulk_insert"); + if (!rows) + { + /* Avoid allocation big caches in all underlaying handlers */ + DBUG_VOID_RETURN; + } + rows= rows/m_tot_parts + 1; + file= m_file; + do + { + (*file)->start_bulk_insert(rows); + } while (*(++file)); + DBUG_VOID_RETURN; +} + + +int ha_partition::end_bulk_insert() +{ + int error= 0; + handler **file; + DBUG_ENTER("ha_partition::end_bulk_insert"); + + file= m_file; + do + { + int tmp; + /* We want to execute end_bulk_insert() on all handlers */ + if ((tmp= (*file)->end_bulk_insert())) + error= tmp; + } while (*(++file)); + DBUG_RETURN(error); +} + +/**************************************************************************** + MODULE full table scan +****************************************************************************/ +/* + Initialize engine for random reads + + SYNOPSIS + ha_partition::rnd_init() + scan 0 Initialize for random reads through rnd_pos() + 1 Initialize for random scan through rnd_next() + + NOTES + rnd_init() is called when the server wants the storage engine to do a + table scan or when the server wants to access data through rnd_pos. + + When scan is used we will scan one handler partition at a time. + When preparing for rnd_pos we will init all handler partitions. + No extra cache handling is needed when scannning is not performed. + + Before initialising we will call rnd_end to ensure that we clean up from + any previous incarnation of a table scan. + Called from filesort.cc, records.cc, sql_handler.cc, sql_select.cc, + sql_table.cc, and sql_update.cc. +*/ + +int ha_partition::rnd_init(bool scan) +{ + int error; + handler **file; + DBUG_ENTER("ha_partition::rnd_init"); + + include_partition_fields_in_used_fields(); + if (scan) + { + /* + rnd_end() is needed for partitioning to reset internal data if scan + is already in use + */ + + rnd_end(); + if (partition_scan_set_up(rec_buf(0), FALSE)) + { + /* + The set of partitions to scan is empty. We return success and return + end of file on first rnd_next. + */ + DBUG_RETURN(0); + } + /* + We will use the partition set in our scan, using the start and stop + partition and checking each scan before start dependent on bittfields. + */ + late_extra_cache(m_part_spec.start_part); + DBUG_PRINT("info", ("rnd_init on partition %d",m_part_spec.start_part)); + error= m_file[m_part_spec.start_part]->ha_rnd_init(1); + m_scan_value= 1; // Scan active + if (error) + m_scan_value= 2; // No scan active + DBUG_RETURN(error); + } + file= m_file; + do + { + if ((error= (*file)->ha_rnd_init(0))) + goto err; + } while (*(++file)); + m_scan_value= 0; + DBUG_RETURN(0); + +err: + while (file--) + (*file)->ha_rnd_end(); + DBUG_RETURN(error); +} + + +int ha_partition::rnd_end() +{ + handler **file; + DBUG_ENTER("ha_partition::rnd_end"); + switch (m_scan_value) { + case 2: // Error + break; + case 1: // Table scan + if (m_part_spec.start_part != NO_CURRENT_PART_ID) + { + late_extra_no_cache(m_part_spec.start_part); + m_file[m_part_spec.start_part]->ha_rnd_end(); + } + break; + case 0: + file= m_file; + do + { + (*file)->ha_rnd_end(); + } while (*(++file)); + break; + } + m_part_spec.start_part= NO_CURRENT_PART_ID; + m_scan_value= 2; + DBUG_RETURN(0); +} + + +/* + read next row during full table scan (scan in random row order) + + SYNOPSIS + rnd_next() + buf buffer that should be filled with data + + This is called for each row of the table scan. When you run out of records + you should return HA_ERR_END_OF_FILE. + The Field structure for the table is the key to getting data into buf + in a manner that will allow the server to understand it. + + Called from filesort.cc, records.cc, sql_handler.cc, sql_select.cc, + sql_table.cc, and sql_update.cc. +*/ + +int ha_partition::rnd_next(byte *buf) +{ + DBUG_ASSERT(m_scan_value); + uint part_id= m_part_spec.start_part; // Cache of this variable + handler *file= m_file[part_id]; + int result= HA_ERR_END_OF_FILE; + DBUG_ENTER("ha_partition::rnd_next"); + + DBUG_ASSERT(m_scan_value == 1); + + if (part_id > m_part_spec.end_part) + { + /* + The original set of partitions to scan was empty and thus we report + the result here. + */ + goto end; + } + while (TRUE) + { + if ((result= file->rnd_next(buf))) + { + if (result == HA_ERR_RECORD_DELETED) + continue; // Probably MyISAM + + if (result != HA_ERR_END_OF_FILE) + break; // Return error + + /* End current partition */ + late_extra_no_cache(part_id); + DBUG_PRINT("info", ("rnd_end on partition %d", part_id)); + if ((result= file->ha_rnd_end())) + break; + /* Shift to next partition */ + if (++part_id > m_part_spec.end_part) + { + result= HA_ERR_END_OF_FILE; + break; + } + file= m_file[part_id]; + DBUG_PRINT("info", ("rnd_init on partition %d", part_id)); + if ((result= file->ha_rnd_init(1))) + break; + late_extra_cache(part_id); + } + else + { + m_part_spec.start_part= part_id; + m_last_part= part_id; + table->status= 0; + DBUG_RETURN(0); + } + } + +end: + m_part_spec.start_part= NO_CURRENT_PART_ID; + table->status= STATUS_NOT_FOUND; + DBUG_RETURN(result); +} + + +inline void store_part_id_in_pos(byte *pos, uint part_id) +{ + int2store(pos, part_id); +} + +inline uint get_part_id_from_pos(const byte *pos) +{ + return uint2korr(pos); +} + +/* + position() is called after each call to rnd_next() if the data needs + to be ordered. You can do something like the following to store + the position: + ha_store_ptr(ref, ref_length, current_position); + + The server uses ref to store data. ref_length in the above case is + the size needed to store current_position. ref is just a byte array + that the server will maintain. If you are using offsets to mark rows, then + current_position should be the offset. If it is a primary key like in + BDB, then it needs to be a primary key. + + Called from filesort.cc, sql_select.cc, sql_delete.cc and sql_update.cc. +*/ + +void ha_partition::position(const byte *record) +{ + handler *file= m_file[m_last_part]; + DBUG_ENTER("ha_partition::position"); + file->position(record); + store_part_id_in_pos(ref, m_last_part); + memcpy((ref + PARTITION_BYTES_IN_POS), file->ref, + (ref_length - PARTITION_BYTES_IN_POS)); + +#ifdef SUPPORTING_PARTITION_OVER_DIFFERENT_ENGINES +#ifdef HAVE_purify + bzero(ref + PARTITION_BYTES_IN_POS + ref_length, max_ref_length-ref_length); +#endif /* HAVE_purify */ +#endif + DBUG_VOID_RETURN; +} + +/* + This is like rnd_next, but you are given a position to use + to determine the row. The position will be of the type that you stored in + ref. You can use ha_get_ptr(pos,ref_length) to retrieve whatever key + or position you saved when position() was called. + Called from filesort.cc records.cc sql_insert.cc sql_select.cc + sql_update.cc. +*/ + +int ha_partition::rnd_pos(byte * buf, byte *pos) +{ + uint part_id; + handler *file; + DBUG_ENTER("ha_partition::rnd_pos"); + + part_id= get_part_id_from_pos((const byte *) pos); + DBUG_ASSERT(part_id < m_tot_parts); + file= m_file[part_id]; + m_last_part= part_id; + DBUG_RETURN(file->rnd_pos(buf, (pos + PARTITION_BYTES_IN_POS))); +} + + +/**************************************************************************** + MODULE index scan +****************************************************************************/ +/* + Positions an index cursor to the index specified in the handle. Fetches the + row if available. If the key value is null, begin at the first key of the + index. + + There are loads of optimisations possible here for the partition handler. + The same optimisations can also be checked for full table scan although + only through conditions and not from index ranges. + Phase one optimisations: + Check if the fields of the partition function are bound. If so only use + the single partition it becomes bound to. + Phase two optimisations: + If it can be deducted through range or list partitioning that only a + subset of the partitions are used, then only use those partitions. +*/ + +/* + index_init is always called before starting index scans (except when + starting through index_read_idx and using read_range variants). +*/ + +int ha_partition::index_init(uint inx, bool sorted) +{ + int error= 0; + handler **file; + DBUG_ENTER("ha_partition::index_init"); + + active_index= inx; + m_part_spec.start_part= NO_CURRENT_PART_ID; + m_start_key.length= 0; + m_ordered= sorted; + m_curr_key_info= table->key_info+inx; + include_partition_fields_in_used_fields(); + + file= m_file; + do + { + /* TODO RONM: Change to index_init() when code is stable */ + if ((error= (*file)->ha_index_init(inx, sorted))) + { + DBUG_ASSERT(0); // Should never happen + break; + } + } while (*(++file)); + DBUG_RETURN(error); +} + + +/* + index_end is called at the end of an index scan to clean up any + things needed to clean up. +*/ + +int ha_partition::index_end() +{ + int error= 0; + handler **file; + DBUG_ENTER("ha_partition::index_end"); + + active_index= MAX_KEY; + m_part_spec.start_part= NO_CURRENT_PART_ID; + file= m_file; + do + { + int tmp; + /* We want to execute index_end() on all handlers */ + /* TODO RONM: Change to index_end() when code is stable */ + if ((tmp= (*file)->ha_index_end())) + error= tmp; + } while (*(++file)); + DBUG_RETURN(error); +} + + +/* + index_read starts a new index scan using a start key. The MySQL Server + will check the end key on its own. Thus to function properly the + partitioned handler need to ensure that it delivers records in the sort + order of the MySQL Server. + index_read can be restarted without calling index_end on the previous + index scan and without calling index_init. In this case the index_read + is on the same index as the previous index_scan. This is particularly + used in conjuntion with multi read ranges. +*/ + +int ha_partition::index_read(byte * buf, const byte * key, + uint key_len, enum ha_rkey_function find_flag) +{ + DBUG_ENTER("ha_partition::index_read"); + end_range= 0; + DBUG_RETURN(common_index_read(buf, key, key_len, find_flag)); +} + + +int ha_partition::common_index_read(byte *buf, const byte *key, uint key_len, + enum ha_rkey_function find_flag) +{ + int error; + DBUG_ENTER("ha_partition::common_index_read"); + + memcpy((void*)m_start_key.key, key, key_len); + m_start_key.length= key_len; + m_start_key.flag= find_flag; + m_index_scan_type= partition_index_read; + + if ((error= partition_scan_set_up(buf, TRUE))) + { + DBUG_RETURN(error); + } + + if (!m_ordered_scan_ongoing || + (find_flag == HA_READ_KEY_EXACT && + (key_len >= m_curr_key_info->key_length || + key_len == 0))) + { + /* + We use unordered index scan either when read_range is used and flag + is set to not use ordered or when an exact key is used and in this + case all records will be sorted equal and thus the sort order of the + resulting records doesn't matter. + We also use an unordered index scan when the number of partitions to + scan is only one. + The unordered index scan will use the partition set created. + Need to set unordered scan ongoing since we can come here even when + it isn't set. + */ + m_ordered_scan_ongoing= FALSE; + error= handle_unordered_scan_next_partition(buf); + } + else + { + /* + In all other cases we will use the ordered index scan. This will use + the partition set created by the get_partition_set method. + */ + error= handle_ordered_index_scan(buf); + } + DBUG_RETURN(error); +} + + +/* + index_first() asks for the first key in the index. + This is similar to index_read except that there is no start key since + the scan starts from the leftmost entry and proceeds forward with + index_next. + + Called from opt_range.cc, opt_sum.cc, sql_handler.cc, + and sql_select.cc. +*/ + +int ha_partition::index_first(byte * buf) +{ + DBUG_ENTER("ha_partition::index_first"); + end_range= 0; + m_index_scan_type= partition_index_first; + DBUG_RETURN(common_first_last(buf)); +} + + +/* + index_last() asks for the last key in the index. + This is similar to index_read except that there is no start key since + the scan starts from the rightmost entry and proceeds forward with + index_prev. + + Called from opt_range.cc, opt_sum.cc, sql_handler.cc, + and sql_select.cc. +*/ + +int ha_partition::index_last(byte * buf) +{ + DBUG_ENTER("ha_partition::index_last"); + m_index_scan_type= partition_index_last; + DBUG_RETURN(common_first_last(buf)); +} + +int ha_partition::common_first_last(byte *buf) +{ + int error; + if ((error= partition_scan_set_up(buf, FALSE))) + return error; + if (!m_ordered_scan_ongoing) + return handle_unordered_scan_next_partition(buf); + return handle_ordered_index_scan(buf); +} + +/* + Positions an index cursor to the index specified in key. Fetches the + row if any. This is only used to read whole keys. + TODO: Optimise this code to avoid index_init and index_end +*/ + +int ha_partition::index_read_idx(byte * buf, uint index, const byte * key, + uint key_len, + enum ha_rkey_function find_flag) +{ + int res; + DBUG_ENTER("ha_partition::index_read_idx"); + index_init(index, 0); + res= index_read(buf, key, key_len, find_flag); + index_end(); + DBUG_RETURN(res); +} + +/* + This is used in join_read_last_key to optimise away an ORDER BY. + Can only be used on indexes supporting HA_READ_ORDER +*/ + +int ha_partition::index_read_last(byte *buf, const byte *key, uint keylen) +{ + DBUG_ENTER("ha_partition::index_read_last"); + m_ordered= TRUE; // Safety measure + DBUG_RETURN(index_read(buf, key, keylen, HA_READ_PREFIX_LAST)); +} + + +/* + Used to read forward through the index. +*/ + +int ha_partition::index_next(byte * buf) +{ + DBUG_ENTER("ha_partition::index_next"); + /* + TODO(low priority): + If we want partition to work with the HANDLER commands, we + must be able to do index_last() -> index_prev() -> index_next() + */ + DBUG_ASSERT(m_index_scan_type != partition_index_last); + if (!m_ordered_scan_ongoing) + { + DBUG_RETURN(handle_unordered_next(buf, FALSE)); + } + DBUG_RETURN(handle_ordered_next(buf, FALSE)); +} + + +/* + This routine is used to read the next but only if the key is the same + as supplied in the call. +*/ + +int ha_partition::index_next_same(byte *buf, const byte *key, uint keylen) +{ + DBUG_ENTER("ha_partition::index_next_same"); + DBUG_ASSERT(keylen == m_start_key.length); + DBUG_ASSERT(m_index_scan_type != partition_index_last); + if (!m_ordered_scan_ongoing) + DBUG_RETURN(handle_unordered_next(buf, TRUE)); + DBUG_RETURN(handle_ordered_next(buf, TRUE)); +} + +/* + Used to read backwards through the index. +*/ + +int ha_partition::index_prev(byte * buf) +{ + DBUG_ENTER("ha_partition::index_prev"); + /* TODO: read comment in index_next */ + DBUG_ASSERT(m_index_scan_type != partition_index_first); + DBUG_RETURN(handle_ordered_prev(buf)); +} + + +/* + We reimplement read_range_first since we don't want the compare_key + check at the end. This is already performed in the partition handler. + read_range_next is very much different due to that we need to scan + all underlying handlers. +*/ + +int ha_partition::read_range_first(const key_range *start_key, + const key_range *end_key, + bool eq_range_arg, bool sorted) +{ + int error; + DBUG_ENTER("ha_partition::read_range_first"); + m_ordered= sorted; + eq_range= eq_range_arg; + end_range= 0; + if (end_key) + { + end_range= &save_end_range; + save_end_range= *end_key; + key_compare_result_on_equal= + ((end_key->flag == HA_READ_BEFORE_KEY) ? 1 : + (end_key->flag == HA_READ_AFTER_KEY) ? -1 : 0); + } + range_key_part= m_curr_key_info->key_part; + + if (!start_key) // Read first record + { + m_index_scan_type= partition_index_first; + error= common_first_last(m_rec0); + } + else + { + error= common_index_read(m_rec0, + start_key->key, + start_key->length, start_key->flag); + } + DBUG_RETURN(error); +} + + +int ha_partition::read_range_next() +{ + DBUG_ENTER("ha_partition::read_range_next"); + if (m_ordered) + { + DBUG_RETURN(handler::read_range_next()); + } + DBUG_RETURN(handle_unordered_next(m_rec0, eq_range)); +} + + +int ha_partition::partition_scan_set_up(byte * buf, bool idx_read_flag) +{ + DBUG_ENTER("ha_partition::partition_scan_set_up"); + + if (idx_read_flag) + get_partition_set(table,buf,active_index,&m_start_key,&m_part_spec); + else + get_partition_set(table, buf, MAX_KEY, 0, &m_part_spec); + if (m_part_spec.start_part > m_part_spec.end_part) + { + /* + We discovered a partition set but the set was empty so we report + key not found. + */ + DBUG_PRINT("info", ("scan with no partition to scan")); + DBUG_RETURN(HA_ERR_END_OF_FILE); + } + if (m_part_spec.start_part == m_part_spec.end_part) + { + /* + We discovered a single partition to scan, this never needs to be + performed using the ordered index scan. + */ + DBUG_PRINT("info", ("index scan using the single partition %d", + m_part_spec.start_part)); + m_ordered_scan_ongoing= FALSE; + } + else + { + /* + Set m_ordered_scan_ongoing according how the scan should be done + */ + m_ordered_scan_ongoing= m_ordered; + } + DBUG_ASSERT(m_part_spec.start_part < m_tot_parts && + m_part_spec.end_part < m_tot_parts); + DBUG_RETURN(0); +} + + +/**************************************************************************** + Unordered Index Scan Routines +****************************************************************************/ +/* + These routines are used to scan partitions without considering order. + This is performed in two situations. + 1) In read_multi_range this is the normal case + 2) When performing any type of index_read, index_first, index_last where + all fields in the partition function is bound. In this case the index + scan is performed on only one partition and thus it isn't necessary to + perform any sort. +*/ + +int ha_partition::handle_unordered_next(byte *buf, bool next_same) +{ + handler *file= file= m_file[m_part_spec.start_part]; + int error; + DBUG_ENTER("ha_partition::handle_unordered_next"); + + /* + We should consider if this should be split into two functions as + next_same is alwas a local constant + */ + if (next_same) + { + if (!(error= file->index_next_same(buf, m_start_key.key, + m_start_key.length))) + { + m_last_part= m_part_spec.start_part; + DBUG_RETURN(0); + } + } + else if (!(error= file->index_next(buf))) + { + if (compare_key(end_range) <= 0) + { + m_last_part= m_part_spec.start_part; + DBUG_RETURN(0); // Row was in range + } + error= HA_ERR_END_OF_FILE; + } + + if (error == HA_ERR_END_OF_FILE) + { + m_part_spec.start_part++; // Start using next part + error= handle_unordered_scan_next_partition(buf); + } + DBUG_RETURN(error); +} + + +/* + This routine is used to start the index scan on the next partition. + Both initial start and after completing scan on one partition. +*/ + +int ha_partition::handle_unordered_scan_next_partition(byte * buf) +{ + uint i; + DBUG_ENTER("ha_partition::handle_unordered_scan_next_partition"); + + for (i= m_part_spec.start_part; i <= m_part_spec.end_part; i++) + { + int error; + handler *file= m_file[i]; + + m_part_spec.start_part= i; + switch (m_index_scan_type) { + case partition_index_read: + DBUG_PRINT("info", ("index_read on partition %d", i)); + error= file->index_read(buf, m_start_key.key, + m_start_key.length, + m_start_key.flag); + break; + case partition_index_first: + DBUG_PRINT("info", ("index_first on partition %d", i)); + error= file->index_first(buf); + break; + default: + DBUG_ASSERT(FALSE); + DBUG_RETURN(1); + } + if (!error) + { + if (compare_key(end_range) <= 0) + { + m_last_part= i; + DBUG_RETURN(0); + } + error= HA_ERR_END_OF_FILE; + } + if ((error != HA_ERR_END_OF_FILE) && (error != HA_ERR_KEY_NOT_FOUND)) + DBUG_RETURN(error); + DBUG_PRINT("info", ("HA_ERR_END_OF_FILE on partition %d", i)); + } + m_part_spec.start_part= NO_CURRENT_PART_ID; + DBUG_RETURN(HA_ERR_END_OF_FILE); +} + + +/* + This part contains the logic to handle index scans that require ordered + output. This includes all except those started by read_range_first with + the flag ordered set to FALSE. Thus most direct index_read and all + index_first and index_last. + + We implement ordering by keeping one record plus a key buffer for each + partition. Every time a new entry is requested we will fetch a new + entry from the partition that is currently not filled with an entry. + Then the entry is put into its proper sort position. + + Returning a record is done by getting the top record, copying the + record to the request buffer and setting the partition as empty on + entries. +*/ + +int ha_partition::handle_ordered_index_scan(byte *buf) +{ + uint i, j= 0; + bool found= FALSE; + bool reverse_order= FALSE; + DBUG_ENTER("ha_partition::handle_ordered_index_scan"); + + m_top_entry= NO_CURRENT_PART_ID; + queue_remove_all(&queue); + for (i= m_part_spec.start_part; i <= m_part_spec.end_part; i++) + { + int error; + byte *rec_buf_ptr= rec_buf(i); + handler *file= m_file[i]; + + switch (m_index_scan_type) { + case partition_index_read: + error= file->index_read(rec_buf_ptr, + m_start_key.key, + m_start_key.length, + m_start_key.flag); + reverse_order= FALSE; + break; + case partition_index_first: + error= file->index_first(rec_buf_ptr); + reverse_order= FALSE; + break; + case partition_index_last: + error= file->index_last(rec_buf_ptr); + reverse_order= TRUE; + break; + default: + DBUG_ASSERT(FALSE); + DBUG_RETURN(HA_ERR_END_OF_FILE); + } + if (!error) + { + found= TRUE; + /* + Initialise queue without order first, simply insert + */ + queue_element(&queue, j++)= (byte*)queue_buf(i); + } + else if (error != HA_ERR_KEY_NOT_FOUND && error != HA_ERR_END_OF_FILE) + { + DBUG_RETURN(error); + } + } + if (found) + { + /* + We found at least one partition with data, now sort all entries and + after that read the first entry and copy it to the buffer to return in. + */ + queue_set_max_at_top(&queue, reverse_order); + queue_set_cmp_arg(&queue, (void*)m_curr_key_info); + queue.elements= j; + queue_fix(&queue); + return_top_record(buf); + DBUG_PRINT("info", ("Record returned from partition %d", m_top_entry)); + DBUG_RETURN(0); + } + DBUG_RETURN(HA_ERR_END_OF_FILE); +} + + +void ha_partition::return_top_record(byte *buf) +{ + uint part_id; + byte *key_buffer= queue_top(&queue); + byte *rec_buffer= key_buffer + PARTITION_BYTES_IN_POS; + part_id= uint2korr(key_buffer); + memcpy(buf, rec_buffer, m_rec_length); + m_last_part= part_id; + m_top_entry= part_id; +} + + +int ha_partition::handle_ordered_next(byte *buf, bool next_same) +{ + int error; + uint part_id= m_top_entry; + handler *file= m_file[part_id]; + DBUG_ENTER("ha_partition::handle_ordered_next"); + + if (!next_same) + error= file->index_next(rec_buf(part_id)); + else + error= file->index_next_same(rec_buf(part_id), m_start_key.key, + m_start_key.length); + if (error) + { + if (error == HA_ERR_END_OF_FILE) + { + /* Return next buffered row */ + queue_remove(&queue, (uint) 0); + if (queue.elements) + { + DBUG_PRINT("info", ("Record returned from partition %u (2)", + m_top_entry)); + return_top_record(buf); + error= 0; + } + } + DBUG_RETURN(error); + } + queue_replaced(&queue); + return_top_record(buf); + DBUG_PRINT("info", ("Record returned from partition %u", m_top_entry)); + DBUG_RETURN(0); +} + + +int ha_partition::handle_ordered_prev(byte *buf) +{ + int error; + uint part_id= m_top_entry; + handler *file= m_file[part_id]; + DBUG_ENTER("ha_partition::handle_ordered_prev"); + if ((error= file->index_prev(rec_buf(part_id)))) + { + if (error == HA_ERR_END_OF_FILE) + { + queue_remove(&queue, (uint) 0); + if (queue.elements) + { + return_top_record(buf); + DBUG_PRINT("info", ("Record returned from partition %d (2)", + m_top_entry)); + error= 0; + } + } + DBUG_RETURN(error); + } + queue_replaced(&queue); + return_top_record(buf); + DBUG_PRINT("info", ("Record returned from partition %d", m_top_entry)); + DBUG_RETURN(0); +} + + +void ha_partition::include_partition_fields_in_used_fields() +{ + DBUG_ENTER("ha_partition::include_partition_fields_in_used_fields"); + Field **ptr= m_part_field_array; + do + { + ha_set_bit_in_read_set((*ptr)->fieldnr); + } while (*(++ptr)); + DBUG_VOID_RETURN; +} + + +/**************************************************************************** + MODULE information calls +****************************************************************************/ + +/* + These are all first approximations of the extra, info, scan_time + and read_time calls +*/ + +/* + ::info() is used to return information to the optimizer. + Currently this table handler doesn't implement most of the fields + really needed. SHOW also makes use of this data + Another note, if your handler doesn't proved exact record count, + you will probably want to have the following in your code: + if (records < 2) + records = 2; + The reason is that the server will optimize for cases of only a single + record. If in a table scan you don't know the number of records + it will probably be better to set records to two so you can return + as many records as you need. + + Along with records a few more variables you may wish to set are: + records + deleted + data_file_length + index_file_length + delete_length + check_time + Take a look at the public variables in handler.h for more information. + + Called in: + filesort.cc + ha_heap.cc + item_sum.cc + opt_sum.cc + sql_delete.cc + sql_delete.cc + sql_derived.cc + sql_select.cc + sql_select.cc + sql_select.cc + sql_select.cc + sql_select.cc + sql_show.cc + sql_show.cc + sql_show.cc + sql_show.cc + sql_table.cc + sql_union.cc + sql_update.cc + + Some flags that are not implemented + HA_STATUS_POS: + This parameter is never used from the MySQL Server. It is checked in a + place in MyISAM so could potentially be used by MyISAM specific programs. + HA_STATUS_NO_LOCK: + This is declared and often used. It's only used by MyISAM. + It means that MySQL doesn't need the absolute latest statistics + information. This may save the handler from doing internal locks while + retrieving statistics data. +*/ + +void ha_partition::info(uint flag) +{ + handler *file, **file_array; + DBUG_ENTER("ha_partition:info"); + + if (flag & HA_STATUS_AUTO) + { + DBUG_PRINT("info", ("HA_STATUS_AUTO")); + /* + The auto increment value is only maintained by the first handler + so we will only call this. + */ + m_file[0]->info(HA_STATUS_AUTO); + } + if (flag & HA_STATUS_VARIABLE) + { + DBUG_PRINT("info", ("HA_STATUS_VARIABLE")); + /* + Calculates statistical variables + records: Estimate of number records in table + We report sum (always at least 2) + deleted: Estimate of number holes in the table due to + deletes + We report sum + data_file_length: Length of data file, in principle bytes in table + We report sum + index_file_length: Length of index file, in principle bytes in + indexes in the table + We report sum + mean_record_length:Mean record length in the table + We calculate this + check_time: Time of last check (only applicable to MyISAM) + We report last time of all underlying handlers + */ + records= 0; + deleted= 0; + data_file_length= 0; + index_file_length= 0; + check_time= 0; + file_array= m_file; + do + { + file= *file_array; + file->info(HA_STATUS_VARIABLE); + records+= file->records; + deleted+= file->deleted; + data_file_length+= file->data_file_length; + index_file_length+= file->index_file_length; + if (file->check_time > check_time) + check_time= file->check_time; + } while (*(++file_array)); + if (records < 2) + records= 2; + mean_rec_length= (ulong) (data_file_length / records); + } + if (flag & HA_STATUS_CONST) + { + DBUG_PRINT("info", ("HA_STATUS_CONST")); + /* + Recalculate loads of constant variables. MyISAM also sets things + directly on the table share object. + + Check whether this should be fixed since handlers should not + change things directly on the table object. + + Monty comment: This should NOT be changed! It's the handlers + responsibility to correct table->s->keys_xxxx information if keys + have been disabled. + + The most important parameters set here is records per key on + all indexes. block_size and primar key ref_length. + + For each index there is an array of rec_per_key. + As an example if we have an index with three attributes a,b and c + we will have an array of 3 rec_per_key. + rec_per_key[0] is an estimate of number of records divided by + number of unique values of the field a. + rec_per_key[1] is an estimate of the number of records divided + by the number of unique combinations of the fields a and b. + rec_per_key[2] is an estimate of the number of records divided + by the number of unique combinations of the fields a,b and c. + + Many handlers only set the value of rec_per_key when all fields + are bound (rec_per_key[2] in the example above). + + If the handler doesn't support statistics, it should set all of the + above to 0. + + We will allow the first handler to set the rec_per_key and use + this as an estimate on the total table. + + max_data_file_length: Maximum data file length + We ignore it, is only used in + SHOW TABLE STATUS + max_index_file_length: Maximum index file length + We ignore it since it is never used + block_size: Block size used + We set it to the value of the first handler + sortkey: Never used at any place so ignored + ref_length: We set this to the value calculated + and stored in local object + raid_type: Set by first handler (MyISAM) + raid_chunks: Set by first handler (MyISAM) + raid_chunksize: Set by first handler (MyISAM) + create_time: Creation time of table + Set by first handler + + So we calculate these constants by using the variables on the first + handler. + */ + + file= m_file[0]; + file->info(HA_STATUS_CONST); + create_time= file->create_time; + raid_type= file->raid_type; + raid_chunks= file->raid_chunks; + raid_chunksize= file->raid_chunksize; + ref_length= m_ref_length; + } + if (flag & HA_STATUS_ERRKEY) + { + handler *file= m_file[m_last_part]; + DBUG_PRINT("info", ("info: HA_STATUS_ERRKEY")); + /* + This flag is used to get index number of the unique index that + reported duplicate key + We will report the errkey on the last handler used and ignore the rest + */ + file->info(HA_STATUS_ERRKEY); + if (file->errkey != (uint) -1) + errkey= file->errkey; + } + if (flag & HA_STATUS_TIME) + { + DBUG_PRINT("info", ("info: HA_STATUS_TIME")); + /* + This flag is used to set the latest update time of the table. + Used by SHOW commands + We will report the maximum of these times + */ + update_time= 0; + file_array= m_file; + do + { + file= *file_array; + file->info(HA_STATUS_TIME); + if (file->update_time > update_time) + update_time= file->update_time; + } while (*(++file_array)); + } + DBUG_VOID_RETURN; +} + + +/* + extra() is called whenever the server wishes to send a hint to + the storage engine. The MyISAM engine implements the most hints. + + We divide the parameters into the following categories: + 1) Parameters used by most handlers + 2) Parameters used by some non-MyISAM handlers + 3) Parameters used only by MyISAM + 4) Parameters only used by temporary tables for query processing + 5) Parameters only used by MyISAM internally + 6) Parameters not used at all + + The partition handler need to handle category 1), 2) and 3). + + 1) Parameters used by most handlers + ----------------------------------- + HA_EXTRA_RESET: + This option is used by most handlers and it resets the handler state + to the same state as after an open call. This includes releasing + any READ CACHE or WRITE CACHE or other internal buffer used. + + It is called from the reset method in the handler interface. There are + three instances where this is called. + 1) After completing a INSERT ... SELECT ... query the handler for the + table inserted into is reset + 2) It is called from close_thread_table which in turn is called from + close_thread_tables except in the case where the tables are locked + in which case ha_commit_stmt is called instead. + It is only called from here if flush_version hasn't changed and the + table is not an old table when calling close_thread_table. + close_thread_tables is called from many places as a general clean up + function after completing a query. + 3) It is called when deleting the QUICK_RANGE_SELECT object if the + QUICK_RANGE_SELECT object had its own handler object. It is called + immediatley before close of this local handler object. + HA_EXTRA_KEYREAD: + HA_EXTRA_NO_KEYREAD: + These parameters are used to provide an optimisation hint to the handler. + If HA_EXTRA_KEYREAD is set it is enough to read the index fields, for + many handlers this means that the index-only scans can be used and it + is not necessary to use the real records to satisfy this part of the + query. Index-only scans is a very important optimisation for disk-based + indexes. For main-memory indexes most indexes contain a reference to the + record and thus KEYREAD only says that it is enough to read key fields. + HA_EXTRA_NO_KEYREAD disables this for the handler, also HA_EXTRA_RESET + will disable this option. + The handler will set HA_KEYREAD_ONLY in its table flags to indicate this + feature is supported. + HA_EXTRA_FLUSH: + Indication to flush tables to disk, called at close_thread_table to + ensure disk based tables are flushed at end of query execution. + + 2) Parameters used by some non-MyISAM handlers + ---------------------------------------------- + HA_EXTRA_RETRIEVE_ALL_COLS: + Many handlers have implemented optimisations to avoid fetching all + fields when retrieving data. In certain situations all fields need + to be retrieved even though the query_id is not set on all field + objects. + + It is called from copy_data_between_tables where all fields are + copied without setting query_id before calling the handlers. + It is called from UPDATE statements when the fields of the index + used is updated or ORDER BY is used with UPDATE. + And finally when calculating checksum of a table using the CHECKSUM + command. + HA_EXTRA_RETRIEVE_PRIMARY_KEY: + In some situations it is mandatory to retrieve primary key fields + independent of the query id's. This extra flag specifies that fetch + of primary key fields is mandatory. + HA_EXTRA_KEYREAD_PRESERVE_FIELDS: + This is a strictly InnoDB feature that is more or less undocumented. + When it is activated InnoDB copies field by field from its fetch + cache instead of all fields in one memcpy. Have no idea what the + purpose of this is. + Cut from include/my_base.h: + When using HA_EXTRA_KEYREAD, overwrite only key member fields and keep + other fields intact. When this is off (by default) InnoDB will use memcpy + to overwrite entire row. + HA_EXTRA_IGNORE_DUP_KEY: + HA_EXTRA_NO_IGNORE_DUP_KEY: + Informs the handler to we will not stop the transaction if we get an + duplicate key errors during insert/upate. + Always called in pair, triggered by INSERT IGNORE and other similar + SQL constructs. + Not used by MyISAM. + + 3) Parameters used only by MyISAM + --------------------------------- + HA_EXTRA_NORMAL: + Only used in MyISAM to reset quick mode, not implemented by any other + handler. Quick mode is also reset in MyISAM by HA_EXTRA_RESET. + + It is called after completing a successful DELETE query if the QUICK + option is set. + + HA_EXTRA_QUICK: + When the user does DELETE QUICK FROM table where-clause; this extra + option is called before the delete query is performed and + HA_EXTRA_NORMAL is called after the delete query is completed. + Temporary tables used internally in MySQL always set this option + + The meaning of quick mode is that when deleting in a B-tree no merging + of leafs is performed. This is a common method and many large DBMS's + actually only support this quick mode since it is very difficult to + merge leaves in a tree used by many threads concurrently. + + HA_EXTRA_CACHE: + This flag is usually set with extra_opt along with a cache size. + The size of this buffer is set by the user variable + record_buffer_size. The value of this cache size is the amount of + data read from disk in each fetch when performing a table scan. + This means that before scanning a table it is normal to call + extra with HA_EXTRA_CACHE and when the scan is completed to call + HA_EXTRA_NO_CACHE to release the cache memory. + + Some special care is taken when using this extra parameter since there + could be a write ongoing on the table in the same statement. In this + one has to take special care since there might be a WRITE CACHE as + well. HA_EXTRA_CACHE specifies using a READ CACHE and using + READ CACHE and WRITE CACHE at the same time is not possible. + + Only MyISAM currently use this option. + + It is set when doing full table scans using rr_sequential and + reset when completing such a scan with end_read_record + (resetting means calling extra with HA_EXTRA_NO_CACHE). + + It is set in filesort.cc for MyISAM internal tables and it is set in + a multi-update where HA_EXTRA_CACHE is called on a temporary result + table and after that ha_rnd_init(0) on table to be updated + and immediately after that HA_EXTRA_NO_CACHE on table to be updated. + + Apart from that it is always used from init_read_record but not when + used from UPDATE statements. It is not used from DELETE statements + with ORDER BY and LIMIT but it is used in normal scan loop in DELETE + statements. The reason here is that DELETE's in MyISAM doesn't move + existings data rows. + + It is also set in copy_data_between_tables when scanning the old table + to copy over to the new table. + And it is set in join_init_read_record where quick objects are used + to perform a scan on the table. In this case the full table scan can + even be performed multiple times as part of the nested loop join. + + For purposes of the partition handler it is obviously necessary to have + special treatment of this extra call. If we would simply pass this + extra call down to each handler we would allocate + cache size * no of partitions amount of memory and this is not + necessary since we will only scan one partition at a time when doing + full table scans. + + Thus we treat it by first checking whether we have MyISAM handlers in + the table, if not we simply ignore the call and if we have we will + record the call but will not call any underlying handler yet. Then + when performing the sequential scan we will check this recorded value + and call extra_opt whenever we start scanning a new partition. + + monty: Neads to be fixed so that it's passed to all handlers when we + move to another partition during table scan. + + HA_EXTRA_NO_CACHE: + When performing a UNION SELECT HA_EXTRA_NO_CACHE is called from the + flush method in the select_union class. + It is used to some extent when insert delayed inserts. + See HA_EXTRA_RESET_STATE for use in conjunction with delete_all_rows(). + + It should be ok to call HA_EXTRA_NO_CACHE on all underlying handlers + if they are MyISAM handlers. Other handlers we can ignore the call + for. If no cache is in use they will quickly return after finding + this out. And we also ensure that all caches are disabled and no one + is left by mistake. + In the future this call will probably be deleted an we will instead call + ::reset(); + + HA_EXTRA_WRITE_CACHE: + See above, called from various places. It is mostly used when we + do INSERT ... SELECT + No special handling to save cache space is developed currently. + + HA_EXTRA_PREPARE_FOR_UPDATE: + This is called as part of a multi-table update. When the table to be + updated is also scanned then this informs MyISAM handler to drop any + caches if dynamic records are used (fixed size records do not care + about this call). We pass this along to all underlying MyISAM handlers + and ignore it for the rest. + + HA_EXTRA_PREPARE_FOR_DELETE: + Only used by MyISAM, called in preparation for a DROP TABLE. + It's used mostly by Windows that cannot handle dropping an open file. + On other platforms it has the same effect as HA_EXTRA_FORCE_REOPEN. + + HA_EXTRA_READCHECK: + HA_EXTRA_NO_READCHECK: + Only one call to HA_EXTRA_NO_READCHECK from ha_open where it says that + this is not needed in SQL. The reason for this call is that MyISAM sets + the READ_CHECK_USED in the open call so the call is needed for MyISAM + to reset this feature. + The idea with this parameter was to inform of doing/not doing a read + check before applying an update. Since SQL always performs a read before + applying the update No Read Check is needed in MyISAM as well. + + This is a cut from Docs/myisam.txt + Sometimes you might want to force an update without checking whether + another user has changed the record since you last read it. This is + somewhat dangerous, so it should ideally not be used. That can be + accomplished by wrapping the mi_update() call in two calls to mi_extra(), + using these functions: + HA_EXTRA_NO_READCHECK=5 No readcheck on update + HA_EXTRA_READCHECK=6 Use readcheck (def) + + HA_EXTRA_FORCE_REOPEN: + Only used by MyISAM, called when altering table, closing tables to + enforce a reopen of the table files. + + 4) Parameters only used by temporary tables for query processing + ---------------------------------------------------------------- + HA_EXTRA_RESET_STATE: + Same as HA_EXTRA_RESET except that buffers are not released. If there is + a READ CACHE it is reinit'ed. A cache is reinit'ed to restart reading + or to change type of cache between READ CACHE and WRITE CACHE. + + This extra function is always called immediately before calling + delete_all_rows on the handler for temporary tables. + There are cases however when HA_EXTRA_RESET_STATE isn't called in + a similar case for a temporary table in sql_union.cc and in two other + cases HA_EXTRA_NO_CACHE is called before and HA_EXTRA_WRITE_CACHE + called afterwards. + The case with HA_EXTRA_NO_CACHE and HA_EXTRA_WRITE_CACHE means + disable caching, delete all rows and enable WRITE CACHE. This is + used for temporary tables containing distinct sums and a + functional group. + + The only case that delete_all_rows is called on non-temporary tables + is in sql_delete.cc when DELETE FROM table; is called by a user. + In this case no special extra calls are performed before or after this + call. + + The partition handler should not need to bother about this one. It + should never be called. + + HA_EXTRA_NO_ROWS: + Don't insert rows indication to HEAP and MyISAM, only used by temporary + tables used in query processing. + Not handled by partition handler. + + 5) Parameters only used by MyISAM internally + -------------------------------------------- + HA_EXTRA_REINIT_CACHE: + This call reinitialises the READ CACHE described above if there is one + and otherwise the call is ignored. + + We can thus safely call it on all underlying handlers if they are + MyISAM handlers. It is however never called so we don't handle it at all. + HA_EXTRA_FLUSH_CACHE: + Flush WRITE CACHE in MyISAM. It is only from one place in the code. + This is in sql_insert.cc where it is called if the table_flags doesn't + contain HA_DUPP_POS. The only handler having the HA_DUPP_POS set is the + MyISAM handler and so the only handler not receiving this call is MyISAM. + Thus in effect this call is called but never used. Could be removed + from sql_insert.cc + HA_EXTRA_NO_USER_CHANGE: + Only used by MyISAM, never called. + Simulates lock_type as locked. + HA_EXTRA_WAIT_LOCK: + HA_EXTRA_WAIT_NOLOCK: + Only used by MyISAM, called from MyISAM handler but never from server + code on top of the handler. + Sets lock_wait on/off + HA_EXTRA_NO_KEYS: + Only used MyISAM, only used internally in MyISAM handler, never called + from server level. + HA_EXTRA_KEYREAD_CHANGE_POS: + HA_EXTRA_REMEMBER_POS: + HA_EXTRA_RESTORE_POS: + HA_EXTRA_PRELOAD_BUFFER_SIZE: + HA_EXTRA_CHANGE_KEY_TO_DUP: + HA_EXTRA_CHANGE_KEY_TO_UNIQUE: + Only used by MyISAM, never called. + + 6) Parameters not used at all + ----------------------------- + HA_EXTRA_KEY_CACHE: + HA_EXTRA_NO_KEY_CACHE: + This parameters are no longer used and could be removed. +*/ + +int ha_partition::extra(enum ha_extra_function operation) +{ + DBUG_ENTER("ha_partition:extra"); + DBUG_PRINT("info", ("operation: %d", (int) operation)); + + switch (operation) { + /* Category 1), used by most handlers */ + case HA_EXTRA_KEYREAD: + case HA_EXTRA_NO_KEYREAD: + case HA_EXTRA_FLUSH: + DBUG_RETURN(loop_extra(operation)); + + /* Category 2), used by non-MyISAM handlers */ + case HA_EXTRA_IGNORE_DUP_KEY: + case HA_EXTRA_NO_IGNORE_DUP_KEY: + case HA_EXTRA_RETRIEVE_ALL_COLS: + case HA_EXTRA_RETRIEVE_PRIMARY_KEY: + case HA_EXTRA_KEYREAD_PRESERVE_FIELDS: + { + if (!m_myisam) + DBUG_RETURN(loop_extra(operation)); + break; + } + + /* Category 3), used by MyISAM handlers */ + case HA_EXTRA_NORMAL: + case HA_EXTRA_QUICK: + case HA_EXTRA_NO_READCHECK: + case HA_EXTRA_PREPARE_FOR_UPDATE: + case HA_EXTRA_PREPARE_FOR_DELETE: + case HA_EXTRA_FORCE_REOPEN: + { + if (m_myisam) + DBUG_RETURN(loop_extra(operation)); + break; + } + case HA_EXTRA_CACHE: + { + prepare_extra_cache(0); + break; + } + case HA_EXTRA_NO_CACHE: + { + m_extra_cache= FALSE; + m_extra_cache_size= 0; + DBUG_RETURN(loop_extra(operation)); + } + default: + { + /* Temporary crash to discover what is wrong */ + DBUG_ASSERT(0); + break; + } + } + DBUG_RETURN(0); +} + + +/* + This will in the future be called instead of extra(HA_EXTRA_RESET) as this + is such a common call +*/ + +int ha_partition::reset(void) +{ + int result= 0, tmp; + handler **file; + DBUG_ENTER("ha_partition::reset"); + file= m_file; + do + { + if ((tmp= (*file)->reset())) + result= tmp; + } while (*(++file)); + DBUG_RETURN(result); +} + + +int ha_partition::extra_opt(enum ha_extra_function operation, ulong cachesize) +{ + DBUG_ENTER("ha_partition::extra_opt()"); + DBUG_ASSERT(HA_EXTRA_CACHE == operation); + prepare_extra_cache(cachesize); + DBUG_RETURN(0); +} + + +void ha_partition::prepare_extra_cache(uint cachesize) +{ + DBUG_ENTER("ha_partition::prepare_extra_cache()"); + + m_extra_cache= TRUE; + m_extra_cache_size= cachesize; + if (m_part_spec.start_part != NO_CURRENT_PART_ID) + { + DBUG_ASSERT(m_part_spec.start_part == 0); + late_extra_cache(0); + } + DBUG_VOID_RETURN; +} + + +int ha_partition::loop_extra(enum ha_extra_function operation) +{ + int result= 0, tmp; + handler **file; + DBUG_ENTER("ha_partition::loop_extra()"); + for (file= m_file; *file; file++) + { + if ((tmp= (*file)->extra(operation))) + result= tmp; + } + DBUG_RETURN(result); +} + + +void ha_partition::late_extra_cache(uint partition_id) +{ + handler *file; + DBUG_ENTER("ha_partition::late_extra_cache"); + if (!m_extra_cache) + DBUG_VOID_RETURN; + file= m_file[partition_id]; + if (m_extra_cache_size == 0) + VOID(file->extra(HA_EXTRA_CACHE)); + else + VOID(file->extra_opt(HA_EXTRA_CACHE, m_extra_cache_size)); + DBUG_VOID_RETURN; +} + + +void ha_partition::late_extra_no_cache(uint partition_id) +{ + handler *file; + DBUG_ENTER("ha_partition::late_extra_no_cache"); + if (!m_extra_cache) + DBUG_VOID_RETURN; + file= m_file[partition_id]; + VOID(file->extra(HA_EXTRA_NO_CACHE)); + DBUG_VOID_RETURN; +} + + +/**************************************************************************** + MODULE optimiser support +****************************************************************************/ + +const key_map *ha_partition::keys_to_use_for_scanning() +{ + DBUG_ENTER("ha_partition::keys_to_use_for_scanning"); + DBUG_RETURN(m_file[0]->keys_to_use_for_scanning()); +} + +double ha_partition::scan_time() +{ + double scan_time= 0; + handler **file; + DBUG_ENTER("ha_partition::scan_time"); + + for (file= m_file; *file; file++) + scan_time+= (*file)->scan_time(); + DBUG_RETURN(scan_time); +} + + +/* + This will be optimised later to include whether or not the index can + be used with partitioning. To achieve we need to add another parameter + that specifies how many of the index fields that are bound in the ranges. + Possibly added as a new call to handlers. +*/ + +double ha_partition::read_time(uint index, uint ranges, ha_rows rows) +{ + DBUG_ENTER("ha_partition::read_time"); + DBUG_RETURN(m_file[0]->read_time(index, ranges, rows)); +} + +/* + Given a starting key, and an ending key estimate the number of rows that + will exist between the two. end_key may be empty which in case determine + if start_key matches any rows. + + Called from opt_range.cc by check_quick_keys(). + + monty: MUST be called for each range and added. + Note that MySQL will assume that if this returns 0 there is no + matching rows for the range! +*/ + +ha_rows ha_partition::records_in_range(uint inx, key_range *min_key, + key_range *max_key) +{ + ha_rows in_range= 0; + handler **file; + DBUG_ENTER("ha_partition::records_in_range"); + + file= m_file; + do + { + in_range+= (*file)->records_in_range(inx, min_key, max_key); + } while (*(++file)); + DBUG_RETURN(in_range); +} + + +ha_rows ha_partition::estimate_rows_upper_bound() +{ + ha_rows rows, tot_rows= 0; + handler **file; + DBUG_ENTER("ha_partition::estimate_rows_upper_bound"); + + file= m_file; + do + { + rows= (*file)->estimate_rows_upper_bound(); + if (rows == HA_POS_ERROR) + DBUG_RETURN(HA_POS_ERROR); + tot_rows+= rows; + } while (*(++file)); + DBUG_RETURN(tot_rows); +} + + +uint8 ha_partition::table_cache_type() +{ + DBUG_ENTER("ha_partition::table_cache_type"); + DBUG_RETURN(m_file[0]->table_cache_type()); +} + + +/**************************************************************************** + MODULE print messages +****************************************************************************/ + +const char *ha_partition::index_type(uint inx) +{ + DBUG_ENTER("ha_partition::index_type"); + DBUG_RETURN(m_file[0]->index_type(inx)); +} + + +void ha_partition::print_error(int error, myf errflag) +{ + DBUG_ENTER("ha_partition::print_error"); + /* Should probably look for my own errors first */ + /* monty: needs to be called for the last used partition ! */ + m_file[0]->print_error(error, errflag); + DBUG_VOID_RETURN; +} + + +bool ha_partition::get_error_message(int error, String *buf) +{ + DBUG_ENTER("ha_partition::get_error_message"); + /* Should probably look for my own errors first */ + /* monty: needs to be called for the last used partition ! */ + DBUG_RETURN(m_file[0]->get_error_message(error, buf)); +} + + +/**************************************************************************** + MODULE handler characteristics +****************************************************************************/ +/* + If frm_error() is called then we will use this to to find out what file + extensions exist for the storage engine. This is also used by the default + rename_table and delete_table method in handler.cc. +*/ + +static const char *ha_partition_ext[]= +{ + ha_par_ext, NullS +}; + +const char **ha_partition::bas_ext() const +{ return ha_partition_ext; } + + +uint ha_partition::min_of_the_max_uint(uint (handler::*operator_func)(void) const) const +{ + handler **file; + uint min_of_the_max= ((*m_file)->*operator_func)(); + + for (file= m_file+1; *file; file++) + { + uint tmp= ((*file)->*operator_func)(); + set_if_smaller(min_of_the_max, tmp); + } + return min_of_the_max; +} + + +uint ha_partition::max_supported_key_parts() const +{ + return min_of_the_max_uint(&handler::max_supported_key_parts); +} + + +uint ha_partition::max_supported_key_length() const +{ + return min_of_the_max_uint(&handler::max_supported_key_length); +} + + +uint ha_partition::max_supported_key_part_length() const +{ + return min_of_the_max_uint(&handler::max_supported_key_part_length); +} + + +uint ha_partition::max_supported_record_length() const +{ + return min_of_the_max_uint(&handler::max_supported_record_length); +} + + +uint ha_partition::max_supported_keys() const +{ + return min_of_the_max_uint(&handler::max_supported_keys); +} + + +uint ha_partition::extra_rec_buf_length() const +{ + handler **file; + uint max= (*m_file)->extra_rec_buf_length(); + for (file= m_file, file++; *file; file++) + if (max < (*file)->extra_rec_buf_length()) + max= (*file)->extra_rec_buf_length(); + return max; +} + + +uint ha_partition::min_record_length(uint options) const +{ + handler **file; + uint max= (*m_file)->min_record_length(options); + for (file= m_file, file++; *file; file++) + if (max < (*file)->min_record_length(options)) + max= (*file)->min_record_length(options); + return max; +} + + +/**************************************************************************** + MODULE compare records +****************************************************************************/ +/* + We get two references and need to check if those records are the same. + If they belong to different partitions we decide that they are not + the same record. Otherwise we use the particular handler to decide if + they are the same. Sort in partition id order if not equal. +*/ + +int ha_partition::cmp_ref(const byte *ref1, const byte *ref2) +{ + uint part_id; + my_ptrdiff_t diff1, diff2; + handler *file; + DBUG_ENTER("ha_partition::cmp_ref"); + if ((ref1[0] == ref2[0]) && (ref1[1] == ref2[1])) + { + part_id= get_part_id_from_pos(ref1); + file= m_file[part_id]; + DBUG_ASSERT(part_id < m_tot_parts); + DBUG_RETURN(file->cmp_ref((ref1 + PARTITION_BYTES_IN_POS), + (ref2 + PARTITION_BYTES_IN_POS))); + } + diff1= ref2[1] - ref1[1]; + diff2= ref2[0] - ref1[0]; + if (diff1 > 0) + { + DBUG_RETURN(-1); + } + if (diff1 < 0) + { + DBUG_RETURN(+1); + } + if (diff2 > 0) + { + DBUG_RETURN(-1); + } + DBUG_RETURN(+1); +} + + +/**************************************************************************** + MODULE auto increment +****************************************************************************/ + +void ha_partition::restore_auto_increment() +{ + DBUG_ENTER("ha_partition::restore_auto_increment"); + DBUG_VOID_RETURN; +} + + +/* + This method is called by update_auto_increment which in turn is called + by the individual handlers as part of write_row. We will always let + the first handler keep track of the auto increment value for all + partitions. +*/ + +ulonglong ha_partition::get_auto_increment() +{ + DBUG_ENTER("ha_partition::get_auto_increment"); + DBUG_RETURN(m_file[0]->get_auto_increment()); +} + + +/**************************************************************************** + MODULE initialise handler for HANDLER call +****************************************************************************/ + +void ha_partition::init_table_handle_for_HANDLER() +{ + return; +} + + +/**************************************************************************** + MODULE Partition Share +****************************************************************************/ +/* + Service routines for ... methods. +------------------------------------------------------------------------- + Variables for partition share methods. A hash used to track open tables. + A mutex for the hash table and an init variable to check if hash table + is initialised. + There is also a constant ending of the partition handler file name. +*/ + +#ifdef NOT_USED +static HASH partition_open_tables; +static pthread_mutex_t partition_mutex; +static int partition_init= 0; + + +/* + Function we use in the creation of our hash to get key. +*/ +static byte *partition_get_key(PARTITION_SHARE *share, uint *length, + my_bool not_used __attribute__ ((unused))) +{ + *length= share->table_name_length; + return (byte *) share->table_name; +} + +/* + Example of simple lock controls. The "share" it creates is structure we + will pass to each partition handler. Do you have to have one of these? + Well, you have pieces that are used for locking, and they are needed to + function. +*/ + + +static PARTITION_SHARE *get_share(const char *table_name, TABLE *table) +{ + PARTITION_SHARE *share; + uint length; + char *tmp_name; + + /* + So why does this exist? There is no way currently to init a storage + engine. + Innodb and BDB both have modifications to the server to allow them to + do this. Since you will not want to do this, this is probably the next + best method. + */ + if (!partition_init) + { + /* Hijack a mutex for init'ing the storage engine */ + pthread_mutex_lock(&LOCK_mysql_create_db); + if (!partition_init) + { + partition_init++; + VOID(pthread_mutex_init(&partition_mutex, MY_MUTEX_INIT_FAST)); + (void) hash_init(&partition_open_tables, system_charset_info, 32, 0, 0, + (hash_get_key) partition_get_key, 0, 0); + } + pthread_mutex_unlock(&LOCK_mysql_create_db); + } + pthread_mutex_lock(&partition_mutex); + length= (uint) strlen(table_name); + + if (!(share= (PARTITION_SHARE *) hash_search(&partition_open_tables, + (byte *) table_name, length))) + { + if (!(share= (PARTITION_SHARE *) + my_multi_malloc(MYF(MY_WME | MY_ZEROFILL), + &share, sizeof(*share), + &tmp_name, length + 1, NullS))) + { + pthread_mutex_unlock(&partition_mutex); + return NULL; + } + + share->use_count= 0; + share->table_name_length= length; + share->table_name= tmp_name; + strmov(share->table_name, table_name); + if (my_hash_insert(&partition_open_tables, (byte *) share)) + goto error; + thr_lock_init(&share->lock); + pthread_mutex_init(&share->mutex, MY_MUTEX_INIT_FAST); + } + share->use_count++; + pthread_mutex_unlock(&partition_mutex); + + return share; + +error: + pthread_mutex_unlock(&partition_mutex); + my_free((gptr) share, MYF(0)); + + return NULL; +} + + +/* + Free lock controls. We call this whenever we close a table. If the table + had the last reference to the share then we free memory associated with + it. +*/ + +static int free_share(PARTITION_SHARE *share) +{ + pthread_mutex_lock(&partition_mutex); + if (!--share->use_count) + { + hash_delete(&partition_open_tables, (byte *) share); + thr_lock_delete(&share->lock); + pthread_mutex_destroy(&share->mutex); + my_free((gptr) share, MYF(0)); + } + pthread_mutex_unlock(&partition_mutex); + + return 0; +} +#endif /* NOT_USED */ +#endif /* HAVE_PARTITION_DB */ diff --git a/sql/ha_partition.h b/sql/ha_partition.h new file mode 100644 index 00000000000..e78cff4cdbb --- /dev/null +++ b/sql/ha_partition.h @@ -0,0 +1,916 @@ +/* Copyright (C) 2005 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +#ifdef __GNUC__ +#pragma interface /* gcc class implementation */ +#endif + +/* + PARTITION_SHARE is a structure that will be shared amoung all open handlers + The partition implements the minimum of what you will probably need. +*/ + +typedef struct st_partition_share +{ + char *table_name; + uint table_name_length, use_count; + pthread_mutex_t mutex; + THR_LOCK lock; +} PARTITION_SHARE; + + +#define PARTITION_BYTES_IN_POS 2 +class ha_partition :public handler +{ +private: + enum partition_index_scan_type + { + partition_index_read= 0, + partition_index_first= 1, + partition_index_last= 2, + partition_no_index_scan= 3 + }; + /* Data for the partition handler */ + char *m_file_buffer; // Buffer with names + char *m_name_buffer_ptr; // Pointer to first partition name + uchar *m_engine_array; // Array of types of the handlers + handler **m_file; // Array of references to handler inst. + partition_info *m_part_info; // local reference to partition + byte *m_start_key_ref; // Reference of start key in current + // index scan info + Field **m_part_field_array; // Part field array locally to save acc + byte *m_ordered_rec_buffer; // Row and key buffer for ord. idx scan + KEY *m_curr_key_info; // Current index + byte *m_rec0; // table->record[0] + QUEUE queue; // Prio queue used by sorted read + /* + Since the partition handler is a handler on top of other handlers, it + is necessary to keep information about what the underlying handler + characteristics is. It is not possible to keep any handler instances + for this since the MySQL Server sometimes allocating the handler object + without freeing them. + */ + u_long m_table_flags; + u_long m_low_byte_first; + + uint m_tot_parts; // Total number of partitions; + uint m_last_part; // Last file that we update,write + int m_lock_type; // Remembers type of last + // external_lock + part_id_range m_part_spec; // Which parts to scan + uint m_scan_value; // Value passed in rnd_init + // call + uint m_ref_length; // Length of position in this + // handler object + key_range m_start_key; // index read key range + enum partition_index_scan_type m_index_scan_type;// What type of index + // scan + uint m_top_entry; // Which partition is to + // deliver next result + uint m_rec_length; // Local copy of record length + + bool m_ordered; // Ordered/Unordered index scan + bool m_has_transactions; // Can we support transactions + bool m_pkey_is_clustered; // Is primary key clustered + bool m_create_handler; // Handler used to create table + bool m_is_sub_partitioned; // Is subpartitioned + bool m_ordered_scan_ongoing; + bool m_use_bit_array; + + /* + We keep track if all underlying handlers are MyISAM since MyISAM has a + great number of extra flags not needed by other handlers. + */ + bool m_myisam; // Are all underlying handlers + // MyISAM + /* + We keep track of InnoDB handlers below since it requires proper setting + of query_id in fields at index_init and index_read calls. + */ + bool m_innodb; // Are all underlying handlers + // InnoDB + /* + When calling extra(HA_EXTRA_CACHE) we do not pass this to the underlying + handlers immediately. Instead we cache it and call the underlying + immediately before starting the scan on the partition. This is to + prevent allocating a READ CACHE for each partition in parallel when + performing a full table scan on MyISAM partitioned table. + This state is cleared by extra(HA_EXTRA_NO_CACHE). + */ + bool m_extra_cache; + uint m_extra_cache_size; + + void init_handler_variables(); + /* + Variables for lock structures. + */ + THR_LOCK_DATA lock; /* MySQL lock */ + PARTITION_SHARE *share; /* Shared lock info */ + +public: + /* + ------------------------------------------------------------------------- + MODULE create/delete handler object + ------------------------------------------------------------------------- + Object create/delete methode. The normal called when a table object + exists. There is also a method to create the handler object with only + partition information. This is used from mysql_create_table when the + table is to be created and the engine type is deduced to be the + partition handler. + ------------------------------------------------------------------------- + */ + ha_partition(TABLE * table); + ha_partition(partition_info * part_info); + ~ha_partition(); + /* + A partition handler has no characteristics in itself. It only inherits + those from the underlying handlers. Here we set-up those constants to + enable later calls of the methods to retrieve constants from the under- + lying handlers. Returns false if not successful. + */ + int ha_initialise(); + + /* + ------------------------------------------------------------------------- + MODULE meta data changes + ------------------------------------------------------------------------- + Meta data routines to CREATE, DROP, RENAME table and often used at + ALTER TABLE (update_create_info used from ALTER TABLE and SHOW ..). + + update_table_comment is used in SHOW TABLE commands to provide a + chance for the handler to add any interesting comments to the table + comments not provided by the users comment. + + create_handler_files is called before opening a new handler object + with openfrm to call create. It is used to create any local handler + object needed in opening the object in openfrm + ------------------------------------------------------------------------- + */ + virtual int delete_table(const char *from); + virtual int rename_table(const char *from, const char *to); + virtual int create(const char *name, TABLE * form, + HA_CREATE_INFO * create_info); + virtual int create_handler_files(const char *name); + virtual void update_create_info(HA_CREATE_INFO * create_info); + virtual char *update_table_comment(const char *comment); +private: + /* + delete_table, rename_table and create uses very similar logic which + is packed into this routine. + */ + uint del_ren_cre_table(const char *from, + const char *to= NULL, + TABLE * table_arg= NULL, + HA_CREATE_INFO * create_info= NULL); + /* + One method to create the table_name.par file containing the names of the + underlying partitions, their engine and the number of partitions. + And one method to read it in. + */ + bool create_handler_file(const char *name); + bool get_from_handler_file(const char *name); + bool new_handlers_from_part_info(); + bool create_handlers(); + void clear_handler_file(); + void set_up_table_before_create(TABLE * table_arg, HA_CREATE_INFO * info, + uint part_id); + partition_element *find_partition_element(uint part_id); +public: + + /* + ------------------------------------------------------------------------- + MODULE open/close object + ------------------------------------------------------------------------- + Open and close handler object to ensure all underlying files and + objects allocated and deallocated for query handling is handled + properly. + ------------------------------------------------------------------------- + + A handler object is opened as part of its initialisation and before + being used for normal queries (not before meta-data changes always. + If the object was opened it will also be closed before being deleted. + */ + virtual int open(const char *name, int mode, uint test_if_locked); + virtual int close(void); + + /* + ------------------------------------------------------------------------- + MODULE start/end statement + ------------------------------------------------------------------------- + This module contains methods that are used to understand start/end of + statements, transaction boundaries, and aid for proper concurrency + control. + The partition handler need not implement abort and commit since this + will be handled by any underlying handlers implementing transactions. + There is only one call to each handler type involved per transaction + and these go directly to the handlers supporting transactions + currently InnoDB, BDB and NDB). + ------------------------------------------------------------------------- + */ + virtual THR_LOCK_DATA **store_lock(THD * thd, THR_LOCK_DATA ** to, + enum thr_lock_type lock_type); + virtual int external_lock(THD * thd, int lock_type); + /* + When table is locked a statement is started by calling start_stmt + instead of external_lock + */ + virtual int start_stmt(THD * thd); + /* + Lock count is number of locked underlying handlers (I assume) + */ + virtual uint lock_count(void) const; + /* + Call to unlock rows not to be updated in transaction + */ + virtual void unlock_row(); + + /* + ------------------------------------------------------------------------- + MODULE change record + ------------------------------------------------------------------------- + This part of the handler interface is used to change the records + after INSERT, DELETE, UPDATE, REPLACE method calls but also other + special meta-data operations as ALTER TABLE, LOAD DATA, TRUNCATE. + ------------------------------------------------------------------------- + + These methods are used for insert (write_row), update (update_row) + and delete (delete_row). All methods to change data always work on + one row at a time. update_row and delete_row also contains the old + row. + delete_all_rows will delete all rows in the table in one call as a + special optimisation for DELETE from table; + + Bulk inserts are supported if all underlying handlers support it. + start_bulk_insert and end_bulk_insert is called before and after a + number of calls to write_row. + Not yet though. + */ + virtual int write_row(byte * buf); + virtual int update_row(const byte * old_data, byte * new_data); + virtual int delete_row(const byte * buf); + virtual int delete_all_rows(void); + virtual void start_bulk_insert(ha_rows rows); + virtual int end_bulk_insert(); + + /* + ------------------------------------------------------------------------- + MODULE full table scan + ------------------------------------------------------------------------- + This module is used for the most basic access method for any table + handler. This is to fetch all data through a full table scan. No + indexes are needed to implement this part. + It contains one method to start the scan (rnd_init) that can also be + called multiple times (typical in a nested loop join). Then proceeding + to the next record (rnd_next) and closing the scan (rnd_end). + To remember a record for later access there is a method (position) + and there is a method used to retrieve the record based on the stored + position. + The position can be a file position, a primary key, a ROWID dependent + on the handler below. + ------------------------------------------------------------------------- + */ + /* + unlike index_init(), rnd_init() can be called two times + without rnd_end() in between (it only makes sense if scan=1). + then the second call should prepare for the new table scan + (e.g if rnd_init allocates the cursor, second call should + position it to the start of the table, no need to deallocate + and allocate it again + */ + virtual int rnd_init(bool scan); + virtual int rnd_end(); + virtual int rnd_next(byte * buf); + virtual int rnd_pos(byte * buf, byte * pos); + virtual void position(const byte * record); + + /* + ------------------------------------------------------------------------- + MODULE index scan + ------------------------------------------------------------------------- + This part of the handler interface is used to perform access through + indexes. The interface is defined as a scan interface but the handler + can also use key lookup if the index is a unique index or a primary + key index. + Index scans are mostly useful for SELECT queries but are an important + part also of UPDATE, DELETE, REPLACE and CREATE TABLE table AS SELECT + and so forth. + Naturally an index is needed for an index scan and indexes can either + be ordered, hash based. Some ordered indexes can return data in order + but not necessarily all of them. + There are many flags that define the behavior of indexes in the + various handlers. These methods are found in the optimizer module. + ------------------------------------------------------------------------- + + index_read is called to start a scan of an index. The find_flag defines + the semantics of the scan. These flags are defined in + include/my_base.h + index_read_idx is the same but also initializes index before calling doing + the same thing as index_read. Thus it is similar to index_init followed + by index_read. This is also how we implement it. + + index_read/index_read_idx does also return the first row. Thus for + key lookups, the index_read will be the only call to the handler in + the index scan. + + index_init initializes an index before using it and index_end does + any end processing needed. + */ + virtual int index_read(byte * buf, const byte * key, + uint key_len, enum ha_rkey_function find_flag); + virtual int index_read_idx(byte * buf, uint idx, const byte * key, + uint key_len, enum ha_rkey_function find_flag); + virtual int index_init(uint idx, bool sorted); + virtual int index_end(); + + /* + These methods are used to jump to next or previous entry in the index + scan. There are also methods to jump to first and last entry. + */ + virtual int index_next(byte * buf); + virtual int index_prev(byte * buf); + virtual int index_first(byte * buf); + virtual int index_last(byte * buf); + virtual int index_next_same(byte * buf, const byte * key, uint keylen); + virtual int index_read_last(byte * buf, const byte * key, uint keylen); + + /* + read_first_row is virtual method but is only implemented by + handler.cc, no storage engine has implemented it so neither + will the partition handler. + + virtual int read_first_row(byte *buf, uint primary_key); + */ + + /* + We don't implement multi read range yet, will do later. + virtual int read_multi_range_first(KEY_MULTI_RANGE **found_range_p, + KEY_MULTI_RANGE *ranges, uint range_count, + bool sorted, HANDLER_BUFFER *buffer); + virtual int read_multi_range_next(KEY_MULTI_RANGE **found_range_p); + */ + + + virtual int read_range_first(const key_range * start_key, + const key_range * end_key, + bool eq_range, bool sorted); + virtual int read_range_next(); + +private: + int common_index_read(byte * buf, const byte * key, + uint key_len, enum ha_rkey_function find_flag); + int common_first_last(byte * buf); + int partition_scan_set_up(byte * buf, bool idx_read_flag); + int handle_unordered_next(byte * buf, bool next_same); + int handle_unordered_scan_next_partition(byte * buf); + byte *queue_buf(uint part_id) + { + return (m_ordered_rec_buffer + + (part_id * (m_rec_length + PARTITION_BYTES_IN_POS))); + } + byte *rec_buf(uint part_id) + { + return (queue_buf(part_id) + + PARTITION_BYTES_IN_POS); + } + int handle_ordered_index_scan(byte * buf); + int handle_ordered_next(byte * buf, bool next_same); + int handle_ordered_prev(byte * buf); + void return_top_record(byte * buf); + void include_partition_fields_in_used_fields(); +public: + /* + ------------------------------------------------------------------------- + MODULE information calls + ------------------------------------------------------------------------- + This calls are used to inform the handler of specifics of the ongoing + scans and other actions. Most of these are used for optimisation + purposes. + ------------------------------------------------------------------------- + */ + virtual void info(uint); + virtual int extra(enum ha_extra_function operation); + virtual int extra_opt(enum ha_extra_function operation, ulong cachesize); + virtual int reset(void); + +private: + static const uint NO_CURRENT_PART_ID= 0xFFFFFFFF; + int loop_extra(enum ha_extra_function operation); + void late_extra_cache(uint partition_id); + void late_extra_no_cache(uint partition_id); + void prepare_extra_cache(uint cachesize); +public: + + /* + ------------------------------------------------------------------------- + MODULE optimiser support + ------------------------------------------------------------------------- + ------------------------------------------------------------------------- + */ + + /* + NOTE !!!!!! + ------------------------------------------------------------------------- + ------------------------------------------------------------------------- + One important part of the public handler interface that is not depicted in + the methods is the attribute records + + which is defined in the base class. This is looked upon directly and is + set by calling info(HA_STATUS_INFO) ? + ------------------------------------------------------------------------- + */ + + /* + keys_to_use_for_scanning can probably be implemented as the + intersection of all underlying handlers if mixed handlers are used. + This method is used to derive whether an index can be used for + index-only scanning when performing an ORDER BY query. + Only called from one place in sql_select.cc + */ + virtual const key_map *keys_to_use_for_scanning(); + + /* + Called in test_quick_select to determine if indexes should be used. + */ + virtual double scan_time(); + + /* + The next method will never be called if you do not implement indexes. + */ + virtual double read_time(uint index, uint ranges, ha_rows rows); + /* + For the given range how many records are estimated to be in this range. + Used by optimiser to calculate cost of using a particular index. + */ + virtual ha_rows records_in_range(uint inx, key_range * min_key, + key_range * max_key); + + /* + Upper bound of number records returned in scan is sum of all + underlying handlers. + */ + virtual ha_rows estimate_rows_upper_bound(); + + /* + table_cache_type is implemented by the underlying handler but all + underlying handlers must have the same implementation for it to work. + */ + virtual uint8 table_cache_type(); + + /* + ------------------------------------------------------------------------- + MODULE print messages + ------------------------------------------------------------------------- + This module contains various methods that returns text messages for + table types, index type and error messages. + ------------------------------------------------------------------------- + */ + /* + The name of the index type that will be used for display + Here we must ensure that all handlers use the same index type + for each index created. + */ + virtual const char *index_type(uint inx); + + /* The name of the table type that will be used for display purposes */ + virtual const char *table_type() const + { return "PARTITION"; } + + /* + Handler specific error messages + */ + virtual void print_error(int error, myf errflag); + virtual bool get_error_message(int error, String * buf); + /* + ------------------------------------------------------------------------- + MODULE handler characteristics + ------------------------------------------------------------------------- + This module contains a number of methods defining limitations and + characteristics of the handler. The partition handler will calculate + this characteristics based on underlying handler characteristics. + ------------------------------------------------------------------------- + + This is a list of flags that says what the storage engine + implements. The current table flags are documented in handler.h + The partition handler will support whatever the underlying handlers + support except when specifically mentioned below about exceptions + to this rule. + + HA_READ_RND_SAME: + Not currently used. (Means that the handler supports the rnd_same() call) + (MyISAM, HEAP) + + HA_TABLE_SCAN_ON_INDEX: + Used to avoid scanning full tables on an index. If this flag is set then + the handler always has a primary key (hidden if not defined) and this + index is used for scanning rather than a full table scan in all + situations. + (InnoDB, BDB, Federated) + + HA_REC_NOT_IN_SEQ: + This flag is set for handlers that cannot guarantee that the rows are + returned accroding to incremental positions (0, 1, 2, 3...). + This also means that rnd_next() should return HA_ERR_RECORD_DELETED + if it finds a deleted row. + (MyISAM (not fixed length row), BDB, HEAP, NDB, InooDB) + + HA_CAN_GEOMETRY: + Can the storage engine handle spatial data. + Used to check that no spatial attributes are declared unless + the storage engine is capable of handling it. + (MyISAM) + + HA_FAST_KEY_READ: + Setting this flag indicates that the handler is equally fast in + finding a row by key as by position. + This flag is used in a very special situation in conjunction with + filesort's. For further explanation see intro to init_read_record. + (BDB, HEAP, InnoDB) + + HA_NULL_IN_KEY: + Is NULL values allowed in indexes. + If this is not allowed then it is not possible to use an index on a + NULLable field. + (BDB, HEAP, MyISAM, NDB, InnoDB) + + HA_DUPP_POS: + Tells that we can the position for the conflicting duplicate key + record is stored in table->file->dupp_ref. (insert uses rnd_pos() on + this to find the duplicated row) + (MyISAM) + + HA_CAN_INDEX_BLOBS: + Is the storage engine capable of defining an index of a prefix on + a BLOB attribute. + (BDB, Federated, MyISAM, InnoDB) + + HA_AUTO_PART_KEY: + Auto increment fields can be part of a multi-part key. For second part + auto-increment keys, the auto_incrementing is done in handler.cc + (BDB, Federated, MyISAM, NDB) + + HA_REQUIRE_PRIMARY_KEY: + Can't define a table without primary key (and cannot handle a table + with hidden primary key) + (No handler has this limitation currently) + + HA_NOT_EXACT_COUNT: + Does the counter of records after the info call specify an exact + value or not. If it doesn't this flag is set. + Only MyISAM and HEAP uses exact count. + (MyISAM, HEAP, BDB, InnoDB, NDB, Federated) + + HA_CAN_INSERT_DELAYED: + Can the storage engine support delayed inserts. + To start with the partition handler will not support delayed inserts. + Further investigation needed. + (HEAP, MyISAM) + + HA_PRIMARY_KEY_IN_READ_INDEX: + This parameter is set when the handler will also return the primary key + when doing read-only-key on another index. + + HA_NOT_DELETE_WITH_CACHE: + Seems to be an old MyISAM feature that is no longer used. No handler + has it defined but it is checked in init_read_record. + Further investigation needed. + (No handler defines it) + + HA_NO_PREFIX_CHAR_KEYS: + Indexes on prefixes of character fields is not allowed. + (NDB) + + HA_CAN_FULLTEXT: + Does the storage engine support fulltext indexes + The partition handler will start by not supporting fulltext indexes. + (MyISAM) + + HA_CAN_SQL_HANDLER: + Can the HANDLER interface in the MySQL API be used towards this + storage engine. + (MyISAM, InnoDB) + + HA_NO_AUTO_INCREMENT: + Set if the storage engine does not support auto increment fields. + (Currently not set by any handler) + + HA_HAS_CHECKSUM: + Special MyISAM feature. Has special SQL support in CREATE TABLE. + No special handling needed by partition handler. + (MyISAM) + + HA_FILE_BASED: + Should file names always be in lower case (used by engines + that map table names to file names. + Since partition handler has a local file this flag is set. + (BDB, Federated, MyISAM) + + HA_CAN_BIT_FIELD: + Is the storage engine capable of handling bit fields? + (MyISAM, NDB) + + HA_NEED_READ_RANGE_BUFFER: + Is Read Multi-Range supported => need multi read range buffer + This parameter specifies whether a buffer for read multi range + is needed by the handler. Whether the handler supports this + feature or not is dependent of whether the handler implements + read_multi_range* calls or not. The only handler currently + supporting this feature is NDB so the partition handler need + not handle this call. There are methods in handler.cc that will + transfer those calls into index_read and other calls in the + index scan module. + (NDB) + */ + virtual ulong table_flags() const + { return m_table_flags; } + /* + HA_CAN_PARTITION: + Used by storage engines that can handle partitioning without this + partition handler + (Partition, NDB) + + HA_CAN_UPDATE_PARTITION_KEY: + Set if the handler can update fields that are part of the partition + function. + + HA_CAN_PARTITION_UNIQUE: + Set if the handler can handle unique indexes where the fields of the + unique key are not part of the fields of the partition function. Thus + a unique key can be set on all fields. + */ + virtual ulong partition_flags() const + { return HA_CAN_PARTITION; } + + /* + This is a bitmap of flags that says how the storage engine + implements indexes. The current index flags are documented in + handler.h. If you do not implement indexes, just return zero + here. + + part is the key part to check. First key part is 0 + If all_parts it's set, MySQL want to know the flags for the combined + index up to and including 'part'. + + HA_READ_NEXT: + Does the index support read next, this is assumed in the server + code and never checked so all indexes must support this. + Note that the handler can be used even if it doesn't have any index. + (BDB, HEAP, MyISAM, Federated, NDB, InnoDB) + + HA_READ_PREV: + Can the index be used to scan backwards. + (BDB, HEAP, MyISAM, NDB, InnoDB) + + HA_READ_ORDER: + Can the index deliver its record in index order. Typically true for + all ordered indexes and not true for hash indexes. + In first step this is not true for partition handler until a merge + sort has been implemented in partition handler. + Used to set keymap part_of_sortkey + This keymap is only used to find indexes usable for resolving an ORDER BY + in the query. Thus in most cases index_read will work just fine without + order in result production. When this flag is set it is however safe to + order all output started by index_read since most engines do this. With + read_multi_range calls there is a specific flag setting order or not + order so in those cases ordering of index output can be avoided. + (BDB, InnoDB, HEAP, MyISAM, NDB) + + HA_READ_RANGE: + Specify whether index can handle ranges, typically true for all + ordered indexes and not true for hash indexes. + Used by optimiser to check if ranges (as key >= 5) can be optimised + by index. + (BDB, InnoDB, NDB, MyISAM, HEAP) + + HA_ONLY_WHOLE_INDEX: + Can't use part key searches. This is typically true for hash indexes + and typically not true for ordered indexes. + (Federated, NDB, HEAP) + + HA_KEYREAD_ONLY: + Does the storage engine support index-only scans on this index. + Enables use of HA_EXTRA_KEYREAD and HA_EXTRA_NO_KEYREAD + Used to set key_map keys_for_keyread and to check in optimiser for + index-only scans. When doing a read under HA_EXTRA_KEYREAD the handler + only have to fill in the columns the key covers. If + HA_PRIMARY_KEY_IN_READ_INDEX is set then also the PRIMARY KEY columns + must be updated in the row. + (BDB, InnoDB, MyISAM) + */ + virtual ulong index_flags(uint inx, uint part, bool all_parts) const + { + return m_file[0]->index_flags(inx, part, all_parts); + } + + /* + extensions of table handler files + */ + virtual const char **bas_ext() const; + /* + unireg.cc will call the following to make sure that the storage engine + can handle the data it is about to send. + + The maximum supported values is the minimum of all handlers in the table + */ + uint min_of_the_max_uint(uint (handler::*operator_func)(void) const) const; + virtual uint max_supported_record_length() const; + virtual uint max_supported_keys() const; + virtual uint max_supported_key_parts() const; + virtual uint max_supported_key_length() const; + virtual uint max_supported_key_part_length() const; + + /* + All handlers in a partitioned table must have the same low_byte_first + */ + virtual bool low_byte_first() const + { return m_low_byte_first; } + + /* + The extra record buffer length is the maximum needed by all handlers. + The minimum record length is the maximum of all involved handlers. + */ + virtual uint extra_rec_buf_length() const; + virtual uint min_record_length(uint options) const; + + /* + Transactions on the table is supported if all handlers below support + transactions. + */ + virtual bool has_transactions() + { return m_has_transactions; } + + /* + Primary key is clustered can only be true if all underlying handlers have + this feature. + */ + virtual bool primary_key_is_clustered() + { return m_pkey_is_clustered; } + + /* + ------------------------------------------------------------------------- + MODULE compare records + ------------------------------------------------------------------------- + cmp_ref checks if two references are the same. For most handlers this is + a simple memcmp of the reference. However some handlers use primary key + as reference and this can be the same even if memcmp says they are + different. This is due to character sets and end spaces and so forth. + For the partition handler the reference is first two bytes providing the + partition identity of the referred record and then the reference of the + underlying handler. + Thus cmp_ref for the partition handler always returns FALSE for records + not in the same partition and uses cmp_ref on the underlying handler + to check whether the rest of the reference part is also the same. + ------------------------------------------------------------------------- + */ + virtual int cmp_ref(const byte * ref1, const byte * ref2); + /* + ------------------------------------------------------------------------- + MODULE auto increment + ------------------------------------------------------------------------- + This module is used to handle the support of auto increments. + + This variable in the handler is used as part of the handler interface + It is maintained by the parent handler object and should not be + touched by child handler objects (see handler.cc for its use). + + auto_increment_column_changed + ------------------------------------------------------------------------- + */ + virtual void restore_auto_increment(); + virtual ulonglong get_auto_increment(); + + /* + ------------------------------------------------------------------------- + MODULE initialise handler for HANDLER call + ------------------------------------------------------------------------- + This method is a special InnoDB method called before a HANDLER query. + ------------------------------------------------------------------------- + */ + virtual void init_table_handle_for_HANDLER(); + + /* + The remainder of this file defines the handler methods not implemented + by the partition handler + */ + + /* + ------------------------------------------------------------------------- + MODULE foreign key support + ------------------------------------------------------------------------- + The following methods are used to implement foreign keys as supported by + InnoDB. Implement this ?? + get_foreign_key_create_info is used by SHOW CREATE TABLE to get a textual + description of how the CREATE TABLE part to define FOREIGN KEY's is done. + free_foreign_key_create_info is used to free the memory area that provided + this description. + ------------------------------------------------------------------------- + + virtual char* get_foreign_key_create_info() + virtual void free_foreign_key_create_info(char* str) + + virtual int get_foreign_key_list(THD *thd, + List<FOREIGN_KEY_INFO> *f_key_list) + virtual uint referenced_by_foreign_key() + */ + + /* + ------------------------------------------------------------------------- + MODULE fulltext index + ------------------------------------------------------------------------- + Fulltext stuff not yet. + ------------------------------------------------------------------------- + virtual int ft_init() { return HA_ERR_WRONG_COMMAND; } + virtual FT_INFO *ft_init_ext(uint flags,uint inx,const byte *key, + uint keylen) + { return NULL; } + virtual int ft_read(byte *buf) { return HA_ERR_WRONG_COMMAND; } + */ + + /* + ------------------------------------------------------------------------- + MODULE restart full table scan at position (MyISAM) + ------------------------------------------------------------------------- + The following method is only used by MyISAM when used as + temporary tables in a join. + virtual int restart_rnd_next(byte *buf, byte *pos); + */ + + /* + ------------------------------------------------------------------------- + MODULE on-line ALTER TABLE + ------------------------------------------------------------------------- + These methods are in the handler interface but never used (yet) + They are to be used by on-line alter table add/drop index: + ------------------------------------------------------------------------- + virtual ulong index_ddl_flags(KEY *wanted_index) const + virtual int add_index(TABLE *table_arg,KEY *key_info,uint num_of_keys); + virtual int drop_index(TABLE *table_arg,uint *key_num,uint num_of_keys); + */ + + /* + ------------------------------------------------------------------------- + MODULE tablespace support + ------------------------------------------------------------------------- + Admin of table spaces is not applicable to the partition handler (InnoDB) + This means that the following method is not implemented: + ------------------------------------------------------------------------- + virtual int discard_or_import_tablespace(my_bool discard) + */ + + /* + ------------------------------------------------------------------------- + MODULE admin MyISAM + ------------------------------------------------------------------------- + Admin commands not supported currently (almost purely MyISAM routines) + This means that the following methods are not implemented: + ------------------------------------------------------------------------- + + virtual int check(THD* thd, HA_CHECK_OPT *check_opt); + virtual int backup(TD* thd, HA_CHECK_OPT *check_opt); + virtual int restore(THD* thd, HA_CHECK_OPT *check_opt); + virtual int repair(THD* thd, HA_CHECK_OPT *check_opt); + virtual int optimize(THD* thd, HA_CHECK_OPT *check_opt); + virtual int analyze(THD* thd, HA_CHECK_OPT *check_opt); + virtual int assign_to_keycache(THD* thd, HA_CHECK_OPT *check_opt); + virtual int preload_keys(THD *thd, HA_CHECK_OPT *check_opt); + virtual bool check_and_repair(THD *thd); + virtual int dump(THD* thd, int fd = -1); + virtual int net_read_dump(NET* net); + virtual uint checksum() const; + virtual bool is_crashed() const; + virtual bool auto_repair() const; + + ------------------------------------------------------------------------- + MODULE enable/disable indexes + ------------------------------------------------------------------------- + Enable/Disable Indexes are not supported currently (Heap, MyISAM) + This means that the following methods are not implemented: + ------------------------------------------------------------------------- + virtual int disable_indexes(uint mode); + virtual int enable_indexes(uint mode); + virtual int indexes_are_disabled(void); + */ + + /* + ------------------------------------------------------------------------- + MODULE append_create_info + ------------------------------------------------------------------------- + append_create_info is only used by MyISAM MERGE tables and the partition + handler will not support this handler as underlying handler. + Implement this?? + ------------------------------------------------------------------------- + virtual void append_create_info(String *packet) + */ +}; diff --git a/sql/handler.cc b/sql/handler.cc index bf2ce0dad4e..512363f71d7 100644 --- a/sql/handler.cc +++ b/sql/handler.cc @@ -34,6 +34,9 @@ #ifdef HAVE_EXAMPLE_DB #include "examples/ha_example.h" #endif +#ifdef HAVE_PARTITION_DB +#include "ha_partition.h" +#endif #ifdef HAVE_ARCHIVE_DB #include "examples/ha_archive.h" #endif @@ -170,7 +173,13 @@ enum db_type ha_checktype(THD *thd, enum db_type database_type, { if (ha_storage_engine_is_enabled(database_type)) return database_type; - +#ifdef HAVE_PARTITION_DB + /* + Partition handler is not in the list of handlers shown since it is an internal handler + */ + if (database_type == DB_TYPE_PARTITION_DB) + return database_type; +#endif if (no_substitute) { if (report_error) @@ -236,6 +245,13 @@ handler *get_new_handler(TABLE *table, enum db_type db_type) file= new ha_example(table); break; #endif +#ifdef HAVE_PARTITION_DB + case DB_TYPE_PARTITION_DB: + { + file= new ha_partition(table); + break; + } +#endif #ifdef HAVE_ARCHIVE_DB case DB_TYPE_ARCHIVE_DB: file= new ha_archive(table); @@ -290,6 +306,29 @@ handler *get_new_handler(TABLE *table, enum db_type db_type) return file; } + +#ifdef HAVE_PARTITION_DB +handler *get_ha_partition(partition_info *part_info) +{ + ha_partition *partition; + DBUG_ENTER("get_ha_partition"); + if ((partition= new ha_partition(part_info))) + { + if (partition->ha_initialise()) + { + delete partition; + partition= 0; + } + } + else + { + my_error(ER_OUTOFMEMORY, MYF(0), sizeof(ha_partition)); + } + DBUG_RETURN(((handler*) partition)); +} +#endif + + /* Register handler error messages for use with my_error(). @@ -1406,27 +1445,41 @@ int handler::ha_allocate_read_write_set(ulong no_fields) my_bool r; #endif DBUG_ENTER("ha_allocate_read_write_set"); - DBUG_PRINT("info", ("no_fields = %d", no_fields)); - read_set= (MY_BITMAP*)sql_alloc(sizeof(MY_BITMAP)); - write_set= (MY_BITMAP*)sql_alloc(sizeof(MY_BITMAP)); - read_buf= (uint32*)sql_alloc(bitmap_size); - write_buf= (uint32*)sql_alloc(bitmap_size); - if (!read_set || !write_set || !read_buf || !write_buf) - { - ha_deallocate_read_write_set(); - DBUG_RETURN(TRUE); - } + DBUG_PRINT("enter", ("no_fields = %d", no_fields)); + + if (table) + { + if (table->read_set == NULL) + { + read_set= (MY_BITMAP*)sql_alloc(sizeof(MY_BITMAP)); + write_set= (MY_BITMAP*)sql_alloc(sizeof(MY_BITMAP)); + read_buf= (uint32*)sql_alloc(bitmap_size); + write_buf= (uint32*)sql_alloc(bitmap_size); + if (!read_set || !write_set || !read_buf || !write_buf) + { + ha_deallocate_read_write_set(); + DBUG_RETURN(TRUE); + } #ifndef DEBUG_OFF - r = + r = #endif - bitmap_init(read_set, read_buf, no_fields+1, FALSE); - DBUG_ASSERT(!r /*bitmap_init(read_set...)*/); + bitmap_init(read_set, read_buf, no_fields+1, FALSE); + DBUG_ASSERT(!r /*bitmap_init(read_set...)*/); #ifndef DEBUG_OFF - r = + r = #endif - bitmap_init(write_set, write_buf, no_fields+1, FALSE); - DBUG_ASSERT(!r /*bitmap_init(write_set...)*/); - ha_clear_all_set(); + bitmap_init(write_set, write_buf, no_fields+1, FALSE); + DBUG_ASSERT(!r /*bitmap_init(write_set...)*/); + table->read_set= read_set; + table->write_set= write_set; + ha_clear_all_set(); + } + else + { + read_set= table->read_set; + write_set= table->write_set; + } + } DBUG_RETURN(FALSE); } @@ -1476,6 +1529,8 @@ void handler::ha_set_primary_key_in_read_set() } DBUG_VOID_RETURN; } + + /* Read first row (only) from a table This is never called for InnoDB or BDB tables, as these table types @@ -1504,7 +1559,7 @@ int handler::read_first_row(byte * buf, uint primary_key) else { /* Find the first row through the primary key */ - (void) ha_index_init(primary_key); + (void) ha_index_init(primary_key, 0); error=index_first(buf); (void) ha_index_end(); } @@ -1688,7 +1743,7 @@ ulonglong handler::get_auto_increment() int error; (void) extra(HA_EXTRA_KEYREAD); - index_init(table->s->next_number_index); + index_init(table->s->next_number_index, 1); if (!table->s->next_number_key_offset) { // Autoincrement at key-start error=index_last(table->record[1]); @@ -2512,7 +2567,7 @@ int handler::compare_key(key_range *range) int handler::index_read_idx(byte * buf, uint index, const byte * key, uint key_len, enum ha_rkey_function find_flag) { - int error= ha_index_init(index); + int error= ha_index_init(index, 0); if (!error) error= index_read(buf, key, key_len, find_flag); if (!error) diff --git a/sql/handler.h b/sql/handler.h index d7fe19ad884..d06ae062fb3 100644 --- a/sql/handler.h +++ b/sql/handler.h @@ -89,6 +89,11 @@ #define HA_NEED_READ_RANGE_BUFFER (1 << 29) /* for read_multi_range */ #define HA_ANY_INDEX_MAY_BE_UNIQUE (1 << 30) +/* Flags for partition handlers */ +#define HA_CAN_PARTITION (1 << 0) /* Partition support */ +#define HA_CAN_UPDATE_PARTITION_KEY (1 << 1) +#define HA_CAN_PARTITION_UNIQUE (1 << 2) + /* bits in index_flags(index_number) for what you can do with index */ #define HA_READ_NEXT 1 /* TODO really use this flag */ @@ -172,6 +177,7 @@ enum db_type DB_TYPE_EXAMPLE_DB, DB_TYPE_ARCHIVE_DB, DB_TYPE_CSV_DB, DB_TYPE_FEDERATED_DB, DB_TYPE_BLACKHOLE_DB, + DB_TYPE_PARTITION_DB, DB_TYPE_DEFAULT // Must be last }; @@ -364,6 +370,208 @@ typedef struct st_thd_trans enum enum_tx_isolation { ISO_READ_UNCOMMITTED, ISO_READ_COMMITTED, ISO_REPEATABLE_READ, ISO_SERIALIZABLE}; + +typedef struct { + uint32 start_part; + uint32 end_part; + bool use_bit_array; +} part_id_range; +/** + * An enum and a struct to handle partitioning and subpartitioning. + */ +enum partition_type { + NOT_A_PARTITION= 0, + RANGE_PARTITION, + HASH_PARTITION, + LIST_PARTITION +}; + +#define UNDEF_NODEGROUP 65535 +class Item; + +class partition_element :public Sql_alloc { +public: + List<partition_element> subpartitions; + List<Item> list_expr_list; + ulonglong part_max_rows; + ulonglong part_min_rows; + char *partition_name; + char *tablespace_name; + Item* range_expr; + char* part_comment; + char* data_file_name; + char* index_file_name; + enum db_type engine_type; + uint16 nodegroup_id; + + partition_element() + : part_max_rows(0), part_min_rows(0), partition_name(NULL), + tablespace_name(NULL), range_expr(NULL), part_comment(NULL), + data_file_name(NULL), index_file_name(NULL), + engine_type(DB_TYPE_UNKNOWN), nodegroup_id(UNDEF_NODEGROUP) + { + subpartitions.empty(); + list_expr_list.empty(); + } + ~partition_element() {} +}; + +typedef struct { + longlong list_value; + uint partition_id; +} LIST_PART_ENTRY; +enum Item_result; + +class partition_info; + +typedef bool (*get_part_id_func)(partition_info *part_info, + uint32 *part_id); +typedef uint32 (*get_subpart_id_func)(partition_info *part_info); + +class partition_info :public Sql_alloc { +public: + /* + * Here comes a set of definitions needed for partitioned table handlers. + */ + List<partition_element> partitions; + + List<char> part_field_list; + List<char> subpart_field_list; + + get_part_id_func get_partition_id; + get_part_id_func get_part_partition_id; + get_subpart_id_func get_subpartition_id; + + Field **part_field_array; + Field **subpart_field_array; + Field **full_part_field_array; + + Item *part_expr; + Item *subpart_expr; + + Item *item_free_list; + + union { + longlong *range_int_array; + LIST_PART_ENTRY *list_array; + }; + char* part_info_string; + + char *part_func_string; + char *subpart_func_string; + + partition_element *curr_part_elem; + partition_element *current_partition; + /* + These key_map's are used for Partitioning to enable quick decisions + on whether we can derive more information about which partition to + scan just by looking at what index is used. + */ + key_map all_fields_in_PF, all_fields_in_PPF, all_fields_in_SPF; + key_map some_fields_in_PF; + + enum db_type default_engine_type; + Item_result part_result_type; + partition_type part_type; + partition_type subpart_type; + + uint part_info_len; + uint part_func_len; + uint subpart_func_len; + + uint no_full_parts; + uint no_parts; + uint no_subparts; + uint count_curr_parts; + uint count_curr_subparts; + + uint part_error_code; + + uint no_list_values; + + uint no_part_fields; + uint no_subpart_fields; + uint no_full_part_fields; + + uint16 linear_hash_mask; + + bool use_default_partitions; + bool use_default_subpartitions; + bool defined_max_value; + bool list_of_part_fields; + bool list_of_subpart_fields; + bool linear_hash_ind; + + partition_info() + : get_partition_id(NULL), get_part_partition_id(NULL), + get_subpartition_id(NULL), + part_field_array(NULL), subpart_field_array(NULL), + full_part_field_array(NULL), + part_expr(NULL), subpart_expr(NULL), item_free_list(NULL), + list_array(NULL), + part_info_string(NULL), + part_func_string(NULL), subpart_func_string(NULL), + curr_part_elem(NULL), current_partition(NULL), + default_engine_type(DB_TYPE_UNKNOWN), + part_result_type(INT_RESULT), + part_type(NOT_A_PARTITION), subpart_type(NOT_A_PARTITION), + part_info_len(0), part_func_len(0), subpart_func_len(0), + no_full_parts(0), no_parts(0), no_subparts(0), + count_curr_parts(0), count_curr_subparts(0), part_error_code(0), + no_list_values(0), no_part_fields(0), no_subpart_fields(0), + no_full_part_fields(0), linear_hash_mask(0), + use_default_partitions(TRUE), + use_default_subpartitions(TRUE), defined_max_value(FALSE), + list_of_part_fields(FALSE), list_of_subpart_fields(FALSE), + linear_hash_ind(FALSE) + { + all_fields_in_PF.clear_all(); + all_fields_in_PPF.clear_all(); + all_fields_in_SPF.clear_all(); + some_fields_in_PF.clear_all(); + partitions.empty(); + part_field_list.empty(); + subpart_field_list.empty(); + } + ~partition_info() {} +}; + + +#ifdef HAVE_PARTITION_DB +/* + Answers the question if subpartitioning is used for a certain table + SYNOPSIS + is_sub_partitioned() + part_info A reference to the partition_info struct + RETURN VALUE + Returns true if subpartitioning used and false otherwise + DESCRIPTION + A routine to check for subpartitioning for improved readability of code +*/ +inline +bool is_sub_partitioned(partition_info *part_info) +{ return (part_info->subpart_type == NOT_A_PARTITION ? FALSE : TRUE); } + + +/* + Returns the total number of partitions on the leaf level. + SYNOPSIS + get_tot_partitions() + part_info A reference to the partition_info struct + RETURN VALUE + Returns the number of partitions + DESCRIPTION + A routine to check for number of partitions for improved readability + of code +*/ +inline +uint get_tot_partitions(partition_info *part_info) +{ + return part_info->no_parts * + (is_sub_partitioned(part_info) ? part_info->no_subparts : 1); +} +#endif + typedef struct st_ha_create_information { CHARSET_INFO *table_charset, *default_table_charset; @@ -412,6 +620,31 @@ typedef struct st_ha_check_opt } HA_CHECK_OPT; +#ifdef HAVE_PARTITION_DB +handler *get_ha_partition(partition_info *part_info); +int get_parts_for_update(const byte *old_data, byte *new_data, + const byte *rec0, partition_info *part_info, + uint32 *old_part_id, uint32 *new_part_id); +int get_part_for_delete(const byte *buf, const byte *rec0, + partition_info *part_info, uint32 *part_id); +bool check_partition_info(partition_info *part_info,enum db_type eng_type, + handler *file, ulonglong max_rows); +bool fix_partition_func(THD *thd, const char *name, TABLE *table); +char *generate_partition_syntax(partition_info *part_info, + uint *buf_length, bool use_sql_alloc); +bool partition_key_modified(TABLE *table, List<Item> &fields); +void get_partition_set(const TABLE *table, byte *buf, const uint index, + const key_range *key_spec, + part_id_range *part_spec); +void get_full_part_id_from_key(const TABLE *table, byte *buf, + KEY *key_info, + const key_range *key_spec, + part_id_range *part_spec); +bool mysql_unpack_partition(File file, THD *thd, uint part_info_len, + TABLE *table); +#endif + + /* This is a buffer area that the handler can use to store rows. 'end_of_used_area' should be kept updated after calls to @@ -429,10 +662,13 @@ typedef struct st_handler_buffer class handler :public Sql_alloc { +#ifdef HAVE_PARTITION_DB + friend class ha_partition; +#endif protected: struct st_table *table; /* The table definition */ - virtual int index_init(uint idx) { active_index=idx; return 0; } + virtual int index_init(uint idx, bool sorted) { active_index=idx; return 0; } virtual int index_end() { active_index=MAX_KEY; return 0; } /* rnd_init() can be called two times without rnd_end() in between @@ -518,7 +754,7 @@ public: { return rows2double(ranges+rows); } virtual const key_map *keys_to_use_for_scanning() { return &key_map_empty; } virtual bool has_transactions(){ return 0;} - virtual uint extra_rec_buf_length() { return 0; } + virtual uint extra_rec_buf_length() const { return 0; } /* Return upper bound of current number of records in the table @@ -537,12 +773,12 @@ public: virtual const char *index_type(uint key_number) { DBUG_ASSERT(0); return "";} - int ha_index_init(uint idx) + int ha_index_init(uint idx, bool sorted) { DBUG_ENTER("ha_index_init"); DBUG_ASSERT(inited==NONE); inited=INDEX; - DBUG_RETURN(index_init(idx)); + DBUG_RETURN(index_init(idx, sorted)); } int ha_index_end() { @@ -902,6 +1138,10 @@ public: virtual const char *table_type() const =0; virtual const char **bas_ext() const =0; virtual ulong table_flags(void) const =0; +#ifdef HAVE_PARTITION_DB + virtual ulong partition_flags(void) const { return 0;} + virtual int get_default_no_partitions(ulonglong max_rows) { return 1;} +#endif virtual ulong index_flags(uint idx, uint part, bool all_parts) const =0; virtual ulong index_ddl_flags(KEY *wanted_index) const { return (HA_DDL_SUPPORT); } @@ -941,6 +1181,7 @@ public: virtual int delete_table(const char *name); virtual int create(const char *name, TABLE *form, HA_CREATE_INFO *info)=0; + virtual int create_handler_files(const char *name) { return FALSE;} /* lock_count() can be more than one if the table is a MERGE */ virtual uint lock_count(void) const { return 1; } diff --git a/sql/item_subselect.cc b/sql/item_subselect.cc index ad1c9977e5b..903f4c953a2 100644 --- a/sql/item_subselect.cc +++ b/sql/item_subselect.cc @@ -1492,7 +1492,7 @@ int subselect_uniquesubquery_engine::exec() } if (!table->file->inited) - table->file->ha_index_init(tab->ref.key); + table->file->ha_index_init(tab->ref.key, 0); error= table->file->index_read(table->record[0], tab->ref.key_buff, tab->ref.key_length,HA_READ_KEY_EXACT); @@ -1545,7 +1545,7 @@ int subselect_indexsubquery_engine::exec() } if (!table->file->inited) - table->file->ha_index_init(tab->ref.key); + table->file->ha_index_init(tab->ref.key, 1); error= table->file->index_read(table->record[0], tab->ref.key_buff, tab->ref.key_length,HA_READ_KEY_EXACT); diff --git a/sql/key.cc b/sql/key.cc index 4bd71d2fa47..f1e073a4775 100644 --- a/sql/key.cc +++ b/sql/key.cc @@ -429,3 +429,86 @@ int key_cmp(KEY_PART_INFO *key_part, const byte *key, uint key_length) } return 0; // Keys are equal } + + +/* + Compare two records in index order + SYNOPSIS + key_rec_cmp() + key Index information + rec0 Pointer to table->record[0] + first_rec Pointer to record compare with + second_rec Pointer to record compare against first_rec + DESCRIPTION + This method is set-up such that it can be called directly from the + priority queue and it is attempted to be optimised as much as possible + since this will be called O(N * log N) times while performing a merge + sort in various places in the code. + + We retrieve the pointer to table->record[0] using the fact that key_parts + have an offset making it possible to calculate the start of the record. + We need to get the diff to the compared record since none of the records + being compared are stored in table->record[0]. + + We first check for NULL values, if there are no NULL values we use + a compare method that gets two field pointers and a max length + and return the result of the comparison. +*/ + +int key_rec_cmp(void *key, byte *first_rec, byte *second_rec) +{ + KEY *key_info= (KEY*)key; + uint key_parts= key_info->key_parts, i= 0; + KEY_PART_INFO *key_part= key_info->key_part; + char *rec0= key_part->field->ptr - key_part->offset; + my_ptrdiff_t first_diff= first_rec - rec0, sec_diff= second_rec - rec0; + int result= 0; + DBUG_ENTER("key_rec_cmp"); + + do + { + Field *field= key_part->field; + uint length; + + if (key_part->null_bit) + { + /* The key_part can contain NULL values */ + bool first_is_null= field->is_null(first_diff); + bool sec_is_null= field->is_null(sec_diff); + /* + NULL is smaller then everything so if first is NULL and the other + not then we know that we should return -1 and for the opposite + we should return +1. If both are NULL then we call it equality + although it is a strange form of equality, we have equally little + information of the real value. + */ + if (!first_is_null) + { + if (!sec_is_null) + ; /* Fall through, no NULL fields */ + else + { + DBUG_RETURN(+1); + } + } + else if (!sec_is_null) + { + DBUG_RETURN(-1); + } + else + goto next_loop; /* Both were NULL */ + } + /* + No null values in the fields + We use the virtual method cmp_max with a max length parameter. + For most field types this translates into a cmp without + max length. The exceptions are the BLOB and VARCHAR field types + that take the max length into account. + */ + result= field->cmp_max(field->ptr+first_diff, field->ptr+sec_diff, + key_part->length); +next_loop: + key_part++; + } while (!result && ++i < key_parts); + DBUG_RETURN(result); +} diff --git a/sql/lex.h b/sql/lex.h index aa10328ced0..59ba6a8e15b 100644 --- a/sql/lex.h +++ b/sql/lex.h @@ -274,11 +274,14 @@ static SYMBOL symbols[] = { { "LEAVE", SYM(LEAVE_SYM)}, { "LEAVES", SYM(LEAVES)}, { "LEFT", SYM(LEFT)}, + { "LESS", SYM(LESS_SYM)}, { "LEVEL", SYM(LEVEL_SYM)}, { "LIKE", SYM(LIKE)}, { "LIMIT", SYM(LIMIT)}, + { "LINEAR", SYM(LINEAR_SYM)}, { "LINES", SYM(LINES)}, { "LINESTRING", SYM(LINESTRING)}, + { "LIST", SYM(LIST_SYM)}, { "LOAD", SYM(LOAD)}, { "LOCAL", SYM(LOCAL_SYM)}, { "LOCALTIME", SYM(NOW_SYM)}, @@ -312,6 +315,7 @@ static SYMBOL symbols[] = { { "MAX_ROWS", SYM(MAX_ROWS)}, { "MAX_UPDATES_PER_HOUR", SYM(MAX_UPDATES_PER_HOUR)}, { "MAX_USER_CONNECTIONS", SYM(MAX_USER_CONNECTIONS_SYM)}, + { "MAXVALUE", SYM(MAX_VALUE_SYM)}, { "MEDIUM", SYM(MEDIUM_SYM)}, { "MEDIUMBLOB", SYM(MEDIUMBLOB)}, { "MEDIUMINT", SYM(MEDIUMINT)}, @@ -343,6 +347,7 @@ static SYMBOL symbols[] = { { "NEW", SYM(NEW_SYM)}, { "NEXT", SYM(NEXT_SYM)}, { "NO", SYM(NO_SYM)}, + { "NODEGROUP", SYM(NODEGROUP_SYM)}, { "NONE", SYM(NONE_SYM)}, { "NOT", SYM(NOT_SYM)}, { "NO_WRITE_TO_BINLOG", SYM(NO_WRITE_TO_BINLOG)}, @@ -365,6 +370,10 @@ static SYMBOL symbols[] = { { "OUTFILE", SYM(OUTFILE)}, { "PACK_KEYS", SYM(PACK_KEYS_SYM)}, { "PARTIAL", SYM(PARTIAL)}, +#ifdef HAVE_PARTITION_DB + { "PARTITION", SYM(PARTITION_SYM)}, +#endif + { "PARTITIONS", SYM(PARTITIONS_SYM)}, { "PASSWORD", SYM(PASSWORD)}, { "PHASE", SYM(PHASE_SYM)}, { "POINT", SYM(POINT_SYM)}, @@ -385,6 +394,7 @@ static SYMBOL symbols[] = { { "RAID_CHUNKS", SYM(RAID_CHUNKS)}, { "RAID_CHUNKSIZE", SYM(RAID_CHUNKSIZE)}, { "RAID_TYPE", SYM(RAID_TYPE)}, + { "RANGE", SYM(RANGE_SYM)}, { "READ", SYM(READ_SYM)}, { "READS", SYM(READS_SYM)}, { "REAL", SYM(REAL)}, @@ -476,6 +486,8 @@ static SYMBOL symbols[] = { { "STRING", SYM(STRING_SYM)}, { "STRIPED", SYM(RAID_STRIPED_SYM)}, { "SUBJECT", SYM(SUBJECT_SYM)}, + { "SUBPARTITION", SYM(SUBPARTITION_SYM)}, + { "SUBPARTITIONS", SYM(SUBPARTITIONS_SYM)}, { "SUPER", SYM(SUPER_SYM)}, { "SUSPEND", SYM(SUSPEND_SYM)}, { "TABLE", SYM(TABLE_SYM)}, @@ -485,6 +497,7 @@ static SYMBOL symbols[] = { { "TEMPTABLE", SYM(TEMPTABLE_SYM)}, { "TERMINATED", SYM(TERMINATED)}, { "TEXT", SYM(TEXT_SYM)}, + { "THAN", SYM(THAN_SYM)}, { "THEN", SYM(THEN_SYM)}, { "TIME", SYM(TIME_SYM)}, { "TIMESTAMP", SYM(TIMESTAMP)}, diff --git a/sql/mysql_priv.h b/sql/mysql_priv.h index 83f2903c8fc..91287a8bd40 100644 --- a/sql/mysql_priv.h +++ b/sql/mysql_priv.h @@ -614,6 +614,18 @@ bool check_table_access(THD *thd, ulong want_access, TABLE_LIST *tables, bool no_errors); bool check_global_access(THD *thd, ulong want_access); +/* + General routine to change field->ptr of a NULL-terminated array of Field + objects. Useful when needed to call val_int, val_str or similar and the + field data is not in table->record[0] but in some other structure. + set_key_field_ptr changes all fields of an index using a key_info object. + All methods presume that there is at least one field to change. +*/ + +void set_field_ptr(Field **ptr, const byte *new_buf, const byte *old_buf); +void set_key_field_ptr(KEY *key_info, const byte *new_buf, + const byte *old_buf); + bool mysql_backup_table(THD* thd, TABLE_LIST* table_list); bool mysql_restore_table(THD* thd, TABLE_LIST* table_list); @@ -772,6 +784,9 @@ Field * find_field_in_real_table(THD *thd, TABLE *table, const char *name, uint length, bool check_grants, bool allow_rowid, uint *cached_field_index_ptr); +Field * +find_field_in_table_sef(TABLE *table, const char *name); + #ifdef HAVE_OPENSSL #include <openssl/des.h> struct st_des_keyblock @@ -1020,6 +1035,7 @@ bool key_cmp_if_same(TABLE *form,const byte *key,uint index,uint key_length); void key_unpack(String *to,TABLE *form,uint index); bool check_if_key_used(TABLE *table, uint idx, List<Item> &fields); int key_cmp(KEY_PART_INFO *key_part, const byte *key, uint key_length); +int key_rec_cmp(void *key_info, byte *a, byte *b); bool init_errmessage(void); void sql_perror(const char *message); @@ -1188,6 +1204,7 @@ extern SHOW_COMP_OPTION have_query_cache; extern SHOW_COMP_OPTION have_geometry, have_rtree_keys; extern SHOW_COMP_OPTION have_crypt; extern SHOW_COMP_OPTION have_compress; +extern SHOW_COMP_OPTION have_partition_db; #ifndef __WIN__ extern pthread_t signal_thread; @@ -1238,7 +1255,7 @@ bool mysql_create_frm(THD *thd, my_string file_name, uint key_count,KEY *key_info,handler *db_type); int rea_create_table(THD *thd, my_string file_name,HA_CREATE_INFO *create_info, List<create_field> &create_field, - uint key_count,KEY *key_info); + uint key_count,KEY *key_info, handler *file); int format_number(uint inputflag,uint max_length,my_string pos,uint length, my_string *errpos); int openfrm(THD *thd, const char *name,const char *alias,uint filestat, diff --git a/sql/mysqld.cc b/sql/mysqld.cc index 30bb84a3cb4..80670f2a445 100644 --- a/sql/mysqld.cc +++ b/sql/mysqld.cc @@ -325,6 +325,7 @@ my_bool opt_ndb_shm, opt_ndb_optimized_node_selection; ulong opt_ndb_cache_check_time; const char *opt_ndb_mgmd; ulong opt_ndb_nodeid; +bool opt_ndb_linear_hash; #endif my_bool opt_readonly, use_temp_pool, relay_log_purge; my_bool opt_sync_frm, opt_allow_suspicious_udfs; @@ -430,6 +431,7 @@ CHARSET_INFO *national_charset_info, *table_alias_charset; SHOW_COMP_OPTION have_berkeley_db, have_innodb, have_isam, have_ndbcluster, have_example_db, have_archive_db, have_csv_db; SHOW_COMP_OPTION have_federated_db; +SHOW_COMP_OPTION have_partition_db; SHOW_COMP_OPTION have_raid, have_openssl, have_symlink, have_query_cache; SHOW_COMP_OPTION have_geometry, have_rtree_keys; SHOW_COMP_OPTION have_crypt, have_compress; @@ -4235,6 +4237,7 @@ enum options_mysqld OPT_NDB_FORCE_SEND, OPT_NDB_AUTOINCREMENT_PREFETCH_SZ, OPT_NDB_SHM, OPT_NDB_OPTIMIZED_NODE_SELECTION, OPT_NDB_CACHE_CHECK_TIME, OPT_NDB_MGMD, OPT_NDB_NODEID, + OPT_NDB_LINEAR_HASH, OPT_SKIP_SAFEMALLOC, OPT_TEMP_POOL, OPT_TX_ISOLATION, OPT_COMPLETION_TYPE, OPT_SKIP_STACK_TRACE, OPT_SKIP_SYMLINKS, @@ -4800,6 +4803,16 @@ Disable with --skip-ndbcluster (will save memory).", (gptr*) &global_system_variables.ndb_autoincrement_prefetch_sz, (gptr*) &global_system_variables.ndb_autoincrement_prefetch_sz, 0, GET_ULONG, REQUIRED_ARG, 32, 1, 256, 0, 0, 0}, + {"ndb-use-linear-hash", OPT_NDB_LINEAR_HASH, + "Flag to indicate whether to use linear hash for default in new tables", + (gptr*) &opt_ndb_linear_hash, + (gptr*) &opt_ndb_linear_hash, + 0, GET_BOOL, OPT_ARG, 1, 0, 0, 0, 0, 0}, + {"ndb_use_linear_hash", OPT_NDB_LINEAR_HASH, + "Flag to indicate whether to use linear hash for default in new tables", + (gptr*) &opt_ndb_linear_hash, + (gptr*) &opt_ndb_linear_hash, + 0, GET_BOOL, OPT_ARG, 1, 0, 0, 0, 0, 0}, {"ndb-force-send", OPT_NDB_FORCE_SEND, "Force send of buffers to ndb immediately without waiting for " "other threads.", @@ -6055,6 +6068,11 @@ static void mysql_init_variables(void) #else have_example_db= SHOW_OPTION_NO; #endif +#ifdef HAVE_PARTITION_DB + have_partition_db= SHOW_OPTION_YES; +#else + have_partition_db= SHOW_OPTION_NO; +#endif #ifdef HAVE_ARCHIVE_DB have_archive_db= SHOW_OPTION_YES; #else diff --git a/sql/opt_range.cc b/sql/opt_range.cc index e04e15788fa..c3e73632c6f 100644 --- a/sql/opt_range.cc +++ b/sql/opt_range.cc @@ -751,7 +751,7 @@ int QUICK_RANGE_SELECT::init() DBUG_ENTER("QUICK_RANGE_SELECT::init"); if (file->inited == handler::NONE) - DBUG_RETURN(error= file->ha_index_init(index)); + DBUG_RETURN(error= file->ha_index_init(index, 1)); error= 0; DBUG_RETURN(0); } @@ -6049,7 +6049,7 @@ int QUICK_RANGE_SELECT::reset() range= NULL; cur_range= (QUICK_RANGE**) ranges.buffer; - if (file->inited == handler::NONE && (error= file->ha_index_init(index))) + if (file->inited == handler::NONE && (error= file->ha_index_init(index,1))) DBUG_RETURN(error); /* Do not allocate the buffers twice. */ @@ -6308,7 +6308,7 @@ int QUICK_RANGE_SELECT_GEOM::get_next() (byte*) range->min_key, range->min_length, (ha_rkey_function)(range->flag ^ GEOM_FLAG)); - if (result != HA_ERR_KEY_NOT_FOUND) + if (result != HA_ERR_KEY_NOT_FOUND && result != HA_ERR_END_OF_FILE) DBUG_RETURN(result); range=0; // Not found, to next range } @@ -6451,7 +6451,7 @@ int QUICK_SELECT_DESC::get_next() } if (result) { - if (result != HA_ERR_KEY_NOT_FOUND) + if (result != HA_ERR_KEY_NOT_FOUND && result != HA_ERR_END_OF_FILE) DBUG_RETURN(result); range=0; // Not found, to next range continue; @@ -8083,7 +8083,7 @@ int QUICK_GROUP_MIN_MAX_SELECT::reset(void) DBUG_ENTER("QUICK_GROUP_MIN_MAX_SELECT::reset"); file->extra(HA_EXTRA_KEYREAD); /* We need only the key attributes */ - result= file->ha_index_init(index); + result= file->ha_index_init(index, 1); result= file->index_last(record); if (result == HA_ERR_END_OF_FILE) DBUG_RETURN(0); @@ -8159,7 +8159,7 @@ int QUICK_GROUP_MIN_MAX_SELECT::get_next() DBUG_ASSERT(is_last_prefix <= 0); if (result == HA_ERR_KEY_NOT_FOUND) continue; - else if (result) + if (result) break; if (have_min) @@ -8189,10 +8189,11 @@ int QUICK_GROUP_MIN_MAX_SELECT::get_next() HA_READ_KEY_EXACT); result= have_min ? min_res : have_max ? max_res : result; - } - while (result == HA_ERR_KEY_NOT_FOUND && is_last_prefix != 0); + } while ((result == HA_ERR_KEY_NOT_FOUND || result == HA_ERR_END_OF_FILE) && + is_last_prefix != 0); if (result == 0) + { /* Partially mimic the behavior of end_select_send. Copy the field data from Item_field::field into Item_field::result_field @@ -8200,6 +8201,7 @@ int QUICK_GROUP_MIN_MAX_SELECT::get_next() other fields in non-ANSI SQL mode). */ copy_fields(&join->tmp_table_param); + } else if (result == HA_ERR_KEY_NOT_FOUND) result= HA_ERR_END_OF_FILE; @@ -8226,6 +8228,7 @@ int QUICK_GROUP_MIN_MAX_SELECT::get_next() RETURN 0 on success HA_ERR_KEY_NOT_FOUND if no MIN key was found that fulfills all conditions. + HA_ERR_END_OF_FILE - "" - other if some error occurred */ @@ -8279,7 +8282,7 @@ int QUICK_GROUP_MIN_MAX_SELECT::next_min() if (key_cmp(index_info->key_part, group_prefix, real_prefix_len)) key_restore(record, tmp_record, index_info, 0); } - else if (result == HA_ERR_KEY_NOT_FOUND) + else if (result == HA_ERR_KEY_NOT_FOUND || result == HA_ERR_END_OF_FILE) result= 0; /* There is a result in any case. */ } } @@ -8304,6 +8307,7 @@ int QUICK_GROUP_MIN_MAX_SELECT::next_min() RETURN 0 on success HA_ERR_KEY_NOT_FOUND if no MAX key was found that fulfills all conditions. + HA_ERR_END_OF_FILE - "" - other if some error occurred */ @@ -8404,6 +8408,7 @@ int QUICK_GROUP_MIN_MAX_SELECT::next_prefix() 0 on success HA_ERR_KEY_NOT_FOUND if there is no key with the given prefix in any of the ranges + HA_ERR_END_OF_FILE - "" - other if some error */ @@ -8448,11 +8453,12 @@ int QUICK_GROUP_MIN_MAX_SELECT::next_min_in_range() result= file->index_read(record, group_prefix, search_prefix_len, find_flag); - if ((result == HA_ERR_KEY_NOT_FOUND) && - (cur_range->flag & (EQ_RANGE | NULL_RANGE))) - continue; /* Check the next range. */ - else if (result) + if (result) { + if ((result == HA_ERR_KEY_NOT_FOUND || result == HA_ERR_END_OF_FILE) && + (cur_range->flag & (EQ_RANGE | NULL_RANGE))) + continue; /* Check the next range. */ + /* In all other cases (HA_ERR_*, HA_READ_KEY_EXACT with NO_MIN_RANGE, HA_READ_AFTER_KEY, HA_READ_KEY_OR_NEXT) if the lookup failed for this @@ -8479,7 +8485,7 @@ int QUICK_GROUP_MIN_MAX_SELECT::next_min_in_range() /* Check if record belongs to the current group. */ if (key_cmp(index_info->key_part, group_prefix, real_prefix_len)) { - result = HA_ERR_KEY_NOT_FOUND; + result= HA_ERR_KEY_NOT_FOUND; continue; } @@ -8497,7 +8503,7 @@ int QUICK_GROUP_MIN_MAX_SELECT::next_min_in_range() if (!((cur_range->flag & NEAR_MAX) && (cmp_res == -1) || (cmp_res <= 0))) { - result = HA_ERR_KEY_NOT_FOUND; + result= HA_ERR_KEY_NOT_FOUND; continue; } } @@ -8536,6 +8542,7 @@ int QUICK_GROUP_MIN_MAX_SELECT::next_min_in_range() 0 on success HA_ERR_KEY_NOT_FOUND if there is no key with the given prefix in any of the ranges + HA_ERR_END_OF_FILE - "" - other if some error */ @@ -8581,10 +8588,12 @@ int QUICK_GROUP_MIN_MAX_SELECT::next_max_in_range() result= file->index_read(record, group_prefix, search_prefix_len, find_flag); - if ((result == HA_ERR_KEY_NOT_FOUND) && (cur_range->flag & EQ_RANGE)) - continue; /* Check the next range. */ if (result) { + if ((result == HA_ERR_KEY_NOT_FOUND || result == HA_ERR_END_OF_FILE) && + (cur_range->flag & EQ_RANGE)) + continue; /* Check the next range. */ + /* In no key was found with this upper bound, there certainly are no keys in the ranges to the left. diff --git a/sql/opt_sum.cc b/sql/opt_sum.cc index 33c8eadc065..9802bbddde6 100644 --- a/sql/opt_sum.cc +++ b/sql/opt_sum.cc @@ -181,7 +181,7 @@ int opt_sum_query(TABLE_LIST *tables, List<Item> &all_fields,COND *conds) const_result= 0; break; } - error= table->file->ha_index_init((uint) ref.key); + error= table->file->ha_index_init((uint) ref.key, 1); if (!ref.key_length) error= table->file->index_first(table->record[0]); @@ -253,7 +253,7 @@ int opt_sum_query(TABLE_LIST *tables, List<Item> &all_fields,COND *conds) const_result= 0; break; } - error= table->file->ha_index_init((uint) ref.key); + error= table->file->ha_index_init((uint) ref.key, 1); if (!ref.key_length) error= table->file->index_last(table->record[0]); diff --git a/sql/records.cc b/sql/records.cc index 9b05dc3e291..b3610cf1bbf 100644 --- a/sql/records.cc +++ b/sql/records.cc @@ -31,6 +31,74 @@ static int rr_cmp(uchar *a,uchar *b); /* init struct for read with info->read_record */ +/* + init_read_record is used to scan by using a number of different methods. + Which method to use is set-up in this call so that later calls to + the info->read_record will call the appropriate method using a function + pointer. + + There are five methods that relate completely to the sort function + filesort. The result of a filesort is retrieved using read_record + calls. The other two methods are used for normal table access. + + The filesort will produce references to the records sorted, these + references can be stored in memory or in a temporary file. + + The temporary file is normally used when the references doesn't fit into + a properly sized memory buffer. For most small queries the references + are stored in the memory buffer. + + The temporary file is also used when performing an update where a key is + modified. + + Methods used when ref's are in memory (using rr_from_pointers): + rr_unpack_from_buffer: + ---------------------- + This method is used when table->sort.addon_field is allocated. + This is allocated for most SELECT queries not involving any BLOB's. + In this case the records are fetched from a memory buffer. + rr_from_pointers: + ----------------- + Used when the above is not true, UPDATE, DELETE and so forth and + SELECT's involving BLOB's. It is also used when the addon_field + buffer is not allocated due to that its size was bigger than the + session variable max_length_for_sort_data. + In this case the record data is fetched from the handler using the + saved reference using the rnd_pos handler call. + + Methods used when ref's are in a temporary file (using rr_from_tempfile) + rr_unpack_from_tempfile: + ------------------------ + Same as rr_unpack_from_buffer except that references are fetched from + temporary file. Should obviously not really happen other than in + strange configurations. + + rr_from_tempfile: + ----------------- + Same as rr_from_pointers except that references are fetched from + temporary file instead of from + rr_from_cache: + -------------- + This is a special variant of rr_from_tempfile that can be used for + handlers that is not using the HA_FAST_KEY_READ table flag. Instead + of reading the references one by one from the temporary file it reads + a set of them, sorts them and reads all of them into a buffer which + is then used for a number of subsequent calls to rr_from_cache. + It is only used for SELECT queries and a number of other conditions + on table size. + + All other accesses use either index access methods (rr_quick) or a full + table scan (rr_sequential). + rr_quick: + --------- + rr_quick uses one of the QUICK_SELECT classes in opt_range.cc to + perform an index scan. There are loads of functionality hidden + in these quick classes. It handles all index scans of various kinds. + rr_sequential: + -------------- + This is the most basic access method of a table using rnd_init, + rnd_next and rnd_end. No indexes are used. +*/ void init_read_record(READ_RECORD *info,THD *thd, TABLE *table, SQL_SELECT *select, int use_record_cache, bool print_error) diff --git a/sql/set_var.cc b/sql/set_var.cc index ae7e4bd844b..eb6cdfa37d0 100644 --- a/sql/set_var.cc +++ b/sql/set_var.cc @@ -786,6 +786,7 @@ struct show_var_st init_vars[]= { {"have_isam", (char*) &have_isam, SHOW_HAVE}, {"have_ndbcluster", (char*) &have_ndbcluster, SHOW_HAVE}, {"have_openssl", (char*) &have_openssl, SHOW_HAVE}, + {"have_partition_engine", (char*) &have_partition_db, SHOW_HAVE}, {"have_query_cache", (char*) &have_query_cache, SHOW_HAVE}, {"have_raid", (char*) &have_raid, SHOW_HAVE}, {"have_rtree_keys", (char*) &have_rtree_keys, SHOW_HAVE}, diff --git a/sql/share/errmsg.txt b/sql/share/errmsg.txt index f999f17aedf..466f1049dc5 100644 --- a/sql/share/errmsg.txt +++ b/sql/share/errmsg.txt @@ -5370,3 +5370,81 @@ ER_SCALE_BIGGER_THAN_PRECISION 42000 S1009 eng "Scale may not be larger than the precision (column '%-.64s')." ER_WRONG_LOCK_OF_SYSTEM_TABLE eng "You can't combine write-locking of system '%-.64s.%-.64s' table with other tables" +ER_PARTITION_REQUIRES_VALUES_ERROR + eng "%s PARTITIONING requires definition of VALUES %s for each partition" + swe "%s PARTITIONering kräver definition av VALUES %s för varje partition" +ER_PARTITION_WRONG_VALUES_ERROR + eng "Only %s PARTITIONING can use VALUES %s in partition definition" + swe "Endast %s partitionering kan använda VALUES %s i definition av partitionen" +ER_PARTITION_MAXVALUE_ERROR + eng "MAXVALUE can only be used in last partition definition" + swe "MAXVALUE kan bara användas i definitionen av den sista partitionen" +ER_PARTITION_SUBPARTITION_ERROR + eng "Subpartitions can only be hash partitions and by key" + swe "Subpartitioner kan bara vara hash och key partitioner" +ER_PARTITION_WRONG_NO_PART_ERROR + eng "Wrong number of partitions defined, mismatch with previous setting" + swe "Antal partitioner definierade och antal partitioner är inte lika" +ER_PARTITION_WRONG_NO_SUBPART_ERROR + eng "Wrong number of subpartitions defined, mismatch with previous setting" + swe "Antal subpartitioner definierade och antal subpartitioner är inte lika" +ER_CONST_EXPR_IN_PARTITION_FUNC_ERROR + eng "Constant/Random expression in (sub)partitioning function is not allowed" + swe "Konstanta uttryck eller slumpmässiga uttryck är inte tillåtna (sub)partitioneringsfunktioner" +ER_NO_CONST_EXPR_IN_RANGE_OR_LIST_ERROR + eng "Expression in RANGE/LIST VALUES must be constant" + swe "Uttryck i RANGE/LIST VALUES måste vara ett konstant uttryck" +ER_FIELD_NOT_FOUND_PART_ERROR + eng "Field in list of fields for partition function not found in table" + swe "Fält i listan av fält för partitionering med key inte funnen i tabellen" +ER_LIST_OF_FIELDS_ONLY_IN_HASH_ERROR + eng "List of fields is only allowed in KEY partitions" + swe "En lista av fält är endast tillåtet för KEY partitioner" +ER_INCONSISTENT_PARTITION_INFO_ERROR + eng "The partition info in the frm file is not consistent with what can be written into the frm file" + swe "Partitioneringsinformationen i frm-filen är inte konsistent med vad som kan skrivas i frm-filen" +ER_PARTITION_FUNC_NOT_ALLOWED_ERROR + eng "The %s function returns the wrong type" + swe "%s-funktionen returnerar felaktig typ" +ER_PARTITIONS_MUST_BE_DEFINED_ERROR + eng "For %s partitions each partition must be defined" + swe "För %s partitionering så måste varje partition definieras" +ER_RANGE_NOT_INCREASING_ERROR + eng "VALUES LESS THAN value must be strictly increasing for each partition" + swe "Värden i VALUES LESS THAN måste vara strikt växande för varje partition" +ER_INCONSISTENT_TYPE_OF_FUNCTIONS_ERROR + eng "VALUES %s value must be of same type as partition function" + swe "Värden i VALUES %s måste vara av samma typ som partitioneringsfunktionen" +ER_MULTIPLE_DEF_CONST_IN_LIST_PART_ERROR + eng "Multiple definition of same constant in list partitioning" + swe "Multipel definition av samma konstant i list partitionering" +ER_PARTITION_ENTRY_ERROR + eng "Partitioning can not be used stand-alone in query" + swe "Partitioneringssyntax kan inte användas på egen hand i en SQL-fråga" +ER_MIX_HANDLER_ERROR + eng "The mix of handlers in the partitions is not allowed in this version in MySQL" + swe "Denna mix av lagringsmotorer är inte tillåten i denna version av MySQL" +ER_PARTITION_NOT_DEFINED_ERROR + eng "For the partitioned engine it is necessary to define all %s" + swe "För partitioneringsmotorn så är det nödvändigt att definiera alla %s" +ER_TOO_MANY_PARTITIONS_ERROR + eng "Too many partitions were defined" + swe "För många partitioner definierades" +ER_SUBPARTITION_ERROR + eng "It is only possible to mix RANGE/LIST partitioning with HASH/KEY partitioning for subpartitioning" + swe "Det är endast möjligt att blanda RANGE/LIST partitionering med HASH/KEY partitionering för subpartitionering" +ER_CANT_CREATE_HANDLER_FILE + eng "Failed to create specific handler file" + swe "Misslyckades med att skapa specifik fil i lagringsmotor" +ER_BLOB_FIELD_IN_PART_FUNC_ERROR + eng "A BLOB field is not allowed in partition function" + swe "Ett BLOB-fält är inte tillåtet i partitioneringsfunktioner" +ER_CHAR_SET_IN_PART_FIELD_ERROR + eng "VARCHAR only allowed if binary collation for partition functions" + swe "VARCHAR endast tillåten med binär collation för partitioneringsfunktion" +ER_UNIQUE_KEY_NEED_ALL_FIELDS_IN_PF + eng "A %s need to include all fields in the partition function" + swe "En %s behöver inkludera alla fält i partitioneringsfunktionen för denna lagringsmotor" +ER_NO_PARTS_ERROR + eng "Number of %s = 0 is not an allowed value" + swe "Antal %s = 0 är inte ett tillåten värde" diff --git a/sql/sp.cc b/sql/sp.cc index 55087f47f5e..cf381762bac 100644 --- a/sql/sp.cc +++ b/sql/sp.cc @@ -799,7 +799,7 @@ db_show_routine_status(THD *thd, int type, const char *wild) } } - table->file->ha_index_init(0); + table->file->ha_index_init(0, 1); if ((res= table->file->index_first(table->record[0]))) { res= (res == HA_ERR_END_OF_FILE) ? 0 : SP_INTERNAL_ERROR; @@ -849,7 +849,7 @@ sp_drop_db_routines(THD *thd, char *db) goto err; ret= SP_OK; - table->file->ha_index_init(0); + table->file->ha_index_init(0, 1); if (! table->file->index_read(table->record[0], key, keylen, HA_READ_KEY_EXACT)) { diff --git a/sql/sql_acl.cc b/sql/sql_acl.cc index 315103aa8b1..fdb7f7f069c 100644 --- a/sql/sql_acl.cc +++ b/sql/sql_acl.cc @@ -2048,7 +2048,7 @@ GRANT_TABLE::GRANT_TABLE(TABLE *form, TABLE *col_privs) key_copy(key, col_privs->record[0], col_privs->key_info, key_prefix_len); col_privs->field[4]->store("",0, &my_charset_latin1); - col_privs->file->ha_index_init(0); + col_privs->file->ha_index_init(0, 1); if (col_privs->file->index_read(col_privs->record[0], (byte*) key, key_prefix_len, HA_READ_KEY_EXACT)) @@ -2193,7 +2193,7 @@ static int replace_column_table(GRANT_TABLE *g_t, List_iterator <LEX_COLUMN> iter(columns); class LEX_COLUMN *column; - table->file->ha_index_init(0); + table->file->ha_index_init(0, 1); while ((column= iter++)) { ulong privileges= column->rights; @@ -3168,8 +3168,8 @@ my_bool grant_init(THD *org_thd) t_table = tables[0].table; c_table = tables[1].table; p_table= tables[2].table; - t_table->file->ha_index_init(0); - p_table->file->ha_index_init(0); + t_table->file->ha_index_init(0, 1); + p_table->file->ha_index_init(0, 1); if (!t_table->file->index_first(t_table->record[0])) { /* Will be restored by org_thd->store_globals() */ @@ -4473,7 +4473,7 @@ static int handle_grant_table(TABLE_LIST *tables, uint table_no, bool drop, user_key, key_prefix_length, HA_READ_KEY_EXACT))) { - if (error != HA_ERR_KEY_NOT_FOUND) + if (error != HA_ERR_KEY_NOT_FOUND && error != HA_ERR_END_OF_FILE) { table->file->print_error(error, MYF(0)); result= -1; diff --git a/sql/sql_base.cc b/sql/sql_base.cc index 0013d478600..5d6dd29b998 100644 --- a/sql/sql_base.cc +++ b/sql/sql_base.cc @@ -2561,6 +2561,42 @@ find_field_in_table(THD *thd, TABLE_LIST *table_list, /* + Find field in table, no side effects, only purpose is to check for field + in table object and get reference to the field if found. + + SYNOPSIS + find_field_in_table_sef() + + table table where to find + name Name of field searched for + + RETURN + 0 field is not found + # pointer to field +*/ + +Field *find_field_in_table_sef(TABLE *table, const char *name) +{ + Field **field_ptr; + if (table->s->name_hash.records) + field_ptr= (Field**)hash_search(&table->s->name_hash,(byte*) name, + strlen(name)); + else + { + if (!(field_ptr= table->field)) + return (Field *)0; + for (; *field_ptr; ++field_ptr) + if (!my_strcasecmp(system_charset_info, (*field_ptr)->field_name, name)) + break; + } + if (field_ptr) + return *field_ptr; + else + return (Field *)0; +} + + +/* Find field in table SYNOPSIS @@ -2623,13 +2659,16 @@ Field *find_field_in_real_table(THD *thd, TABLE *table, (bool)(thd->set_query_id-1)); if (field->query_id != thd->query_id) { + if (table->get_fields_in_item_tree) + field->flags|= GET_FIXED_FIELDS_FLAG; field->query_id=thd->query_id; table->used_fields++; table->used_keys.intersect(field->part_of_key); } else thd->dupp_field=field; - } + } else if (table->get_fields_in_item_tree) + field->flags|= GET_FIXED_FIELDS_FLAG; #ifndef NO_EMBEDDED_ACCESS_CHECKS if (check_grants && check_grant_column(thd, &table->grant, table->s->db, diff --git a/sql/sql_handler.cc b/sql/sql_handler.cc index e109600bcd0..84087db9719 100644 --- a/sql/sql_handler.cc +++ b/sql/sql_handler.cc @@ -461,7 +461,7 @@ bool mysql_ha_read(THD *thd, TABLE_LIST *tables, if (keyname) { table->file->ha_index_or_rnd_end(); - table->file->ha_index_init(keyno); + table->file->ha_index_init(keyno, 1); error= table->file->index_first(table->record[0]); } else @@ -483,7 +483,7 @@ bool mysql_ha_read(THD *thd, TABLE_LIST *tables, case RLAST: DBUG_ASSERT(keyname != 0); table->file->ha_index_or_rnd_end(); - table->file->ha_index_init(keyno); + table->file->ha_index_init(keyno, 1); error= table->file->index_last(table->record[0]); mode=RPREV; break; @@ -522,7 +522,7 @@ bool mysql_ha_read(THD *thd, TABLE_LIST *tables, if (!(key= (byte*) thd->calloc(ALIGN_SIZE(key_len)))) goto err; table->file->ha_index_or_rnd_end(); - table->file->ha_index_init(keyno); + table->file->ha_index_init(keyno, 1); key_copy(key, table->record[0], table->key_info + keyno, key_len); error= table->file->index_read(table->record[0], key,key_len,ha_rkey_mode); diff --git a/sql/sql_help.cc b/sql/sql_help.cc index 6780beec258..11045529a51 100644 --- a/sql/sql_help.cc +++ b/sql/sql_help.cc @@ -286,8 +286,8 @@ int get_topics_for_keyword(THD *thd, TABLE *topics, TABLE *relations, rtopic_id= find_fields[help_relation_help_topic_id].field; rkey_id= find_fields[help_relation_help_keyword_id].field; - topics->file->ha_index_init(iindex_topic); - relations->file->ha_index_init(iindex_relations); + topics->file->ha_index_init(iindex_topic,1); + relations->file->ha_index_init(iindex_relations,1); rkey_id->store((longlong) key_id); rkey_id->get_key_image(buff, rkey_id->pack_length(), Field::itRAW); diff --git a/sql/sql_lex.cc b/sql/sql_lex.cc index 630a7e950f7..73aaecd39aa 100644 --- a/sql/sql_lex.cc +++ b/sql/sql_lex.cc @@ -155,6 +155,7 @@ void lex_start(THD *thd, uchar *buf,uint length) lex->yylineno = 1; lex->in_comment=0; lex->length=0; + lex->part_info= 0; lex->select_lex.in_sum_expr=0; lex->select_lex.expr_list.empty(); lex->select_lex.ftfunc_list_alloc.empty(); diff --git a/sql/sql_lex.h b/sql/sql_lex.h index 45c8182a29c..edcf4db09a4 100644 --- a/sql/sql_lex.h +++ b/sql/sql_lex.h @@ -25,6 +25,7 @@ class sp_head; class sp_name; class sp_instr; class sp_pcontext; +class partition_info; /* The following hack is needed because mysql_yacc.cc does not define @@ -721,6 +722,8 @@ typedef struct st_lex TABLE_LIST **query_tables_last; /* store original leaf_tables for INSERT SELECT and PS/SP */ TABLE_LIST *leaf_tables_insert; + /* Partition info structure filled in by PARTITION BY parse part */ + partition_info *part_info; List<key_part_spec> col_list; List<key_part_spec> ref_list; diff --git a/sql/sql_partition.cc b/sql/sql_partition.cc new file mode 100644 index 00000000000..ffdf53ed287 --- /dev/null +++ b/sql/sql_partition.cc @@ -0,0 +1,3117 @@ +/* Copyright (C) 2005 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* + This file was introduced as a container for general functionality related + to partitioning introduced in MySQL version 5.1. It contains functionality + used by all handlers that support partitioning, which in the first version + is the partitioning handler itself and the NDB handler. + + The first version was written by Mikael Ronström. + + This version supports RANGE partitioning, LIST partitioning, HASH + partitioning and composite partitioning (hereafter called subpartitioning) + where each RANGE/LIST partitioning is HASH partitioned. The hash function + can either be supplied by the user or by only a list of fields (also + called KEY partitioning, where the MySQL server will use an internal + hash function. + There are quite a few defaults that can be used as well. +*/ + +/* Some general useful functions */ + +#include "mysql_priv.h" +#include <errno.h> +#include <m_ctype.h> +#include "md5.h" + + +#ifdef HAVE_PARTITION_DB +/* + Partition related functions declarations and some static constants; +*/ +static char *hash_str= "HASH"; +static char *range_str= "RANGE"; +static char *list_str= "LIST"; +static char *part_str= "PARTITION"; +static char *sub_str= "SUB"; +static char *by_str= "BY"; +static char *key_str= "KEY"; +static char *space_str= " "; +static char *equal_str= "="; +static char *end_paren_str= ")"; +static char *begin_paren_str= "("; +static char *comma_str= ","; +static char buff[22]; + +bool get_partition_id_list(partition_info *part_info, + uint32 *part_id); +bool get_partition_id_range(partition_info *part_info, + uint32 *part_id); +bool get_partition_id_hash_nosub(partition_info *part_info, + uint32 *part_id); +bool get_partition_id_key_nosub(partition_info *part_info, + uint32 *part_id); +bool get_partition_id_linear_hash_nosub(partition_info *part_info, + uint32 *part_id); +bool get_partition_id_linear_key_nosub(partition_info *part_info, + uint32 *part_id); +bool get_partition_id_range_sub_hash(partition_info *part_info, + uint32 *part_id); +bool get_partition_id_range_sub_key(partition_info *part_info, + uint32 *part_id); +bool get_partition_id_range_sub_linear_hash(partition_info *part_info, + uint32 *part_id); +bool get_partition_id_range_sub_linear_key(partition_info *part_info, + uint32 *part_id); +bool get_partition_id_list_sub_hash(partition_info *part_info, + uint32 *part_id); +bool get_partition_id_list_sub_key(partition_info *part_info, + uint32 *part_id); +bool get_partition_id_list_sub_linear_hash(partition_info *part_info, + uint32 *part_id); +bool get_partition_id_list_sub_linear_key(partition_info *part_info, + uint32 *part_id); +uint32 get_partition_id_hash_sub(partition_info *part_info); +uint32 get_partition_id_key_sub(partition_info *part_info); +uint32 get_partition_id_linear_hash_sub(partition_info *part_info); +uint32 get_partition_id_linear_key_sub(partition_info *part_info); + +/* + A useful routine used by update_row for partition handlers to calculate + the partition ids of the old and the new record. + SYNOPSIS + get_part_for_update() + old_data Buffer of old record + new_data Buffer of new record + rec0 Reference to table->record[0] + part_info Reference to partition information + part_field_array A NULL-terminated array of fields for partition + function + old_part_id The returned partition id of old record + new_part_id The returned partition id of new record + RETURN VALUE + 0 Success + > 0 Error code + DESCRIPTION + Dependent on whether buf is not record[0] we need to prepare the + fields. Then we call the function pointer get_partition_id to + calculate the partition ids. +*/ + +int get_parts_for_update(const byte *old_data, byte *new_data, + const byte *rec0, partition_info *part_info, + uint32 *old_part_id, uint32 *new_part_id) +{ + Field **part_field_array= part_info->full_part_field_array; + int error; + DBUG_ENTER("get_parts_for_update"); + DBUG_ASSERT(new_data == rec0); + + set_field_ptr(part_field_array, old_data, rec0); + error= part_info->get_partition_id(part_info, old_part_id); + set_field_ptr(part_field_array, rec0, old_data); + if (unlikely(error)) // Should never happen + { + DBUG_ASSERT(0); + DBUG_RETURN(error); + } +#ifdef NOT_NEEDED + if (new_data == rec0) +#endif + { + if (unlikely(error= part_info->get_partition_id(part_info,new_part_id))) + { + DBUG_RETURN(error); + } + } +#ifdef NOT_NEEDED + else + { + /* + This branch should never execute but it is written anyways for + future use. It will be tested by ensuring that the above + condition is false in one test situation before pushing the code. + */ + set_field_ptr(part_field_array, new_data, rec0); + error= part_info->get_partition_id(part_info, new_part_id); + set_field_ptr(part_field_array, rec0, new_data); + if (unlikely(error)) + { + DBUG_RETURN(error); + } + } +#endif + DBUG_RETURN(0); +} + + +/* + A useful routine used by delete_row for partition handlers to calculate + the partition id. + SYNOPSIS + get_part_for_delete() + buf Buffer of old record + rec0 Reference to table->record[0] + part_info Reference to partition information + part_field_array A NULL-terminated array of fields for partition + function + part_id The returned partition id to delete from + RETURN VALUE + 0 Success + > 0 Error code + DESCRIPTION + Dependent on whether buf is not record[0] we need to prepare the + fields. Then we call the function pointer get_partition_id to + calculate the partition id. +*/ + +int get_part_for_delete(const byte *buf, const byte *rec0, + partition_info *part_info, uint32 *part_id) +{ + int error; + DBUG_ENTER("get_part_for_delete"); + + if (likely(buf == rec0)) + { + if (unlikely((error= part_info->get_partition_id(part_info, part_id)))) + { + DBUG_RETURN(error); + } + DBUG_PRINT("info", ("Delete from partition %d", *part_id)); + } + else + { + Field **part_field_array= part_info->full_part_field_array; + set_field_ptr(part_field_array, buf, rec0); + error= part_info->get_partition_id(part_info, part_id); + set_field_ptr(part_field_array, rec0, buf); + if (unlikely(error)) + { + DBUG_RETURN(error); + } + DBUG_PRINT("info", ("Delete from partition %d (path2)", *part_id)); + } + DBUG_RETURN(0); +} + + +/* + This routine allocates an array for all range constants to achieve a fast + check what partition a certain value belongs to. At the same time it does + also check that the range constants are defined in increasing order and + that the expressions are constant integer expressions. + SYNOPSIS + check_range_constants() + part_info + RETURN VALUE + TRUE An error occurred during creation of range constants + FALSE Successful creation of range constant mapping + DESCRIPTION + This routine is called from check_partition_info to get a quick error + before we came too far into the CREATE TABLE process. It is also called + from fix_partition_func every time we open the .frm file. It is only + called for RANGE PARTITIONed tables. +*/ + +static bool check_range_constants(partition_info *part_info) +{ + partition_element* part_def; + longlong current_largest_int= LONGLONG_MIN, part_range_value_int; + uint no_parts= part_info->no_parts, i; + List_iterator<partition_element> it(part_info->partitions); + bool result= TRUE; + DBUG_ENTER("check_range_constants"); + DBUG_PRINT("enter", ("INT_RESULT with %d parts", no_parts)); + + part_info->part_result_type= INT_RESULT; + part_info->range_int_array= + (longlong*)sql_alloc(no_parts * sizeof(longlong)); + if (unlikely(part_info->range_int_array == NULL)) + { + my_error(ER_OUTOFMEMORY, MYF(0), no_parts*sizeof(longlong)); + goto end; + } + i= 0; + do + { + part_def= it++; + if ((i != (no_parts - 1)) || !part_info->defined_max_value) + { + if (likely(part_def->range_expr->result_type() == INT_RESULT)) + part_range_value_int= part_def->range_expr->val_int(); + else + { + my_error(ER_INCONSISTENT_TYPE_OF_FUNCTIONS_ERROR, MYF(0), + "LESS THAN"); + goto end; + } + } + else + part_range_value_int= LONGLONG_MAX; + if (likely(current_largest_int < part_range_value_int)) + { + current_largest_int= part_range_value_int; + part_info->range_int_array[i]= part_range_value_int; + } + else + { + my_error(ER_RANGE_NOT_INCREASING_ERROR, MYF(0)); + goto end; + } + } while (++i < no_parts); + result= FALSE; +end: + DBUG_RETURN(result); +} + + +/* + A support routine for check_list_constants used by qsort to sort the + constant list expressions. + SYNOPSIS + list_part_cmp() + a First list constant to compare with + b Second list constant to compare with + RETURN VALUE + +1 a > b + 0 a == b + -1 a < b +*/ + +static int list_part_cmp(const void* a, const void* b) +{ + longlong a1, b1; + a1= ((LIST_PART_ENTRY*)a)->list_value; + b1= ((LIST_PART_ENTRY*)b)->list_value; + if (a1 < b1) + return -1; + else if (a1 > b1) + return +1; + else + return 0; +} + + +/* + This routine allocates an array for all list constants to achieve a fast + check what partition a certain value belongs to. At the same time it does + also check that there are no duplicates among the list constants and that + that the list expressions are constant integer expressions. + SYNOPSIS + check_list_constants() + part_info + RETURN VALUE + TRUE An error occurred during creation of list constants + FALSE Successful creation of list constant mapping + DESCRIPTION + This routine is called from check_partition_info to get a quick error + before we came too far into the CREATE TABLE process. It is also called + from fix_partition_func every time we open the .frm file. It is only + called for LIST PARTITIONed tables. +*/ + +static bool check_list_constants(partition_info *part_info) +{ + uint i, no_list_values= 0, no_parts, list_index= 0; + Item *list_expr; + bool not_first, result= TRUE; + longlong curr_value, prev_value; + partition_element* part_def; + List_iterator<partition_element> list_func_it(part_info->partitions); + DBUG_ENTER("check_list_constants"); + + part_info->part_result_type= INT_RESULT; + + /* + We begin by calculating the number of list values that have been + defined in the first step. + + We use this number to allocate a properly sized array of structs + to keep the partition id and the value to use in that partition. + In the second traversal we check that all Item trees are of the + same type (INT_RESULT) and assign them values in the struct array. + + Finally we sort the array of structs in order of values to enable + a quick binary search for the proper value to discover the + partition id. + After sorting the array we check that there are no duplicates in the + list. + */ + + no_parts= part_info->no_parts; + i= 0; + do + { + part_def= list_func_it++; + List_iterator<Item> list_val_it1(part_def->list_expr_list); + while (list_val_it1++) + no_list_values++; + } while (++i < no_parts); + list_func_it.rewind(); + part_info->no_list_values= no_list_values; + part_info->list_array= + (LIST_PART_ENTRY*)sql_alloc(no_list_values*sizeof(LIST_PART_ENTRY)); + if (unlikely(part_info->list_array == NULL)) + { + my_error(ER_OUTOFMEMORY, MYF(0), no_list_values*sizeof(LIST_PART_ENTRY)); + goto end; + } + + i= 0; + do + { + part_def= list_func_it++; + List_iterator<Item> list_val_it2(part_def->list_expr_list); + while ((list_expr= list_val_it2++)) + { + if (likely(list_expr->result_type() == INT_RESULT)) + { + part_info->list_array[list_index].list_value= list_expr->val_int(); + part_info->list_array[list_index++].partition_id= i; + } + else + { + my_error(ER_INCONSISTENT_TYPE_OF_FUNCTIONS_ERROR, MYF(0), "IN"); + goto end; + } + } + } while (++i < no_parts); + + qsort((void*)part_info->list_array, no_list_values, + sizeof(LIST_PART_ENTRY), &list_part_cmp); + + not_first= FALSE; + i= prev_value= 0; //prev_value initialised to quiet compiler + do + { + curr_value= part_info->list_array[i].list_value; + if (likely(!not_first || prev_value != curr_value)) + { + prev_value= curr_value; + not_first= TRUE; + } + else + { + my_error(ER_MULTIPLE_DEF_CONST_IN_LIST_PART_ERROR, MYF(0)); + goto end; + } + } while (++i < no_list_values); + result= FALSE; +end: + DBUG_RETURN(result); +} + + +/* + Create a memory area where default partition names are stored and fill it + up with the names. + SYNOPSIS + create_default_partition_names() + no_parts Number of partitions + subpart Is it subpartitions + RETURN VALUE + A pointer to the memory area of the default partition names + DESCRIPTION + A support routine for the partition code where default values are + generated. + The external routine needing this code is check_partition_info +*/ + +#define MAX_PART_NAME_SIZE 8 + +static char *create_default_partition_names(uint no_parts, bool subpart) +{ + char *ptr= sql_calloc(no_parts*MAX_PART_NAME_SIZE); + char *move_ptr= ptr; + uint i= 0; + DBUG_ENTER("create_default_partition_names"); + if (likely(ptr != 0)) + { + do + { + if (subpart) + my_sprintf(move_ptr, (move_ptr,"sp%u", i)); + else + my_sprintf(move_ptr, (move_ptr,"p%u", i)); + move_ptr+=MAX_PART_NAME_SIZE; + } while (++i < no_parts); + } + else + { + my_error(ER_OUTOFMEMORY, MYF(0), no_parts*MAX_PART_NAME_SIZE); + } + DBUG_RETURN(ptr); +} + + +/* + Set up all the default partitions not set-up by the user in the SQL + statement. Also perform a number of checks that the user hasn't tried + to use default values where no defaults exists. + SYNOPSIS + set_up_default_partitions() + part_info The reference to all partition information + file A reference to a handler of the table + max_rows Maximum number of rows stored in the table + RETURN VALUE + TRUE Error, attempted default values not possible + FALSE Ok, default partitions set-up + DESCRIPTION + The routine uses the underlying handler of the partitioning to define + the default number of partitions. For some handlers this requires + knowledge of the maximum number of rows to be stored in the table. + This routine only accepts HASH and KEY partitioning and thus there is + no subpartitioning if this routine is successful. + The external routine needing this code is check_partition_info +*/ + +static bool set_up_default_partitions(partition_info *part_info, + handler *file, ulonglong max_rows) +{ + uint no_parts, i; + char *default_name; + bool result= TRUE; + DBUG_ENTER("set_up_default_partitions"); + + if (part_info->part_type != HASH_PARTITION) + { + char *error_string; + if (part_info->part_type == RANGE_PARTITION) + error_string= range_str; + else + error_string= list_str; + my_error(ER_PARTITIONS_MUST_BE_DEFINED_ERROR, MYF(0), error_string); + goto end; + } + if (part_info->no_parts == 0) + part_info->no_parts= file->get_default_no_partitions(max_rows); + no_parts= part_info->no_parts; + if (unlikely(no_parts > MAX_PARTITIONS)) + { + my_error(ER_TOO_MANY_PARTITIONS_ERROR, MYF(0)); + goto end; + } + if (unlikely((!(default_name= create_default_partition_names(no_parts, + FALSE))))) + goto end; + i= 0; + do + { + partition_element *part_elem= new partition_element(); + if (likely(part_elem != 0)) + { + part_elem->engine_type= DB_TYPE_UNKNOWN; + part_elem->partition_name= default_name; + default_name+=MAX_PART_NAME_SIZE; + part_info->partitions.push_back(part_elem); + } + else + { + my_error(ER_OUTOFMEMORY, MYF(0), sizeof(partition_element)); + goto end; + } + } while (++i < no_parts); + result= FALSE; +end: + DBUG_RETURN(result); +} + + +/* + Set up all the default subpartitions not set-up by the user in the SQL + statement. Also perform a number of checks that the default partitioning + becomes an allowed partitioning scheme. + SYNOPSIS + set_up_default_subpartitions() + part_info The reference to all partition information + file A reference to a handler of the table + max_rows Maximum number of rows stored in the table + RETURN VALUE + TRUE Error, attempted default values not possible + FALSE Ok, default partitions set-up + DESCRIPTION + The routine uses the underlying handler of the partitioning to define + the default number of partitions. For some handlers this requires + knowledge of the maximum number of rows to be stored in the table. + This routine is only called for RANGE or LIST partitioning and those + need to be specified so only subpartitions are specified. + The external routine needing this code is check_partition_info +*/ + +static bool set_up_default_subpartitions(partition_info *part_info, + handler *file, ulonglong max_rows) +{ + uint i, j= 0, no_parts, no_subparts; + char *default_name; + bool result= TRUE; + partition_element *part_elem; + List_iterator<partition_element> part_it(part_info->partitions); + DBUG_ENTER("set_up_default_subpartitions"); + + if (part_info->no_subparts == 0) + part_info->no_subparts= file->get_default_no_partitions(max_rows); + no_parts= part_info->no_parts; + no_subparts= part_info->no_subparts; + if (unlikely((no_parts * no_subparts) > MAX_PARTITIONS)) + { + my_error(ER_TOO_MANY_PARTITIONS_ERROR, MYF(0)); + goto end; + } + if (unlikely((!(default_name= + create_default_partition_names(no_subparts, TRUE))))) + goto end; + i= 0; + do + { + part_elem= part_it++; + do + { + partition_element *subpart_elem= new partition_element(); + if (likely(subpart_elem != 0)) + { + subpart_elem->engine_type= DB_TYPE_UNKNOWN; + subpart_elem->partition_name= default_name; + default_name+= MAX_PART_NAME_SIZE; + part_elem->subpartitions.push_back(subpart_elem); + } + else + { + my_error(ER_OUTOFMEMORY, MYF(0), sizeof(partition_element)); + goto end; + } + } while (++j < no_subparts); + } while (++i < no_parts); + result= FALSE; +end: + DBUG_RETURN(result); +} + + +/* + Set up defaults for partition or subpartition (cannot set-up for both, + this will return an error. + SYNOPSIS + set_up_defaults_for_partitioning() + part_info The reference to all partition information + file A reference to a handler of the table + max_rows Maximum number of rows stored in the table + RETURN VALUE + TRUE Error, attempted default values not possible + FALSE Ok, default partitions set-up + DESCRIPTION + Support routine for check_partition_info +*/ + +static bool set_up_defaults_for_partitioning(partition_info *part_info, + handler *file, + ulonglong max_rows) +{ + DBUG_ENTER("set_up_defaults_for_partitioning"); + + if (part_info->use_default_partitions) + DBUG_RETURN(set_up_default_partitions(part_info, file, max_rows)); + if (is_sub_partitioned(part_info) && part_info->use_default_subpartitions) + DBUG_RETURN(set_up_default_subpartitions(part_info, file, max_rows)); + DBUG_RETURN(FALSE); +} + + +/* + Check that all partitions use the same storage engine. + This is currently a limitation in this version. + SYNOPSIS + check_engine_mix() + engine_array An array of engine identifiers + no_parts Total number of partitions + RETURN VALUE + TRUE Error, mixed engines + FALSE Ok, no mixed engines +*/ + +static bool check_engine_mix(u_char *engine_array, uint no_parts) +{ + /* + Current check verifies only that all handlers are the same. + Later this check will be more sophisticated. + */ + uint i= 0; + bool result= FALSE; + DBUG_ENTER("check_engine_mix"); + + do + { + if (engine_array[i] != engine_array[0]) + { + result= TRUE; + break; + } + } while (++i < no_parts); + DBUG_RETURN(result); +} + + +/* + We will check that the partition info requested is possible to set-up in + this version. This routine is an extension of the parser one could say. + If defaults were used we will generate default data structures for all + partitions. + SYNOPSIS + check_partition_info() + part_info The reference to all partition information + db_type Default storage engine if no engine specified per + partition. + file A reference to a handler of the table + max_rows Maximum number of rows stored in the table + RETURN VALUE + TRUE Error, something went wrong + FALSE Ok, full partition data structures are now generated + DESCRIPTION + This code is used early in the CREATE TABLE and ALTER TABLE process. +*/ + +bool check_partition_info(partition_info *part_info,enum db_type eng_type, + handler *file, ulonglong max_rows) +{ + u_char *engine_array= NULL; + uint part_count= 0, i, no_parts, tot_partitions; + bool result= TRUE; + List_iterator<partition_element> part_it(part_info->partitions); + DBUG_ENTER("check_partition_info"); + + if (unlikely(is_sub_partitioned(part_info) && + (!(part_info->part_type == RANGE_PARTITION || + part_info->part_type == LIST_PARTITION)))) + { + /* Only RANGE and LIST partitioning can be subpartitioned */ + my_error(ER_SUBPARTITION_ERROR, MYF(0)); + goto end; + } + if (unlikely(set_up_defaults_for_partitioning(part_info, file, max_rows))) + goto end; + tot_partitions= get_tot_partitions(part_info); + if (unlikely(tot_partitions > MAX_PARTITIONS)) + { + my_error(ER_TOO_MANY_PARTITIONS_ERROR, MYF(0)); + goto end; + } + engine_array= (u_char*)my_malloc(tot_partitions, MYF(MY_WME)); + if (unlikely(!engine_array)) + goto end; + i= 0; + no_parts= part_info->no_parts; + do + { + partition_element *part_elem= part_it++; + if (!is_sub_partitioned(part_info)) + { + if (part_elem->engine_type == DB_TYPE_UNKNOWN) + part_elem->engine_type= eng_type; + DBUG_PRINT("info", ("engine = %u",(uint)part_elem->engine_type)); + engine_array[part_count++]= (u_char)part_elem->engine_type; + } + else + { + uint j= 0, no_subparts= part_info->no_subparts;; + List_iterator<partition_element> sub_it(part_elem->subpartitions); + do + { + part_elem= sub_it++; + if (part_elem->engine_type == DB_TYPE_UNKNOWN) + part_elem->engine_type= eng_type; + DBUG_PRINT("info", ("engine = %u",(uint)part_elem->engine_type)); + engine_array[part_count++]= (u_char)part_elem->engine_type; + } while (++j < no_subparts); + } + } while (++i < part_info->no_parts); + if (unlikely(check_engine_mix(engine_array, part_count))) + { + my_error(ER_MIX_HANDLER_ERROR, MYF(0)); + goto end; + } + + /* + We need to check all constant expressions that they are of the correct + type and that they are increasing for ranges and not overlapping for + list constants. + */ + + if (unlikely((part_info->part_type == RANGE_PARTITION && + check_range_constants(part_info)) || + (part_info->part_type == LIST_PARTITION && + check_list_constants(part_info)))) + goto end; + result= FALSE; +end: + my_free((char*)engine_array,MYF(MY_ALLOW_ZERO_PTR)); + DBUG_RETURN(result); +} + + +/* + A great number of functions below here is part of the fix_partition_func + method. It is used to set up the partition structures for execution from + openfrm. It is called at the end of the openfrm when the table struct has + been set-up apart from the partition information. + It involves: + 1) Setting arrays of fields for the partition functions. + 2) Setting up binary search array for LIST partitioning + 3) Setting up array for binary search for RANGE partitioning + 4) Setting up key_map's to assist in quick evaluation whether one + can deduce anything from a given index of what partition to use + 5) Checking whether a set of partitions can be derived from a range on + a field in the partition function. + As part of doing this there is also a great number of error controls. + This is actually the place where most of the things are checked for + partition information when creating a table. + Things that are checked includes + 1) No NULLable fields in partition function + 2) All fields of partition function in Primary keys and unique indexes + (if not supported) + 3) No fields in partition function that are BLOB's or VARCHAR with a + collation other than the binary collation. + + + + Create an array of partition fields (NULL terminated). Before this method + is called fix_fields or find_table_in_sef has been called to set + GET_FIXED_FIELDS_FLAG on all fields that are part of the partition + function. + SYNOPSIS + set_up_field_array() + table TABLE object for which partition fields are set-up + sub_part Is the table subpartitioned as well + RETURN VALUE + TRUE Error, some field didn't meet requirements + FALSE Ok, partition field array set-up + DESCRIPTION + This method is used to set-up both partition and subpartitioning + field array and used for all types of partitioning. + It is part of the logic around fix_partition_func. +*/ +static bool set_up_field_array(TABLE *table, + bool sub_part) +{ + Field **ptr, *field, **field_array; + uint no_fields= 0, size_field_array, i= 0; + partition_info *part_info= table->s->part_info; + int result= FALSE; + DBUG_ENTER("set_up_field_array"); + + ptr= table->field; + while ((field= *(ptr++))) + { + if (field->flags & GET_FIXED_FIELDS_FLAG) + no_fields++; + } + size_field_array= (no_fields+1)*sizeof(Field*); + field_array= (Field**)sql_alloc(size_field_array); + if (unlikely(!field_array)) + { + my_error(ER_OUTOFMEMORY, MYF(0), size_field_array); + result= TRUE; + } + ptr= table->field; + while ((field= *(ptr++))) + { + if (field->flags & GET_FIXED_FIELDS_FLAG) + { + field->flags&= ~GET_FIXED_FIELDS_FLAG; + field->flags|= FIELD_IN_PART_FUNC_FLAG; + if (likely(!result)) + { + field_array[i++]= field; + + /* + We check that the fields are proper. It is required for each + field in a partition function to: + 1) Not be a BLOB of any type + A BLOB takes too long time to evaluate so we don't want it for + performance reasons. + 2) Not be a VARCHAR other than VARCHAR with a binary collation + A VARCHAR with character sets can have several values being + equal with different number of spaces or NULL's. This is not a + good ground for a safe and exact partition function. Thus it is + not allowed in partition functions. + */ + + if (unlikely(field->flags & BLOB_FLAG)) + { + my_error(ER_BLOB_FIELD_IN_PART_FUNC_ERROR, MYF(0)); + result= TRUE; + } + else if (unlikely((!field->flags & BINARY_FLAG) && + field->real_type() == MYSQL_TYPE_VARCHAR)) + { + my_error(ER_CHAR_SET_IN_PART_FIELD_ERROR, MYF(0)); + result= TRUE; + } + } + } + } + field_array[no_fields]= 0; + if (!sub_part) + { + part_info->part_field_array= field_array; + part_info->no_part_fields= no_fields; + } + else + { + part_info->subpart_field_array= field_array; + part_info->no_subpart_fields= no_fields; + } + DBUG_RETURN(result); +} + + +/* + Create a field array including all fields of both the partitioning and the + subpartitioning functions. + SYNOPSIS + create_full_part_field_array() + table TABLE object for which partition fields are set-up + part_info Reference to partitioning data structure + RETURN VALUE + TRUE Memory allocation of field array failed + FALSE Ok + DESCRIPTION + If there is no subpartitioning then the same array is used as for the + partitioning. Otherwise a new array is built up using the flag + FIELD_IN_PART_FUNC in the field object. + This function is called from fix_partition_func +*/ + +static bool create_full_part_field_array(TABLE *table, + partition_info *part_info) +{ + bool result= FALSE; + DBUG_ENTER("create_full_part_field_array"); + + if (!is_sub_partitioned(part_info)) + { + part_info->full_part_field_array= part_info->part_field_array; + part_info->no_full_part_fields= part_info->no_part_fields; + } + else + { + Field **ptr, *field, **field_array; + uint no_part_fields=0, size_field_array; + ptr= table->field; + while ((field= *(ptr++))) + { + if (field->flags & FIELD_IN_PART_FUNC_FLAG) + no_part_fields++; + } + size_field_array= (no_part_fields+1)*sizeof(Field*); + field_array= (Field**)sql_alloc(size_field_array); + if (unlikely(!field_array)) + { + my_error(ER_OUTOFMEMORY, MYF(0), size_field_array); + result= TRUE; + goto end; + } + no_part_fields= 0; + ptr= table->field; + while ((field= *(ptr++))) + { + if (field->flags & FIELD_IN_PART_FUNC_FLAG) + field_array[no_part_fields++]= field; + } + field_array[no_part_fields]=0; + part_info->full_part_field_array= field_array; + part_info->no_full_part_fields= no_part_fields; + } +end: + DBUG_RETURN(result); +} + + +/* + These support routines is used to set/reset an indicator of all fields + in a certain key. It is used in conjunction with another support routine + that traverse all fields in the PF to find if all or some fields in the + PF is part of the key. This is used to check primary keys and unique + keys involve all fields in PF (unless supported) and to derive the + key_map's used to quickly decide whether the index can be used to + derive which partitions are needed to scan. + + + + Clear flag GET_FIXED_FIELDS_FLAG in all fields of a key previously set by + set_indicator_in_key_fields (always used in pairs). + SYNOPSIS + clear_indicator_in_key_fields() + key_info Reference to find the key fields +*/ + +static void clear_indicator_in_key_fields(KEY *key_info) +{ + KEY_PART_INFO *key_part; + uint key_parts= key_info->key_parts, i; + for (i= 0, key_part=key_info->key_part; i < key_parts; i++, key_part++) + key_part->field->flags&= (~GET_FIXED_FIELDS_FLAG); +} + + +/* + Set flag GET_FIXED_FIELDS_FLAG in all fields of a key. + SYNOPSIS + set_indicator_in_key_fields + key_info Reference to find the key fields +*/ + +static void set_indicator_in_key_fields(KEY *key_info) +{ + KEY_PART_INFO *key_part; + uint key_parts= key_info->key_parts, i; + for (i= 0, key_part=key_info->key_part; i < key_parts; i++, key_part++) + key_part->field->flags|= GET_FIXED_FIELDS_FLAG; +} + + +/* + Check if all or some fields in partition field array is part of a key + previously used to tag key fields. + SYNOPSIS + check_fields_in_PF() + ptr Partition field array + all_fields Is all fields of partition field array used in key + some_fields Is some fields of partition field array used in key + RETURN VALUE + all_fields, some_fields +*/ + +static void check_fields_in_PF(Field **ptr, bool *all_fields, + bool *some_fields) +{ + DBUG_ENTER("check_fields_in_PF"); + *all_fields= TRUE; + *some_fields= FALSE; + do + { + /* Check if the field of the PF is part of the current key investigated */ + if ((*ptr)->flags & GET_FIXED_FIELDS_FLAG) + *some_fields= TRUE; + else + *all_fields= FALSE; + } while (*(++ptr)); + DBUG_VOID_RETURN; +} + + +/* + Clear flag GET_FIXED_FIELDS_FLAG in all fields of the table. + This routine is used for error handling purposes. + SYNOPSIS + clear_field_flag() + table TABLE object for which partition fields are set-up +*/ + +static void clear_field_flag(TABLE *table) +{ + Field **ptr; + DBUG_ENTER("clear_field_flag"); + + for (ptr= table->field; *ptr; ptr++) + (*ptr)->flags&= (~GET_FIXED_FIELDS_FLAG); + DBUG_VOID_RETURN; +} + + +/* + This routine sets-up the partition field array for KEY partitioning, it + also verifies that all fields in the list of fields is actually a part of + the table. + SYNOPSIS + handle_list_of_fields() + it A list of field names for the partition function + table TABLE object for which partition fields are set-up + part_info Reference to partitioning data structure + sub_part Is the table subpartitioned as well + RETURN VALUE + TRUE Fields in list of fields not part of table + FALSE All fields ok and array created + DESCRIPTION + find_field_in_table_sef finds the field given its name. All fields get + GET_FIXED_FIELDS_FLAG set. +*/ + +static bool handle_list_of_fields(List_iterator<char> it, + TABLE *table, + partition_info *part_info, + bool sub_part) +{ + Field *field; + bool result; + char *field_name; + DBUG_ENTER("handle_list_of_fields"); + + while ((field_name= it++)) + { + field= find_field_in_table_sef(table, field_name); + if (likely(field != 0)) + field->flags|= GET_FIXED_FIELDS_FLAG; + else + { + my_error(ER_FIELD_NOT_FOUND_PART_ERROR, MYF(0)); + clear_field_flag(table); + result= TRUE; + goto end; + } + } + result= set_up_field_array(table, sub_part); +end: + DBUG_RETURN(result); +} + + +/* + This function is used to build an array of partition fields for the + partitioning function and subpartitioning function. The partitioning + function is an item tree that must reference at least one field in the + table. This is checked first in the parser that the function doesn't + contain non-cacheable parts (like a random function) and by checking + here that the function isn't a constant function. + SYNOPSIS + fix_fields_part_func() + thd The thread object + tables A list of one table, the partitioned table + func_expr The item tree reference of the partition function + part_info Reference to partitioning data structure + sub_part Is the table subpartitioned as well + RETURN VALUE + TRUE An error occurred, something was wrong with the + partition function. + FALSE Ok, a partition field array was created + DESCRIPTION + The function uses a new feature in fix_fields where the flag + GET_FIXED_FIELDS_FLAG is set for all fields in the item tree. + This field must always be reset before returning from the function + since it is used for other purposes as well. +*/ + +static bool fix_fields_part_func(THD *thd, TABLE_LIST *tables, + Item* func_expr, partition_info *part_info, + bool sub_part) +{ + /* + Calculate the number of fields in the partition function. + Use it allocate memory for array of Field pointers. + Initialise array of field pointers. Use information set when + calling fix_fields and reset it immediately after. + The get_fields_in_item_tree activates setting of bit in flags + on the field object. + */ + + bool result= TRUE; + TABLE *table= tables->table; + TABLE_LIST *save_list; + int error; + Name_resolution_context *context= &thd->lex->current_select->context; + DBUG_ENTER("fix_fields_part_func"); + + table->map= 1; //To ensure correct calculation of const item + table->get_fields_in_item_tree= TRUE; + save_list= context->table_list; + context->table_list= tables; + thd->where= "partition function"; + error= func_expr->fix_fields(thd, (Item**)0); + context->table_list= save_list; + if (unlikely(error)) + { + DBUG_PRINT("info", ("Field in partition function not part of table")); + clear_field_flag(table); + goto end; + } + if (unlikely(func_expr->const_item())) + { + my_error(ER_CONST_EXPR_IN_PARTITION_FUNC_ERROR, MYF(0)); + clear_field_flag(table); + goto end; + } + result= set_up_field_array(table, sub_part); +end: + table->get_fields_in_item_tree= FALSE; + table->map= 0; //Restore old value + DBUG_RETURN(result); +} + + +/* + This function verifies that if there is a primary key that it contains + all the fields of the partition function. + This is a temporary limitation that will hopefully be removed after a + while. + SYNOPSIS + check_primary_key() + table TABLE object for which partition fields are set-up + RETURN VALUES + TRUE Not all fields in partitioning function was part + of primary key + FALSE Ok, all fields of partitioning function were part + of primary key +*/ + +static bool check_primary_key(TABLE *table) +{ + uint primary_key= table->s->primary_key; + bool all_fields, some_fields, result= FALSE; + DBUG_ENTER("check_primary_key"); + + if (primary_key < MAX_KEY) + { + set_indicator_in_key_fields(table->key_info+primary_key); + check_fields_in_PF(table->s->part_info->full_part_field_array, + &all_fields, &some_fields); + clear_indicator_in_key_fields(table->key_info+primary_key); + if (unlikely(!all_fields)) + { + my_error(ER_UNIQUE_KEY_NEED_ALL_FIELDS_IN_PF,MYF(0),"PRIMARY KEY"); + result= TRUE; + } + } + DBUG_RETURN(result); +} + + +/* + This function verifies that if there is a unique index that it contains + all the fields of the partition function. + This is a temporary limitation that will hopefully be removed after a + while. + SYNOPSIS + check_unique_keys() + table TABLE object for which partition fields are set-up + RETURN VALUES + TRUE Not all fields in partitioning function was part + of all unique keys + FALSE Ok, all fields of partitioning function were part + of unique keys +*/ + +static bool check_unique_keys(TABLE *table) +{ + bool all_fields, some_fields, result= FALSE; + uint keys= table->s->keys, i; + DBUG_ENTER("check_unique_keys"); + for (i= 0; i < keys; i++) + { + if (table->key_info[i].flags & HA_NOSAME) //Unique index + { + set_indicator_in_key_fields(table->key_info+i); + check_fields_in_PF(table->s->part_info->full_part_field_array, + &all_fields, &some_fields); + clear_indicator_in_key_fields(table->key_info+i); + if (unlikely(!all_fields)) + { + my_error(ER_UNIQUE_KEY_NEED_ALL_FIELDS_IN_PF,MYF(0),"UNIQUE INDEX"); + result= TRUE; + break; + } + } + } + DBUG_RETURN(result); +} + + +/* + An important optimisation is whether a range on a field can select a subset + of the partitions. + A prerequisite for this to happen is that the PF is a growing function OR + a shrinking function. + This can never happen for a multi-dimensional PF. Thus this can only happen + with PF with at most one field involved in the PF. + The idea is that if the function is a growing function and you know that + the field of the PF is 4 <= A <= 6 then we can convert this to a range + in the PF instead by setting the range to PF(4) <= PF(A) <= PF(6). In the + case of RANGE PARTITIONING and LIST PARTITIONING this can be used to + calculate a set of partitions rather than scanning all of them. + Thus the following prerequisites are there to check if sets of partitions + can be found. + 1) Only possible for RANGE and LIST partitioning (not for subpartitioning) + 2) Only possible if PF only contains 1 field + 3) Possible if PF is a growing function of the field + 4) Possible if PF is a shrinking function of the field + OBSERVATION: + 1) IF f1(A) is a growing function AND f2(A) is a growing function THEN + f1(A) + f2(A) is a growing function + f1(A) * f2(A) is a growing function if f1(A) >= 0 and f2(A) >= 0 + 2) IF f1(A) is a growing function and f2(A) is a shrinking function THEN + f1(A) / f2(A) is a growing function if f1(A) >= 0 and f2(A) > 0 + 3) IF A is a growing function then a function f(A) that removes the + least significant portion of A is a growing function + E.g. DATE(datetime) is a growing function + MONTH(datetime) is not a growing/shrinking function + 4) IF f1(A) is a growing function and f2(A) is a growing function THEN + f1(f2(A)) and f2(f1(A)) are also growing functions + 5) IF f1(A) is a shrinking function and f2(A) is a growing function THEN + f1(f2(A)) is a shrinking function and f2(f1(A)) is a shrinking function + 6) f1(A) = A is a growing function + 7) f1(A) = A*a + b (where a and b are constants) is a growing function + + By analysing the item tree of the PF we can use these deducements and + derive whether the PF is a growing function or a shrinking function or + neither of it. + + If the PF is range capable then a flag is set on the table object + indicating this to notify that we can use also ranges on the field + of the PF to deduce a set of partitions if the fields of the PF were + not all fully bound. + SYNOPSIS + check_range_capable_PF() + table TABLE object for which partition fields are set-up + DESCRIPTION + Support for this is not implemented yet. +*/ + +void check_range_capable_PF(TABLE *table) +{ + DBUG_ENTER("check_range_capable_PF"); + DBUG_VOID_RETURN; +} + + +/* + Set up partition key maps + SYNOPSIS + set_up_partition_key_maps() + table TABLE object for which partition fields are set-up + part_info Reference to partitioning data structure + RETURN VALUES + None + DESCRIPTION + This function sets up a couple of key maps to be able to quickly check + if an index ever can be used to deduce the partition fields or even + a part of the fields of the partition function. + We set up the following key_map's. + PF = Partition Function + 1) All fields of the PF is set even by equal on the first fields in the + key + 2) All fields of the PF is set if all fields of the key is set + 3) At least one field in the PF is set if all fields is set + 4) At least one field in the PF is part of the key +*/ + +static void set_up_partition_key_maps(TABLE *table, + partition_info *part_info) +{ + uint keys= table->s->keys, i; + bool all_fields, some_fields; + DBUG_ENTER("set_up_partition_key_maps"); + + part_info->all_fields_in_PF.clear_all(); + part_info->all_fields_in_PPF.clear_all(); + part_info->all_fields_in_SPF.clear_all(); + part_info->some_fields_in_PF.clear_all(); + for (i= 0; i < keys; i++) + { + set_indicator_in_key_fields(table->key_info+i); + check_fields_in_PF(part_info->full_part_field_array, + &all_fields, &some_fields); + if (all_fields) + part_info->all_fields_in_PF.set_bit(i); + if (some_fields) + part_info->some_fields_in_PF.set_bit(i); + if (is_sub_partitioned(part_info)) + { + check_fields_in_PF(part_info->part_field_array, + &all_fields, &some_fields); + if (all_fields) + part_info->all_fields_in_PPF.set_bit(i); + check_fields_in_PF(part_info->subpart_field_array, + &all_fields, &some_fields); + if (all_fields) + part_info->all_fields_in_SPF.set_bit(i); + } + clear_indicator_in_key_fields(table->key_info+i); + } + DBUG_VOID_RETURN; +} + + +/* + Set-up all function pointers for calculation of partition id, + subpartition id and the upper part in subpartitioning. This is to speed up + execution of get_partition_id which is executed once every record to be + written and deleted and twice for updates. + SYNOPSIS + set_up_partition_function_pointers() + part_info Reference to partitioning data structure +*/ + +static void set_up_partition_func_pointers(partition_info *part_info) +{ + if (is_sub_partitioned(part_info)) + { + if (part_info->part_type == RANGE_PARTITION) + { + part_info->get_part_partition_id= get_partition_id_range; + if (part_info->list_of_subpart_fields) + { + if (part_info->linear_hash_ind) + { + part_info->get_partition_id= get_partition_id_range_sub_linear_key; + part_info->get_subpartition_id= get_partition_id_linear_key_sub; + } + else + { + part_info->get_partition_id= get_partition_id_range_sub_key; + part_info->get_subpartition_id= get_partition_id_key_sub; + } + } + else + { + if (part_info->linear_hash_ind) + { + part_info->get_partition_id= get_partition_id_range_sub_linear_hash; + part_info->get_subpartition_id= get_partition_id_linear_hash_sub; + } + else + { + part_info->get_partition_id= get_partition_id_range_sub_hash; + part_info->get_subpartition_id= get_partition_id_hash_sub; + } + } + } + else //LIST Partitioning + { + part_info->get_part_partition_id= get_partition_id_list; + if (part_info->list_of_subpart_fields) + { + if (part_info->linear_hash_ind) + { + part_info->get_partition_id= get_partition_id_list_sub_linear_key; + part_info->get_subpartition_id= get_partition_id_linear_key_sub; + } + else + { + part_info->get_partition_id= get_partition_id_list_sub_key; + part_info->get_subpartition_id= get_partition_id_key_sub; + } + } + else + { + if (part_info->linear_hash_ind) + { + part_info->get_partition_id= get_partition_id_list_sub_linear_hash; + part_info->get_subpartition_id= get_partition_id_linear_hash_sub; + } + else + { + part_info->get_partition_id= get_partition_id_list_sub_hash; + part_info->get_subpartition_id= get_partition_id_hash_sub; + } + } + } + } + else //No subpartitioning + { + part_info->get_part_partition_id= NULL; + part_info->get_subpartition_id= NULL; + if (part_info->part_type == RANGE_PARTITION) + part_info->get_partition_id= get_partition_id_range; + else if (part_info->part_type == LIST_PARTITION) + part_info->get_partition_id= get_partition_id_list; + else //HASH partitioning + { + if (part_info->list_of_part_fields) + { + if (part_info->linear_hash_ind) + part_info->get_partition_id= get_partition_id_linear_key_nosub; + else + part_info->get_partition_id= get_partition_id_key_nosub; + } + else + { + if (part_info->linear_hash_ind) + part_info->get_partition_id= get_partition_id_linear_hash_nosub; + else + part_info->get_partition_id= get_partition_id_hash_nosub; + } + } + } +} + + +/* + For linear hashing we need a mask which is on the form 2**n - 1 where + 2**n >= no_parts. Thus if no_parts is 6 then mask is 2**3 - 1 = 8 - 1 = 7. + SYNOPSIS + set_linear_hash_mask() + part_info Reference to partitioning data structure + no_parts Number of parts in linear hash partitioning +*/ + +static void set_linear_hash_mask(partition_info *part_info, uint no_parts) +{ + uint mask; + for (mask= 1; mask < no_parts; mask<<=1) + ; + part_info->linear_hash_mask= mask - 1; +} + + +/* + This function calculates the partition id provided the result of the hash + function using linear hashing parameters, mask and number of partitions. + SYNOPSIS + get_part_id_from_linear_hash() + hash_value Hash value calculated by HASH function or KEY function + mask Mask calculated previously by set_linear_hash_mask + no_parts Number of partitions in HASH partitioned part + RETURN VALUE + part_id The calculated partition identity (starting at 0) + DESCRIPTION + The partition is calculated according to the theory of linear hashing. + See e.g. Linear hashing: a new tool for file and table addressing, + Reprinted from VLDB-80 in Readings Database Systems, 2nd ed, M. Stonebraker + (ed.), Morgan Kaufmann 1994. +*/ + +static uint32 get_part_id_from_linear_hash(longlong hash_value, uint mask, + uint no_parts) +{ + uint32 part_id= (uint32)(hash_value & mask); + if (part_id >= no_parts) + { + uint new_mask= ((mask + 1) >> 1) - 1; + part_id= hash_value & new_mask; + } + return part_id; +} + +/* + This function is called as part of opening the table by opening the .frm + file. It is a part of CREATE TABLE to do this so it is quite permissible + that errors due to erroneus syntax isn't found until we come here. + If the user has used a non-existing field in the table is one such example + of an error that is not discovered until here. + SYNOPSIS + fix_partition_func() + thd The thread object + name The name of the partitioned table + table TABLE object for which partition fields are set-up + RETURN VALUE + TRUE + FALSE + DESCRIPTION + The name parameter contains the full table name and is used to get the + database name of the table which is used to set-up a correct + TABLE_LIST object for use in fix_fields. +*/ + +bool fix_partition_func(THD *thd, const char* name, TABLE *table) +{ + bool result= TRUE; + uint dir_length, home_dir_length; + TABLE_LIST tables; + TABLE_SHARE *share= table->s; + char db_name_string[FN_REFLEN]; + char* db_name; + partition_info *part_info= share->part_info; + ulong save_set_query_id= thd->set_query_id; + DBUG_ENTER("fix_partition_func"); + + thd->set_query_id= 0; + /* + Set-up the TABLE_LIST object to be a list with a single table + Set the object to zero to create NULL pointers and set alias + and real name to table name and get database name from file name. + */ + + bzero((void*)&tables, sizeof(TABLE_LIST)); + tables.alias= tables.table_name= (char*)share->table_name; + tables.table= table; + strmov(db_name_string, name); + dir_length= dirname_length(db_name_string); + db_name_string[dir_length - 1]= 0; + home_dir_length= dirname_length(db_name_string); + db_name= &db_name_string[home_dir_length]; + tables.db= db_name; + + part_info->no_full_parts= part_info->no_parts; + if (is_sub_partitioned(part_info)) + { + DBUG_ASSERT(part_info->subpart_type == HASH_PARTITION); + part_info->no_full_parts= part_info->no_parts*part_info->no_subparts; + /* + Subpartition is defined. We need to verify that subpartitioning + function is correct. + */ + if (part_info->linear_hash_ind) + set_linear_hash_mask(part_info, part_info->no_subparts); + if (part_info->list_of_subpart_fields) + { + List_iterator<char> it(part_info->subpart_field_list); + if (unlikely(handle_list_of_fields(it, table, part_info, TRUE))) + goto end; + } + else + { + if (unlikely(fix_fields_part_func(thd, &tables, + part_info->subpart_expr, part_info, TRUE))) + goto end; + if (unlikely(part_info->subpart_expr->result_type() != INT_RESULT)) + { + my_error(ER_PARTITION_FUNC_NOT_ALLOWED_ERROR, MYF(0), + "SUBPARTITION"); + goto end; + } + } + } + DBUG_ASSERT(part_info->part_type != NOT_A_PARTITION); + /* + Partition is defined. We need to verify that partitioning + function is correct. + */ + if (part_info->part_type == HASH_PARTITION) + { + if (part_info->linear_hash_ind) + set_linear_hash_mask(part_info, part_info->no_parts); + if (part_info->list_of_part_fields) + { + List_iterator<char> it(part_info->part_field_list); + if (unlikely(handle_list_of_fields(it, table, part_info, FALSE))) + goto end; + } + else + { + if (unlikely(fix_fields_part_func(thd, &tables, part_info->part_expr, + part_info, FALSE))) + goto end; + if (unlikely(part_info->part_expr->result_type() != INT_RESULT)) + { + my_error(ER_PARTITION_FUNC_NOT_ALLOWED_ERROR, MYF(0), part_str); + goto end; + } + part_info->part_result_type= INT_RESULT; + } + } + else + { + char *error_str; + if (part_info->part_type == RANGE_PARTITION) + { + error_str= range_str; + if (unlikely(check_range_constants(part_info))) + goto end; + } + else if (part_info->part_type == LIST_PARTITION) + { + error_str= list_str; + if (unlikely(check_list_constants(part_info))) + goto end; + } + else + { + DBUG_ASSERT(0); + my_error(ER_INCONSISTENT_PARTITION_INFO_ERROR, MYF(0)); + goto end; + } + if (unlikely(part_info->no_parts < 1)) + { + my_error(ER_PARTITIONS_MUST_BE_DEFINED_ERROR, MYF(0), error_str); + goto end; + } + if (unlikely(fix_fields_part_func(thd, &tables, part_info->part_expr, + part_info, FALSE))) + goto end; + if (unlikely(part_info->part_expr->result_type() != INT_RESULT)) + { + my_error(ER_PARTITION_FUNC_NOT_ALLOWED_ERROR, MYF(0), part_str); + goto end; + } + } + if (unlikely(create_full_part_field_array(table, part_info))) + goto end; + if (unlikely(check_primary_key(table))) + goto end; + if (unlikely((!table->file->partition_flags() & HA_CAN_PARTITION_UNIQUE) && + check_unique_keys(table))) + goto end; + check_range_capable_PF(table); + set_up_partition_key_maps(table, part_info); + set_up_partition_func_pointers(part_info); + result= FALSE; +end: + thd->set_query_id= save_set_query_id; + DBUG_RETURN(result); +} + + +/* + The code below is support routines for the reverse parsing of the + partitioning syntax. This feature is very useful to generate syntax for + all default values to avoid all default checking when opening the frm + file. It is also used when altering the partitioning by use of various + ALTER TABLE commands. Finally it is used for SHOW CREATE TABLES. +*/ + +static int add_write(File fptr, const char *buf, uint len) +{ + uint len_written= my_write(fptr, buf, len, MYF(0)); + if (likely(len == len_written)) + return 0; + else + return 1; +} + +static int add_string(File fptr, const char *string) +{ + return add_write(fptr, string, strlen(string)); +} + +static int add_string_len(File fptr, const char *string, uint len) +{ + return add_write(fptr, string, len); +} + +static int add_space(File fptr) +{ + return add_string(fptr, space_str); +} + +static int add_comma(File fptr) +{ + return add_string(fptr, comma_str); +} + +static int add_equal(File fptr) +{ + return add_string(fptr, equal_str); +} + +static int add_end_parenthesis(File fptr) +{ + return add_string(fptr, end_paren_str); +} + +static int add_begin_parenthesis(File fptr) +{ + return add_string(fptr, begin_paren_str); +} + +static int add_part_key_word(File fptr, const char *key_string) +{ + int err= add_string(fptr, key_string); + err+= add_space(fptr); + return err + add_begin_parenthesis(fptr); +} + +static int add_hash(File fptr) +{ + return add_part_key_word(fptr, hash_str); +} + +static int add_partition(File fptr) +{ + strxmov(buff, part_str, space_str, NullS); + return add_string(fptr, buff); +} + +static int add_subpartition(File fptr) +{ + int err= add_string(fptr, sub_str); + return err + add_partition(fptr); +} + +static int add_partition_by(File fptr) +{ + strxmov(buff, part_str, space_str, by_str, space_str, NullS); + return add_string(fptr, buff); +} + +static int add_subpartition_by(File fptr) +{ + int err= add_string(fptr, sub_str); + return err + add_partition_by(fptr); +} + +static int add_key_partition(File fptr, List<char> field_list) +{ + uint i, no_fields; + int err; + List_iterator<char> part_it(field_list); + err= add_part_key_word(fptr, key_str); + no_fields= field_list.elements; + i= 0; + do + { + const char *field_str= part_it++; + err+= add_string(fptr, field_str); + if (i != (no_fields-1)) + err+= add_comma(fptr); + } while (++i < no_fields); + return err; +} + +static int add_int(File fptr, longlong number) +{ + llstr(number, buff); + return add_string(fptr, buff); +} + +static int add_keyword_string(File fptr, const char *keyword, + const char *keystr) +{ + int err= add_string(fptr, keyword); + err+= add_space(fptr); + err+= add_equal(fptr); + err+= add_space(fptr); + err+= add_string(fptr, keystr); + return err + add_space(fptr); +} + +static int add_keyword_int(File fptr, const char *keyword, longlong num) +{ + int err= add_string(fptr, keyword); + err+= add_space(fptr); + err+= add_equal(fptr); + err+= add_space(fptr); + err+= add_int(fptr, num); + return err + add_space(fptr); +} + +static int add_engine(File fptr, enum db_type engine_type) +{ + const char *engine_str= ha_get_storage_engine(engine_type); + int err= add_string(fptr, "ENGINE = "); + return err + add_string(fptr, engine_str); + return err; +} + +static int add_partition_options(File fptr, partition_element *p_elem) +{ + int err= 0; + if (p_elem->tablespace_name) + err+= add_keyword_string(fptr,"TABLESPACE",p_elem->tablespace_name); + if (p_elem->nodegroup_id != UNDEF_NODEGROUP) + err+= add_keyword_int(fptr,"NODEGROUP",(longlong)p_elem->nodegroup_id); + if (p_elem->part_max_rows) + err+= add_keyword_int(fptr,"MAX_ROWS",(longlong)p_elem->part_max_rows); + if (p_elem->part_min_rows) + err+= add_keyword_int(fptr,"MIN_ROWS",(longlong)p_elem->part_min_rows); + if (p_elem->data_file_name) + err+= add_keyword_string(fptr,"DATA DIRECTORY",p_elem->data_file_name); + if (p_elem->index_file_name) + err+= add_keyword_string(fptr,"INDEX DIRECTORY",p_elem->index_file_name); + if (p_elem->part_comment) + err+= add_keyword_string(fptr, "COMMENT",p_elem->part_comment); + return err + add_engine(fptr,p_elem->engine_type); +} + +static int add_partition_values(File fptr, partition_info *part_info, + partition_element *p_elem) +{ + int err= 0; + if (part_info->part_type == RANGE_PARTITION) + { + err+= add_string(fptr, "VALUES LESS THAN "); + if (p_elem->range_expr) + { + err+= add_begin_parenthesis(fptr); + err+= add_int(fptr,p_elem->range_expr->val_int()); + err+= add_end_parenthesis(fptr); + } + else + err+= add_string(fptr, "MAXVALUE"); + } + else if (part_info->part_type == LIST_PARTITION) + { + uint i; + List_iterator<Item> list_expr_it(p_elem->list_expr_list); + err+= add_string(fptr, "VALUES IN "); + uint no_items= p_elem->list_expr_list.elements; + err+= add_begin_parenthesis(fptr); + i= 0; + do + { + Item *list_expr= list_expr_it++; + err+= add_int(fptr, list_expr->val_int()); + if (i != (no_items-1)) + err+= add_comma(fptr); + } while (++i < no_items); + err+= add_end_parenthesis(fptr); + } + return err + add_space(fptr); +} + +/* + Generate the partition syntax from the partition data structure. + Useful for support of generating defaults, SHOW CREATE TABLES + and easy partition management. + SYNOPSIS + generate_partition_syntax() + part_info The partitioning data structure + buf_length A pointer to the returned buffer length + use_sql_alloc Allocate buffer from sql_alloc if true + otherwise use my_malloc + RETURN VALUES + NULL error + buf, buf_length Buffer and its length + DESCRIPTION + Here we will generate the full syntax for the given command where all + defaults have been expanded. By so doing the it is also possible to + make lots of checks of correctness while at it. + This could will also be reused for SHOW CREATE TABLES and also for all + type ALTER TABLE commands focusing on changing the PARTITION structure + in any fashion. + + The implementation writes the syntax to a temporary file (essentially + an abstraction of a dynamic array) and if all writes goes well it + allocates a buffer and writes the syntax into this one and returns it. + + As a security precaution the file is deleted before writing into it. This + means that no other processes on the machine can open and read the file + while this processing is ongoing. + + The code is optimised for minimal code size since it is not used in any + common queries. +*/ + +char *generate_partition_syntax(partition_info *part_info, + uint *buf_length, + bool use_sql_alloc) +{ + uint i,j, no_parts, no_subparts; + partition_element *part_elem; + ulonglong buffer_length; + char path[FN_REFLEN]; + int err= 0; + DBUG_ENTER("generate_partition_syntax"); + File fptr; + char *buf= NULL; //Return buffer + const char *file_name; + sprintf(path, "%s_%lx_%lx", "part_syntax", current_pid, + current_thd->thread_id); + fn_format(path,path,mysql_tmpdir,".psy", MY_REPLACE_EXT); + file_name= &path[0]; + DBUG_PRINT("info", ("File name = %s", file_name)); + if (unlikely(((fptr= my_open(file_name,O_CREAT|O_RDWR, MYF(MY_WME))) == -1))) + DBUG_RETURN(NULL); +#if defined(MSDOS) || defined(__WIN__) || defined(__EMX__) || defined(OS2) +#else + my_delete(file_name, MYF(0)); +#endif + err+= add_space(fptr); + err+= add_partition_by(fptr); + switch (part_info->part_type) + { + case RANGE_PARTITION: + err+= add_part_key_word(fptr, range_str); + break; + case LIST_PARTITION: + err+= add_part_key_word(fptr, list_str); + break; + case HASH_PARTITION: + if (part_info->linear_hash_ind) + err+= add_string(fptr, "LINEAR "); + if (part_info->list_of_part_fields) + err+= add_key_partition(fptr, part_info->part_field_list); + else + err+= add_hash(fptr); + break; + default: + DBUG_ASSERT(0); + /* We really shouldn't get here, no use in continuing from here */ + current_thd->fatal_error(); + DBUG_RETURN(NULL); + } + if (part_info->part_expr) + err+= add_string_len(fptr, part_info->part_func_string, + part_info->part_func_len); + err+= add_end_parenthesis(fptr); + err+= add_space(fptr); + if (is_sub_partitioned(part_info)) + { + err+= add_subpartition_by(fptr); + /* Must be hash partitioning for subpartitioning */ + if (part_info->list_of_subpart_fields) + err+= add_key_partition(fptr, part_info->subpart_field_list); + else + err+= add_hash(fptr); + if (part_info->subpart_expr) + err+= add_string_len(fptr, part_info->subpart_func_string, + part_info->subpart_func_len); + err+= add_end_parenthesis(fptr); + err+= add_space(fptr); + } + err+= add_begin_parenthesis(fptr); + List_iterator<partition_element> part_it(part_info->partitions); + no_parts= part_info->no_parts; + no_subparts= part_info->no_subparts; + i= 0; + do + { + part_elem= part_it++; + err+= add_partition(fptr); + err+= add_string(fptr, part_elem->partition_name); + err+= add_space(fptr); + err+= add_partition_values(fptr, part_info, part_elem); + if (!is_sub_partitioned(part_info)) + err+= add_partition_options(fptr, part_elem); + if (is_sub_partitioned(part_info)) + { + err+= add_space(fptr); + err+= add_begin_parenthesis(fptr); + List_iterator<partition_element> sub_it(part_elem->subpartitions); + j= 0; + do + { + part_elem= sub_it++; + err+= add_subpartition(fptr); + err+= add_string(fptr, part_elem->partition_name); + err+= add_space(fptr); + err+= add_partition_options(fptr, part_elem); + if (j != (no_subparts-1)) + { + err+= add_comma(fptr); + err+= add_space(fptr); + } + else + err+= add_end_parenthesis(fptr); + } while (++j < no_subparts); + } + if (i != (no_parts-1)) + { + err+= add_comma(fptr); + err+= add_space(fptr); + } + else + err+= add_end_parenthesis(fptr); + } while (++i < no_parts); + if (err) + goto close_file; + buffer_length= my_seek(fptr, 0L,MY_SEEK_END,MYF(0)); + if (unlikely(buffer_length == MY_FILEPOS_ERROR)) + goto close_file; + if (unlikely(my_seek(fptr, 0L, MY_SEEK_SET, MYF(0)) == MY_FILEPOS_ERROR)) + goto close_file; + *buf_length= (uint)buffer_length; + if (use_sql_alloc) + buf= sql_alloc(*buf_length+1); + else + buf= my_malloc(*buf_length+1, MYF(MY_WME)); + if (!buf) + goto close_file; + + if (unlikely(my_read(fptr, buf, *buf_length, MYF(MY_FNABP)))) + { + if (!use_sql_alloc) + my_free(buf, MYF(0)); + else + buf= NULL; + } + else + buf[*buf_length]= 0; + +close_file: + /* + Delete the file before closing to ensure the file doesn't get synched + to disk unnecessary. We only used the file system as a dynamic array + implementation so we are not really interested in getting the file + present on disk. + This is not possible on Windows so here it has to be done after closing + the file. Also on Unix we delete immediately after opening to ensure no + other process can read the information written into the file. + */ + my_close(fptr, MYF(0)); +#if defined(MSDOS) || defined(__WIN__) || defined(__EMX__) || defined(OS2) + my_delete(file_name, MYF(0)); +#endif + DBUG_RETURN(buf); +} + + +/* + Check if partition key fields are modified and if it can be handled by the + underlying storage engine. + SYNOPSIS + partition_key_modified + table TABLE object for which partition fields are set-up + fields A list of the to be modifed + RETURN VALUES + TRUE Need special handling of UPDATE + FALSE Normal UPDATE handling is ok +*/ + +bool partition_key_modified(TABLE *table, List<Item> &fields) +{ + List_iterator_fast<Item> f(fields); + partition_info *part_info= table->s->part_info; + Item_field *item_field; + DBUG_ENTER("partition_key_modified"); + if (!part_info) + DBUG_RETURN(FALSE); + if (table->file->partition_flags() & HA_CAN_UPDATE_PARTITION_KEY) + DBUG_RETURN(FALSE); + f.rewind(); + while ((item_field=(Item_field*) f++)) + if (item_field->field->flags & FIELD_IN_PART_FUNC_FLAG) + DBUG_RETURN(TRUE); + DBUG_RETURN(FALSE); +} + + +/* + The next set of functions are used to calculate the partition identity. + A handler sets up a variable that corresponds to one of these functions + to be able to quickly call it whenever the partition id needs to calculated + based on the record in table->record[0] (or set up to fake that). + There are 4 functions for hash partitioning and 2 for RANGE/LIST partitions. + In addition there are 4 variants for RANGE subpartitioning and 4 variants + for LIST subpartitioning thus in total there are 14 variants of this + function. + + We have a set of support functions for these 14 variants. There are 4 + variants of hash functions and there is a function for each. The KEY + partitioning uses the function calculate_key_value to calculate the hash + value based on an array of fields. The linear hash variants uses the + method get_part_id_from_linear_hash to get the partition id using the + hash value and some parameters calculated from the number of partitions. +*/ + +/* + Calculate hash value for KEY partitioning using an array of fields. + SYNOPSIS + calculate_key_value() + field_array An array of the fields in KEY partitioning + RETURN VALUE + hash_value calculated + DESCRIPTION + Uses the hash function on the character set of the field. Integer and + floating point fields use the binary character set by default. +*/ + +static uint32 calculate_key_value(Field **field_array) +{ + uint32 hashnr= 0; + ulong nr2= 4; + do + { + Field *field= *field_array; + if (field->is_null()) + { + hashnr^= (hashnr << 1) | 1; + } + else + { + uint len= field->pack_length(); + ulong nr1= 1; + CHARSET_INFO *cs= field->charset(); + cs->coll->hash_sort(cs, (uchar*)field->ptr, len, &nr1, &nr2); + hashnr^= (uint32)nr1; + } + } while (*(++field_array)); + return hashnr; +} + + +/* + A simple support function to calculate part_id given local part and + sub part. + SYNOPSIS + get_part_id_for_sub() + loc_part_id Local partition id + sub_part_id Subpartition id + no_subparts Number of subparts +*/ + +inline +static uint32 get_part_id_for_sub(uint32 loc_part_id, uint32 sub_part_id, + uint no_subparts) +{ + return (uint32)((loc_part_id * no_subparts) + sub_part_id); +} + + +/* + Calculate part_id for (SUB)PARTITION BY HASH + SYNOPSIS + get_part_id_hash() + no_parts Number of hash partitions + part_expr Item tree of hash function + RETURN VALUE + Calculated partition id +*/ + +inline +static uint32 get_part_id_hash(uint no_parts, + Item *part_expr) +{ + DBUG_ENTER("get_part_id_hash"); + DBUG_RETURN((uint32)(part_expr->val_int() % no_parts)); +} + + +/* + Calculate part_id for (SUB)PARTITION BY LINEAR HASH + SYNOPSIS + get_part_id_linear_hash() + part_info A reference to the partition_info struct where all the + desired information is given + no_parts Number of hash partitions + part_expr Item tree of hash function + RETURN VALUE + Calculated partition id +*/ + +inline +static uint32 get_part_id_linear_hash(partition_info *part_info, + uint no_parts, + Item *part_expr) +{ + DBUG_ENTER("get_part_id_linear_hash"); + DBUG_RETURN(get_part_id_from_linear_hash(part_expr->val_int(), + part_info->linear_hash_mask, + no_parts)); +} + + +/* + Calculate part_id for (SUB)PARTITION BY KEY + SYNOPSIS + get_part_id_key() + field_array Array of fields for PARTTION KEY + no_parts Number of KEY partitions + RETURN VALUE + Calculated partition id +*/ + +inline +static uint32 get_part_id_key(Field **field_array, + uint no_parts) +{ + DBUG_ENTER("get_part_id_key"); + DBUG_RETURN(calculate_key_value(field_array) & no_parts); +} + + +/* + Calculate part_id for (SUB)PARTITION BY LINEAR KEY + SYNOPSIS + get_part_id_linear_key() + part_info A reference to the partition_info struct where all the + desired information is given + field_array Array of fields for PARTTION KEY + no_parts Number of KEY partitions + RETURN VALUE + Calculated partition id +*/ + +inline +static uint32 get_part_id_linear_key(partition_info *part_info, + Field **field_array, + uint no_parts) +{ + DBUG_ENTER("get_partition_id_linear_key"); + DBUG_RETURN(get_part_id_from_linear_hash(calculate_key_value(field_array), + part_info->linear_hash_mask, + no_parts)); +} + +/* + This function is used to calculate the partition id where all partition + fields have been prepared to point to a record where the partition field + values are bound. + SYNOPSIS + get_partition_id() + part_info A reference to the partition_info struct where all the + desired information is given + part_id The partition id is returned through this pointer + RETURN VALUE + part_id + return TRUE means that the fields of the partition function didn't fit + into any partition and thus the values of the PF-fields are not allowed. + DESCRIPTION + A routine used from write_row, update_row and delete_row from any + handler supporting partitioning. It is also a support routine for + get_partition_set used to find the set of partitions needed to scan + for a certain index scan or full table scan. + + It is actually 14 different variants of this function which are called + through a function pointer. + + get_partition_id_list + get_partition_id_range + get_partition_id_hash_nosub + get_partition_id_key_nosub + get_partition_id_linear_hash_nosub + get_partition_id_linear_key_nosub + get_partition_id_range_sub_hash + get_partition_id_range_sub_key + get_partition_id_range_sub_linear_hash + get_partition_id_range_sub_linear_key + get_partition_id_list_sub_hash + get_partition_id_list_sub_key + get_partition_id_list_sub_linear_hash + get_partition_id_list_sub_linear_key +*/ + +/* + This function is used to calculate the main partition to use in the case of + subpartitioning and we don't know enough to get the partition identity in + total. + SYNOPSIS + get_part_partition_id() + part_info A reference to the partition_info struct where all the + desired information is given + part_id The partition id is returned through this pointer + RETURN VALUE + part_id + return TRUE means that the fields of the partition function didn't fit + into any partition and thus the values of the PF-fields are not allowed. + DESCRIPTION + + It is actually 6 different variants of this function which are called + through a function pointer. + + get_partition_id_list + get_partition_id_range + get_partition_id_hash_nosub + get_partition_id_key_nosub + get_partition_id_linear_hash_nosub + get_partition_id_linear_key_nosub +*/ + + +bool get_partition_id_list(partition_info *part_info, + uint32 *part_id) +{ + DBUG_ENTER("get_partition_id_list"); + LIST_PART_ENTRY *list_array= part_info->list_array; + uint list_index; + longlong list_value; + uint min_list_index= 0, max_list_index= part_info->no_list_values - 1; + longlong part_func_value= part_info->part_expr->val_int(); + while (max_list_index >= min_list_index) + { + list_index= (max_list_index + min_list_index) >> 1; + list_value= list_array[list_index].list_value; + if (list_value < part_func_value) + min_list_index= list_index + 1; + else if (list_value > part_func_value) + max_list_index= list_index - 1; + else { + *part_id= (uint32)list_array[list_index].partition_id; + DBUG_RETURN(FALSE); + } + } + *part_id= 0; + DBUG_RETURN(TRUE); +} + + +bool get_partition_id_range(partition_info *part_info, + uint32 *part_id) +{ + DBUG_ENTER("get_partition_id_int_range"); + longlong *range_array= part_info->range_int_array; + uint max_partition= part_info->no_parts - 1; + uint min_part_id= 0, max_part_id= max_partition, loc_part_id; + longlong part_func_value= part_info->part_expr->val_int(); + while (max_part_id > min_part_id) + { + loc_part_id= (max_part_id + min_part_id + 1) >> 1; + if (range_array[loc_part_id] < part_func_value) + min_part_id= loc_part_id + 1; + else + max_part_id= loc_part_id - 1; + } + loc_part_id= max_part_id; + if (part_func_value >= range_array[loc_part_id]) + if (loc_part_id != max_partition) + loc_part_id++; + *part_id= (uint32)loc_part_id; + if (loc_part_id == max_partition) + if (range_array[loc_part_id] != LONGLONG_MAX) + if (part_func_value >= range_array[loc_part_id]) + DBUG_RETURN(TRUE); + DBUG_RETURN(FALSE); +} + +bool get_partition_id_hash_nosub(partition_info *part_info, + uint32 *part_id) +{ + *part_id= get_part_id_hash(part_info->no_parts, part_info->part_expr); + return FALSE; +} + + +bool get_partition_id_linear_hash_nosub(partition_info *part_info, + uint32 *part_id) +{ + *part_id= get_part_id_linear_hash(part_info, part_info->no_parts, + part_info->part_expr); + return FALSE; +} + + +bool get_partition_id_key_nosub(partition_info *part_info, + uint32 *part_id) +{ + *part_id= get_part_id_key(part_info->part_field_array, part_info->no_parts); + return FALSE; +} + + +bool get_partition_id_linear_key_nosub(partition_info *part_info, + uint32 *part_id) +{ + *part_id= get_part_id_linear_key(part_info, + part_info->part_field_array, + part_info->no_parts); + return FALSE; +} + + +bool get_partition_id_range_sub_hash(partition_info *part_info, + uint32 *part_id) +{ + uint32 loc_part_id, sub_part_id; + uint no_subparts; + DBUG_ENTER("get_partition_id_range_sub_hash"); + if (unlikely(get_partition_id_range(part_info, &loc_part_id))) + { + DBUG_RETURN(TRUE); + } + no_subparts= part_info->no_subparts; + sub_part_id= get_part_id_hash(no_subparts, part_info->subpart_expr); + *part_id= get_part_id_for_sub(loc_part_id, sub_part_id, no_subparts); + DBUG_RETURN(FALSE); +} + + +bool get_partition_id_range_sub_linear_hash(partition_info *part_info, + uint32 *part_id) +{ + uint32 loc_part_id, sub_part_id; + uint no_subparts; + DBUG_ENTER("get_partition_id_range_sub_linear_hash"); + if (unlikely(get_partition_id_range(part_info, &loc_part_id))) + { + DBUG_RETURN(TRUE); + } + no_subparts= part_info->no_subparts; + sub_part_id= get_part_id_linear_hash(part_info, no_subparts, + part_info->subpart_expr); + *part_id= get_part_id_for_sub(loc_part_id, sub_part_id, no_subparts); + DBUG_RETURN(FALSE); +} + + +bool get_partition_id_range_sub_key(partition_info *part_info, + uint32 *part_id) +{ + uint32 loc_part_id, sub_part_id; + uint no_subparts; + DBUG_ENTER("get_partition_id_range_sub_key"); + if (unlikely(get_partition_id_range(part_info, &loc_part_id))) + { + DBUG_RETURN(TRUE); + } + no_subparts= part_info->no_subparts; + sub_part_id= get_part_id_key(part_info->subpart_field_array, no_subparts); + *part_id= get_part_id_for_sub(loc_part_id, sub_part_id, no_subparts); + DBUG_RETURN(FALSE); +} + + +bool get_partition_id_range_sub_linear_key(partition_info *part_info, + uint32 *part_id) +{ + uint32 loc_part_id, sub_part_id; + uint no_subparts; + DBUG_ENTER("get_partition_id_range_sub_linear_key"); + if (unlikely(get_partition_id_range(part_info, &loc_part_id))) + { + DBUG_RETURN(TRUE); + } + no_subparts= part_info->no_subparts; + sub_part_id= get_part_id_linear_key(part_info, + part_info->subpart_field_array, + no_subparts); + *part_id= get_part_id_for_sub(loc_part_id, sub_part_id, no_subparts); + DBUG_RETURN(FALSE); +} + + +bool get_partition_id_list_sub_hash(partition_info *part_info, + uint32 *part_id) +{ + uint32 loc_part_id, sub_part_id; + uint no_subparts; + DBUG_ENTER("get_partition_id_list_sub_hash"); + if (unlikely(get_partition_id_list(part_info, &loc_part_id))) + { + DBUG_RETURN(TRUE); + } + no_subparts= part_info->no_subparts; + sub_part_id= get_part_id_hash(no_subparts, part_info->subpart_expr); + *part_id= get_part_id_for_sub(loc_part_id, sub_part_id, no_subparts); + DBUG_RETURN(FALSE); +} + + +bool get_partition_id_list_sub_linear_hash(partition_info *part_info, + uint32 *part_id) +{ + uint32 loc_part_id, sub_part_id; + uint no_subparts; + DBUG_ENTER("get_partition_id_list_sub_linear_hash"); + if (unlikely(get_partition_id_list(part_info, &loc_part_id))) + { + DBUG_RETURN(TRUE); + } + no_subparts= part_info->no_subparts; + sub_part_id= get_part_id_hash(no_subparts, part_info->subpart_expr); + *part_id= get_part_id_for_sub(loc_part_id, sub_part_id, no_subparts); + DBUG_RETURN(FALSE); +} + + +bool get_partition_id_list_sub_key(partition_info *part_info, + uint32 *part_id) +{ + uint32 loc_part_id, sub_part_id; + uint no_subparts; + DBUG_ENTER("get_partition_id_range_sub_key"); + if (unlikely(get_partition_id_list(part_info, &loc_part_id))) + { + DBUG_RETURN(TRUE); + } + no_subparts= part_info->no_subparts; + sub_part_id= get_part_id_key(part_info->subpart_field_array, no_subparts); + *part_id= get_part_id_for_sub(loc_part_id, sub_part_id, no_subparts); + DBUG_RETURN(FALSE); +} + + +bool get_partition_id_list_sub_linear_key(partition_info *part_info, + uint32 *part_id) +{ + uint32 loc_part_id, sub_part_id; + uint no_subparts; + DBUG_ENTER("get_partition_id_list_sub_linear_key"); + if (unlikely(get_partition_id_list(part_info, &loc_part_id))) + { + DBUG_RETURN(TRUE); + } + no_subparts= part_info->no_subparts; + sub_part_id= get_part_id_linear_key(part_info, + part_info->subpart_field_array, + no_subparts); + *part_id= get_part_id_for_sub(loc_part_id, sub_part_id, no_subparts); + DBUG_RETURN(FALSE); +} + + +/* + This function is used to calculate the subpartition id + SYNOPSIS + get_subpartition_id() + part_info A reference to the partition_info struct where all the + desired information is given + RETURN VALUE + part_id + The subpartition identity + DESCRIPTION + A routine used in some SELECT's when only partial knowledge of the + partitions is known. + + It is actually 4 different variants of this function which are called + through a function pointer. + + get_partition_id_hash_sub + get_partition_id_key_sub + get_partition_id_linear_hash_sub + get_partition_id_linear_key_sub +*/ + +uint32 get_partition_id_hash_sub(partition_info *part_info) +{ + return get_part_id_hash(part_info->no_subparts, part_info->subpart_expr); +} + + +uint32 get_partition_id_linear_hash_sub(partition_info *part_info) +{ + return get_part_id_linear_hash(part_info, part_info->no_subparts, + part_info->subpart_expr); +} + + +uint32 get_partition_id_key_sub(partition_info *part_info) +{ + return get_part_id_key(part_info->subpart_field_array, + part_info->no_subparts); +} + + +uint32 get_partition_id_linear_key_sub(partition_info *part_info) +{ + return get_part_id_linear_key(part_info, + part_info->subpart_field_array, + part_info->no_subparts); +} + + +/* + Set an indicator on all partition fields that are set by the key + SYNOPSIS + set_PF_fields_in_key() + key_info Information about the index + key_length Length of key + RETURN VALUE + TRUE Found partition field set by key + FALSE No partition field set by key +*/ + +static bool set_PF_fields_in_key(KEY *key_info, uint key_length) +{ + KEY_PART_INFO *key_part; + bool found_part_field= FALSE; + DBUG_ENTER("set_PF_fields_in_key"); + + for (key_part= key_info->key_part; (int)key_length > 0; key_part++) + { + if (key_part->null_bit) + key_length--; + if (key_part->type == HA_KEYTYPE_BIT) + { + if (((Field_bit*)key_part->field)->bit_len) + key_length--; + } + if (key_part->key_part_flag & (HA_BLOB_PART + HA_VAR_LENGTH_PART)) + { + key_length-= HA_KEY_BLOB_LENGTH; + } + if (key_length < key_part->length) + break; + key_length-= key_part->length; + if (key_part->field->flags & FIELD_IN_PART_FUNC_FLAG) + { + found_part_field= TRUE; + key_part->field->flags|= GET_FIXED_FIELDS_FLAG; + } + } + DBUG_RETURN(found_part_field); +} + + +/* + We have found that at least one partition field was set by a key, now + check if a partition function has all its fields bound or not. + SYNOPSIS + check_part_func_bound() + ptr Array of fields NULL terminated (partition fields) + RETURN VALUE + TRUE All fields in partition function are set + FALSE Not all fields in partition function are set +*/ + +static bool check_part_func_bound(Field **ptr) +{ + bool result= TRUE; + DBUG_ENTER("check_part_func_bound"); + + for (; *ptr; ptr++) + { + if (!((*ptr)->flags & GET_FIXED_FIELDS_FLAG)) + { + result= FALSE; + break; + } + } + DBUG_RETURN(result); +} + + +/* + Get the id of the subpartitioning part by using the key buffer of the + index scan. + SYNOPSIS + get_sub_part_id_from_key() + table The table object + buf A buffer that can be used to evaluate the partition function + key_info The index object + key_spec A key_range containing key and key length + RETURN VALUES + part_id Subpartition id to use + DESCRIPTION + Use key buffer to set-up record in buf, move field pointers and + get the partition identity and restore field pointers afterwards. +*/ + +static uint32 get_sub_part_id_from_key(const TABLE *table,byte *buf, + KEY *key_info, + const key_range *key_spec) +{ + byte *rec0= table->record[0]; + partition_info *part_info= table->s->part_info; + uint32 part_id; + DBUG_ENTER("get_sub_part_id_from_key"); + + key_restore(buf, (byte*)key_spec->key, key_info, key_spec->length); + if (likely(rec0 == buf)) + part_id= part_info->get_subpartition_id(part_info); + else + { + Field **part_field_array= part_info->subpart_field_array; + set_field_ptr(part_field_array, buf, rec0); + part_id= part_info->get_subpartition_id(part_info); + set_field_ptr(part_field_array, rec0, buf); + } + DBUG_RETURN(part_id); +} + +/* + Get the id of the partitioning part by using the key buffer of the + index scan. + SYNOPSIS + get_part_id_from_key() + table The table object + buf A buffer that can be used to evaluate the partition function + key_info The index object + key_spec A key_range containing key and key length + part_id Partition to use + RETURN VALUES + TRUE Partition to use not found + FALSE Ok, part_id indicates partition to use + DESCRIPTION + Use key buffer to set-up record in buf, move field pointers and + get the partition identity and restore field pointers afterwards. +*/ +bool get_part_id_from_key(const TABLE *table, byte *buf, KEY *key_info, + const key_range *key_spec, uint32 *part_id) +{ + bool result; + byte *rec0= table->record[0]; + partition_info *part_info= table->s->part_info; + DBUG_ENTER("get_part_id_from_key"); + + key_restore(buf, (byte*)key_spec->key, key_info, key_spec->length); + if (likely(rec0 == buf)) + result= part_info->get_part_partition_id(part_info, part_id); + else + { + Field **part_field_array= part_info->part_field_array; + set_field_ptr(part_field_array, buf, rec0); + result= part_info->get_part_partition_id(part_info, part_id); + set_field_ptr(part_field_array, rec0, buf); + } + DBUG_RETURN(result); +} + +/* + Get the partitioning id of the full PF by using the key buffer of the + index scan. + SYNOPSIS + get_full_part_id_from_key() + table The table object + buf A buffer that is used to evaluate the partition function + key_info The index object + key_spec A key_range containing key and key length + part_spec A partition id containing start part and end part + RETURN VALUES + part_spec + No partitions to scan is indicated by end_part > start_part when returning + DESCRIPTION + Use key buffer to set-up record in buf, move field pointers if needed and + get the partition identity and restore field pointers afterwards. +*/ + +void get_full_part_id_from_key(const TABLE *table, byte *buf, + KEY *key_info, + const key_range *key_spec, + part_id_range *part_spec) +{ + bool result; + partition_info *part_info= table->s->part_info; + byte *rec0= table->record[0]; + DBUG_ENTER("get_full_part_id_from_key"); + + key_restore(buf, (byte*)key_spec->key, key_info, key_spec->length); + if (likely(rec0 == buf)) + result= part_info->get_partition_id(part_info, &part_spec->start_part); + else + { + Field **part_field_array= part_info->full_part_field_array; + set_field_ptr(part_field_array, buf, rec0); + result= part_info->get_partition_id(part_info, &part_spec->start_part); + set_field_ptr(part_field_array, rec0, buf); + } + part_spec->end_part= part_spec->start_part; + if (unlikely(result)) + part_spec->start_part++; + DBUG_VOID_RETURN; +} + +/* + Get the set of partitions to use in query. + SYNOPSIS + get_partition_set() + table The table object + buf A buffer that can be used to evaluate the partition function + index The index of the key used, if MAX_KEY no index used + key_spec A key_range containing key and key length + part_spec Contains start part, end part and indicator if bitmap is + used for which partitions to scan + DESCRIPTION + This function is called to discover which partitions to use in an index + scan or a full table scan. + It returns a range of partitions to scan. If there are holes in this + range with partitions that are not needed to scan a bit array is used + to signal which partitions to use and which not to use. + If start_part > end_part at return it means no partition needs to be + scanned. If start_part == end_part it always means a single partition + needs to be scanned. + RETURN VALUE + part_spec +*/ +void get_partition_set(const TABLE *table, byte *buf, const uint index, + const key_range *key_spec, part_id_range *part_spec) +{ + partition_info *part_info= table->s->part_info; + uint no_parts= part_info->no_full_parts, i, part_id; + uint sub_part= no_parts, part_part= no_parts; + KEY *key_info= NULL; + bool found_part_field= FALSE; + DBUG_ENTER("get_partition_set"); + + part_spec->use_bit_array= FALSE; + part_spec->start_part= 0; + part_spec->end_part= no_parts - 1; + if ((index < MAX_KEY) && + key_spec->flag == (uint)HA_READ_KEY_EXACT && + part_info->some_fields_in_PF.is_set(index)) + { + key_info= table->key_info+index; + /* + The index can potentially provide at least one PF-field (field in the + partition function). Thus it is interesting to continue our probe. + */ + if (key_spec->length == key_info->key_length) + { + /* + The entire key is set so we can check whether we can immediately + derive either the complete PF or if we can derive either + the top PF or the subpartitioning PF. This can be established by + checking precalculated bits on each index. + */ + if (part_info->all_fields_in_PF.is_set(index)) + { + /* + We can derive the exact partition to use, no more than this one + is needed. + */ + get_full_part_id_from_key(table,buf,key_info,key_spec,part_spec); + DBUG_VOID_RETURN; + } + else if (is_sub_partitioned(part_info)) + { + if (part_info->all_fields_in_SPF.is_set(index)) + sub_part= get_sub_part_id_from_key(table, buf, key_info, key_spec); + else if (part_info->all_fields_in_PPF.is_set(index)) + { + if (get_part_id_from_key(table,buf,key_info,key_spec,&part_part)) + { + /* + The value of the RANGE or LIST partitioning was outside of + allowed values. Thus it is certain that the result of this + scan will be empty. + */ + part_spec->start_part= no_parts; + DBUG_VOID_RETURN; + } + } + } + } + else + { + /* + Set an indicator on all partition fields that are bound. + If at least one PF-field was bound it pays off to check whether + the PF or PPF or SPF has been bound. + (PF = Partition Function, SPF = Subpartition Function and + PPF = Partition Function part of subpartitioning) + */ + if ((found_part_field= set_PF_fields_in_key(key_info, + key_spec->length))) + { + if (check_part_func_bound(part_info->full_part_field_array)) + { + /* + We were able to bind all fields in the partition function even + by using only a part of the key. Calculate the partition to use. + */ + get_full_part_id_from_key(table,buf,key_info,key_spec,part_spec); + clear_indicator_in_key_fields(key_info); + DBUG_VOID_RETURN; + } + else if (check_part_func_bound(part_info->part_field_array)) + sub_part= get_sub_part_id_from_key(table, buf, key_info, key_spec); + else if (check_part_func_bound(part_info->subpart_field_array)) + { + if (get_part_id_from_key(table,buf,key_info,key_spec,&part_part)) + { + part_spec->start_part= no_parts; + clear_indicator_in_key_fields(key_info); + DBUG_VOID_RETURN; + } + } + } + } + } + { + /* + The next step is to analyse the table condition to see whether any + information about which partitions to scan can be derived from there. + Currently not implemented. + */ + } + /* + If we come here we have found a range of sorts we have either discovered + nothing or we have discovered a range of partitions with possible holes + in it. We need a bitvector to further the work here. + */ + if (!(part_part == no_parts && sub_part == no_parts)) + { + /* + We can only arrive here if we are using subpartitioning. + */ + if (part_part != no_parts) + { + /* + We know the top partition and need to scan all underlying + subpartitions. This is a range without holes. + */ + DBUG_ASSERT(sub_part == no_parts); + part_spec->start_part= part_part * part_info->no_parts; + part_spec->end_part= part_spec->start_part+part_info->no_subparts - 1; + } + else + { + DBUG_ASSERT(sub_part != no_parts); + part_spec->use_bit_array= TRUE; + part_spec->start_part= sub_part; + part_spec->end_part=sub_part+ + (part_info->no_subparts*(part_info->no_parts-1)); + for (i= 0, part_id= sub_part; i < part_info->no_parts; + i++, part_id+= part_info->no_subparts) + ; //Set bit part_id in bit array + } + } + if (found_part_field) + clear_indicator_in_key_fields(key_info); + DBUG_VOID_RETURN; +} + + +/* + If the table is partitioned we will read the partition info into the + .frm file here. + ------------------------------- + | Fileinfo 64 bytes | + ------------------------------- + | Formnames 7 bytes | + ------------------------------- + | Not used 4021 bytes | + ------------------------------- + | Keyinfo + record | + ------------------------------- + | Padded to next multiple | + | of IO_SIZE | + ------------------------------- + | Forminfo 288 bytes | + ------------------------------- + | Screen buffer, to make | + |Â field names readable | + ------------------------------- + | Packed field info | + |Â 17 + 1 + strlen(field_name) | + | + 1 end of file character | + ------------------------------- + | Partition info | + ------------------------------- + We provide the length of partition length in Fileinfo[55-58]. + + Read the partition syntax from the frm file and parse it to get the + data structures of the partitioning. + SYNOPSIS + mysql_unpack_partition() + file File reference of frm file + thd Thread object + part_info_len Length of partition syntax + table Table object of partitioned table + RETURN VALUE + TRUE Error + FALSE Sucess + DESCRIPTION + Read the partition syntax from the current position in the frm file. + Initiate a LEX object, save the list of item tree objects to free after + the query is done. Set-up partition info object such that parser knows + it is called from internally. Call parser to create data structures + (best possible recreation of item trees and so forth since there is no + serialisation of these objects other than in parseable text format). + We need to save the text of the partition functions since it is not + possible to retrace this given an item tree. +*/ + +bool mysql_unpack_partition(File file, THD *thd, uint part_info_len, + TABLE* table) +{ + Item *thd_free_list= thd->free_list; + bool result= TRUE; + uchar* part_buf= NULL; + partition_info *part_info; + LEX *old_lex= thd->lex, lex; + DBUG_ENTER("mysql_unpack_partition"); + if (read_string(file, (gptr*)&part_buf, part_info_len)) + DBUG_RETURN(result); + thd->lex= &lex; + lex_start(thd, part_buf, part_info_len); + /* + We need to use the current SELECT_LEX since I need to keep the + Name_resolution_context object which is referenced from the + Item_field objects. + This is not a nice solution since if the parser uses current_select + for anything else it will corrupt the current LEX object. + */ + thd->lex->current_select= old_lex->current_select; + /* + All Items created is put into a free list on the THD object. This list + is used to free all Item objects after completing a query. We don't + want that to happen with the Item tree created as part of the partition + info. This should be attached to the table object and remain so until + the table object is released. + Thus we move away the current list temporarily and start a new list that + we then save in the partition info structure. + */ + thd->free_list= NULL; + lex.part_info= (partition_info*)1; //Indicate yyparse from this place + if (yyparse((void*)thd) || thd->is_fatal_error) + { + free_items(thd->free_list); + goto end; + } + part_info= lex.part_info; + table->s->part_info= part_info; + part_info->item_free_list= thd->free_list; + + { + /* + This code part allocates memory for the serialised item information for + the partition functions. In most cases this is not needed but if the + table is used for SHOW CREATE TABLES or ALTER TABLE that modifies + partition information it is needed and the info is lost if we don't + save it here so unfortunately we have to do it here even if in most + cases it is not needed. This is a consequence of that item trees are + not serialisable. + */ + uint part_func_len= part_info->part_func_len; + uint subpart_func_len= part_info->subpart_func_len; + char *part_func_string, *subpart_func_string= NULL; + if (!((part_func_string= sql_alloc(part_func_len))) || + (subpart_func_len && + !((subpart_func_string= sql_alloc(subpart_func_len))))) + { + my_error(ER_OUTOFMEMORY, MYF(0), part_func_len); + free_items(thd->free_list); + part_info->item_free_list= 0; + goto end; + } + memcpy(part_func_string, part_info->part_func_string, part_func_len); + if (subpart_func_len) + memcpy(subpart_func_string, part_info->subpart_func_string, + subpart_func_len); + part_info->part_func_string= part_func_string; + part_info->subpart_func_string= subpart_func_string; + } + + result= FALSE; +end: + thd->free_list= thd_free_list; + x_free((gptr)part_buf); + thd->lex= old_lex; + DBUG_RETURN(result); +} +#endif + +/* + Prepare for calling val_int on partition function by setting fields to + point to the record where the values of the PF-fields are stored. + SYNOPSIS + set_field_ptr() + ptr Array of fields to change ptr + new_buf New record pointer + old_buf Old record pointer + DESCRIPTION + Set ptr in field objects of field array to refer to new_buf record + instead of previously old_buf. Used before calling val_int and after + it is used to restore pointers to table->record[0]. + This routine is placed outside of partition code since it can be useful + also for other programs. +*/ + +void set_field_ptr(Field **ptr, const byte *new_buf, + const byte *old_buf) +{ + my_ptrdiff_t diff= (new_buf - old_buf); + DBUG_ENTER("set_nullable_field_ptr"); + + do + { + (*ptr)->move_field(diff); + } while (*(++ptr)); + DBUG_VOID_RETURN; +} + + +/* + Prepare for calling val_int on partition function by setting fields to + point to the record where the values of the PF-fields are stored. + This variant works on a key_part reference. + It is not required that all fields are NOT NULL fields. + SYNOPSIS + set_key_field_ptr() + key_part key part with a set of fields to change ptr + new_buf New record pointer + old_buf Old record pointer + DESCRIPTION + Set ptr in field objects of field array to refer to new_buf record + instead of previously old_buf. Used before calling val_int and after + it is used to restore pointers to table->record[0]. + This routine is placed outside of partition code since it can be useful + also for other programs. +*/ + +void set_key_field_ptr(KEY *key_info, const byte *new_buf, + const byte *old_buf) +{ + KEY_PART_INFO *key_part= key_info->key_part; + uint key_parts= key_info->key_parts, i= 0; + my_ptrdiff_t diff= (new_buf - old_buf); + DBUG_ENTER("set_key_field_ptr"); + + do + { + key_part->field->move_field(diff); + key_part++; + } while (++i < key_parts); + DBUG_VOID_RETURN; +} + diff --git a/sql/sql_select.cc b/sql/sql_select.cc index bc45c5fa3be..c54efe531ad 100644 --- a/sql/sql_select.cc +++ b/sql/sql_select.cc @@ -1277,6 +1277,9 @@ JOIN::exec() /* Copy data to the temporary table */ thd->proc_info= "Copying to tmp table"; DBUG_PRINT("info", ("%s", thd->proc_info)); + if (!curr_join->sort_and_group && + curr_join->const_tables != curr_join->tables) + curr_join->join_tab[curr_join->const_tables].sorted= 0; if ((tmp_error= do_select(curr_join, (List<Item> *) 0, curr_tmp_table, 0))) { error= tmp_error; @@ -1423,6 +1426,9 @@ JOIN::exec() 1, TRUE)) DBUG_VOID_RETURN; curr_join->group_list= 0; + if (!curr_join->sort_and_group && + curr_join->const_tables != curr_join->tables) + curr_join->join_tab[curr_join->const_tables].sorted= 0; if (setup_sum_funcs(curr_join->thd, curr_join->sum_funcs) || (tmp_error= do_select(curr_join, (List<Item> *) 0, curr_tmp_table, 0))) @@ -1608,6 +1614,16 @@ JOIN::exec() (select_options & OPTION_FOUND_ROWS ? HA_POS_ERROR : unit->select_limit_cnt))) DBUG_VOID_RETURN; + if (curr_join->const_tables != curr_join->tables && + !curr_join->join_tab[curr_join->const_tables].table->sort.io_cache) + { + /* + If no IO cache exists for the first table then we are using an + INDEX SCAN and no filesort. Thus we should not remove the sorted + attribute on the INDEX SCAN. + */ + skip_sort_order= 1; + } } } /* XXX: When can we have here thd->net.report_error not zero? */ @@ -5659,6 +5675,7 @@ make_join_readinfo(JOIN *join, uint options) uint i; bool statistics= test(!(join->select_options & SELECT_DESCRIBE)); + bool sorted= 1; DBUG_ENTER("make_join_readinfo"); for (i=join->const_tables ; i < join->tables ; i++) @@ -5668,6 +5685,8 @@ make_join_readinfo(JOIN *join, uint options) tab->read_record.table= table; tab->read_record.file=table->file; tab->next_select=sub_select; /* normal select */ + tab->sorted= sorted; + sorted= 0; // only first must be sorted switch (tab->type) { case JT_SYSTEM: // Only happens with left join table->status=STATUS_NO_RECORD; @@ -8915,7 +8934,12 @@ bool create_myisam_from_heap(THD *thd, TABLE *table, TMP_TABLE_PARAM *param, new_table.file->extra(HA_EXTRA_WRITE_CACHE); #endif - /* copy all old rows */ + /* + copy all old rows from heap table to MyISAM table + This is the only code that uses record[1] to read/write but this + is safe as this is a temporary MyISAM table without timestamp/autoincrement + or partitioning. + */ while (!table->file->rnd_next(new_table.record[1])) { if ((write_err=new_table.file->write_row(new_table.record[1]))) @@ -9046,7 +9070,7 @@ do_select(JOIN *join,List<Item> *fields,TABLE *table,Procedure *procedure) empty_record(table); if (table->group && join->tmp_table_param.sum_func_count && table->s->keys && !table->file->inited) - table->file->ha_index_init(0); + table->file->ha_index_init(0, 0); } /* Set up select_end */ join->join_tab[join->tables-1].next_select= setup_end_select_func(join); @@ -9660,7 +9684,13 @@ join_read_const_table(JOIN_TAB *tab, POSITION *pos) table->file->extra(HA_EXTRA_KEYREAD); tab->index= tab->ref.key; } - if ((error=join_read_const(tab))) + error=join_read_const(tab); + if (table->key_read) + { + table->key_read=0; + table->file->extra(HA_EXTRA_NO_KEYREAD); + } + if (error) { tab->info="unique row not found"; /* Mark for EXPLAIN that the row was not found */ @@ -9668,11 +9698,6 @@ join_read_const_table(JOIN_TAB *tab, POSITION *pos) if (!table->maybe_null || error > 0) DBUG_RETURN(error); } - if (table->key_read) - { - table->key_read=0; - table->file->extra(HA_EXTRA_NO_KEYREAD); - } } if (*tab->on_expr_ref && !table->null_row) { @@ -9744,7 +9769,7 @@ join_read_const(JOIN_TAB *tab) table->status= STATUS_NOT_FOUND; mark_as_null_row(tab->table); empty_record(table); - if (error != HA_ERR_KEY_NOT_FOUND) + if (error != HA_ERR_KEY_NOT_FOUND && error != HA_ERR_END_OF_FILE) return report_error(table, error); return -1; } @@ -9767,7 +9792,9 @@ join_read_key(JOIN_TAB *tab) TABLE *table= tab->table; if (!table->file->inited) - table->file->ha_index_init(tab->ref.key); + { + table->file->ha_index_init(tab->ref.key, tab->sorted); + } if (cmp_buffer_with_ref(tab) || (table->status & (STATUS_GARBAGE | STATUS_NO_PARENT | STATUS_NULL_ROW))) { @@ -9779,7 +9806,7 @@ join_read_key(JOIN_TAB *tab) error=table->file->index_read(table->record[0], tab->ref.key_buff, tab->ref.key_length,HA_READ_KEY_EXACT); - if (error && error != HA_ERR_KEY_NOT_FOUND) + if (error && error != HA_ERR_KEY_NOT_FOUND && error != HA_ERR_END_OF_FILE) return report_error(table, error); } table->null_row=0; @@ -9794,14 +9821,16 @@ join_read_always_key(JOIN_TAB *tab) TABLE *table= tab->table; if (!table->file->inited) - table->file->ha_index_init(tab->ref.key); + { + table->file->ha_index_init(tab->ref.key, tab->sorted); + } if (cp_buffer_from_ref(tab->join->thd, &tab->ref)) return -1; if ((error=table->file->index_read(table->record[0], tab->ref.key_buff, tab->ref.key_length,HA_READ_KEY_EXACT))) { - if (error != HA_ERR_KEY_NOT_FOUND) + if (error != HA_ERR_KEY_NOT_FOUND && error != HA_ERR_END_OF_FILE) return report_error(table, error); return -1; /* purecov: inspected */ } @@ -9821,14 +9850,14 @@ join_read_last_key(JOIN_TAB *tab) TABLE *table= tab->table; if (!table->file->inited) - table->file->ha_index_init(tab->ref.key); + table->file->ha_index_init(tab->ref.key, tab->sorted); if (cp_buffer_from_ref(tab->join->thd, &tab->ref)) return -1; if ((error=table->file->index_read_last(table->record[0], tab->ref.key_buff, tab->ref.key_length))) { - if (error != HA_ERR_KEY_NOT_FOUND) + if (error != HA_ERR_KEY_NOT_FOUND && error != HA_ERR_END_OF_FILE) return report_error(table, error); return -1; /* purecov: inspected */ } @@ -9931,7 +9960,7 @@ join_read_first(JOIN_TAB *tab) tab->read_record.index=tab->index; tab->read_record.record=table->record[0]; if (!table->file->inited) - table->file->ha_index_init(tab->index); + table->file->ha_index_init(tab->index, tab->sorted); if ((error=tab->table->file->index_first(tab->table->record[0]))) { if (error != HA_ERR_KEY_NOT_FOUND && error != HA_ERR_END_OF_FILE) @@ -9970,7 +9999,7 @@ join_read_last(JOIN_TAB *tab) tab->read_record.index=tab->index; tab->read_record.record=table->record[0]; if (!table->file->inited) - table->file->ha_index_init(tab->index); + table->file->ha_index_init(tab->index, 1); if ((error= tab->table->file->index_last(tab->table->record[0]))) return report_error(table, error); return 0; @@ -9994,7 +10023,7 @@ join_ft_read_first(JOIN_TAB *tab) TABLE *table= tab->table; if (!table->file->inited) - table->file->ha_index_init(tab->ref.key); + table->file->ha_index_init(tab->ref.key, 1); #if NOT_USED_YET if (cp_buffer_from_ref(tab->join->thd, &tab->ref)) // as ft-key doesn't use store_key's return -1; // see also FT_SELECT::init() @@ -10380,7 +10409,7 @@ end_update(JOIN *join, JOIN_TAB *join_tab __attribute__((unused)), error, 0)) DBUG_RETURN(NESTED_LOOP_ERROR); // Not a table_is_full error /* Change method to update rows */ - table->file->ha_index_init(0); + table->file->ha_index_init(0, 0); join->join_tab[join->tables-1].next_select=end_unique_update; } join->send_records++; diff --git a/sql/sql_select.h b/sql/sql_select.h index 9285e33be33..ec424c48366 100644 --- a/sql/sql_select.h +++ b/sql/sql_select.h @@ -133,6 +133,7 @@ typedef struct st_join_table { uint used_fields,used_fieldlength,used_blobs; enum join_type type; bool cached_eq_ref_table,eq_ref_table,not_used_in_distinct; + bool sorted; TABLE_REF ref; JOIN_CACHE cache; JOIN *join; diff --git a/sql/sql_show.cc b/sql/sql_show.cc index 8343f9ec582..e786b70114d 100644 --- a/sql/sql_show.cc +++ b/sql/sql_show.cc @@ -963,11 +963,16 @@ store_create_info(THD *thd, TABLE_LIST *table_list, String *packet) packet->append("\n)", 2); if (!(thd->variables.sql_mode & MODE_NO_TABLE_OPTIONS) && !foreign_db_mode) { - if (thd->variables.sql_mode & (MODE_MYSQL323 | MODE_MYSQL40)) - packet->append(" TYPE=", 6); - else - packet->append(" ENGINE=", 8); - packet->append(file->table_type()); +#ifdef HAVE_PARTITION_DB + if (!table->s->part_info) +#endif + { + if (thd->variables.sql_mode & (MODE_MYSQL323 | MODE_MYSQL40)) + packet->append(" TYPE=", 6); + else + packet->append(" ENGINE=", 8); + packet->append(file->table_type()); + } if (share->table_charset && !(thd->variables.sql_mode & MODE_MYSQL323) && @@ -1034,6 +1039,23 @@ store_create_info(THD *thd, TABLE_LIST *table_list, String *packet) append_directory(thd, packet, "DATA", create_info.data_file_name); append_directory(thd, packet, "INDEX", create_info.index_file_name); } +#ifdef HAVE_PARTITION_DB + { + /* + Partition syntax for CREATE TABLE is at the end of the syntax. + */ + uint part_syntax_len; + char *part_syntax; + if (table->s->part_info && + ((part_syntax= generate_partition_syntax(table->s->part_info, + &part_syntax_len, + FALSE)))) + { + packet->append(part_syntax, part_syntax_len); + my_free(part_syntax, MYF(0)); + } + } +#endif DBUG_RETURN(0); } @@ -2728,7 +2750,7 @@ int fill_schema_proc(THD *thd, TABLE_LIST *tables, COND *cond) { DBUG_RETURN(1); } - proc_table->file->ha_index_init(0); + proc_table->file->ha_index_init(0, 1); if ((res= proc_table->file->index_first(proc_table->record[0]))) { res= (res == HA_ERR_END_OF_FILE) ? 0 : 1; diff --git a/sql/sql_table.cc b/sql/sql_table.cc index 1d7f3ca87be..21348908bf3 100644 --- a/sql/sql_table.cc +++ b/sql/sql_table.cc @@ -28,6 +28,7 @@ #include <io.h> #endif + const char *primary_key_name="PRIMARY"; static bool check_if_keyname_exists(const char *name,KEY *start, KEY *end); @@ -1513,7 +1514,66 @@ bool mysql_create_table(THD *thd,const char *db, const char *table_name, if (create_info->row_type == ROW_TYPE_DYNAMIC) db_options|=HA_OPTION_PACK_RECORD; alias= table_case_name(create_info, table_name); - file=get_new_handler((TABLE*) 0, create_info->db_type); + if (!(file=get_new_handler((TABLE*) 0, create_info->db_type))) + { + my_error(ER_OUTOFMEMORY, MYF(0), 128);//128 bytes invented + DBUG_RETURN(TRUE); + } +#ifdef HAVE_PARTITION_DB + partition_info *part_info= thd->lex->part_info; + if (part_info) + { + /* + The table has been specified as a partitioned table. + If this is part of an ALTER TABLE the handler will be the partition + handler but we need to specify the default handler to use for + partitions also in the call to check_partition_info. We transport + this information in the default_db_type variable, it is either + DB_TYPE_DEFAULT or the engine set in the ALTER TABLE command. + */ + enum db_type part_engine_type= create_info->db_type; + char *part_syntax_buf; + uint syntax_len; + if (part_engine_type == DB_TYPE_PARTITION_DB) + { + /* + This only happens at ALTER TABLE. + default_engine_type was assigned from the engine set in the ALTER + TABLE command. + */ + part_engine_type= ha_checktype(thd, + part_info->default_engine_type, 0, 0); + } + if (check_partition_info(part_info, part_engine_type, + file, create_info->max_rows)) + DBUG_RETURN(TRUE); + /* + We reverse the partitioning parser and generate a standard format + for syntax stored in frm file. + */ + if (!(part_syntax_buf= generate_partition_syntax(part_info, + &syntax_len, + TRUE))) + DBUG_RETURN(TRUE); + part_info->part_info_string= part_syntax_buf; + part_info->part_info_len= syntax_len; + if ((!(file->partition_flags() & HA_CAN_PARTITION)) || + create_info->db_type == DB_TYPE_PARTITION_DB) + { + /* + The handler assigned to the table cannot handle partitioning. + Assign the partition handler as the handler of the table. + */ + DBUG_PRINT("info", ("db_type= %d, part_flag= %d", create_info->db_type,file->partition_flags())); + delete file; + create_info->db_type= DB_TYPE_PARTITION_DB; + if (!(file= get_ha_partition(part_info))) + { + DBUG_RETURN(TRUE); + } + } + } +#endif #ifdef NOT_USED /* @@ -1527,7 +1587,7 @@ bool mysql_create_table(THD *thd,const char *db, const char *table_name, (file->table_flags() & HA_NO_TEMP_TABLES)) { my_error(ER_ILLEGAL_HA, MYF(0), table_name); - DBUG_RETURN(TRUE); + goto err; } #endif @@ -1550,7 +1610,7 @@ bool mysql_create_table(THD *thd,const char *db, const char *table_name, &keys, internal_tmp_table, &db_options, file, &key_info_buffer, &key_count, select_field_count)) - DBUG_RETURN(TRUE); + goto err; /* Check if table exists */ if (create_info->options & HA_LEX_CREATE_TMP_TABLE) @@ -1572,13 +1632,13 @@ bool mysql_create_table(THD *thd,const char *db, const char *table_name, if (create_info->options & HA_LEX_CREATE_IF_NOT_EXISTS) { create_info->table_existed= 1; // Mark that table existed - DBUG_RETURN(FALSE); + goto no_err; } my_error(ER_TABLE_EXISTS_ERROR, MYF(0), alias); - DBUG_RETURN(TRUE); + goto err; } if (wait_if_global_read_lock(thd, 0, 1)) - DBUG_RETURN(error); + goto err; VOID(pthread_mutex_lock(&LOCK_open)); if (!internal_tmp_table && !(create_info->options & HA_LEX_CREATE_TMP_TABLE)) { @@ -1631,7 +1691,7 @@ bool mysql_create_table(THD *thd,const char *db, const char *table_name, create_info->table_options=db_options; if (rea_create_table(thd, path, create_info, fields, key_count, - key_info_buffer)) + key_info_buffer, file)) { /* my_error(ER_CANT_CREATE_TABLE,MYF(0),table_name,my_errno); */ goto end; @@ -1660,6 +1720,13 @@ end: delete file; thd->proc_info="After create"; DBUG_RETURN(error); + +err: + delete file; + DBUG_RETURN(TRUE); +no_err: + delete file; + DBUG_RETURN(FALSE); } /* @@ -3138,6 +3205,59 @@ bool mysql_alter_table(THD *thd,char *new_db, char *new_name, old_db_type= table->s->db_type; if (create_info->db_type == DB_TYPE_DEFAULT) create_info->db_type= old_db_type; +#ifdef HAVE_PARTITION_DB + /* + When thd->lex->part_info has a reference to a partition_info the + ALTER TABLE contained a definition of a partitioning. + + Case I: + If there was a partition before and there is a new one defined. + We use the new partitioning. The new partitioning is already + defined in the correct variable so no work is needed to + accomplish this. + + Case IIa: + There was a partitioning before and there is no new one defined. + Also the user has not specified an explicit engine to use. + + We use the old partitioning also for the new table. We do this + by assigning the partition_info from the table loaded in + open_ltable to the partition_info struct used by mysql_create_table + later in this method. + + Case IIb: + There was a partitioning before and there is no new one defined. + The user has specified an explicit engine to use. + + Since the user has specified an explicit engine to use we override + the old partitioning info and create a new table using the specified + engine. This is the reason for the extra check if old and new engine + is equal. + + Case III: + There was no partitioning before altering the table, there is + partitioning defined in the altered table. Use the new partitioning. + No work needed since the partitioning info is already in the + correct variable. + + Case IV: + There was no partitioning before and no partitioning defined. Obviously + no work needed. + */ + if (table->s->part_info) + if (!thd->lex->part_info && + create_info->db_type == old_db_type) + thd->lex->part_info= table->s->part_info; + if (thd->lex->part_info) + { + /* + Need to cater for engine types that can handle partition without + using the partition handler. + */ + thd->lex->part_info->default_engine_type= create_info->db_type; + create_info->db_type= DB_TYPE_PARTITION_DB; + } +#endif if (check_engine(thd, new_name, &create_info->db_type)) DBUG_RETURN(TRUE); new_db_type= create_info->db_type; diff --git a/sql/sql_update.cc b/sql/sql_update.cc index 5c6324e15fd..95d0f500df8 100644 --- a/sql/sql_update.cc +++ b/sql/sql_update.cc @@ -148,7 +148,7 @@ int mysql_update(THD *thd, /* pass counter value */ thd->lex->table_count= table_count; /* convert to multiupdate */ - return 2; + DBUG_RETURN(2); } if (lock_tables(thd, table_list, table_count) || @@ -265,7 +265,12 @@ int mysql_update(THD *thd, else used_key_is_modified=0; +#ifdef HAVE_PARTITION_DB + if (used_key_is_modified || order || + partition_key_modified(table, fields)) +#else if (used_key_is_modified || order) +#endif { /* We can't update table directly; We must first search after all @@ -452,8 +457,8 @@ int mysql_update(THD *thd, call then it should be included in the count of dup_key_found and error should be set to 0 (only if these errors are ignored). */ - error= table->file->bulk_update_row(table->record[0], - table->record[1], + error= table->file->bulk_update_row(table->record[1], + table->record[0], &dup_key_found); limit+= dup_key_found; updated-= dup_key_found; diff --git a/sql/sql_yacc.yy b/sql/sql_yacc.yy index 92680e48945..e4259197292 100644 --- a/sql/sql_yacc.yy +++ b/sql/sql_yacc.yy @@ -356,13 +356,16 @@ bool my_yyoverflow(short **a, YYSTYPE **b, ulong *yystacksize); %token LEAVES %token LEAVE_SYM %token LEFT +%token LESS_SYM %token LEVEL_SYM %token LEX_HOSTNAME %token LIKE %token LIMIT +%token LINEAR_SYM %token LINEFROMTEXT %token LINES %token LINESTRING +%token LIST_SYM %token LOAD %token LOCAL_SYM %token LOCATE @@ -402,6 +405,7 @@ bool my_yyoverflow(short **a, YYSTYPE **b, ulong *yystacksize); %token MAX_SYM %token MAX_UPDATES_PER_HOUR %token MAX_USER_CONNECTIONS_SYM +%token MAX_VALUE_SYM %token MEDIUMBLOB %token MEDIUMINT %token MEDIUMTEXT @@ -436,6 +440,7 @@ bool my_yyoverflow(short **a, YYSTYPE **b, ulong *yystacksize); %token NE %token NEW_SYM %token NEXT_SYM +%token NODEGROUP_SYM %token NONE_SYM %token NOT2_SYM %token NOT_SYM @@ -464,6 +469,8 @@ bool my_yyoverflow(short **a, YYSTYPE **b, ulong *yystacksize); %token OUT_SYM %token PACK_KEYS_SYM %token PARTIAL +%token PARTITION_SYM +%token PARTITIONS_SYM %token PASSWORD %token PARAM_MARKER %token PHASE_SYM @@ -490,6 +497,7 @@ bool my_yyoverflow(short **a, YYSTYPE **b, ulong *yystacksize); %token RAID_STRIPED_SYM %token RAID_TYPE %token RAND +%token RANGE_SYM %token READS_SYM %token READ_SYM %token REAL @@ -575,6 +583,8 @@ bool my_yyoverflow(short **a, YYSTYPE **b, ulong *yystacksize); %token STRING_SYM %token SUBDATE_SYM %token SUBJECT_SYM +%token SUBPARTITION_SYM +%token SUBPARTITIONS_SYM %token SUBSTRING %token SUBSTRING_INDEX %token SUM_SYM @@ -595,6 +605,7 @@ bool my_yyoverflow(short **a, YYSTYPE **b, ulong *yystacksize); %token TINYBLOB %token TINYINT %token TINYTEXT +%token THAN_SYM %token TO_SYM %token TRAILING %token TRANSACTION_SYM @@ -618,11 +629,8 @@ bool my_yyoverflow(short **a, YYSTYPE **b, ulong *yystacksize); %token UNIX_TIMESTAMP %token UNKNOWN_SYM %token UNLOCK_SYM -%token UNLOCK_SYM %token UNSIGNED %token UNTIL_SYM -%token UNTIL_SYM -%token UPDATE_SYM %token UPDATE_SYM %token USAGE %token USER @@ -723,6 +731,7 @@ bool my_yyoverflow(short **a, YYSTYPE **b, ulong *yystacksize); sp_opt_default simple_ident_nospvar simple_ident_q field_or_var limit_option + part_bit_expr part_func_expr %type <item_num> NUM_literal @@ -821,6 +830,7 @@ bool my_yyoverflow(short **a, YYSTYPE **b, ulong *yystacksize); statement sp_suid opt_view_list view_list or_replace algorithm sp_c_chistics sp_a_chistics sp_chistic sp_c_chistic xa load_data opt_field_or_var_spec fields_or_vars opt_load_data_set_spec + partition_entry END_OF_INPUT %type <NONE> call sp_proc_stmts sp_proc_stmts1 sp_proc_stmt @@ -886,6 +896,7 @@ statement: | lock | optimize | keycache + | partition_entry | preload | prepare | purge @@ -2538,7 +2549,9 @@ trg_event: create2: '(' create2a {} - | opt_create_table_options create3 {} + | opt_create_table_options + opt_partitioning {} + create3 {} | LIKE table_ident { LEX *lex=Lex; @@ -2554,8 +2567,12 @@ create2: ; create2a: - field_list ')' opt_create_table_options create3 {} - | create_select ')' { Select->set_braces(1);} union_opt {} + field_list ')' opt_create_table_options + opt_partitioning {} + create3 {} + | opt_partitioning {} + create_select ')' + { Select->set_braces(1);} union_opt {} ; create3: @@ -2566,6 +2583,411 @@ create3: { Select->set_braces(1);} union_opt {} ; +/* + This part of the parser is about handling of the partition information. + + It's first version was written by Mikael Ronström with lots of answers to + questions provided by Antony Curtis. + + The partition grammar can be called from three places. + 1) CREATE TABLE ... PARTITION .. + 2) ALTER TABLE table_name PARTITION ... + 3) PARTITION ... + + The first place is called when a new table is created from a MySQL client. + The second place is called when a table is altered with the ALTER TABLE + command from a MySQL client. + The third place is called when opening an frm file and finding partition + info in the .frm file. It is necessary to avoid allowing PARTITION to be + an allowed entry point for SQL client queries. This is arranged by setting + some state variables before arriving here. + + To be able to handle errors we will only set error code in this code + and handle the error condition in the function calling the parser. This + is necessary to ensure we can also handle errors when calling the parser + from the openfrm function. +*/ +opt_partitioning: + /* empty */ {} + | partitioning + ; + +partitioning: + PARTITION_SYM + { Lex->part_info= new partition_info(); } + partition + ; + +partition_entry: + PARTITION_SYM + { + LEX *lex= Lex; + if (lex->part_info) + { + /* + We enter here when opening the frm file to translate + partition info string into part_info data structure. + */ + lex->part_info= new partition_info(); + } + else + { + yyerror(ER(ER_PARTITION_ENTRY_ERROR)); + YYABORT; + } + } + partition {}; + +partition: + BY part_type_def opt_no_parts {} opt_sub_part {} part_defs; + +part_type_def: + opt_linear KEY_SYM '(' part_field_list ')' + { + LEX *lex= Lex; + lex->part_info->list_of_part_fields= TRUE; + lex->part_info->part_type= HASH_PARTITION; + } + | opt_linear HASH_SYM + { Lex->part_info->part_type= HASH_PARTITION; } + part_func {} + | RANGE_SYM + { Lex->part_info->part_type= RANGE_PARTITION; } + part_func {} + | LIST_SYM + { Lex->part_info->part_type= LIST_PARTITION; } + part_func {}; + +opt_linear: + /* empty */ {} + | LINEAR_SYM + { Lex->part_info->linear_hash_ind= TRUE;}; + +part_field_list: + part_field_item {} + | part_field_list ',' part_field_item {}; + +part_field_item: + ident + { + Lex->part_info->part_field_list.push_back($1.str); + }; + +part_func: + '(' remember_name part_func_expr remember_end ')' + { + LEX *lex= Lex; + uint expr_len= (uint)($4 - $2) - 1; + lex->part_info->list_of_part_fields= FALSE; + lex->part_info->part_expr= $3; + lex->part_info->part_func_string= $2+1; + lex->part_info->part_func_len= expr_len; + }; + +sub_part_func: + '(' remember_name part_func_expr remember_end ')' + { + LEX *lex= Lex; + uint expr_len= (uint)($4 - $2) - 1; + lex->part_info->list_of_subpart_fields= FALSE; + lex->part_info->subpart_expr= $3; + lex->part_info->subpart_func_string= $2+1; + lex->part_info->subpart_func_len= expr_len; + }; + + +opt_no_parts: + /* empty */ {} + | PARTITIONS_SYM ulong_num + { + uint no_parts= $2; + if (no_parts == 0) + { + my_error(ER_NO_PARTS_ERROR, MYF(0), "partitions"); + YYABORT; + } + Lex->part_info->no_parts= no_parts; + }; + +opt_sub_part: + /* empty */ {} + | SUBPARTITION_SYM BY opt_linear HASH_SYM sub_part_func + { Lex->part_info->subpart_type= HASH_PARTITION; } + opt_no_subparts {} + | SUBPARTITION_SYM BY opt_linear KEY_SYM + '(' sub_part_field_list ')' + { + LEX *lex= Lex; + lex->part_info->subpart_type= HASH_PARTITION; + lex->part_info->list_of_subpart_fields= TRUE; + } + opt_no_subparts {}; + +sub_part_field_list: + sub_part_field_item {} + | sub_part_field_list ',' sub_part_field_item {}; + +sub_part_field_item: + ident + { Lex->part_info->subpart_field_list.push_back($1.str); }; + +part_func_expr: + bit_expr + { + LEX *lex= Lex; + bool not_corr_func; + not_corr_func= !lex->safe_to_cache_query; + lex->safe_to_cache_query= 1; + if (not_corr_func) + { + yyerror(ER(ER_CONST_EXPR_IN_PARTITION_FUNC_ERROR)); + YYABORT; + } + $$=$1; + } + +opt_no_subparts: + /* empty */ {} + | SUBPARTITIONS_SYM ulong_num + { + uint no_parts= $2; + if (no_parts == 0) + { + my_error(ER_NO_PARTS_ERROR, MYF(0), "subpartitions"); + YYABORT; + } + Lex->part_info->no_subparts= no_parts; + }; + +part_defs: + /* empty */ + {} + | '(' part_def_list ')' + { + LEX *lex= Lex; + partition_info *part_info= lex->part_info; + if (part_info->no_parts != 0) + { + if (part_info->no_parts != + part_info->count_curr_parts) + { + yyerror(ER(ER_PARTITION_WRONG_NO_PART_ERROR)); + YYABORT; + } + } + else if (part_info->count_curr_parts > 0) + { + part_info->no_parts= part_info->count_curr_parts; + } + part_info->count_curr_subparts= 0; + part_info->count_curr_parts= 0; + }; + +part_def_list: + part_definition {} + | part_def_list ',' part_definition {}; + +part_definition: + PARTITION_SYM + { + LEX *lex= Lex; + partition_info *part_info= lex->part_info; + partition_element *p_elem= new partition_element(); + if (!p_elem) + { + my_error(ER_OUTOFMEMORY, MYF(0), sizeof(partition_element)); + YYABORT; + } + part_info->curr_part_elem= p_elem; + part_info->current_partition= p_elem; + part_info->use_default_partitions= FALSE; + part_info->partitions.push_back(p_elem); + p_elem->engine_type= DB_TYPE_UNKNOWN; + part_info->count_curr_parts++; + } + part_name {} + opt_part_values {} + opt_part_options {} + opt_sub_partition {}; + +part_name: + ident_or_text + { Lex->part_info->curr_part_elem->partition_name= $1.str; }; + +opt_part_values: + /* empty */ + { + LEX *lex= Lex; + if (lex->part_info->part_type == RANGE_PARTITION) + { + my_error(ER_PARTITION_REQUIRES_VALUES_ERROR, MYF(0), + "RANGE", "LESS THAN"); + YYABORT; + } + if (lex->part_info->part_type == LIST_PARTITION) + { + my_error(ER_PARTITION_REQUIRES_VALUES_ERROR, MYF(0), + "LIST", "IN"); + YYABORT; + } + } + | VALUES LESS_SYM THAN_SYM part_func_max + { + if (Lex->part_info->part_type != RANGE_PARTITION) + { + my_error(ER_PARTITION_WRONG_VALUES_ERROR, MYF(0), + "RANGE", "LESS THAN"); + YYABORT; + } + } + | VALUES IN_SYM '(' part_list_func ')' + { + if (Lex->part_info->part_type != LIST_PARTITION) + { + my_error(ER_PARTITION_WRONG_VALUES_ERROR, MYF(0), + "LIST", "IN"); + YYABORT; + } + }; + +part_func_max: + MAX_VALUE_SYM + { + LEX *lex= Lex; + if (lex->part_info->defined_max_value) + { + yyerror(ER(ER_PARTITION_MAXVALUE_ERROR)); + YYABORT; + } + lex->part_info->defined_max_value= TRUE; + } + | part_range_func + { + if (Lex->part_info->defined_max_value) + { + yyerror(ER(ER_PARTITION_MAXVALUE_ERROR)); + YYABORT; + } + }; + +part_range_func: + '(' part_bit_expr ')' + { + Lex->part_info->curr_part_elem->range_expr= $2; + }; + +part_list_func: + part_list_item {} + | part_list_func ',' part_list_item {}; + +part_list_item: + part_bit_expr + { + Lex->part_info->curr_part_elem->list_expr_list.push_back($1); + }; + +part_bit_expr: + bit_expr + { + Item *part_expr= $1; + bool not_corr_func; + LEX *lex= Lex; + Name_resolution_context *context= &lex->current_select->context; + TABLE_LIST *save_list= context->table_list; + + context->table_list= 0; + part_expr->fix_fields(YYTHD, (Item**)0); + context->table_list= save_list; + not_corr_func= !part_expr->const_item() || + !lex->safe_to_cache_query; + lex->safe_to_cache_query= 1; + if (not_corr_func) + { + yyerror(ER(ER_NO_CONST_EXPR_IN_RANGE_OR_LIST_ERROR)); + YYABORT; + } + $$= part_expr; + } + +opt_sub_partition: + /* empty */ {} + | '(' sub_part_list ')' + { + LEX *lex= Lex; + partition_info *part_info= lex->part_info; + if (part_info->no_subparts != 0) + { + if (part_info->no_subparts != + part_info->count_curr_subparts) + { + yyerror(ER(ER_PARTITION_WRONG_NO_SUBPART_ERROR)); + YYABORT; + } + } + else if (part_info->count_curr_subparts > 0) + { + part_info->no_subparts= part_info->count_curr_subparts; + } + part_info->count_curr_subparts= 0; + }; + +sub_part_list: + sub_part_definition {} + | sub_part_list ',' sub_part_definition {}; + +sub_part_definition: + SUBPARTITION_SYM + { + LEX *lex= Lex; + partition_info *part_info= lex->part_info; + partition_element *p_elem= new partition_element(); + if (!p_elem) + { + my_error(ER_OUTOFMEMORY, MYF(0), sizeof(partition_element)); + YYABORT; + } + part_info->curr_part_elem= p_elem; + part_info->current_partition->subpartitions.push_back(p_elem); + part_info->use_default_subpartitions= FALSE; + part_info->count_curr_subparts++; + p_elem->engine_type= DB_TYPE_UNKNOWN; + } + sub_name opt_part_options {}; + +sub_name: + ident_or_text + { Lex->part_info->curr_part_elem->partition_name= $1.str; }; + +opt_part_options: + /* empty */ {} + | opt_part_option_list {}; + +opt_part_option_list: + opt_part_option_list opt_part_option {} + | opt_part_option {}; + +opt_part_option: + TABLESPACE opt_equal ident_or_text + { Lex->part_info->curr_part_elem->tablespace_name= $3.str; } + | opt_storage ENGINE_SYM opt_equal storage_engines + { Lex->part_info->curr_part_elem->engine_type= $4; } + | NODEGROUP_SYM opt_equal ulong_num + { Lex->part_info->curr_part_elem->nodegroup_id= $3; } + | MAX_ROWS opt_equal ulonglong_num + { Lex->part_info->curr_part_elem->part_max_rows= $3; } + | MIN_ROWS opt_equal ulonglong_num + { Lex->part_info->curr_part_elem->part_min_rows= $3; } + | DATA_SYM DIRECTORY_SYM opt_equal TEXT_STRING_sys + { Lex->part_info->curr_part_elem->data_file_name= $4.str; } + | INDEX_SYM DIRECTORY_SYM opt_equal TEXT_STRING_sys + { Lex->part_info->curr_part_elem->index_file_name= $4.str; } + | COMMENT_SYM opt_equal TEXT_STRING_sys + { Lex->part_info->curr_part_elem->part_comment= $3.str; }; + +/* + End of partition parser part +*/ + create_select: SELECT_SYM { @@ -3338,7 +3760,7 @@ alter: lex->alter_info.reset(); lex->alter_info.flags= 0; } - alter_list + alter_commands {} | ALTER DATABASE ident_or_empty { @@ -3404,11 +3826,18 @@ ident_or_empty: /* empty */ { $$= 0; } | ident { $$= $1.str; }; -alter_list: +alter_commands: | DISCARD TABLESPACE { Lex->alter_info.tablespace_op= DISCARD_TABLESPACE; } | IMPORT TABLESPACE { Lex->alter_info.tablespace_op= IMPORT_TABLESPACE; } - | alter_list_item - | alter_list ',' alter_list_item; + | alter_list + opt_partitioning + | partitioning + ; + +alter_list: + alter_list_item + | alter_list ',' alter_list_item + ; add_column: ADD opt_column @@ -7363,6 +7792,7 @@ keyword: | LANGUAGE_SYM {} | NO_SYM {} | OPEN_SYM {} + | PARTITION_SYM {} | PREPARE_SYM {} | REPAIR {} | RESET_SYM {} @@ -7463,8 +7893,10 @@ keyword_sp: | RELAY_THREAD {} | LAST_SYM {} | LEAVES {} + | LESS_SYM {} | LEVEL_SYM {} | LINESTRING {} + | LIST_SYM {} | LOCAL_SYM {} | LOCKS_SYM {} | LOGS_SYM {} @@ -7488,6 +7920,7 @@ keyword_sp: | MAX_QUERIES_PER_HOUR {} | MAX_UPDATES_PER_HOUR {} | MAX_USER_CONNECTIONS_SYM {} + | MAX_VALUE_SYM {} | MEDIUM_SYM {} | MERGE_SYM {} | MICROSECOND_SYM {} @@ -7508,6 +7941,7 @@ keyword_sp: | NDBCLUSTER_SYM {} | NEXT_SYM {} | NEW_SYM {} + | NODEGROUP_SYM {} | NONE_SYM {} | NVARCHAR_SYM {} | OFFSET_SYM {} @@ -7516,6 +7950,7 @@ keyword_sp: | ONE_SYM {} | PACK_KEYS_SYM {} | PARTIAL {} + | PARTITIONS_SYM {} | PASSWORD {} | PHASE_SYM {} | POINT_SYM {} @@ -7566,6 +8001,8 @@ keyword_sp: | STRING_SYM {} | SUBDATE_SYM {} | SUBJECT_SYM {} + | SUBPARTITION_SYM {} + | SUBPARTITIONS_SYM {} | SUPER_SYM {} | SUSPEND_SYM {} | TABLES {} @@ -7573,6 +8010,7 @@ keyword_sp: | TEMPORARY {} | TEMPTABLE_SYM {} | TEXT_SYM {} + | THAN_SYM {} | TRANSACTION_SYM {} | TIMESTAMP {} | TIMESTAMP_ADD {} diff --git a/sql/table.cc b/sql/table.cc index bbf34ae2c25..8852e1fa9dd 100644 --- a/sql/table.cc +++ b/sql/table.cc @@ -70,7 +70,7 @@ int openfrm(THD *thd, const char *name, const char *alias, uint db_stat, int j,error, errarg= 0; uint rec_buff_length,n_length,int_length,records,key_parts,keys, interval_count,interval_parts,read_length,db_create_options; - uint key_info_length, com_length; + uint key_info_length, com_length, part_info_len, extra_rec_buf_length; ulong pos; char index_file[FN_REFLEN], *names, *keynames, *comment_pos; uchar head[288],*disk_buff,new_field_pack_flag; @@ -153,6 +153,7 @@ int openfrm(THD *thd, const char *name, const char *alias, uint db_stat, goto err; /* purecov: inspected */ *fn_ext(index_file)='\0'; // Remove .frm extension + part_info_len= uint4korr(head+55); share->frm_version= head[2]; /* Check if .frm file created by MySQL 5.0. In this case we want to @@ -300,10 +301,6 @@ int openfrm(THD *thd, const char *name, const char *alias, uint db_stat, } #endif - /* Allocate handler */ - if (!(outparam->file= get_new_handler(outparam, share->db_type))) - goto err; - error=4; outparam->reginfo.lock_type= TL_UNLOCK; outparam->current_lock=F_UNLCK; @@ -314,8 +311,9 @@ int openfrm(THD *thd, const char *name, const char *alias, uint db_stat, if (prgflag & (READ_ALL+EXTRA_RECORD)) records++; /* QQ: TODO, remove the +1 from below */ + extra_rec_buf_length= uint2korr(head+59); rec_buff_length= ALIGN_SIZE(share->reclength + 1 + - outparam->file->extra_rec_buf_length()); + extra_rec_buf_length); share->rec_buff_length= rec_buff_length; if (!(record= (char *) alloc_root(&outparam->mem_root, rec_buff_length * records))) @@ -435,9 +433,22 @@ int openfrm(THD *thd, const char *name, const char *alias, uint db_stat, if (keynames) fix_type_pointers(&int_array, &share->keynames, 1, &keynames); + if (part_info_len > 0) + { +#ifdef HAVE_PARTITION_DB + if (mysql_unpack_partition(file, thd, part_info_len, outparam)) + goto err; +#else + goto err; +#endif + } VOID(my_close(file,MYF(MY_WME))); file= -1; + /* Allocate handler */ + if (!(outparam->file= get_new_handler(outparam, share->db_type))) + goto err; + record= (char*) outparam->record[0]-1; /* Fieldstart = 1 */ if (null_field_first) { @@ -859,6 +870,13 @@ int openfrm(THD *thd, const char *name, const char *alias, uint db_stat, if (outparam->file->ha_allocate_read_write_set(share->fields)) goto err; + /* Fix the partition functions and ensure they are not constant functions*/ + if (part_info_len > 0) +#ifdef HAVE_PARTITION_DB + if (fix_partition_func(thd,name,outparam)) +#endif + goto err; + /* The table struct is now initialized; Open the table */ error=2; if (db_stat) @@ -916,6 +934,13 @@ int openfrm(THD *thd, const char *name, const char *alias, uint db_stat, if (! error_reported) frm_error(error,outparam,name,ME_ERROR+ME_WAITTANG, errarg); delete outparam->file; +#ifdef HAVE_PARTITION_DB + if (outparam->s->part_info) + { + free_items(outparam->s->part_info->item_free_list); + outparam->s->part_info->item_free_list= 0; + } +#endif outparam->file=0; // For easier errorchecking outparam->db_stat=0; hash_free(&share->name_hash); @@ -942,6 +967,13 @@ int closefrm(register TABLE *table) table->field= 0; } delete table->file; +#ifdef HAVE_PARTITION_DB + if (table->s->part_info) + { + free_items(table->s->part_info->item_free_list); + table->s->part_info->item_free_list= 0; + } +#endif table->file= 0; /* For easier errorchecking */ hash_free(&table->s->name_hash); free_root(&table->mem_root, MYF(0)); diff --git a/sql/table.h b/sql/table.h index e5653a1f213..6ba9453b2f0 100644 --- a/sql/table.h +++ b/sql/table.h @@ -21,6 +21,7 @@ class Item; /* Needed by ORDER */ class GRANT_TABLE; class st_select_lex_unit; class st_select_lex; +class partition_info; class COND_EQUAL; /* Order clause list element */ @@ -96,6 +97,9 @@ class Table_triggers_list; typedef struct st_table_share { +#ifdef HAVE_PARTITION_DB + partition_info *part_info; /* Partition related information */ +#endif /* hash of field names (contains pointers to elements of field array) */ HASH name_hash; /* hash of field names */ MEM_ROOT mem_root; @@ -203,6 +207,8 @@ struct st_table { ORDER *group; const char *alias; /* alias or table name */ uchar *null_flags; + MY_BITMAP *read_set; + MY_BITMAP *write_set; query_id_t query_id; ha_rows quick_rows[MAX_KEY]; @@ -256,6 +262,7 @@ struct st_table { my_bool auto_increment_field_not_null; my_bool insert_or_update; /* Can be used by the handler */ my_bool alias_name_used; /* true if table_name is alias */ + my_bool get_fields_in_item_tree; /* Signal to fix_field */ REGINFO reginfo; /* field connections */ MEM_ROOT mem_root; diff --git a/sql/tztime.cc b/sql/tztime.cc index f5111459da2..bb516731440 100644 --- a/sql/tztime.cc +++ b/sql/tztime.cc @@ -1623,7 +1623,7 @@ my_tz_init(THD *org_thd, const char *default_tzname, my_bool bootstrap) mysql.time_zone* tables are MyISAM and these operations always succeed for MyISAM. */ - (void)table->file->ha_index_init(0); + (void)table->file->ha_index_init(0, 1); tz_leapcnt= 0; res= table->file->index_first(table->record[0]); @@ -1800,7 +1800,7 @@ tz_load_from_open_tables(const String *tz_name, TABLE_LIST *tz_tables) mysql.time_zone* tables are MyISAM and these operations always succeed for MyISAM. */ - (void)table->file->ha_index_init(0); + (void)table->file->ha_index_init(0, 1); if (table->file->index_read(table->record[0], (byte*)table->field[0]->ptr, 0, HA_READ_KEY_EXACT)) @@ -1827,7 +1827,7 @@ tz_load_from_open_tables(const String *tz_name, TABLE_LIST *tz_tables) table= tz_tables->table; tz_tables= tz_tables->next_local; table->field[0]->store((longlong)tzid); - (void)table->file->ha_index_init(0); + (void)table->file->ha_index_init(0, 1); if (table->file->index_read(table->record[0], (byte*)table->field[0]->ptr, 0, HA_READ_KEY_EXACT)) @@ -1854,7 +1854,7 @@ tz_load_from_open_tables(const String *tz_name, TABLE_LIST *tz_tables) table= tz_tables->table; tz_tables= tz_tables->next_local; table->field[0]->store((longlong)tzid); - (void)table->file->ha_index_init(0); + (void)table->file->ha_index_init(0, 1); // FIXME Is there any better approach than explicitly specifying 4 ??? res= table->file->index_read(table->record[0], (byte*)table->field[0]->ptr, @@ -1926,7 +1926,7 @@ tz_load_from_open_tables(const String *tz_name, TABLE_LIST *tz_tables) */ table= tz_tables->table; table->field[0]->store((longlong)tzid); - (void)table->file->ha_index_init(0); + (void)table->file->ha_index_init(0, 1); // FIXME Is there any better approach than explicitly specifying 4 ??? res= table->file->index_read(table->record[0], (byte*)table->field[0]->ptr, diff --git a/sql/unireg.cc b/sql/unireg.cc index 2ea79d92e37..cdbae4f1eb9 100644 --- a/sql/unireg.cc +++ b/sql/unireg.cc @@ -46,7 +46,8 @@ static bool pack_fields(File file, List<create_field> &create_fields, static bool make_empty_rec(THD *thd, int file, enum db_type table_type, uint table_options, List<create_field> &create_fields, - uint reclength, ulong data_offset); + uint reclength, ulong data_offset, + handler *handler); /* Create a frm (table definition) file @@ -79,13 +80,18 @@ bool mysql_create_frm(THD *thd, my_string file_name, uchar fileinfo[64],forminfo[288],*keybuff; TYPELIB formnames; uchar *screen_buff; +#ifdef HAVE_PARTITION_DB + partition_info *part_info= thd->lex->part_info; +#endif DBUG_ENTER("mysql_create_frm"); +#ifdef HAVE_PARTITION_DB + thd->lex->part_info= NULL; +#endif formnames.type_names=0; if (!(screen_buff=pack_screens(create_fields,&info_length,&screens,0))) DBUG_RETURN(1); - if (db_file == NULL) - db_file= get_new_handler((TABLE*) 0, create_info->db_type); + DBUG_ASSERT(db_file != NULL); /* If fixed row records, we need one bit to check for deleted rows */ if (!(create_info->table_options & HA_OPTION_PACK_RECORD)) @@ -136,6 +142,13 @@ bool mysql_create_frm(THD *thd, my_string file_name, 60); forminfo[46]=(uchar) strlen((char*)forminfo+47); // Length of comment +#ifdef HAVE_PARTITION_DB + if (part_info) + { + int4store(fileinfo+55,part_info->part_info_len); + } +#endif + int2store(fileinfo+59,db_file->extra_rec_buf_length()); if (my_pwrite(file,(byte*) fileinfo,64,0L,MYF_RW) || my_pwrite(file,(byte*) keybuff,key_info_length, (ulong) uint2korr(fileinfo+6),MYF_RW)) @@ -144,7 +157,7 @@ bool mysql_create_frm(THD *thd, my_string file_name, (ulong) uint2korr(fileinfo+6)+ (ulong) key_buff_length, MY_SEEK_SET,MYF(0))); if (make_empty_rec(thd,file,create_info->db_type,create_info->table_options, - create_fields,reclength, data_offset)) + create_fields,reclength, data_offset, db_file)) goto err; VOID(my_seek(file,filepos,MY_SEEK_SET,MYF(0))); @@ -153,6 +166,14 @@ bool mysql_create_frm(THD *thd, my_string file_name, pack_fields(file, create_fields, data_offset)) goto err; +#ifdef HAVE_PARTITION_DB + if (part_info) + { + if (my_write(file, (byte*) part_info->part_info_string, + part_info->part_info_len, MYF_RW)) + goto err; + } +#endif #ifdef HAVE_CRYPTED_FRM if (create_info->password) { @@ -211,15 +232,14 @@ err3: Create a frm (table definition) file and the tables SYNOPSIS - mysql_create_frm() + rea_create_table() thd Thread handler file_name Name of file (including database and .frm) create_info create info parameters create_fields Fields to create keys number of keys to create key_info Keys to create - db_file Handler to use. May be zero, in which case we use - create_info->db_type + file Handler to use. RETURN 0 ok 1 error @@ -228,19 +248,21 @@ err3: int rea_create_table(THD *thd, my_string file_name, HA_CREATE_INFO *create_info, List<create_field> &create_fields, - uint keys, KEY *key_info) + uint keys, KEY *key_info, handler *file) { DBUG_ENTER("rea_create_table"); if (mysql_create_frm(thd, file_name, create_info, - create_fields, keys, key_info, NULL)) + create_fields, keys, key_info, file)) DBUG_RETURN(1); + if (file->create_handler_files(file_name)) + goto err_handler; if (!create_info->frm_only && ha_create_table(file_name,create_info,0)) - { - my_delete(file_name,MYF(0)); - DBUG_RETURN(1); - } + goto err_handler; DBUG_RETURN(0); +err_handler: + my_delete(file_name, MYF(0)); + DBUG_RETURN(1); } /* rea_create_table */ @@ -664,7 +686,8 @@ static bool make_empty_rec(THD *thd, File file,enum db_type table_type, uint table_options, List<create_field> &create_fields, uint reclength, - ulong data_offset) + ulong data_offset, + handler *handler) { int error; Field::utype type; @@ -672,19 +695,15 @@ static bool make_empty_rec(THD *thd, File file,enum db_type table_type, uchar *buff,*null_pos; TABLE table; create_field *field; - handler *handler; enum_check_fields old_count_cuted_fields= thd->count_cuted_fields; DBUG_ENTER("make_empty_rec"); /* We need a table to generate columns for default values */ bzero((char*) &table,sizeof(table)); table.s= &table.share_not_to_be_used; - handler= get_new_handler((TABLE*) 0, table_type); - if (!handler || - !(buff=(uchar*) my_malloc((uint) reclength,MYF(MY_WME | MY_ZEROFILL)))) + if (!(buff=(uchar*) my_malloc((uint) reclength,MYF(MY_WME | MY_ZEROFILL)))) { - delete handler; DBUG_RETURN(1); } @@ -771,7 +790,6 @@ static bool make_empty_rec(THD *thd, File file,enum db_type table_type, err: my_free((gptr) buff,MYF(MY_FAE)); - delete handler; thd->count_cuted_fields= old_count_cuted_fields; DBUG_RETURN(error); } /* make_empty_rec */ diff --git a/sql/unireg.h b/sql/unireg.h index 8d88683241b..aafb96ef7c3 100644 --- a/sql/unireg.h +++ b/sql/unireg.h @@ -80,6 +80,7 @@ #define PSEUDO_TABLE_BITS (PARAM_TABLE_BIT | OUTER_REF_TABLE_BIT | \ RAND_TABLE_BIT) #define MAX_FIELDS 4096 /* Limit in the .frm file */ +#define MAX_PARTITIONS 1024 #define MAX_SORT_MEMORY (2048*1024-MALLOC_OVERHEAD) #define MIN_SORT_MEMORY (32*1024-MALLOC_OVERHEAD) |