diff options
author | Sergei Golubchik <sergii@pisem.net> | 2013-07-21 16:39:19 +0200 |
---|---|---|
committer | Sergei Golubchik <sergii@pisem.net> | 2013-07-21 16:39:19 +0200 |
commit | b7b5f6f1ab49948b0e15b762266d4640b3d6b7fb (patch) | |
tree | 7c302c2025184dbd053aa6135f0ff28c8ce6f359 /storage/innobase/row | |
parent | 5f6380adde2dac3f32b40339b9b702c0135eb7d6 (diff) | |
parent | c1d6a2d7e194225ccc19a68ea5d0f368632620d0 (diff) | |
download | mariadb-git-b7b5f6f1ab49948b0e15b762266d4640b3d6b7fb.tar.gz |
10.0-monty merge
includes:
* remove some remnants of "Bug#14521864: MYSQL 5.1 TO 5.5 BUGS PARTITIONING"
* introduce LOCK_share, now LOCK_ha_data is strictly for engines
* rea_create_table() always creates .par file (even in "frm-only" mode)
* fix a 5.6 bug, temp file leak on dummy ALTER TABLE
Diffstat (limited to 'storage/innobase/row')
-rw-r--r-- | storage/innobase/row/row0ext.cc | 2 | ||||
-rw-r--r-- | storage/innobase/row/row0ftsort.cc | 216 | ||||
-rw-r--r-- | storage/innobase/row/row0import.cc | 3806 | ||||
-rw-r--r-- | storage/innobase/row/row0ins.cc | 1234 | ||||
-rw-r--r-- | storage/innobase/row/row0log.cc | 3219 | ||||
-rw-r--r-- | storage/innobase/row/row0merge.cc | 2358 | ||||
-rw-r--r-- | storage/innobase/row/row0mysql.cc | 1670 | ||||
-rw-r--r-- | storage/innobase/row/row0purge.cc | 443 | ||||
-rw-r--r-- | storage/innobase/row/row0quiesce.cc | 702 | ||||
-rw-r--r-- | storage/innobase/row/row0row.cc | 199 | ||||
-rw-r--r-- | storage/innobase/row/row0sel.cc | 348 | ||||
-rw-r--r-- | storage/innobase/row/row0uins.cc | 194 | ||||
-rw-r--r-- | storage/innobase/row/row0umod.cc | 508 | ||||
-rw-r--r-- | storage/innobase/row/row0undo.cc | 21 | ||||
-rw-r--r-- | storage/innobase/row/row0upd.cc | 507 | ||||
-rw-r--r-- | storage/innobase/row/row0vers.cc | 88 |
16 files changed, 12762 insertions, 2753 deletions
diff --git a/storage/innobase/row/row0ext.cc b/storage/innobase/row/row0ext.cc index 8d4da9f034b..f084fa09c5a 100644 --- a/storage/innobase/row/row0ext.cc +++ b/storage/innobase/row/row0ext.cc @@ -95,6 +95,8 @@ row_ext_create( row_ext_t* ret; + ut_ad(n_ext > 0); + ret = static_cast<row_ext_t*>( mem_heap_alloc(heap, (sizeof *ret) + (n_ext - 1) * sizeof ret->len)); diff --git a/storage/innobase/row/row0ftsort.cc b/storage/innobase/row/row0ftsort.cc index 50b681361d8..9a6af50e09d 100644 --- a/storage/innobase/row/row0ftsort.cc +++ b/storage/innobase/row/row0ftsort.cc @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 2010, 2011, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2010, 2012, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -23,6 +23,7 @@ Create Full Text Index with (parallel) merge sort Created 10/13/2010 Jimmy Yang *******************************************************/ +#include "dict0dict.h" /* dict_table_stats_lock() */ #include "row0merge.h" #include "pars0pars.h" #include "row0ftsort.h" @@ -47,9 +48,6 @@ Created 10/13/2010 Jimmy Yang /** Parallel sort degree */ UNIV_INTERN ulong fts_sort_pll_degree = 2; -/** Parallel sort buffer size */ -UNIV_INTERN ulong srv_sort_buf_size = 1048576; - /*********************************************************************//** Create a temporary "fts sort index" used to merge sort the tokenized doc string. The index has three "fields": @@ -124,7 +122,7 @@ row_merge_create_fts_sort_index( if (DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_ADD_DOC_ID)) { /* If Doc ID column is being added by this create index, then just check the number of rows in the table */ - if (table->stat_n_rows < MAX_DOC_ID_OPT_VAL) { + if (dict_table_get_n_rows(table) < MAX_DOC_ID_OPT_VAL) { *opt_doc_id_size = TRUE; } } else { @@ -173,10 +171,10 @@ ibool row_fts_psort_info_init( /*====================*/ trx_t* trx, /*!< in: transaction */ - struct TABLE* table, /*!< in: MySQL table object */ + row_merge_dup_t* dup, /*!< in,own: descriptor of + FTS index being created */ const dict_table_t* new_table,/*!< in: table on which indexes are created */ - dict_index_t* index, /*!< in: FTS index to be created */ ibool opt_doc_id_size, /*!< in: whether to use 4 bytes instead of 8 bytes integer to @@ -192,7 +190,6 @@ row_fts_psort_info_init( fts_psort_t* psort_info = NULL; fts_psort_t* merge_info = NULL; ulint block_size; - os_event_t sort_event; ibool ret = TRUE; block_size = 3 * srv_sort_buf_size; @@ -201,28 +198,28 @@ row_fts_psort_info_init( fts_sort_pll_degree * sizeof *psort_info)); if (!psort_info) { - return FALSE; + ut_free(dup); + return(FALSE); } - sort_event = os_event_create(NULL); - /* Common Info for all sort threads */ common_info = static_cast<fts_psort_common_t*>( mem_alloc(sizeof *common_info)); - common_info->table = table; + if (!common_info) { + ut_free(dup); + mem_free(psort_info); + return(FALSE); + } + + common_info->dup = dup; common_info->new_table = (dict_table_t*) new_table; common_info->trx = trx; - common_info->sort_index = index; common_info->all_info = psort_info; - common_info->sort_event = sort_event; + common_info->sort_event = os_event_create(); + common_info->merge_event = os_event_create(); common_info->opt_doc_id_size = opt_doc_id_size; - if (!common_info) { - mem_free(psort_info); - return FALSE; - } - /* There will be FTS_NUM_AUX_INDEX number of "sort buckets" for each parallel sort thread. Each "sort bucket" holds records for a particular "FTS index partition" */ @@ -242,9 +239,12 @@ row_fts_psort_info_init( } psort_info[j].merge_buf[i] = row_merge_buf_create( - index); + dup->index); - row_merge_file_create(psort_info[j].merge_file[i]); + if (row_merge_file_create(psort_info[j].merge_file[i]) + < 0) { + goto func_exit; + } /* Need to align memory for O_DIRECT write */ psort_info[j].block_alloc[i] = @@ -314,6 +314,9 @@ row_fts_psort_info_destroy( } } + os_event_free(merge_info[0].psort_common->sort_event); + os_event_free(merge_info[0].psort_common->merge_event); + ut_free(merge_info[0].psort_common->dup); mem_free(merge_info[0].psort_common); mem_free(psort_info); } @@ -433,12 +436,11 @@ row_merge_fts_doc_tokenize( ut_a(t_ctx->buf_used < FTS_NUM_AUX_INDEX); idx = t_ctx->buf_used; - buf->tuples[buf->n_tuples + n_tuple[idx]] = field = - static_cast<dfield_t*>(mem_heap_alloc( - buf->heap, - FTS_NUM_FIELDS_SORT * sizeof *field)); + mtuple_t* mtuple = &buf->tuples[buf->n_tuples + n_tuple[idx]]; - ut_a(field); + field = mtuple->fields = static_cast<dfield_t*>( + mem_heap_alloc(buf->heap, + FTS_NUM_FIELDS_SORT * sizeof *field)); /* The first field is the tokenized word */ dfield_set_data(field, t_str.f_str, t_str.f_len); @@ -522,6 +524,10 @@ row_merge_fts_doc_tokenize( /* Update the data length and the number of new word tuples added in this round of tokenization */ for (i = 0; i < FTS_NUM_AUX_INDEX; i++) { + /* The computation of total_size below assumes that no + delete-mark flags will be stored and that all fields + are NOT NULL and fixed-length. */ + sort_buf[i]->total_size += data_size[i]; sort_buf[i]->n_tuples += n_tuple[i]; @@ -560,7 +566,7 @@ fts_parallel_tokenization( ulint mycount[FTS_NUM_AUX_INDEX]; ib_uint64_t total_rec = 0; ulint num_doc_processed = 0; - doc_id_t last_doc_id; + doc_id_t last_doc_id = 0; ulint zip_size; mem_heap_t* blob_heap = NULL; fts_doc_t doc; @@ -581,10 +587,10 @@ fts_parallel_tokenization( memset(mycount, 0, FTS_NUM_AUX_INDEX * sizeof(int)); doc.charset = fts_index_get_charset( - psort_info->psort_common->sort_index); + psort_info->psort_common->dup->index); idx_field = dict_index_get_nth_field( - psort_info->psort_common->sort_index, 0); + psort_info->psort_common->dup->index, 0); word_dtype.prtype = idx_field->col->prtype; word_dtype.mbminmaxlen = idx_field->col->mbminmaxlen; word_dtype.mtype = (strcmp(doc.charset->name, "latin1_swedish_ci") == 0) @@ -742,7 +748,12 @@ loop: } if (doc_item) { - prev_doc_item = doc_item; + prev_doc_item = doc_item; + + if (last_doc_id != doc_item->doc_id) { + t_ctx.init_pos = 0; + } + retried = 0; } else if (psort_info->state == FTS_PARENT_COMPLETE) { retried++; @@ -751,16 +762,51 @@ loop: goto loop; exit: + /* Do a final sort of the last (or latest) batch of records + in block memory. Flush them to temp file if records cannot + be hold in one block memory */ for (i = 0; i < FTS_NUM_AUX_INDEX; i++) { if (t_ctx.rows_added[i]) { row_merge_buf_sort(buf[i], NULL); row_merge_buf_write( - buf[i], (const merge_file_t*) merge_file[i], - block[i]); - row_merge_write(merge_file[i]->fd, - merge_file[i]->offset++, block[i]); + buf[i], merge_file[i], block[i]); + + /* Write to temp file, only if records have + been flushed to temp file before (offset > 0): + The pseudo code for sort is following: + + while (there are rows) { + tokenize rows, put result in block[] + if (block[] runs out) { + sort rows; + write to temp file with + row_merge_write(); + offset++; + } + } + + # write out the last batch + if (offset > 0) { + row_merge_write(); + offset++; + } else { + # no need to write anything + offset stay as 0 + } + + so if merge_file[i]->offset is 0 when we come to + here as the last batch, this means rows have + never flush to temp file, it can be held all in + memory */ + if (merge_file[i]->offset != 0) { + row_merge_write(merge_file[i]->fd, + merge_file[i]->offset++, + block[i]); + + UNIV_MEM_INVALID(block[i][0], + srv_sort_buf_size); + } - UNIV_MEM_INVALID(block[i][0], srv_sort_buf_size); buf[i] = row_merge_buf_empty(buf[i]); t_ctx.rows_added[i] = 0; } @@ -776,16 +822,19 @@ exit: continue; } - tmpfd[i] = innobase_mysql_tmpfile(); + tmpfd[i] = row_merge_file_create_low(); + if (tmpfd[i] < 0) { + goto func_exit; + } + row_merge_sort(psort_info->psort_common->trx, - psort_info->psort_common->sort_index, - merge_file[i], - (row_merge_block_t*) block[i], &tmpfd[i], - psort_info->psort_common->table); + psort_info->psort_common->dup, + merge_file[i], block[i], &tmpfd[i]); total_rec += merge_file[i]->n_rec; close(tmpfd[i]); } +func_exit: if (fts_enable_diag_print) { DEBUG_FTS_SORT_PRINT(" InnoDB_FTS: complete merge sort\n"); } @@ -794,8 +843,14 @@ exit: psort_info->child_status = FTS_CHILD_COMPLETE; os_event_set(psort_info->psort_common->sort_event); + psort_info->child_status = FTS_CHILD_EXITING; + +#ifdef __WIN__ + CloseHandle(psort_info->thread_hdl); +#endif /*__WIN__ */ os_thread_exit(NULL); + OS_THREAD_DUMMY_RETURN; } @@ -812,8 +867,9 @@ row_fts_start_psort( for (i = 0; i < fts_sort_pll_degree; i++) { psort_info[i].psort_id = i; - os_thread_create(fts_parallel_tokenization, - (void*) &psort_info[i], &thd_id); + psort_info[i].thread_hdl = os_thread_create( + fts_parallel_tokenization, + (void*) &psort_info[i], &thd_id); } } @@ -833,14 +889,20 @@ fts_parallel_merge( id = psort_info->psort_id; - row_fts_merge_insert(psort_info->psort_common->sort_index, + row_fts_merge_insert(psort_info->psort_common->dup->index, psort_info->psort_common->new_table, psort_info->psort_common->all_info, id); psort_info->child_status = FTS_CHILD_COMPLETE; - os_event_set(psort_info->psort_common->sort_event); + os_event_set(psort_info->psort_common->merge_event); + psort_info->child_status = FTS_CHILD_EXITING; + +#ifdef __WIN__ + CloseHandle(psort_info->thread_hdl); +#endif /*__WIN__ */ os_thread_exit(NULL); + OS_THREAD_DUMMY_RETURN; } @@ -860,16 +922,16 @@ row_fts_start_parallel_merge( merge_info[i].psort_id = i; merge_info[i].child_status = 0; - os_thread_create(fts_parallel_merge, - (void*) &merge_info[i], &thd_id); + merge_info[i].thread_hdl = os_thread_create( + fts_parallel_merge, (void*) &merge_info[i], &thd_id); } } /********************************************************************//** Insert processed FTS data to auxillary index tables. @return DB_SUCCESS if insertion runs fine */ -UNIV_INTERN -ulint +static __attribute__((nonnull)) +dberr_t row_merge_write_fts_word( /*=====================*/ trx_t* trx, /*!< in: transaction */ @@ -880,15 +942,15 @@ row_merge_write_fts_word( CHARSET_INFO* charset) /*!< in: charset */ { ulint selected; - ulint ret = DB_SUCCESS; + dberr_t ret = DB_SUCCESS; selected = fts_select_index( charset, word->text.f_str, word->text.f_len); fts_table->suffix = fts_get_suffix(selected); /* Pop out each fts_node in word->nodes write them to auxiliary table */ - while(ib_vector_size(word->nodes) > 0) { - ulint error; + while (ib_vector_size(word->nodes) > 0) { + dberr_t error; fts_node_t* fts_node; fts_node = static_cast<fts_node_t*>(ib_vector_pop(word->nodes)); @@ -900,8 +962,8 @@ row_merge_write_fts_word( if (error != DB_SUCCESS) { fprintf(stderr, "InnoDB: failed to write" " word %s to FTS auxiliary index" - " table, error (%lu) \n", - word->text.f_str, error); + " table, error (%s) \n", + word->text.f_str, ut_strerr(error)); ret = error; } @@ -1064,7 +1126,6 @@ row_fts_sel_tree_propagate( int child_left; int child_right; int selected; - ibool null_eq = FALSE; /* Find which parent this value will be propagated to */ parent = (propogated - 1) / 2; @@ -1083,10 +1144,10 @@ row_fts_sel_tree_propagate( } else if (child_right == -1 || mrec[child_right] == NULL) { selected = child_left; - } else if (row_merge_cmp(mrec[child_left], mrec[child_right], - offsets[child_left], - offsets[child_right], - index, &null_eq) < 0) { + } else if (cmp_rec_rec_simple(mrec[child_left], mrec[child_right], + offsets[child_left], + offsets[child_right], + index, NULL) < 0) { selected = child_left; } else { selected = child_right; @@ -1143,8 +1204,6 @@ row_fts_build_sel_tree_level( num_item = (1 << level); for (i = 0; i < num_item; i++) { - ibool null_eq = FALSE; - child_left = sel_tree[(start + i) * 2 + 1]; child_right = sel_tree[(start + i) * 2 + 2]; @@ -1174,14 +1233,12 @@ row_fts_build_sel_tree_level( } /* Select the smaller one to set parent pointer */ - if (row_merge_cmp(mrec[child_left], mrec[child_right], - offsets[child_left], - offsets[child_right], - index, &null_eq) < 0) { - sel_tree[start + i] = child_left; - } else { - sel_tree[start + i] = child_right; - } + int cmp = cmp_rec_rec_simple( + mrec[child_left], mrec[child_right], + offsets[child_left], offsets[child_right], + index, NULL); + + sel_tree[start + i] = cmp < 0 ? child_left : child_right; } } @@ -1231,7 +1288,7 @@ Read sorted file containing index data tuples and insert these data tuples to the index @return DB_SUCCESS or error number */ UNIV_INTERN -ulint +dberr_t row_fts_merge_insert( /*=================*/ dict_index_t* index, /*!< in: index */ @@ -1243,7 +1300,7 @@ row_fts_merge_insert( const byte** b; mem_heap_t* tuple_heap; mem_heap_t* heap; - ulint error = DB_SUCCESS; + dberr_t error = DB_SUCCESS; ulint* foffs; ulint** offsets; fts_tokenizer_word_t new_word; @@ -1317,7 +1374,7 @@ row_fts_merge_insert( count_diag += (int) psort_info[i].merge_file[id]->n_rec; } - if (fts_enable_diag_print) { + if (fts_enable_diag_print) { ut_print_timestamp(stderr); fprintf(stderr, " InnoDB_FTS: to inserted %lu records\n", (ulong) count_diag); @@ -1349,8 +1406,13 @@ row_fts_merge_insert( /* No Rows to read */ mrec[i] = b[i] = NULL; } else { - if (!row_merge_read(fd[i], foffs[i], - (row_merge_block_t*) block[i])) { + /* Read from temp file only if it has been + written to. Otherwise, block memory holds + all the sorted records */ + if (psort_info[i].merge_file[id]->offset > 0 + && (!row_merge_read( + fd[i], foffs[i], + (row_merge_block_t*) block[i]))) { error = DB_CORRUPTION; goto exit; } @@ -1386,14 +1448,14 @@ row_fts_merge_insert( } for (i = min_rec + 1; i < fts_sort_pll_degree; i++) { - ibool null_eq = FALSE; if (!mrec[i]) { continue; } - if (row_merge_cmp(mrec[i], mrec[min_rec], - offsets[i], offsets[min_rec], - index, &null_eq) < 0) { + if (cmp_rec_rec_simple( + mrec[i], mrec[min_rec], + offsets[i], offsets[min_rec], + index, NULL) < 0) { min_rec = i; } } diff --git a/storage/innobase/row/row0import.cc b/storage/innobase/row/row0import.cc new file mode 100644 index 00000000000..f5eb31191a5 --- /dev/null +++ b/storage/innobase/row/row0import.cc @@ -0,0 +1,3806 @@ +/***************************************************************************** + +Copyright (c) 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file row/row0import.cc +Import a tablespace to a running instance. + +Created 2012-02-08 by Sunny Bains. +*******************************************************/ + +#include "row0import.h" + +#ifdef UNIV_NONINL +#include "row0import.ic" +#endif + +#include "btr0pcur.h" +#include "que0que.h" +#include "dict0boot.h" +#include "ibuf0ibuf.h" +#include "pars0pars.h" +#include "row0upd.h" +#include "row0sel.h" +#include "row0mysql.h" +#include "srv0start.h" +#include "row0quiesce.h" + +#include <vector> + +/** The size of the buffer to use for IO. Note: os_file_read() doesn't expect +reads to fail. If you set the buffer size to be greater than a multiple of the +file size then it will assert. TODO: Fix this limitation of the IO functions. +@param n - page size of the tablespace. +@retval number of pages */ +#define IO_BUFFER_SIZE(n) ((1024 * 1024) / n) + +/** For gathering stats on records during phase I */ +struct row_stats_t { + ulint m_n_deleted; /*!< Number of deleted records + found in the index */ + + ulint m_n_purged; /*!< Number of records purged + optimisatically */ + + ulint m_n_rows; /*!< Number of rows */ + + ulint m_n_purge_failed; /*!< Number of deleted rows + that could not be purged */ +}; + +/** Index information required by IMPORT. */ +struct row_index_t { + index_id_t m_id; /*!< Index id of the table + in the exporting server */ + byte* m_name; /*!< Index name */ + + ulint m_space; /*!< Space where it is placed */ + + ulint m_page_no; /*!< Root page number */ + + ulint m_type; /*!< Index type */ + + ulint m_trx_id_offset; /*!< Relevant only for clustered + indexes, offset of transaction + id system column */ + + ulint m_n_user_defined_cols; /*!< User defined columns */ + + ulint m_n_uniq; /*!< Number of columns that can + uniquely identify the row */ + + ulint m_n_nullable; /*!< Number of nullable + columns */ + + ulint m_n_fields; /*!< Total number of fields */ + + dict_field_t* m_fields; /*!< Index fields */ + + const dict_index_t* + m_srv_index; /*!< Index instance in the + importing server */ + + row_stats_t m_stats; /*!< Statistics gathered during + the import phase */ + +}; + +/** Meta data required by IMPORT. */ +struct row_import { + row_import() UNIV_NOTHROW + : + m_table(), + m_version(), + m_hostname(), + m_table_name(), + m_autoinc(), + m_page_size(), + m_flags(), + m_n_cols(), + m_cols(), + m_col_names(), + m_n_indexes(), + m_indexes(), + m_missing(true) { } + + ~row_import() UNIV_NOTHROW; + + /** + Find the index entry in in the indexes array. + @param name - index name + @return instance if found else 0. */ + row_index_t* get_index(const char* name) const UNIV_NOTHROW; + + /** + Get the number of rows in the index. + @param name - index name + @return number of rows (doesn't include delete marked rows). */ + ulint get_n_rows(const char* name) const UNIV_NOTHROW; + + /** + Find the ordinal value of the column name in the cfg table columns. + @param name - of column to look for. + @return ULINT_UNDEFINED if not found. */ + ulint find_col(const char* name) const UNIV_NOTHROW; + + /** + Find the index field entry in in the cfg indexes fields. + @name - of the index to look for + @return instance if found else 0. */ + const dict_field_t* find_field( + const row_index_t* cfg_index, + const char* name) const UNIV_NOTHROW; + + /** + Get the number of rows for which purge failed during the convert phase. + @param name - index name + @return number of rows for which purge failed. */ + ulint get_n_purge_failed(const char* name) const UNIV_NOTHROW; + + /** + Check if the index is clean. ie. no delete-marked records + @param name - index name + @return true if index needs to be purged. */ + bool requires_purge(const char* name) const UNIV_NOTHROW + { + return(get_n_purge_failed(name) > 0); + } + + /** + Set the index root <space, pageno> using the index name */ + void set_root_by_name() UNIV_NOTHROW; + + /** + Set the index root <space, pageno> using a heuristic + @return DB_SUCCESS or error code */ + dberr_t set_root_by_heuristic() UNIV_NOTHROW; + + /** Check if the index schema that was read from the .cfg file + matches the in memory index definition. + Note: It will update row_import_t::m_srv_index to map the meta-data + read from the .cfg file to the server index instance. + @return DB_SUCCESS or error code. */ + dberr_t match_index_columns( + THD* thd, + const dict_index_t* index) UNIV_NOTHROW; + + /** + Check if the table schema that was read from the .cfg file matches the + in memory table definition. + @param thd - MySQL session variable + @return DB_SUCCESS or error code. */ + dberr_t match_table_columns( + THD* thd) UNIV_NOTHROW; + + /** + Check if the table (and index) schema that was read from the .cfg file + matches the in memory table definition. + @param thd - MySQL session variable + @return DB_SUCCESS or error code. */ + dberr_t match_schema( + THD* thd) UNIV_NOTHROW; + + dict_table_t* m_table; /*!< Table instance */ + + ulint m_version; /*!< Version of config file */ + + byte* m_hostname; /*!< Hostname where the + tablespace was exported */ + byte* m_table_name; /*!< Exporting instance table + name */ + + ib_uint64_t m_autoinc; /*!< Next autoinc value */ + + ulint m_page_size; /*!< Tablespace page size */ + + ulint m_flags; /*!< Table flags */ + + ulint m_n_cols; /*!< Number of columns in the + meta-data file */ + + dict_col_t* m_cols; /*!< Column data */ + + byte** m_col_names; /*!< Column names, we store the + column naems separately becuase + there is no field to store the + value in dict_col_t */ + + ulint m_n_indexes; /*!< Number of indexes, + including clustered index */ + + row_index_t* m_indexes; /*!< Index meta data */ + + bool m_missing; /*!< true if a .cfg file was + found and was readable */ +}; + +/** Use the page cursor to iterate over records in a block. */ +class RecIterator { +public: + /** + Default constructor */ + RecIterator() UNIV_NOTHROW + { + memset(&m_cur, 0x0, sizeof(m_cur)); + } + + /** + Position the cursor on the first user record. */ + void open(buf_block_t* block) UNIV_NOTHROW + { + page_cur_set_before_first(block, &m_cur); + + if (!end()) { + next(); + } + } + + /** + Move to the next record. */ + void next() UNIV_NOTHROW + { + page_cur_move_to_next(&m_cur); + } + + /** + @return the current record */ + rec_t* current() UNIV_NOTHROW + { + ut_ad(!end()); + return(page_cur_get_rec(&m_cur)); + } + + /** + @return true if cursor is at the end */ + bool end() UNIV_NOTHROW + { + return(page_cur_is_after_last(&m_cur) == TRUE); + } + + /** Remove the current record + @return true on success */ + bool remove( + const dict_index_t* index, + page_zip_des_t* page_zip, + ulint* offsets) UNIV_NOTHROW + { + /* We can't end up with an empty page unless it is root. */ + if (page_get_n_recs(m_cur.block->frame) <= 1) { + return(false); + } + + return(page_delete_rec(index, &m_cur, page_zip, offsets)); + } + +private: + page_cur_t m_cur; +}; + +/** Class that purges delete marked reocords from indexes, both secondary +and cluster. It does a pessimistic delete. This should only be done if we +couldn't purge the delete marked reocrds during Phase I. */ +class IndexPurge { +public: + /** Constructor + @param trx - the user transaction covering the import tablespace + @param index - to be imported + @param space_id - space id of the tablespace */ + IndexPurge( + trx_t* trx, + dict_index_t* index) UNIV_NOTHROW + : + m_trx(trx), + m_index(index), + m_n_rows(0) + { + ib_logf(IB_LOG_LEVEL_INFO, + "Phase II - Purge records from index %s", + index->name); + } + + /** Descructor */ + ~IndexPurge() UNIV_NOTHROW { } + + /** Purge delete marked records. + @return DB_SUCCESS or error code. */ + dberr_t garbage_collect() UNIV_NOTHROW; + + /** The number of records that are not delete marked. + @return total records in the index after purge */ + ulint get_n_rows() const UNIV_NOTHROW + { + return(m_n_rows); + } + +private: + /** + Begin import, position the cursor on the first record. */ + void open() UNIV_NOTHROW; + + /** + Close the persistent curosr and commit the mini-transaction. */ + void close() UNIV_NOTHROW; + + /** + Position the cursor on the next record. + @return DB_SUCCESS or error code */ + dberr_t next() UNIV_NOTHROW; + + /** + Store the persistent cursor position and reopen the + B-tree cursor in BTR_MODIFY_TREE mode, because the + tree structure may be changed during a pessimistic delete. */ + void purge_pessimistic_delete() UNIV_NOTHROW; + + /** + Purge delete-marked records. + @param offsets - current row offsets. */ + void purge() UNIV_NOTHROW; + +protected: + // Disable copying + IndexPurge(); + IndexPurge(const IndexPurge&); + IndexPurge &operator=(const IndexPurge&); + +private: + trx_t* m_trx; /*!< User transaction */ + mtr_t m_mtr; /*!< Mini-transaction */ + btr_pcur_t m_pcur; /*!< Persistent cursor */ + dict_index_t* m_index; /*!< Index to be processed */ + ulint m_n_rows; /*!< Records in index */ +}; + +/** Functor that is called for each physical page that is read from the +tablespace file. */ +class AbstractCallback : public PageCallback { +public: + /** Constructor + @param trx - covering transaction */ + AbstractCallback(trx_t* trx) + : + m_trx(trx), + m_space(ULINT_UNDEFINED), + m_xdes(), + m_xdes_page_no(ULINT_UNDEFINED), + m_space_flags(ULINT_UNDEFINED), + m_table_flags(ULINT_UNDEFINED) UNIV_NOTHROW { } + + /** + Free any extent descriptor instance */ + virtual ~AbstractCallback() + { + delete [] m_xdes; + } + + /** Determine the page size to use for traversing the tablespace + @param file_size - size of the tablespace file in bytes + @param block - contents of the first page in the tablespace file. + @retval DB_SUCCESS or error code. */ + virtual dberr_t init( + os_offset_t file_size, + const buf_block_t* block) UNIV_NOTHROW; + + /** @return true if compressed table. */ + bool is_compressed_table() const UNIV_NOTHROW + { + return(get_zip_size() > 0); + } + +protected: + /** + Get the data page depending on the table type, compressed or not. + @param block - block read from disk + @retval the buffer frame */ + buf_frame_t* get_frame(buf_block_t* block) const UNIV_NOTHROW + { + if (is_compressed_table()) { + return(block->page.zip.data); + } + + return(buf_block_get_frame(block)); + } + + /** Check for session interrupt. If required we could + even flush to disk here every N pages. + @retval DB_SUCCESS or error code */ + dberr_t periodic_check() UNIV_NOTHROW + { + if (trx_is_interrupted(m_trx)) { + return(DB_INTERRUPTED); + } + + return(DB_SUCCESS); + } + + /** + Get the physical offset of the extent descriptor within the page. + @param page_no - page number of the extent descriptor + @param page - contents of the page containing the extent descriptor. + @return the start of the xdes array in a page */ + const xdes_t* xdes( + ulint page_no, + const page_t* page) const UNIV_NOTHROW + { + ulint offset; + + offset = xdes_calc_descriptor_index(get_zip_size(), page_no); + + return(page + XDES_ARR_OFFSET + XDES_SIZE * offset); + } + + /** + Set the current page directory (xdes). If the extent descriptor is + marked as free then free the current extent descriptor and set it to + 0. This implies that all pages that are covered by this extent + descriptor are also freed. + + @param page_no - offset of page within the file + @param page - page contents + @return DB_SUCCESS or error code. */ + dberr_t set_current_xdes( + ulint page_no, + const page_t* page) UNIV_NOTHROW + { + m_xdes_page_no = page_no; + + delete[] m_xdes; + + m_xdes = 0; + + ulint state; + const xdes_t* xdesc = page + XDES_ARR_OFFSET; + + state = mach_read_ulint(xdesc + XDES_STATE, MLOG_4BYTES); + + if (state != XDES_FREE) { + + m_xdes = new(std::nothrow) xdes_t[m_page_size]; + + /* Trigger OOM */ + DBUG_EXECUTE_IF("ib_import_OOM_13", + delete [] m_xdes; m_xdes = 0;); + + if (m_xdes == 0) { + return(DB_OUT_OF_MEMORY); + } + + memcpy(m_xdes, page, m_page_size); + } + + return(DB_SUCCESS); + } + + /** + @return true if it is a root page */ + bool is_root_page(const page_t* page) const UNIV_NOTHROW + { + ut_ad(fil_page_get_type(page) == FIL_PAGE_INDEX); + + return(mach_read_from_4(page + FIL_PAGE_NEXT) == FIL_NULL + && mach_read_from_4(page + FIL_PAGE_PREV) == FIL_NULL); + } + + /** + Check if the page is marked as free in the extent descriptor. + @param page_no - page number to check in the extent descriptor. + @return true if the page is marked as free */ + bool is_free(ulint page_no) const UNIV_NOTHROW + { + ut_a(xdes_calc_descriptor_page(get_zip_size(), page_no) + == m_xdes_page_no); + + if (m_xdes != 0) { + const xdes_t* xdesc = xdes(page_no, m_xdes); + ulint pos = page_no % FSP_EXTENT_SIZE; + + return(xdes_get_bit(xdesc, XDES_FREE_BIT, pos)); + } + + /* If the current xdes was free, the page must be free. */ + return(true); + } + +protected: + /** Covering transaction. */ + trx_t* m_trx; + + /** Space id of the file being iterated over. */ + ulint m_space; + + /** Minimum page number for which the free list has not been + initialized: the pages >= this limit are, by definition, free; + note that in a single-table tablespace where size < 64 pages, + this number is 64, i.e., we have initialized the space about + the first extent, but have not physically allocted those pages + to the file. @see FSP_LIMIT. */ + ulint m_free_limit; + + /** Current size of the space in pages */ + ulint m_size; + + /** Current extent descriptor page */ + xdes_t* m_xdes; + + /** Physical page offset in the file of the extent descriptor */ + ulint m_xdes_page_no; + + /** Flags value read from the header page */ + ulint m_space_flags; + + /** Derived from m_space_flags and row format type, the row format + type is determined from the page header. */ + ulint m_table_flags; +}; + +/** Determine the page size to use for traversing the tablespace +@param file_size - size of the tablespace file in bytes +@param block - contents of the first page in the tablespace file. +@retval DB_SUCCESS or error code. */ +dberr_t +AbstractCallback::init( + os_offset_t file_size, + const buf_block_t* block) UNIV_NOTHROW +{ + const page_t* page = block->frame; + + m_space_flags = fsp_header_get_flags(page); + + /* Since we don't know whether it is a compressed table + or not, the data is always read into the block->frame. */ + + dberr_t err = set_zip_size(block->frame); + + if (err != DB_SUCCESS) { + return(DB_CORRUPTION); + } + + /* Set the page size used to traverse the tablespace. */ + + m_page_size = (is_compressed_table()) + ? get_zip_size() : fsp_flags_get_page_size(m_space_flags); + + if (m_page_size == 0) { + ib_logf(IB_LOG_LEVEL_ERROR, "Page size is 0"); + return(DB_CORRUPTION); + } else if (!is_compressed_table() && m_page_size != UNIV_PAGE_SIZE) { + + ib_logf(IB_LOG_LEVEL_ERROR, + "Page size %lu of ibd file is not the same " + "as the server page size %lu", + m_page_size, UNIV_PAGE_SIZE); + + return(DB_CORRUPTION); + + } else if ((file_size % m_page_size)) { + + ib_logf(IB_LOG_LEVEL_ERROR, + "File size " UINT64PF " is not a multiple " + "of the page size %lu", + (ib_uint64_t) file_size, (ulong) m_page_size); + + return(DB_CORRUPTION); + } + + ut_a(m_space == ULINT_UNDEFINED); + + m_size = mach_read_from_4(page + FSP_SIZE); + m_free_limit = mach_read_from_4(page + FSP_FREE_LIMIT); + m_space = mach_read_from_4(page + FSP_HEADER_OFFSET + FSP_SPACE_ID); + + if ((err = set_current_xdes(0, page)) != DB_SUCCESS) { + return(err); + } + + return(DB_SUCCESS); +} + +/** +Try and determine the index root pages by checking if the next/prev +pointers are both FIL_NULL. We need to ensure that skip deleted pages. */ +struct FetchIndexRootPages : public AbstractCallback { + + /** Index information gathered from the .ibd file. */ + struct Index { + + Index(index_id_t id, ulint page_no) + : + m_id(id), + m_page_no(page_no) { } + + index_id_t m_id; /*!< Index id */ + ulint m_page_no; /*!< Root page number */ + }; + + typedef std::vector<Index> Indexes; + + /** Constructor + @param trx - covering (user) transaction + @param table - table definition in server .*/ + FetchIndexRootPages(const dict_table_t* table, trx_t* trx) + : + AbstractCallback(trx), + m_table(table) UNIV_NOTHROW { } + + /** Destructor */ + virtual ~FetchIndexRootPages() UNIV_NOTHROW { } + + /** + @retval the space id of the tablespace being iterated over */ + virtual ulint get_space_id() const UNIV_NOTHROW + { + return(m_space); + } + + /** + Check if the .ibd file row format is the same as the table's. + @param ibd_table_flags - determined from space and page. + @return DB_SUCCESS or error code. */ + dberr_t check_row_format(ulint ibd_table_flags) UNIV_NOTHROW + { + dberr_t err; + rec_format_t ibd_rec_format; + rec_format_t table_rec_format; + + if (!dict_tf_is_valid(ibd_table_flags)) { + + ib_errf(m_trx->mysql_thd, IB_LOG_LEVEL_ERROR, + ER_TABLE_SCHEMA_MISMATCH, + ".ibd file has invlad table flags: %lx", + ibd_table_flags); + + return(DB_CORRUPTION); + } + + ibd_rec_format = dict_tf_get_rec_format(ibd_table_flags); + table_rec_format = dict_tf_get_rec_format(m_table->flags); + + if (table_rec_format != ibd_rec_format) { + + ib_errf(m_trx->mysql_thd, IB_LOG_LEVEL_ERROR, + ER_TABLE_SCHEMA_MISMATCH, + "Table has %s row format, .ibd " + "file has %s row format.", + dict_tf_to_row_format_string(m_table->flags), + dict_tf_to_row_format_string(ibd_table_flags)); + + err = DB_CORRUPTION; + } else { + err = DB_SUCCESS; + } + + return(err); + } + + /** + Called for each block as it is read from the file. + @param offset - physical offset in the file + @param block - block to convert, it is not from the buffer pool. + @retval DB_SUCCESS or error code. */ + virtual dberr_t operator() ( + os_offset_t offset, + buf_block_t* block) UNIV_NOTHROW; + + /** Update the import configuration that will be used to import + the tablespace. */ + dberr_t build_row_import(row_import* cfg) const UNIV_NOTHROW; + + /** Table definition in server. */ + const dict_table_t* m_table; + + /** Index information */ + Indexes m_indexes; +}; + +/** +Called for each block as it is read from the file. Check index pages to +determine the exact row format. We can't get that from the tablespace +header flags alone. + +@param offset - physical offset in the file +@param block - block to convert, it is not from the buffer pool. +@retval DB_SUCCESS or error code. */ +dberr_t +FetchIndexRootPages::operator() ( + os_offset_t offset, + buf_block_t* block) UNIV_NOTHROW +{ + dberr_t err; + + if ((err = periodic_check()) != DB_SUCCESS) { + return(err); + } + + const page_t* page = get_frame(block); + + ulint page_type = fil_page_get_type(page); + + if (block->page.offset * m_page_size != offset) { + ib_logf(IB_LOG_LEVEL_ERROR, + "Page offset doesn't match file offset: " + "page offset: %lu, file offset: %lu", + (ulint) block->page.offset, + (ulint) (offset / m_page_size)); + + err = DB_CORRUPTION; + } else if (page_type == FIL_PAGE_TYPE_XDES) { + err = set_current_xdes(block->page.offset, page); + } else if (page_type == FIL_PAGE_INDEX + && !is_free(block->page.offset) + && is_root_page(page)) { + + index_id_t id = btr_page_get_index_id(page); + ulint page_no = buf_block_get_page_no(block); + + m_indexes.push_back(Index(id, page_no)); + + if (m_indexes.size() == 1) { + + m_table_flags = dict_sys_tables_type_to_tf( + m_space_flags, + page_is_comp(page) ? DICT_N_COLS_COMPACT : 0); + + err = check_row_format(m_table_flags); + } + } + + return(err); +} + +/** +Update the import configuration that will be used to import the tablespace. +@return error code or DB_SUCCESS */ +dberr_t +FetchIndexRootPages::build_row_import(row_import* cfg) const UNIV_NOTHROW +{ + Indexes::const_iterator end = m_indexes.end(); + + ut_a(cfg->m_table == m_table); + cfg->m_page_size = m_page_size; + cfg->m_n_indexes = m_indexes.size(); + + if (cfg->m_n_indexes == 0) { + + ib_logf(IB_LOG_LEVEL_ERROR, "No B+Tree found in tablespace"); + + return(DB_CORRUPTION); + } + + cfg->m_indexes = new(std::nothrow) row_index_t[cfg->m_n_indexes]; + + /* Trigger OOM */ + DBUG_EXECUTE_IF("ib_import_OOM_11", + delete [] cfg->m_indexes; cfg->m_indexes = 0;); + + if (cfg->m_indexes == 0) { + return(DB_OUT_OF_MEMORY); + } + + memset(cfg->m_indexes, 0x0, sizeof(*cfg->m_indexes) * cfg->m_n_indexes); + + row_index_t* cfg_index = cfg->m_indexes; + + for (Indexes::const_iterator it = m_indexes.begin(); + it != end; + ++it, ++cfg_index) { + + char name[BUFSIZ]; + + ut_snprintf(name, sizeof(name), "index" IB_ID_FMT, it->m_id); + + ulint len = strlen(name) + 1; + + cfg_index->m_name = new(std::nothrow) byte[len]; + + /* Trigger OOM */ + DBUG_EXECUTE_IF("ib_import_OOM_12", + delete [] cfg_index->m_name; + cfg_index->m_name = 0;); + + if (cfg_index->m_name == 0) { + return(DB_OUT_OF_MEMORY); + } + + memcpy(cfg_index->m_name, name, len); + + cfg_index->m_id = it->m_id; + + cfg_index->m_space = m_space; + + cfg_index->m_page_no = it->m_page_no; + } + + return(DB_SUCCESS); +} + +/* Functor that is called for each physical page that is read from the +tablespace file. + + 1. Check each page for corruption. + + 2. Update the space id and LSN on every page + * For the header page + - Validate the flags + - Update the LSN + + 3. On Btree pages + * Set the index id + * Update the max trx id + * In a cluster index, update the system columns + * In a cluster index, update the BLOB ptr, set the space id + * Purge delete marked records, but only if they can be easily + removed from the page + * Keep a counter of number of rows, ie. non-delete-marked rows + * Keep a counter of number of delete marked rows + * Keep a counter of number of purge failure + * If a page is stamped with an index id that isn't in the .cfg file + we assume it is deleted and the page can be ignored. + + 4. Set the page state to dirty so that it will be written to disk. +*/ +class PageConverter : public AbstractCallback { +public: + /** Constructor + * @param cfg - config of table being imported. + * @param trx - transaction covering the import */ + PageConverter(row_import* cfg, trx_t* trx) UNIV_NOTHROW; + + virtual ~PageConverter() UNIV_NOTHROW + { + if (m_heap != 0) { + mem_heap_free(m_heap); + } + } + + /** + @retval the server space id of the tablespace being iterated over */ + virtual ulint get_space_id() const UNIV_NOTHROW + { + return(m_cfg->m_table->space); + } + + /** + Called for each block as it is read from the file. + @param offset - physical offset in the file + @param block - block to convert, it is not from the buffer pool. + @retval DB_SUCCESS or error code. */ + virtual dberr_t operator() ( + os_offset_t offset, + buf_block_t* block) UNIV_NOTHROW; +private: + + /** Status returned by PageConverter::validate() */ + enum import_page_status_t { + IMPORT_PAGE_STATUS_OK, /*!< Page is OK */ + IMPORT_PAGE_STATUS_ALL_ZERO, /*!< Page is all zeros */ + IMPORT_PAGE_STATUS_CORRUPTED /*!< Page is corrupted */ + }; + + /** + Update the page, set the space id, max trx id and index id. + @param block - block read from file + @param page_type - type of the page + @retval DB_SUCCESS or error code */ + dberr_t update_page( + buf_block_t* block, + ulint& page_type) UNIV_NOTHROW; + +#if defined UNIV_DEBUG + /** + @return true error condition is enabled. */ + bool trigger_corruption() UNIV_NOTHROW + { + return(false); + } + #else +#define trigger_corruption() (false) +#endif /* UNIV_DEBUG */ + + /** + Update the space, index id, trx id. + @param block - block to convert + @return DB_SUCCESS or error code */ + dberr_t update_index_page(buf_block_t* block) UNIV_NOTHROW; + + /** Update the BLOB refrences and write UNDO log entries for + rows that can't be purged optimistically. + @param block - block to update + @retval DB_SUCCESS or error code */ + dberr_t update_records(buf_block_t* block) UNIV_NOTHROW; + + /** + Validate the page, check for corruption. + @param offset - physical offset within file. + @param page - page read from file. + @return 0 on success, 1 if all zero, 2 if corrupted */ + import_page_status_t validate( + os_offset_t offset, + buf_block_t* page) UNIV_NOTHROW; + + /** + Validate the space flags and update tablespace header page. + @param block - block read from file, not from the buffer pool. + @retval DB_SUCCESS or error code */ + dberr_t update_header(buf_block_t* block) UNIV_NOTHROW; + + /** + Adjust the BLOB reference for a single column that is externally stored + @param rec - record to update + @param offsets - column offsets for the record + @param i - column ordinal value + @return DB_SUCCESS or error code */ + dberr_t adjust_cluster_index_blob_column( + rec_t* rec, + const ulint* offsets, + ulint i) UNIV_NOTHROW; + + /** + Adjusts the BLOB reference in the clustered index row for all + externally stored columns. + @param rec - record to update + @param offsets - column offsets for the record + @return DB_SUCCESS or error code */ + dberr_t adjust_cluster_index_blob_columns( + rec_t* rec, + const ulint* offsets) UNIV_NOTHROW; + + /** + In the clustered index, adjist the BLOB pointers as needed. + Also update the BLOB reference, write the new space id. + @param rec - record to update + @param offsets - column offsets for the record + @return DB_SUCCESS or error code */ + dberr_t adjust_cluster_index_blob_ref( + rec_t* rec, + const ulint* offsets) UNIV_NOTHROW; + + /** + Purge delete-marked records, only if it is possible to do + so without re-organising the B+tree. + @param offsets - current row offsets. + @retval true if purged */ + bool purge(const ulint* offsets) UNIV_NOTHROW; + + /** + Adjust the BLOB references and sys fields for the current record. + @param index - the index being converted + @param rec - record to update + @param offsets - column offsets for the record + @param deleted - true if row is delete marked + @return DB_SUCCESS or error code. */ + dberr_t adjust_cluster_record( + const dict_index_t* index, + rec_t* rec, + const ulint* offsets, + bool deleted) UNIV_NOTHROW; + + /** + Find an index with the matching id. + @return row_index_t* instance or 0 */ + row_index_t* find_index(index_id_t id) UNIV_NOTHROW + { + row_index_t* index = &m_cfg->m_indexes[0]; + + for (ulint i = 0; i < m_cfg->m_n_indexes; ++i, ++index) { + if (id == index->m_id) { + return(index); + } + } + + return(0); + + } +private: + /** Config for table that is being imported. */ + row_import* m_cfg; + + /** Current index whose pages are being imported */ + row_index_t* m_index; + + /** Current system LSN */ + lsn_t m_current_lsn; + + /** Alias for m_page_zip, only set for compressed pages. */ + page_zip_des_t* m_page_zip_ptr; + + /** Iterator over records in a block */ + RecIterator m_rec_iter; + + /** Record offset */ + ulint m_offsets_[REC_OFFS_NORMAL_SIZE]; + + /** Pointer to m_offsets_ */ + ulint* m_offsets; + + /** Memory heap for the record offsets */ + mem_heap_t* m_heap; + + /** Cluster index instance */ + dict_index_t* m_cluster_index; +}; + +/** +row_import destructor. */ +row_import::~row_import() UNIV_NOTHROW +{ + for (ulint i = 0; m_indexes != 0 && i < m_n_indexes; ++i) { + delete [] m_indexes[i].m_name; + + if (m_indexes[i].m_fields == 0) { + continue; + } + + dict_field_t* fields = m_indexes[i].m_fields; + ulint n_fields = m_indexes[i].m_n_fields; + + for (ulint j = 0; j < n_fields; ++j) { + delete [] fields[j].name; + } + + delete [] fields; + } + + for (ulint i = 0; m_col_names != 0 && i < m_n_cols; ++i) { + delete [] m_col_names[i]; + } + + delete [] m_cols; + delete [] m_indexes; + delete [] m_col_names; + delete [] m_table_name; + delete [] m_hostname; +} + +/** +Find the index entry in in the indexes array. +@param name - index name +@return instance if found else 0. */ +row_index_t* +row_import::get_index( + const char* name) const UNIV_NOTHROW +{ + for (ulint i = 0; i < m_n_indexes; ++i) { + const char* index_name; + row_index_t* index = &m_indexes[i]; + + index_name = reinterpret_cast<const char*>(index->m_name); + + if (strcmp(index_name, name) == 0) { + + return(index); + } + } + + return(0); +} + +/** +Get the number of rows in the index. +@param name - index name +@return number of rows (doesn't include delete marked rows). */ +ulint +row_import::get_n_rows( + const char* name) const UNIV_NOTHROW +{ + const row_index_t* index = get_index(name); + + ut_a(name != 0); + + return(index->m_stats.m_n_rows); +} + +/** +Get the number of rows for which purge failed uding the convert phase. +@param name - index name +@return number of rows for which purge failed. */ +ulint +row_import::get_n_purge_failed( + const char* name) const UNIV_NOTHROW +{ + const row_index_t* index = get_index(name); + + ut_a(name != 0); + + return(index->m_stats.m_n_purge_failed); +} + +/** +Find the ordinal value of the column name in the cfg table columns. +@param name - of column to look for. +@return ULINT_UNDEFINED if not found. */ +ulint +row_import::find_col( + const char* name) const UNIV_NOTHROW +{ + for (ulint i = 0; i < m_n_cols; ++i) { + const char* col_name; + + col_name = reinterpret_cast<const char*>(m_col_names[i]); + + if (strcmp(col_name, name) == 0) { + return(i); + } + } + + return(ULINT_UNDEFINED); +} + +/** +Find the index field entry in in the cfg indexes fields. +@name - of the index to look for +@return instance if found else 0. */ +const dict_field_t* +row_import::find_field( + const row_index_t* cfg_index, + const char* name) const UNIV_NOTHROW +{ + const dict_field_t* field = cfg_index->m_fields; + + for (ulint i = 0; i < cfg_index->m_n_fields; ++i, ++field) { + const char* field_name; + + field_name = reinterpret_cast<const char*>(field->name); + + if (strcmp(field_name, name) == 0) { + return(field); + } + } + + return(0); +} + +/** +Check if the index schema that was read from the .cfg file matches the +in memory index definition. +@return DB_SUCCESS or error code. */ +dberr_t +row_import::match_index_columns( + THD* thd, + const dict_index_t* index) UNIV_NOTHROW +{ + row_index_t* cfg_index; + dberr_t err = DB_SUCCESS; + + cfg_index = get_index(index->name); + + if (cfg_index == 0) { + ib_errf(thd, IB_LOG_LEVEL_ERROR, + ER_TABLE_SCHEMA_MISMATCH, + "Index %s not found in tablespace meta-data file.", + index->name); + + return(DB_ERROR); + } + + cfg_index->m_srv_index = index; + + const dict_field_t* field = index->fields; + + for (ulint i = 0; i < index->n_fields; ++i, ++field) { + + const dict_field_t* cfg_field; + + cfg_field = find_field(cfg_index, field->name); + + if (cfg_field == 0) { + ib_errf(thd, IB_LOG_LEVEL_ERROR, + ER_TABLE_SCHEMA_MISMATCH, + "Index %s field %s not found in tablespace " + "meta-data file.", + index->name, field->name); + + err = DB_ERROR; + } else { + + if (cfg_field->prefix_len != field->prefix_len) { + ib_errf(thd, IB_LOG_LEVEL_ERROR, + ER_TABLE_SCHEMA_MISMATCH, + "Index %s field %s prefix len %lu " + "doesn't match meta-data file value " + "%lu", + index->name, field->name, + (ulong) field->prefix_len, + (ulong) cfg_field->prefix_len); + + err = DB_ERROR; + } + + if (cfg_field->fixed_len != field->fixed_len) { + ib_errf(thd, IB_LOG_LEVEL_ERROR, + ER_TABLE_SCHEMA_MISMATCH, + "Index %s field %s fixed len %lu " + "doesn't match meta-data file value " + "%lu", + index->name, field->name, + (ulong) field->fixed_len, + (ulong) cfg_field->fixed_len); + + err = DB_ERROR; + } + } + } + + return(err); +} + +/** +Check if the table schema that was read from the .cfg file matches the +in memory table definition. +@param thd - MySQL session variable +@return DB_SUCCESS or error code. */ +dberr_t +row_import::match_table_columns( + THD* thd) UNIV_NOTHROW +{ + dberr_t err = DB_SUCCESS; + const dict_col_t* col = m_table->cols; + + for (ulint i = 0; i < m_table->n_cols; ++i, ++col) { + + const char* col_name; + ulint cfg_col_index; + + col_name = dict_table_get_col_name( + m_table, dict_col_get_no(col)); + + cfg_col_index = find_col(col_name); + + if (cfg_col_index == ULINT_UNDEFINED) { + + ib_errf(thd, IB_LOG_LEVEL_ERROR, + ER_TABLE_SCHEMA_MISMATCH, + "Column %s not found in tablespace.", + col_name); + + err = DB_ERROR; + } else if (cfg_col_index != col->ind) { + + ib_errf(thd, IB_LOG_LEVEL_ERROR, + ER_TABLE_SCHEMA_MISMATCH, + "Column %s ordinal value mismatch, it's at " + "%lu in the table and %lu in the tablespace " + "meta-data file", + col_name, + (ulong) col->ind, (ulong) cfg_col_index); + + err = DB_ERROR; + } else { + const dict_col_t* cfg_col; + + cfg_col = &m_cols[cfg_col_index]; + ut_a(cfg_col->ind == cfg_col_index); + + if (cfg_col->prtype != col->prtype) { + ib_errf(thd, + IB_LOG_LEVEL_ERROR, + ER_TABLE_SCHEMA_MISMATCH, + "Column %s precise type mismatch.", + col_name); + err = DB_ERROR; + } + + if (cfg_col->mtype != col->mtype) { + ib_errf(thd, + IB_LOG_LEVEL_ERROR, + ER_TABLE_SCHEMA_MISMATCH, + "Column %s main type mismatch.", + col_name); + err = DB_ERROR; + } + + if (cfg_col->len != col->len) { + ib_errf(thd, + IB_LOG_LEVEL_ERROR, + ER_TABLE_SCHEMA_MISMATCH, + "Column %s length mismatch.", + col_name); + err = DB_ERROR; + } + + if (cfg_col->mbminmaxlen != col->mbminmaxlen) { + ib_errf(thd, + IB_LOG_LEVEL_ERROR, + ER_TABLE_SCHEMA_MISMATCH, + "Column %s multi-byte len mismatch.", + col_name); + err = DB_ERROR; + } + + if (cfg_col->ind != col->ind) { + err = DB_ERROR; + } + + if (cfg_col->ord_part != col->ord_part) { + ib_errf(thd, + IB_LOG_LEVEL_ERROR, + ER_TABLE_SCHEMA_MISMATCH, + "Column %s ordering mismatch.", + col_name); + err = DB_ERROR; + } + + if (cfg_col->max_prefix != col->max_prefix) { + ib_errf(thd, + IB_LOG_LEVEL_ERROR, + ER_TABLE_SCHEMA_MISMATCH, + "Column %s max prefix mismatch.", + col_name); + err = DB_ERROR; + } + } + } + + return(err); +} + +/** +Check if the table (and index) schema that was read from the .cfg file +matches the in memory table definition. +@param thd - MySQL session variable +@return DB_SUCCESS or error code. */ +dberr_t +row_import::match_schema( + THD* thd) UNIV_NOTHROW +{ + /* Do some simple checks. */ + + if (m_flags != m_table->flags) { + ib_errf(thd, IB_LOG_LEVEL_ERROR, ER_TABLE_SCHEMA_MISMATCH, + "Table flags don't match, server table has 0x%lx " + "and the meta-data file has 0x%lx", + (ulong) m_table->n_cols, (ulong) m_flags); + + return(DB_ERROR); + } else if (m_table->n_cols != m_n_cols) { + ib_errf(thd, IB_LOG_LEVEL_ERROR, ER_TABLE_SCHEMA_MISMATCH, + "Number of columns don't match, table has %lu " + "columns but the tablespace meta-data file has " + "%lu columns", + (ulong) m_table->n_cols, (ulong) m_n_cols); + + return(DB_ERROR); + } else if (UT_LIST_GET_LEN(m_table->indexes) != m_n_indexes) { + + /* If the number of indexes don't match then it is better + to abort the IMPORT. It is easy for the user to create a + table matching the IMPORT definition. */ + + ib_errf(thd, IB_LOG_LEVEL_ERROR, ER_TABLE_SCHEMA_MISMATCH, + "Number of indexes don't match, table has %lu " + "indexes but the tablespace meta-data file has " + "%lu indexes", + (ulong) UT_LIST_GET_LEN(m_table->indexes), + (ulong) m_n_indexes); + + return(DB_ERROR); + } + + dberr_t err = match_table_columns(thd); + + if (err != DB_SUCCESS) { + return(err); + } + + /* Check if the index definitions match. */ + + const dict_index_t* index; + + for (index = UT_LIST_GET_FIRST(m_table->indexes); + index != 0; + index = UT_LIST_GET_NEXT(indexes, index)) { + + dberr_t index_err; + + index_err = match_index_columns(thd, index); + + if (index_err != DB_SUCCESS) { + err = index_err; + } + } + + return(err); +} + +/** +Set the index root <space, pageno>, using index name. */ +void +row_import::set_root_by_name() UNIV_NOTHROW +{ + row_index_t* cfg_index = m_indexes; + + for (ulint i = 0; i < m_n_indexes; ++i, ++cfg_index) { + dict_index_t* index; + + const char* index_name; + + index_name = reinterpret_cast<const char*>(cfg_index->m_name); + + index = dict_table_get_index_on_name(m_table, index_name); + + /* We've already checked that it exists. */ + ut_a(index != 0); + + /* Set the root page number and space id. */ + index->space = m_table->space; + index->page = cfg_index->m_page_no; + } +} + +/** +Set the index root <space, pageno>, using a heuristic. +@return DB_SUCCESS or error code */ +dberr_t +row_import::set_root_by_heuristic() UNIV_NOTHROW +{ + row_index_t* cfg_index = m_indexes; + + ut_a(m_n_indexes > 0); + + // TODO: For now use brute force, based on ordinality + + if (UT_LIST_GET_LEN(m_table->indexes) != m_n_indexes) { + + char table_name[MAX_FULL_NAME_LEN + 1]; + + innobase_format_name( + table_name, sizeof(table_name), m_table->name, FALSE); + + ib_logf(IB_LOG_LEVEL_WARN, + "Table %s should have %lu indexes but the tablespace " + "has %lu indexes", + table_name, + UT_LIST_GET_LEN(m_table->indexes), + m_n_indexes); + } + + dict_mutex_enter_for_mysql(); + + ulint i = 0; + dberr_t err = DB_SUCCESS; + + for (dict_index_t* index = UT_LIST_GET_FIRST(m_table->indexes); + index != 0; + index = UT_LIST_GET_NEXT(indexes, index)) { + + if (index->type & DICT_FTS) { + index->type |= DICT_CORRUPT; + ib_logf(IB_LOG_LEVEL_WARN, + "Skipping FTS index: %s", index->name); + } else if (i < m_n_indexes) { + + delete [] cfg_index[i].m_name; + + ulint len = strlen(index->name) + 1; + + cfg_index[i].m_name = new(std::nothrow) byte[len]; + + /* Trigger OOM */ + DBUG_EXECUTE_IF("ib_import_OOM_14", + delete[] cfg_index[i].m_name; + cfg_index[i].m_name = 0;); + + if (cfg_index[i].m_name == 0) { + err = DB_OUT_OF_MEMORY; + break; + } + + memcpy(cfg_index[i].m_name, index->name, len); + + cfg_index[i].m_srv_index = index; + + index->space = m_table->space; + index->page = cfg_index[i].m_page_no; + + ++i; + } + } + + dict_mutex_exit_for_mysql(); + + return(err); +} + +/** +Purge delete marked records. +@return DB_SUCCESS or error code. */ +dberr_t +IndexPurge::garbage_collect() UNIV_NOTHROW +{ + dberr_t err; + ibool comp = dict_table_is_comp(m_index->table); + + /* Open the persistent cursor and start the mini-transaction. */ + + open(); + + while ((err = next()) == DB_SUCCESS) { + + rec_t* rec = btr_pcur_get_rec(&m_pcur); + ibool deleted = rec_get_deleted_flag(rec, comp); + + if (!deleted) { + ++m_n_rows; + } else { + purge(); + } + } + + /* Close the persistent cursor and commit the mini-transaction. */ + + close(); + + return(err == DB_END_OF_INDEX ? DB_SUCCESS : err); +} + +/** +Begin import, position the cursor on the first record. */ +void +IndexPurge::open() UNIV_NOTHROW +{ + mtr_start(&m_mtr); + + mtr_set_log_mode(&m_mtr, MTR_LOG_NO_REDO); + + btr_pcur_open_at_index_side( + true, m_index, BTR_MODIFY_LEAF, &m_pcur, true, 0, &m_mtr); +} + +/** +Close the persistent curosr and commit the mini-transaction. */ +void +IndexPurge::close() UNIV_NOTHROW +{ + btr_pcur_close(&m_pcur); + mtr_commit(&m_mtr); +} + +/** +Position the cursor on the next record. +@return DB_SUCCESS or error code */ +dberr_t +IndexPurge::next() UNIV_NOTHROW +{ + btr_pcur_move_to_next_on_page(&m_pcur); + + /* When switching pages, commit the mini-transaction + in order to release the latch on the old page. */ + + if (!btr_pcur_is_after_last_on_page(&m_pcur)) { + return(DB_SUCCESS); + } else if (trx_is_interrupted(m_trx)) { + /* Check after every page because the check + is expensive. */ + return(DB_INTERRUPTED); + } + + btr_pcur_store_position(&m_pcur, &m_mtr); + + mtr_commit(&m_mtr); + + mtr_start(&m_mtr); + + mtr_set_log_mode(&m_mtr, MTR_LOG_NO_REDO); + + btr_pcur_restore_position(BTR_MODIFY_LEAF, &m_pcur, &m_mtr); + + if (!btr_pcur_move_to_next_user_rec(&m_pcur, &m_mtr)) { + + return(DB_END_OF_INDEX); + } + + return(DB_SUCCESS); +} + +/** +Store the persistent cursor position and reopen the +B-tree cursor in BTR_MODIFY_TREE mode, because the +tree structure may be changed during a pessimistic delete. */ +void +IndexPurge::purge_pessimistic_delete() UNIV_NOTHROW +{ + dberr_t err; + + btr_pcur_restore_position(BTR_MODIFY_TREE, &m_pcur, &m_mtr); + + ut_ad(rec_get_deleted_flag( + btr_pcur_get_rec(&m_pcur), + dict_table_is_comp(m_index->table))); + + btr_cur_pessimistic_delete( + &err, FALSE, btr_pcur_get_btr_cur(&m_pcur), 0, RB_NONE, &m_mtr); + + ut_a(err == DB_SUCCESS); + + /* Reopen the B-tree cursor in BTR_MODIFY_LEAF mode */ + mtr_commit(&m_mtr); +} + +/** +Purge delete-marked records. */ +void +IndexPurge::purge() UNIV_NOTHROW +{ + btr_pcur_store_position(&m_pcur, &m_mtr); + + purge_pessimistic_delete(); + + mtr_start(&m_mtr); + + mtr_set_log_mode(&m_mtr, MTR_LOG_NO_REDO); + + btr_pcur_restore_position(BTR_MODIFY_LEAF, &m_pcur, &m_mtr); +} + +/** +Constructor +* @param cfg - config of table being imported. +* @param trx - transaction covering the import */ +PageConverter::PageConverter( + row_import* cfg, + trx_t* trx) + : + AbstractCallback(trx), + m_cfg(cfg), + m_page_zip_ptr(0), + m_heap(0) UNIV_NOTHROW +{ + m_index = m_cfg->m_indexes; + + m_current_lsn = log_get_lsn(); + ut_a(m_current_lsn > 0); + + m_offsets = m_offsets_; + rec_offs_init(m_offsets_); + + m_cluster_index = dict_table_get_first_index(m_cfg->m_table); +} + +/** +Adjust the BLOB reference for a single column that is externally stored +@param rec - record to update +@param offsets - column offsets for the record +@param i - column ordinal value +@return DB_SUCCESS or error code */ +dberr_t +PageConverter::adjust_cluster_index_blob_column( + rec_t* rec, + const ulint* offsets, + ulint i) UNIV_NOTHROW +{ + ulint len; + byte* field; + + field = rec_get_nth_field(rec, offsets, i, &len); + + DBUG_EXECUTE_IF("ib_import_trigger_corruption_2", + len = BTR_EXTERN_FIELD_REF_SIZE - 1;); + + if (len < BTR_EXTERN_FIELD_REF_SIZE) { + + char index_name[MAX_FULL_NAME_LEN + 1]; + + innobase_format_name( + index_name, sizeof(index_name), + m_cluster_index->name, TRUE); + + ib_errf(m_trx->mysql_thd, IB_LOG_LEVEL_ERROR, + ER_INNODB_INDEX_CORRUPT, + "Externally stored column(%lu) has a reference " + "length of %lu in the cluster index %s", + (ulong) i, (ulong) len, index_name); + + return(DB_CORRUPTION); + } + + field += BTR_EXTERN_SPACE_ID - BTR_EXTERN_FIELD_REF_SIZE + len; + + if (is_compressed_table()) { + mach_write_to_4(field, get_space_id()); + + page_zip_write_blob_ptr( + m_page_zip_ptr, rec, m_cluster_index, offsets, i, 0); + } else { + mlog_write_ulint(field, get_space_id(), MLOG_4BYTES, 0); + } + + return(DB_SUCCESS); +} + +/** +Adjusts the BLOB reference in the clustered index row for all externally +stored columns. +@param rec - record to update +@param offsets - column offsets for the record +@return DB_SUCCESS or error code */ +dberr_t +PageConverter::adjust_cluster_index_blob_columns( + rec_t* rec, + const ulint* offsets) UNIV_NOTHROW +{ + ut_ad(rec_offs_any_extern(offsets)); + + /* Adjust the space_id in the BLOB pointers. */ + + for (ulint i = 0; i < rec_offs_n_fields(offsets); ++i) { + + /* Only if the column is stored "externally". */ + + if (rec_offs_nth_extern(offsets, i)) { + dberr_t err; + + err = adjust_cluster_index_blob_column(rec, offsets, i); + + if (err != DB_SUCCESS) { + return(err); + } + } + } + + return(DB_SUCCESS); +} + +/** +In the clustered index, adjust BLOB pointers as needed. Also update the +BLOB reference, write the new space id. +@param rec - record to update +@param offsets - column offsets for the record +@return DB_SUCCESS or error code */ +dberr_t +PageConverter::adjust_cluster_index_blob_ref( + rec_t* rec, + const ulint* offsets) UNIV_NOTHROW +{ + if (rec_offs_any_extern(offsets)) { + dberr_t err; + + err = adjust_cluster_index_blob_columns(rec, offsets); + + if (err != DB_SUCCESS) { + return(err); + } + } + + return(DB_SUCCESS); +} + +/** +Purge delete-marked records, only if it is possible to do so without +re-organising the B+tree. +@param offsets - current row offsets. +@return true if purge succeeded */ +bool +PageConverter::purge(const ulint* offsets) UNIV_NOTHROW +{ + const dict_index_t* index = m_index->m_srv_index; + + /* We can't have a page that is empty and not root. */ + if (m_rec_iter.remove(index, m_page_zip_ptr, m_offsets)) { + + ++m_index->m_stats.m_n_purged; + + return(true); + } else { + ++m_index->m_stats.m_n_purge_failed; + } + + return(false); +} + +/** +Adjust the BLOB references and sys fields for the current record. +@param rec - record to update +@param offsets - column offsets for the record +@param deleted - true if row is delete marked +@return DB_SUCCESS or error code. */ +dberr_t +PageConverter::adjust_cluster_record( + const dict_index_t* index, + rec_t* rec, + const ulint* offsets, + bool deleted) UNIV_NOTHROW +{ + dberr_t err; + + if ((err = adjust_cluster_index_blob_ref(rec, offsets)) == DB_SUCCESS) { + + /* Reset DB_TRX_ID and DB_ROLL_PTR. Normally, these fields + are only written in conjunction with other changes to the + record. */ + + row_upd_rec_sys_fields( + rec, m_page_zip_ptr, m_cluster_index, m_offsets, + m_trx, 0); + } + + return(err); +} + +/** +Update the BLOB refrences and write UNDO log entries for +rows that can't be purged optimistically. +@param block - block to update +@retval DB_SUCCESS or error code */ +dberr_t +PageConverter::update_records( + buf_block_t* block) UNIV_NOTHROW +{ + ibool comp = dict_table_is_comp(m_cfg->m_table); + bool clust_index = m_index->m_srv_index == m_cluster_index; + + /* This will also position the cursor on the first user record. */ + + m_rec_iter.open(block); + + while (!m_rec_iter.end()) { + + rec_t* rec = m_rec_iter.current(); + + /* FIXME: Move out of the loop */ + + if (rec_get_status(rec) == REC_STATUS_NODE_PTR) { + break; + } + + ibool deleted = rec_get_deleted_flag(rec, comp); + + /* For the clustered index we have to adjust the BLOB + reference and the system fields irrespective of the + delete marked flag. The adjustment of delete marked + cluster records is required for purge to work later. */ + + if (deleted || clust_index) { + m_offsets = rec_get_offsets( + rec, m_index->m_srv_index, m_offsets, + ULINT_UNDEFINED, &m_heap); + } + + if (clust_index) { + + dberr_t err = adjust_cluster_record( + m_index->m_srv_index, rec, m_offsets, + deleted); + + if (err != DB_SUCCESS) { + return(err); + } + } + + /* If it is a delete marked record then try an + optimistic delete. */ + + if (deleted) { + /* A successful purge will move the cursor to the + next record. */ + + if (!purge(m_offsets)) { + m_rec_iter.next(); + } + + ++m_index->m_stats.m_n_deleted; + } else { + ++m_index->m_stats.m_n_rows; + m_rec_iter.next(); + } + } + + return(DB_SUCCESS); +} + +/** +Update the space, index id, trx id. +@return DB_SUCCESS or error code */ +dberr_t +PageConverter::update_index_page( + buf_block_t* block) UNIV_NOTHROW +{ + index_id_t id; + buf_frame_t* page = block->frame; + + if (is_free(buf_block_get_page_no(block))) { + return(DB_SUCCESS); + } else if ((id = btr_page_get_index_id(page)) != m_index->m_id) { + + row_index_t* index = find_index(id); + + if (index == 0) { + m_index = 0; + return(DB_CORRUPTION); + } + + /* Update current index */ + m_index = index; + } + + /* If the .cfg file is missing and there is an index mismatch + then ignore the error. */ + if (m_cfg->m_missing && (m_index == 0 || m_index->m_srv_index == 0)) { + return(DB_SUCCESS); + } + +#ifdef UNIV_ZIP_DEBUG + ut_a(!is_compressed_table() + || page_zip_validate(m_page_zip_ptr, page, m_index->m_srv_index)); +#endif /* UNIV_ZIP_DEBUG */ + + /* This has to be written to uncompressed index header. Set it to + the current index id. */ + btr_page_set_index_id( + page, m_page_zip_ptr, m_index->m_srv_index->id, 0); + + page_set_max_trx_id(block, m_page_zip_ptr, m_trx->id, 0); + + if (page_get_n_recs(block->frame) == 0) { + + /* Only a root page can be empty. */ + if (!is_root_page(block->frame)) { + // TODO: We should relax this and skip secondary + // indexes. Mark them as corrupt because they can + // always be rebuilt. + return(DB_CORRUPTION); + } + + return(DB_SUCCESS); + } + + return(update_records(block)); +} + +/** +Validate the space flags and update tablespace header page. +@param block - block read from file, not from the buffer pool. +@retval DB_SUCCESS or error code */ +dberr_t +PageConverter::update_header( + buf_block_t* block) UNIV_NOTHROW +{ + /* Check for valid header */ + switch(fsp_header_get_space_id(get_frame(block))) { + case 0: + return(DB_CORRUPTION); + case ULINT_UNDEFINED: + ib_logf(IB_LOG_LEVEL_WARN, + "Space id check in the header failed " + "- ignored"); + } + + ulint space_flags = fsp_header_get_flags(get_frame(block)); + + if (!fsp_flags_is_valid(space_flags)) { + + ib_logf(IB_LOG_LEVEL_ERROR, + "Unsupported tablespace format %lu", + (ulong) space_flags); + + return(DB_UNSUPPORTED); + } + + mach_write_to_8( + get_frame(block) + FIL_PAGE_FILE_FLUSH_LSN, m_current_lsn); + + /* Write space_id to the tablespace header, page 0. */ + mach_write_to_4( + get_frame(block) + FSP_HEADER_OFFSET + FSP_SPACE_ID, + get_space_id()); + + /* This is on every page in the tablespace. */ + mach_write_to_4( + get_frame(block) + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, + get_space_id()); + + return(DB_SUCCESS); +} + +/** +Update the page, set the space id, max trx id and index id. +@param block - block read from file +@retval DB_SUCCESS or error code */ +dberr_t +PageConverter::update_page( + buf_block_t* block, + ulint& page_type) UNIV_NOTHROW +{ + dberr_t err = DB_SUCCESS; + + switch (page_type = fil_page_get_type(get_frame(block))) { + case FIL_PAGE_TYPE_FSP_HDR: + /* Work directly on the uncompressed page headers. */ + ut_a(buf_block_get_page_no(block) == 0); + return(update_header(block)); + + case FIL_PAGE_INDEX: + /* We need to decompress the contents into block->frame + before we can do any thing with Btree pages. */ + + if (is_compressed_table() && !buf_zip_decompress(block, TRUE)) { + return(DB_CORRUPTION); + } + + /* This is on every page in the tablespace. */ + mach_write_to_4( + get_frame(block) + + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, get_space_id()); + + /* Only update the Btree nodes. */ + return(update_index_page(block)); + + case FIL_PAGE_TYPE_SYS: + /* This is page 0 in the system tablespace. */ + return(DB_CORRUPTION); + + case FIL_PAGE_TYPE_XDES: + err = set_current_xdes( + buf_block_get_page_no(block), get_frame(block)); + case FIL_PAGE_INODE: + case FIL_PAGE_TYPE_TRX_SYS: + case FIL_PAGE_IBUF_FREE_LIST: + case FIL_PAGE_TYPE_ALLOCATED: + case FIL_PAGE_IBUF_BITMAP: + case FIL_PAGE_TYPE_BLOB: + case FIL_PAGE_TYPE_ZBLOB: + case FIL_PAGE_TYPE_ZBLOB2: + + /* Work directly on the uncompressed page headers. */ + /* This is on every page in the tablespace. */ + mach_write_to_4( + get_frame(block) + + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, get_space_id()); + + return(err); + } + + ib_logf(IB_LOG_LEVEL_WARN, "Unknown page type (%lu)", page_type); + + return(DB_CORRUPTION); +} + +/** +Validate the page +@param offset - physical offset within file. +@param page - page read from file. +@return status */ +PageConverter::import_page_status_t +PageConverter::validate( + os_offset_t offset, + buf_block_t* block) UNIV_NOTHROW +{ + buf_frame_t* page = get_frame(block); + + /* Check that the page number corresponds to the offset in + the file. Flag as corrupt if it doesn't. Disable the check + for LSN in buf_page_is_corrupted() */ + + if (buf_page_is_corrupted(false, page, get_zip_size()) + || (page_get_page_no(page) != offset / m_page_size + && page_get_page_no(page) != 0)) { + + return(IMPORT_PAGE_STATUS_CORRUPTED); + + } else if (offset > 0 && page_get_page_no(page) == 0) { + const byte* b = page; + const byte* e = b + m_page_size; + + /* If the page number is zero and offset > 0 then + the entire page MUST consist of zeroes. If not then + we flag it as corrupt. */ + + while (b != e) { + + if (*b++ && !trigger_corruption()) { + return(IMPORT_PAGE_STATUS_CORRUPTED); + } + } + + /* The page is all zero: do nothing. */ + return(IMPORT_PAGE_STATUS_ALL_ZERO); + } + + return(IMPORT_PAGE_STATUS_OK); +} + +/** +Called for every page in the tablespace. If the page was not +updated then its state must be set to BUF_PAGE_NOT_USED. +@param offset - physical offset within the file +@param block - block read from file, note it is not from the buffer pool +@retval DB_SUCCESS or error code. */ +dberr_t +PageConverter::operator() ( + os_offset_t offset, + buf_block_t* block) UNIV_NOTHROW +{ + ulint page_type; + dberr_t err = DB_SUCCESS; + + if ((err = periodic_check()) != DB_SUCCESS) { + return(err); + } + + if (is_compressed_table()) { + m_page_zip_ptr = &block->page.zip; + } else { + ut_ad(m_page_zip_ptr == 0); + } + + switch(validate(offset, block)) { + case IMPORT_PAGE_STATUS_OK: + + /* We have to decompress the compressed pages before + we can work on them */ + + if ((err = update_page(block, page_type)) != DB_SUCCESS) { + return(err); + } + + /* Note: For compressed pages this function will write to the + zip descriptor and for uncompressed pages it will write to + page (ie. the block->frame). Therefore the caller should write + out the descriptor contents and not block->frame for compressed + pages. */ + + if (!is_compressed_table() || page_type == FIL_PAGE_INDEX) { + + buf_flush_init_for_writing( + !is_compressed_table() + ? block->frame : block->page.zip.data, + !is_compressed_table() ? 0 : m_page_zip_ptr, + m_current_lsn); + } else { + /* Calculate and update the checksum of non-btree + pages for compressed tables explicitly here. */ + + buf_flush_update_zip_checksum( + get_frame(block), get_zip_size(), + m_current_lsn); + } + + break; + + case IMPORT_PAGE_STATUS_ALL_ZERO: + /* The page is all zero: leave it as is. */ + break; + + case IMPORT_PAGE_STATUS_CORRUPTED: + + ib_logf(IB_LOG_LEVEL_WARN, + "%s: Page %lu at offset " UINT64PF " looks corrupted.", + m_filepath, (ulong) (offset / m_page_size), offset); + + return(DB_CORRUPTION); + } + + return(err); +} + +/*****************************************************************//** +Clean up after import tablespace failure, this function will acquire +the dictionary latches on behalf of the transaction if the transaction +hasn't already acquired them. */ +static __attribute__((nonnull)) +void +row_import_discard_changes( +/*=======================*/ + row_prebuilt_t* prebuilt, /*!< in/out: prebuilt from handler */ + trx_t* trx, /*!< in/out: transaction for import */ + dberr_t err) /*!< in: error code */ +{ + dict_table_t* table = prebuilt->table; + + ut_a(err != DB_SUCCESS); + + prebuilt->trx->error_info = NULL; + + char table_name[MAX_FULL_NAME_LEN + 1]; + + innobase_format_name( + table_name, sizeof(table_name), + prebuilt->table->name, FALSE); + + ib_logf(IB_LOG_LEVEL_INFO, + "Discarding tablespace of table %s: %s", + table_name, ut_strerr(err)); + + if (trx->dict_operation_lock_mode != RW_X_LATCH) { + ut_a(trx->dict_operation_lock_mode == 0); + row_mysql_lock_data_dictionary(trx); + } + + ut_a(trx->dict_operation_lock_mode == RW_X_LATCH); + + /* Since we update the index root page numbers on disk after + we've done a successful import. The table will not be loadable. + However, we need to ensure that the in memory root page numbers + are reset to "NULL". */ + + for (dict_index_t* index = UT_LIST_GET_FIRST(table->indexes); + index != 0; + index = UT_LIST_GET_NEXT(indexes, index)) { + + index->page = FIL_NULL; + index->space = FIL_NULL; + } + + table->ibd_file_missing = TRUE; + + fil_close_tablespace(trx, table->space); +} + +/*****************************************************************//** +Clean up after import tablespace. */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_import_cleanup( +/*===============*/ + row_prebuilt_t* prebuilt, /*!< in/out: prebuilt from handler */ + trx_t* trx, /*!< in/out: transaction for import */ + dberr_t err) /*!< in: error code */ +{ + ut_a(prebuilt->trx != trx); + + if (err != DB_SUCCESS) { + row_import_discard_changes(prebuilt, trx, err); + } + + ut_a(trx->dict_operation_lock_mode == RW_X_LATCH); + + DBUG_EXECUTE_IF("ib_import_before_commit_crash", DBUG_SUICIDE();); + + trx_commit_for_mysql(trx); + + row_mysql_unlock_data_dictionary(trx); + + trx_free_for_mysql(trx); + + prebuilt->trx->op_info = ""; + + DBUG_EXECUTE_IF("ib_import_before_checkpoint_crash", DBUG_SUICIDE();); + + log_make_checkpoint_at(IB_ULONGLONG_MAX, TRUE); + + return(err); +} + +/*****************************************************************//** +Report error during tablespace import. */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_import_error( +/*=============*/ + row_prebuilt_t* prebuilt, /*!< in/out: prebuilt from handler */ + trx_t* trx, /*!< in/out: transaction for import */ + dberr_t err) /*!< in: error code */ +{ + if (!trx_is_interrupted(trx)) { + char table_name[MAX_FULL_NAME_LEN + 1]; + + innobase_format_name( + table_name, sizeof(table_name), + prebuilt->table->name, FALSE); + + ib_senderrf( + trx->mysql_thd, IB_LOG_LEVEL_WARN, + ER_INNODB_IMPORT_ERROR, + table_name, (ulong) err, ut_strerr(err)); + } + + return(row_import_cleanup(prebuilt, trx, err)); +} + +/*****************************************************************//** +Adjust the root page index node and leaf node segment headers, update +with the new space id. For all the table's secondary indexes. +@return error code */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_import_adjust_root_pages_of_secondary_indexes( +/*==============================================*/ + row_prebuilt_t* prebuilt, /*!< in/out: prebuilt from + handler */ + trx_t* trx, /*!< in: transaction used for + the import */ + dict_table_t* table, /*!< in: table the indexes + belong to */ + const row_import& cfg) /*!< Import context */ +{ + dict_index_t* index; + ulint n_rows_in_table; + dberr_t err = DB_SUCCESS; + + /* Skip the clustered index. */ + index = dict_table_get_first_index(table); + + n_rows_in_table = cfg.get_n_rows(index->name); + + DBUG_EXECUTE_IF("ib_import_sec_rec_count_mismatch_failure", + n_rows_in_table++;); + + /* Adjust the root pages of the secondary indexes only. */ + while ((index = dict_table_get_next_index(index)) != NULL) { + char index_name[MAX_FULL_NAME_LEN + 1]; + + innobase_format_name( + index_name, sizeof(index_name), index->name, TRUE); + + ut_a(!dict_index_is_clust(index)); + + if (!(index->type & DICT_CORRUPT) + && index->space != FIL_NULL + && index->page != FIL_NULL) { + + /* Update the Btree segment headers for index node and + leaf nodes in the root page. Set the new space id. */ + + err = btr_root_adjust_on_import(index); + } else { + ib_logf(IB_LOG_LEVEL_WARN, + "Skip adjustment of root pages for " + "index %s.", index->name); + + err = DB_CORRUPTION; + } + + if (err != DB_SUCCESS) { + + if (index->type & DICT_CLUSTERED) { + break; + } + + ib_errf(trx->mysql_thd, + IB_LOG_LEVEL_WARN, + ER_INNODB_INDEX_CORRUPT, + "Index '%s' not found or corrupt, " + "you should recreate this index.", + index_name); + + /* Do not bail out, so that the data + can be recovered. */ + + err = DB_SUCCESS; + index->type |= DICT_CORRUPT; + continue; + } + + /* If we failed to purge any records in the index then + do it the hard way. + + TODO: We can do this in the first pass by generating UNDO log + records for the failed rows. */ + + if (!cfg.requires_purge(index->name)) { + continue; + } + + IndexPurge purge(trx, index); + + trx->op_info = "secondary: purge delete marked records"; + + err = purge.garbage_collect(); + + trx->op_info = ""; + + if (err != DB_SUCCESS) { + break; + } else if (purge.get_n_rows() != n_rows_in_table) { + + ib_errf(trx->mysql_thd, + IB_LOG_LEVEL_WARN, + ER_INNODB_INDEX_CORRUPT, + "Index '%s' contains %lu entries, " + "should be %lu, you should recreate " + "this index.", index_name, + (ulong) purge.get_n_rows(), + (ulong) n_rows_in_table); + + index->type |= DICT_CORRUPT; + + /* Do not bail out, so that the data + can be recovered. */ + + err = DB_SUCCESS; + } + } + + return(err); +} + +/*****************************************************************//** +Ensure that dict_sys->row_id exceeds SELECT MAX(DB_ROW_ID). +@return error code */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_import_set_sys_max_row_id( +/*==========================*/ + row_prebuilt_t* prebuilt, /*!< in/out: prebuilt from + handler */ + const dict_table_t* table) /*!< in: table to import */ +{ + dberr_t err; + const rec_t* rec; + mtr_t mtr; + btr_pcur_t pcur; + row_id_t row_id = 0; + dict_index_t* index; + + index = dict_table_get_first_index(table); + ut_a(dict_index_is_clust(index)); + + mtr_start(&mtr); + + mtr_set_log_mode(&mtr, MTR_LOG_NO_REDO); + + btr_pcur_open_at_index_side( + false, // High end + index, + BTR_SEARCH_LEAF, + &pcur, + true, // Init cursor + 0, // Leaf level + &mtr); + + btr_pcur_move_to_prev_on_page(&pcur); + rec = btr_pcur_get_rec(&pcur); + + /* Check for empty table. */ + if (!page_rec_is_infimum(rec)) { + ulint len; + const byte* field; + mem_heap_t* heap = NULL; + ulint offsets_[1 + REC_OFFS_HEADER_SIZE]; + ulint* offsets; + + rec_offs_init(offsets_); + + offsets = rec_get_offsets( + rec, index, offsets_, ULINT_UNDEFINED, &heap); + + field = rec_get_nth_field( + rec, offsets, + dict_index_get_sys_col_pos(index, DATA_ROW_ID), + &len); + + if (len == DATA_ROW_ID_LEN) { + row_id = mach_read_from_6(field); + err = DB_SUCCESS; + } else { + err = DB_CORRUPTION; + } + + if (heap != NULL) { + mem_heap_free(heap); + } + } else { + /* The table is empty. */ + err = DB_SUCCESS; + } + + btr_pcur_close(&pcur); + mtr_commit(&mtr); + + DBUG_EXECUTE_IF("ib_import_set_max_rowid_failure", + err = DB_CORRUPTION;); + + if (err != DB_SUCCESS) { + char index_name[MAX_FULL_NAME_LEN + 1]; + + innobase_format_name( + index_name, sizeof(index_name), index->name, TRUE); + + ib_errf(prebuilt->trx->mysql_thd, + IB_LOG_LEVEL_WARN, + ER_INNODB_INDEX_CORRUPT, + "Index '%s' corruption detected, invalid DB_ROW_ID " + "in index.", index_name); + + return(err); + + } else if (row_id > 0) { + + /* Update the system row id if the imported index row id is + greater than the max system row id. */ + + mutex_enter(&dict_sys->mutex); + + if (row_id >= dict_sys->row_id) { + dict_sys->row_id = row_id + 1; + dict_hdr_flush_row_id(); + } + + mutex_exit(&dict_sys->mutex); + } + + return(DB_SUCCESS); +} + +/*****************************************************************//** +Read the a string from the meta data file. +@return DB_SUCCESS or error code. */ +static +dberr_t +row_import_cfg_read_string( +/*=======================*/ + FILE* file, /*!< in/out: File to read from */ + byte* ptr, /*!< out: string to read */ + ulint max_len) /*!< in: maximum length of the output + buffer in bytes */ +{ + DBUG_EXECUTE_IF("ib_import_string_read_error", + errno = EINVAL; return(DB_IO_ERROR);); + + ulint len = 0; + + while (!feof(file)) { + int ch = fgetc(file); + + if (ch == EOF) { + break; + } else if (ch != 0) { + if (len < max_len) { + ptr[len++] = ch; + } else { + break; + } + /* max_len includes the NUL byte */ + } else if (len != max_len - 1) { + break; + } else { + ptr[len] = 0; + return(DB_SUCCESS); + } + } + + errno = EINVAL; + + return(DB_IO_ERROR); +} + +/*********************************************************************//** +Write the meta data (index user fields) config file. +@return DB_SUCCESS or error code. */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_import_cfg_read_index_fields( +/*=============================*/ + FILE* file, /*!< in: file to write to */ + THD* thd, /*!< in/out: session */ + row_index_t* index, /*!< Index being read in */ + row_import* cfg) /*!< in/out: meta-data read */ +{ + byte row[sizeof(ib_uint32_t) * 3]; + ulint n_fields = index->m_n_fields; + + index->m_fields = new(std::nothrow) dict_field_t[n_fields]; + + /* Trigger OOM */ + DBUG_EXECUTE_IF("ib_import_OOM_4", + delete [] index->m_fields; index->m_fields = 0;); + + if (index->m_fields == 0) { + return(DB_OUT_OF_MEMORY); + } + + dict_field_t* field = index->m_fields; + + memset(field, 0x0, sizeof(*field) * n_fields); + + for (ulint i = 0; i < n_fields; ++i, ++field) { + byte* ptr = row; + + /* Trigger EOF */ + DBUG_EXECUTE_IF("ib_import_io_read_error_1", + (void) fseek(file, 0L, SEEK_END);); + + if (fread(row, 1, sizeof(row), file) != sizeof(row)) { + + ib_senderrf( + thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR, + errno, strerror(errno), + "while reading index fields."); + + return(DB_IO_ERROR); + } + + field->prefix_len = mach_read_from_4(ptr); + ptr += sizeof(ib_uint32_t); + + field->fixed_len = mach_read_from_4(ptr); + ptr += sizeof(ib_uint32_t); + + /* Include the NUL byte in the length. */ + ulint len = mach_read_from_4(ptr); + + byte* name = new(std::nothrow) byte[len]; + + /* Trigger OOM */ + DBUG_EXECUTE_IF("ib_import_OOM_5", delete [] name; name = 0;); + + if (name == 0) { + return(DB_OUT_OF_MEMORY); + } + + field->name = reinterpret_cast<const char*>(name); + + dberr_t err = row_import_cfg_read_string(file, name, len); + + if (err != DB_SUCCESS) { + + ib_senderrf( + thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR, + errno, strerror(errno), + "while parsing table name."); + + return(err); + } + } + + return(DB_SUCCESS); +} + +/*****************************************************************//** +Read the index names and root page numbers of the indexes and set the values. +Row format [root_page_no, len of str, str ... ] +@return DB_SUCCESS or error code. */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_import_read_index_data( +/*=======================*/ + FILE* file, /*!< in: File to read from */ + THD* thd, /*!< in: session */ + row_import* cfg) /*!< in/out: meta-data read */ +{ + byte* ptr; + row_index_t* cfg_index; + byte row[sizeof(index_id_t) + sizeof(ib_uint32_t) * 9]; + + /* FIXME: What is the max value? */ + ut_a(cfg->m_n_indexes > 0); + ut_a(cfg->m_n_indexes < 1024); + + cfg->m_indexes = new(std::nothrow) row_index_t[cfg->m_n_indexes]; + + /* Trigger OOM */ + DBUG_EXECUTE_IF("ib_import_OOM_6", + delete [] cfg->m_indexes; cfg->m_indexes = 0;); + + if (cfg->m_indexes == 0) { + return(DB_OUT_OF_MEMORY); + } + + memset(cfg->m_indexes, 0x0, sizeof(*cfg->m_indexes) * cfg->m_n_indexes); + + cfg_index = cfg->m_indexes; + + for (ulint i = 0; i < cfg->m_n_indexes; ++i, ++cfg_index) { + /* Trigger EOF */ + DBUG_EXECUTE_IF("ib_import_io_read_error_2", + (void) fseek(file, 0L, SEEK_END);); + + /* Read the index data. */ + size_t n_bytes = fread(row, 1, sizeof(row), file); + + /* Trigger EOF */ + DBUG_EXECUTE_IF("ib_import_io_read_error", + (void) fseek(file, 0L, SEEK_END);); + + if (n_bytes != sizeof(row)) { + char msg[BUFSIZ]; + + ut_snprintf(msg, sizeof(msg), + "while reading index meta-data, expected " + "to read %lu bytes but read only %lu " + "bytes", + (ulong) sizeof(row), (ulong) n_bytes); + + ib_senderrf( + thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR, + errno, strerror(errno), msg); + + ib_logf(IB_LOG_LEVEL_ERROR, "IO Error: %s", msg); + + return(DB_IO_ERROR); + } + + ptr = row; + + cfg_index->m_id = mach_read_from_8(ptr); + ptr += sizeof(index_id_t); + + cfg_index->m_space = mach_read_from_4(ptr); + ptr += sizeof(ib_uint32_t); + + cfg_index->m_page_no = mach_read_from_4(ptr); + ptr += sizeof(ib_uint32_t); + + cfg_index->m_type = mach_read_from_4(ptr); + ptr += sizeof(ib_uint32_t); + + cfg_index->m_trx_id_offset = mach_read_from_4(ptr); + if (cfg_index->m_trx_id_offset != mach_read_from_4(ptr)) { + ut_ad(0); + /* Overflow. Pretend that the clustered index + has a variable-length PRIMARY KEY. */ + cfg_index->m_trx_id_offset = 0; + } + ptr += sizeof(ib_uint32_t); + + cfg_index->m_n_user_defined_cols = mach_read_from_4(ptr); + ptr += sizeof(ib_uint32_t); + + cfg_index->m_n_uniq = mach_read_from_4(ptr); + ptr += sizeof(ib_uint32_t); + + cfg_index->m_n_nullable = mach_read_from_4(ptr); + ptr += sizeof(ib_uint32_t); + + cfg_index->m_n_fields = mach_read_from_4(ptr); + ptr += sizeof(ib_uint32_t); + + /* The NUL byte is included in the name length. */ + ulint len = mach_read_from_4(ptr); + + if (len > OS_FILE_MAX_PATH) { + ib_errf(thd, IB_LOG_LEVEL_ERROR, + ER_INNODB_INDEX_CORRUPT, + "Index name length (%lu) is too long, " + "the meta-data is corrupt", len); + + return(DB_CORRUPTION); + } + + cfg_index->m_name = new(std::nothrow) byte[len]; + + /* Trigger OOM */ + DBUG_EXECUTE_IF("ib_import_OOM_7", + delete [] cfg_index->m_name; + cfg_index->m_name = 0;); + + if (cfg_index->m_name == 0) { + return(DB_OUT_OF_MEMORY); + } + + dberr_t err; + + err = row_import_cfg_read_string(file, cfg_index->m_name, len); + + if (err != DB_SUCCESS) { + + ib_senderrf( + thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR, + errno, strerror(errno), + "while parsing index name."); + + return(err); + } + + err = row_import_cfg_read_index_fields( + file, thd, cfg_index, cfg); + + if (err != DB_SUCCESS) { + return(err); + } + + } + + return(DB_SUCCESS); +} + +/*****************************************************************//** +Set the index root page number for v1 format. +@return DB_SUCCESS or error code. */ +static +dberr_t +row_import_read_indexes( +/*====================*/ + FILE* file, /*!< in: File to read from */ + THD* thd, /*!< in: session */ + row_import* cfg) /*!< in/out: meta-data read */ +{ + byte row[sizeof(ib_uint32_t)]; + + /* Trigger EOF */ + DBUG_EXECUTE_IF("ib_import_io_read_error_3", + (void) fseek(file, 0L, SEEK_END);); + + /* Read the number of indexes. */ + if (fread(row, 1, sizeof(row), file) != sizeof(row)) { + ib_senderrf( + thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR, + errno, strerror(errno), + "while reading number of indexes."); + + return(DB_IO_ERROR); + } + + cfg->m_n_indexes = mach_read_from_4(row); + + if (cfg->m_n_indexes == 0) { + ib_errf(thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR, + "Number of indexes in meta-data file is 0"); + + return(DB_CORRUPTION); + + } else if (cfg->m_n_indexes > 1024) { + // FIXME: What is the upper limit? */ + ib_errf(thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR, + "Number of indexes in meta-data file is too high: %lu", + (ulong) cfg->m_n_indexes); + cfg->m_n_indexes = 0; + + return(DB_CORRUPTION); + } + + return(row_import_read_index_data(file, thd, cfg)); +} + +/*********************************************************************//** +Read the meta data (table columns) config file. Deserialise the contents of +dict_col_t structure, along with the column name. */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_import_read_columns( +/*====================*/ + FILE* file, /*!< in: file to write to */ + THD* thd, /*!< in/out: session */ + row_import* cfg) /*!< in/out: meta-data read */ +{ + dict_col_t* col; + byte row[sizeof(ib_uint32_t) * 8]; + + /* FIXME: What should the upper limit be? */ + ut_a(cfg->m_n_cols > 0); + ut_a(cfg->m_n_cols < 1024); + + cfg->m_cols = new(std::nothrow) dict_col_t[cfg->m_n_cols]; + + /* Trigger OOM */ + DBUG_EXECUTE_IF("ib_import_OOM_8", + delete [] cfg->m_cols; cfg->m_cols = 0;); + + if (cfg->m_cols == 0) { + return(DB_OUT_OF_MEMORY); + } + + cfg->m_col_names = new(std::nothrow) byte* [cfg->m_n_cols]; + + /* Trigger OOM */ + DBUG_EXECUTE_IF("ib_import_OOM_9", + delete [] cfg->m_col_names; cfg->m_col_names = 0;); + + if (cfg->m_col_names == 0) { + return(DB_OUT_OF_MEMORY); + } + + memset(cfg->m_cols, 0x0, sizeof(cfg->m_cols) * cfg->m_n_cols); + memset(cfg->m_col_names, 0x0, sizeof(cfg->m_col_names) * cfg->m_n_cols); + + col = cfg->m_cols; + + for (ulint i = 0; i < cfg->m_n_cols; ++i, ++col) { + byte* ptr = row; + + /* Trigger EOF */ + DBUG_EXECUTE_IF("ib_import_io_read_error_4", + (void) fseek(file, 0L, SEEK_END);); + + if (fread(row, 1, sizeof(row), file) != sizeof(row)) { + ib_senderrf( + thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR, + errno, strerror(errno), + "while reading table column meta-data."); + + return(DB_IO_ERROR); + } + + col->prtype = mach_read_from_4(ptr); + ptr += sizeof(ib_uint32_t); + + col->mtype = mach_read_from_4(ptr); + ptr += sizeof(ib_uint32_t); + + col->len = mach_read_from_4(ptr); + ptr += sizeof(ib_uint32_t); + + col->mbminmaxlen = mach_read_from_4(ptr); + ptr += sizeof(ib_uint32_t); + + col->ind = mach_read_from_4(ptr); + ptr += sizeof(ib_uint32_t); + + col->ord_part = mach_read_from_4(ptr); + ptr += sizeof(ib_uint32_t); + + col->max_prefix = mach_read_from_4(ptr); + ptr += sizeof(ib_uint32_t); + + /* Read in the column name as [len, byte array]. The len + includes the NUL byte. */ + + ulint len = mach_read_from_4(ptr); + + /* FIXME: What is the maximum column name length? */ + if (len == 0 || len > 128) { + ib_errf(thd, IB_LOG_LEVEL_ERROR, + ER_IO_READ_ERROR, + "Column name length %lu, is invalid", + (ulong) len); + + return(DB_CORRUPTION); + } + + cfg->m_col_names[i] = new(std::nothrow) byte[len]; + + /* Trigger OOM */ + DBUG_EXECUTE_IF("ib_import_OOM_10", + delete [] cfg->m_col_names[i]; + cfg->m_col_names[i] = 0;); + + if (cfg->m_col_names[i] == 0) { + return(DB_OUT_OF_MEMORY); + } + + dberr_t err; + + err = row_import_cfg_read_string( + file, cfg->m_col_names[i], len); + + if (err != DB_SUCCESS) { + + ib_senderrf( + thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR, + errno, strerror(errno), + "while parsing table column name."); + + return(err); + } + } + + return(DB_SUCCESS); +} + +/*****************************************************************//** +Read the contents of the <tablespace>.cfg file. +@return DB_SUCCESS or error code. */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_import_read_v1( +/*===============*/ + FILE* file, /*!< in: File to read from */ + THD* thd, /*!< in: session */ + row_import* cfg) /*!< out: meta data */ +{ + byte value[sizeof(ib_uint32_t)]; + + /* Trigger EOF */ + DBUG_EXECUTE_IF("ib_import_io_read_error_5", + (void) fseek(file, 0L, SEEK_END);); + + /* Read the hostname where the tablespace was exported. */ + if (fread(value, 1, sizeof(value), file) != sizeof(value)) { + ib_senderrf( + thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR, + errno, strerror(errno), + "while reading meta-data export hostname length."); + + return(DB_IO_ERROR); + } + + ulint len = mach_read_from_4(value); + + /* NUL byte is part of name length. */ + cfg->m_hostname = new(std::nothrow) byte[len]; + + /* Trigger OOM */ + DBUG_EXECUTE_IF("ib_import_OOM_1", + delete [] cfg->m_hostname; cfg->m_hostname = 0;); + + if (cfg->m_hostname == 0) { + return(DB_OUT_OF_MEMORY); + } + + dberr_t err = row_import_cfg_read_string(file, cfg->m_hostname, len); + + if (err != DB_SUCCESS) { + + ib_senderrf( + thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR, + errno, strerror(errno), + "while parsing export hostname."); + + return(err); + } + + /* Trigger EOF */ + DBUG_EXECUTE_IF("ib_import_io_read_error_6", + (void) fseek(file, 0L, SEEK_END);); + + /* Read the table name of tablespace that was exported. */ + if (fread(value, 1, sizeof(value), file) != sizeof(value)) { + ib_senderrf( + thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR, + errno, strerror(errno), + "while reading meta-data table name length."); + + return(DB_IO_ERROR); + } + + len = mach_read_from_4(value); + + /* NUL byte is part of name length. */ + cfg->m_table_name = new(std::nothrow) byte[len]; + + /* Trigger OOM */ + DBUG_EXECUTE_IF("ib_import_OOM_2", + delete [] cfg->m_table_name; cfg->m_table_name = 0;); + + if (cfg->m_table_name == 0) { + return(DB_OUT_OF_MEMORY); + } + + err = row_import_cfg_read_string(file, cfg->m_table_name, len); + + if (err != DB_SUCCESS) { + ib_senderrf( + thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR, + errno, strerror(errno), + "while parsing table name."); + + return(err); + } + + ib_logf(IB_LOG_LEVEL_INFO, + "Importing tablespace for table '%s' that was exported " + "from host '%s'", cfg->m_table_name, cfg->m_hostname); + + byte row[sizeof(ib_uint32_t) * 3]; + + /* Trigger EOF */ + DBUG_EXECUTE_IF("ib_import_io_read_error_7", + (void) fseek(file, 0L, SEEK_END);); + + /* Read the autoinc value. */ + if (fread(row, 1, sizeof(ib_uint64_t), file) != sizeof(ib_uint64_t)) { + ib_senderrf( + thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR, + errno, strerror(errno), + "while reading autoinc value."); + + return(DB_IO_ERROR); + } + + cfg->m_autoinc = mach_read_from_8(row); + + /* Trigger EOF */ + DBUG_EXECUTE_IF("ib_import_io_read_error_8", + (void) fseek(file, 0L, SEEK_END);); + + /* Read the tablespace page size. */ + if (fread(row, 1, sizeof(row), file) != sizeof(row)) { + ib_senderrf( + thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR, + errno, strerror(errno), + "while reading meta-data header."); + + return(DB_IO_ERROR); + } + + byte* ptr = row; + + cfg->m_page_size = mach_read_from_4(ptr); + ptr += sizeof(ib_uint32_t); + + if (cfg->m_page_size != UNIV_PAGE_SIZE) { + + ib_errf(thd, IB_LOG_LEVEL_ERROR, ER_TABLE_SCHEMA_MISMATCH, + "Tablespace to be imported has a different " + "page size than this server. Server page size " + "is %lu, whereas tablespace page size is %lu", + UNIV_PAGE_SIZE, (ulong) cfg->m_page_size); + + return(DB_ERROR); + } + + cfg->m_flags = mach_read_from_4(ptr); + ptr += sizeof(ib_uint32_t); + + cfg->m_n_cols = mach_read_from_4(ptr); + + if (!dict_tf_is_valid(cfg->m_flags)) { + + return(DB_CORRUPTION); + + } else if ((err = row_import_read_columns(file, thd, cfg)) + != DB_SUCCESS) { + + return(err); + + } else if ((err = row_import_read_indexes(file, thd, cfg)) + != DB_SUCCESS) { + + return(err); + } + + ut_a(err == DB_SUCCESS); + return(err); +} + +/** +Read the contents of the <tablespace>.cfg file. +@return DB_SUCCESS or error code. */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_import_read_meta_data( +/*======================*/ + dict_table_t* table, /*!< in: table */ + FILE* file, /*!< in: File to read from */ + THD* thd, /*!< in: session */ + row_import& cfg) /*!< out: contents of the .cfg file */ +{ + byte row[sizeof(ib_uint32_t)]; + + /* Trigger EOF */ + DBUG_EXECUTE_IF("ib_import_io_read_error_9", + (void) fseek(file, 0L, SEEK_END);); + + if (fread(&row, 1, sizeof(row), file) != sizeof(row)) { + ib_senderrf( + thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR, + errno, strerror(errno), + "while reading meta-data version."); + + return(DB_IO_ERROR); + } + + cfg.m_version = mach_read_from_4(row); + + /* Check the version number. */ + switch (cfg.m_version) { + case IB_EXPORT_CFG_VERSION_V1: + + return(row_import_read_v1(file, thd, &cfg)); + default: + ib_errf(thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR, + "Unsupported meta-data version number (%lu), " + "file ignored", (ulong) cfg.m_version); + } + + return(DB_ERROR); +} + +/** +Read the contents of the <tablename>.cfg file. +@return DB_SUCCESS or error code. */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_import_read_cfg( +/*================*/ + dict_table_t* table, /*!< in: table */ + THD* thd, /*!< in: session */ + row_import& cfg) /*!< out: contents of the .cfg file */ +{ + dberr_t err; + char name[OS_FILE_MAX_PATH]; + + cfg.m_table = table; + + srv_get_meta_data_filename(table, name, sizeof(name)); + + FILE* file = fopen(name, "rb"); + + if (file == NULL) { + char msg[BUFSIZ]; + + ut_snprintf(msg, sizeof(msg), + "Error opening '%s', will attempt to import " + "without schema verification", name); + + ib_senderrf( + thd, IB_LOG_LEVEL_WARN, ER_IO_READ_ERROR, + errno, strerror(errno), msg); + + cfg.m_missing = true; + + err = DB_FAIL; + } else { + + cfg.m_missing = false; + + err = row_import_read_meta_data(table, file, thd, cfg); + fclose(file); + } + + return(err); +} + +/*****************************************************************//** +Update the <space, root page> of a table's indexes from the values +in the data dictionary. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +row_import_update_index_root( +/*=========================*/ + trx_t* trx, /*!< in/out: transaction that + covers the update */ + const dict_table_t* table, /*!< in: Table for which we want + to set the root page_no */ + bool reset, /*!< in: if true then set to + FIL_NUL */ + bool dict_locked) /*!< in: Set to true if the + caller already owns the + dict_sys_t:: mutex. */ + +{ + const dict_index_t* index; + que_t* graph = 0; + dberr_t err = DB_SUCCESS; + + static const char sql[] = { + "PROCEDURE UPDATE_INDEX_ROOT() IS\n" + "BEGIN\n" + "UPDATE SYS_INDEXES\n" + "SET SPACE = :space,\n" + " PAGE_NO = :page,\n" + " TYPE = :type\n" + "WHERE TABLE_ID = :table_id AND ID = :index_id;\n" + "END;\n"}; + + if (!dict_locked) { + mutex_enter(&dict_sys->mutex); + } + + for (index = dict_table_get_first_index(table); + index != 0; + index = dict_table_get_next_index(index)) { + + pars_info_t* info; + ib_uint32_t page; + ib_uint32_t space; + ib_uint32_t type; + index_id_t index_id; + table_id_t table_id; + + info = (graph != 0) ? graph->info : pars_info_create(); + + mach_write_to_4( + reinterpret_cast<byte*>(&type), + index->type); + + mach_write_to_4( + reinterpret_cast<byte*>(&page), + reset ? FIL_NULL : index->page); + + mach_write_to_4( + reinterpret_cast<byte*>(&space), + reset ? FIL_NULL : index->space); + + mach_write_to_8( + reinterpret_cast<byte*>(&index_id), + index->id); + + mach_write_to_8( + reinterpret_cast<byte*>(&table_id), + table->id); + + /* If we set the corrupt bit during the IMPORT phase then + we need to update the system tables. */ + pars_info_bind_int4_literal(info, "type", &type); + pars_info_bind_int4_literal(info, "space", &space); + pars_info_bind_int4_literal(info, "page", &page); + pars_info_bind_ull_literal(info, "index_id", &index_id); + pars_info_bind_ull_literal(info, "table_id", &table_id); + + if (graph == 0) { + graph = pars_sql(info, sql); + ut_a(graph); + graph->trx = trx; + } + + que_thr_t* thr; + + graph->fork_type = QUE_FORK_MYSQL_INTERFACE; + + ut_a(thr = que_fork_start_command(graph)); + + que_run_threads(thr); + + DBUG_EXECUTE_IF("ib_import_internal_error", + trx->error_state = DB_ERROR;); + + err = trx->error_state; + + if (err != DB_SUCCESS) { + char index_name[MAX_FULL_NAME_LEN + 1]; + + innobase_format_name( + index_name, sizeof(index_name), + index->name, TRUE); + + ib_errf(trx->mysql_thd, IB_LOG_LEVEL_ERROR, + ER_INTERNAL_ERROR, + "While updating the <space, root page " + "number> of index %s - %s", + index_name, ut_strerr(err)); + + break; + } + } + + que_graph_free(graph); + + if (!dict_locked) { + mutex_exit(&dict_sys->mutex); + } + + return(err); +} + +/** Callback arg for row_import_set_discarded. */ +struct discard_t { + ib_uint32_t flags2; /*!< Value read from column */ + bool state; /*!< New state of the flag */ + ulint n_recs; /*!< Number of recs processed */ +}; + +/******************************************************************//** +Fetch callback that sets or unsets the DISCARDED tablespace flag in +SYS_TABLES. The flags is stored in MIX_LEN column. +@return FALSE if all OK */ +static +ibool +row_import_set_discarded( +/*=====================*/ + void* row, /*!< in: sel_node_t* */ + void* user_arg) /*!< in: bool set/unset flag */ +{ + sel_node_t* node = static_cast<sel_node_t*>(row); + discard_t* discard = static_cast<discard_t*>(user_arg); + dfield_t* dfield = que_node_get_val(node->select_list); + dtype_t* type = dfield_get_type(dfield); + ulint len = dfield_get_len(dfield); + + ut_a(dtype_get_mtype(type) == DATA_INT); + ut_a(len == sizeof(ib_uint32_t)); + + ulint flags2 = mach_read_from_4( + static_cast<byte*>(dfield_get_data(dfield))); + + if (discard->state) { + flags2 |= DICT_TF2_DISCARDED; + } else { + flags2 &= ~DICT_TF2_DISCARDED; + } + + mach_write_to_4(reinterpret_cast<byte*>(&discard->flags2), flags2); + + ++discard->n_recs; + + /* There should be at most one matching record. */ + ut_a(discard->n_recs == 1); + + return(FALSE); +} + +/*****************************************************************//** +Update the DICT_TF2_DISCARDED flag in SYS_TABLES. +@return DB_SUCCESS or error code. */ +UNIV_INTERN +dberr_t +row_import_update_discarded_flag( +/*=============================*/ + trx_t* trx, /*!< in/out: transaction that + covers the update */ + table_id_t table_id, /*!< in: Table for which we want + to set the root table->flags2 */ + bool discarded, /*!< in: set MIX_LEN column bit + to discarded, if true */ + bool dict_locked) /*!< in: set to true if the + caller already owns the + dict_sys_t:: mutex. */ + +{ + pars_info_t* info; + discard_t discard; + + static const char sql[] = + "PROCEDURE UPDATE_DISCARDED_FLAG() IS\n" + "DECLARE FUNCTION my_func;\n" + "DECLARE CURSOR c IS\n" + " SELECT MIX_LEN " + " FROM SYS_TABLES " + " WHERE ID = :table_id FOR UPDATE;" + "\n" + "BEGIN\n" + "OPEN c;\n" + "WHILE 1 = 1 LOOP\n" + " FETCH c INTO my_func();\n" + " IF c % NOTFOUND THEN\n" + " EXIT;\n" + " END IF;\n" + "END LOOP;\n" + "UPDATE SYS_TABLES" + " SET MIX_LEN = :flags2" + " WHERE ID = :table_id;\n" + "CLOSE c;\n" + "END;\n"; + + discard.n_recs = 0; + discard.state = discarded; + discard.flags2 = ULINT32_UNDEFINED; + + info = pars_info_create(); + + pars_info_add_ull_literal(info, "table_id", table_id); + pars_info_bind_int4_literal(info, "flags2", &discard.flags2); + + pars_info_bind_function( + info, "my_func", row_import_set_discarded, &discard); + + dberr_t err = que_eval_sql(info, sql, !dict_locked, trx); + + ut_a(discard.n_recs == 1); + ut_a(discard.flags2 != ULINT32_UNDEFINED); + + return(err); +} + +/*****************************************************************//** +Imports a tablespace. The space id in the .ibd file must match the space id +of the table in the data dictionary. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +row_import_for_mysql( +/*=================*/ + dict_table_t* table, /*!< in/out: table */ + row_prebuilt_t* prebuilt) /*!< in: prebuilt struct in MySQL */ +{ + dberr_t err; + trx_t* trx; + ib_uint64_t autoinc = 0; + char table_name[MAX_FULL_NAME_LEN + 1]; + char* filepath = NULL; + + ut_ad(!srv_read_only_mode); + + innobase_format_name( + table_name, sizeof(table_name), table->name, FALSE); + + ut_a(table->space); + ut_ad(prebuilt->trx); + ut_a(table->ibd_file_missing); + + trx_start_if_not_started(prebuilt->trx); + + trx = trx_allocate_for_mysql(); + + /* So that the table is not DROPped during recovery. */ + trx_set_dict_operation(trx, TRX_DICT_OP_INDEX); + + trx_start_if_not_started(trx); + + /* So that we can send error messages to the user. */ + trx->mysql_thd = prebuilt->trx->mysql_thd; + + /* Ensure that the table will be dropped by trx_rollback_active() + in case of a crash. */ + + trx->table_id = table->id; + + /* Assign an undo segment for the transaction, so that the + transaction will be recovered after a crash. */ + + mutex_enter(&trx->undo_mutex); + + err = trx_undo_assign_undo(trx, TRX_UNDO_UPDATE); + + mutex_exit(&trx->undo_mutex); + + DBUG_EXECUTE_IF("ib_import_undo_assign_failure", + err = DB_TOO_MANY_CONCURRENT_TRXS;); + + if (err != DB_SUCCESS) { + + return(row_import_cleanup(prebuilt, trx, err)); + + } else if (trx->update_undo == 0) { + + err = DB_TOO_MANY_CONCURRENT_TRXS; + return(row_import_cleanup(prebuilt, trx, err)); + } + + prebuilt->trx->op_info = "read meta-data file"; + + /* Prevent DDL operations while we are checking. */ + + rw_lock_s_lock_func(&dict_operation_lock, 0, __FILE__, __LINE__); + + row_import cfg; + + memset(&cfg, 0x0, sizeof(cfg)); + + err = row_import_read_cfg(table, trx->mysql_thd, cfg); + + /* Check if the table column definitions match the contents + of the config file. */ + + if (err == DB_SUCCESS) { + + /* We have a schema file, try and match it with the our + data dictionary. */ + + err = cfg.match_schema(trx->mysql_thd); + + /* Update index->page and SYS_INDEXES.PAGE_NO to match the + B-tree root page numbers in the tablespace. Use the index + name from the .cfg file to find match. */ + + if (err == DB_SUCCESS) { + cfg.set_root_by_name(); + autoinc = cfg.m_autoinc; + } + + rw_lock_s_unlock_gen(&dict_operation_lock, 0); + + DBUG_EXECUTE_IF("ib_import_set_index_root_failure", + err = DB_TOO_MANY_CONCURRENT_TRXS;); + + } else if (cfg.m_missing) { + + rw_lock_s_unlock_gen(&dict_operation_lock, 0); + + /* We don't have a schema file, we will have to discover + the index root pages from the .ibd file and skip the schema + matching step. */ + + ut_a(err == DB_FAIL); + + cfg.m_page_size = UNIV_PAGE_SIZE; + + FetchIndexRootPages fetchIndexRootPages(table, trx); + + err = fil_tablespace_iterate( + table, IO_BUFFER_SIZE(cfg.m_page_size), + fetchIndexRootPages); + + if (err == DB_SUCCESS) { + + err = fetchIndexRootPages.build_row_import(&cfg); + + /* Update index->page and SYS_INDEXES.PAGE_NO + to match the B-tree root page numbers in the + tablespace. */ + + if (err == DB_SUCCESS) { + err = cfg.set_root_by_heuristic(); + } + } + + } else { + rw_lock_s_unlock_gen(&dict_operation_lock, 0); + } + + if (err != DB_SUCCESS) { + return(row_import_error(prebuilt, trx, err)); + } + + prebuilt->trx->op_info = "importing tablespace"; + + ib_logf(IB_LOG_LEVEL_INFO, "Phase I - Update all pages"); + + /* Iterate over all the pages and do the sanity checking and + the conversion required to import the tablespace. */ + + PageConverter converter(&cfg, trx); + + /* Set the IO buffer size in pages. */ + + err = fil_tablespace_iterate( + table, IO_BUFFER_SIZE(cfg.m_page_size), converter); + + DBUG_EXECUTE_IF("ib_import_reset_space_and_lsn_failure", + err = DB_TOO_MANY_CONCURRENT_TRXS;); + + if (err != DB_SUCCESS) { + char table_name[MAX_FULL_NAME_LEN + 1]; + + innobase_format_name( + table_name, sizeof(table_name), table->name, FALSE); + + ib_errf(trx->mysql_thd, IB_LOG_LEVEL_ERROR, + ER_INTERNAL_ERROR, + "Cannot reset LSNs in table '%s' : %s", + table_name, ut_strerr(err)); + + return(row_import_cleanup(prebuilt, trx, err)); + } + + row_mysql_lock_data_dictionary(trx); + + /* If the table is stored in a remote tablespace, we need to + determine that filepath from the link file and system tables. + Find the space ID in SYS_TABLES since this is an ALTER TABLE. */ + if (DICT_TF_HAS_DATA_DIR(table->flags)) { + dict_get_and_save_data_dir_path(table, true); + ut_a(table->data_dir_path); + + filepath = os_file_make_remote_pathname( + table->data_dir_path, table->name, "ibd"); + } else { + filepath = fil_make_ibd_name(table->name, false); + } + ut_a(filepath); + + /* Open the tablespace so that we can access via the buffer pool. + We set the 2nd param (fix_dict = true) here because we already + have an x-lock on dict_operation_lock and dict_sys->mutex. */ + + err = fil_open_single_table_tablespace( + true, true, table->space, + dict_tf_to_fsp_flags(table->flags), + table->name, filepath); + + DBUG_EXECUTE_IF("ib_import_open_tablespace_failure", + err = DB_TABLESPACE_NOT_FOUND;); + + if (err != DB_SUCCESS) { + row_mysql_unlock_data_dictionary(trx); + + ib_senderrf(trx->mysql_thd, IB_LOG_LEVEL_ERROR, + ER_FILE_NOT_FOUND, + filepath, err, ut_strerr(err)); + + mem_free(filepath); + + return(row_import_cleanup(prebuilt, trx, err)); + } + + row_mysql_unlock_data_dictionary(trx); + + mem_free(filepath); + + err = ibuf_check_bitmap_on_import(trx, table->space); + + DBUG_EXECUTE_IF("ib_import_check_bitmap_failure", err = DB_CORRUPTION;); + + if (err != DB_SUCCESS) { + return(row_import_cleanup(prebuilt, trx, err)); + } + + /* The first index must always be the clustered index. */ + + dict_index_t* index = dict_table_get_first_index(table); + + if (!dict_index_is_clust(index)) { + return(row_import_error(prebuilt, trx, DB_CORRUPTION)); + } + + /* Update the Btree segment headers for index node and + leaf nodes in the root page. Set the new space id. */ + + err = btr_root_adjust_on_import(index); + + DBUG_EXECUTE_IF("ib_import_cluster_root_adjust_failure", + err = DB_CORRUPTION;); + + if (err != DB_SUCCESS) { + return(row_import_error(prebuilt, trx, err)); + } + + if (err != DB_SUCCESS) { + return(row_import_error(prebuilt, trx, err)); + } else if (cfg.requires_purge(index->name)) { + + /* Purge any delete-marked records that couldn't be + purged during the page conversion phase from the + cluster index. */ + + IndexPurge purge(trx, index); + + trx->op_info = "cluster: purging delete marked records"; + + err = purge.garbage_collect(); + + trx->op_info = ""; + } + + DBUG_EXECUTE_IF("ib_import_cluster_failure", err = DB_CORRUPTION;); + + if (err != DB_SUCCESS) { + return(row_import_error(prebuilt, trx, err)); + } + + /* For secondary indexes, purge any records that couldn't be purged + during the page conversion phase. */ + + err = row_import_adjust_root_pages_of_secondary_indexes( + prebuilt, trx, table, cfg); + + DBUG_EXECUTE_IF("ib_import_sec_root_adjust_failure", + err = DB_CORRUPTION;); + + if (err != DB_SUCCESS) { + return(row_import_error(prebuilt, trx, err)); + } + + /* Ensure that the next available DB_ROW_ID is not smaller than + any DB_ROW_ID stored in the table. */ + + if (prebuilt->clust_index_was_generated) { + + err = row_import_set_sys_max_row_id(prebuilt, table); + + if (err != DB_SUCCESS) { + return(row_import_error(prebuilt, trx, err)); + } + } + + ib_logf(IB_LOG_LEVEL_INFO, "Phase III - Flush changes to disk"); + + /* Ensure that all pages dirtied during the IMPORT make it to disk. + The only dirty pages generated should be from the pessimistic purge + of delete marked records that couldn't be purged in Phase I. */ + + buf_LRU_flush_or_remove_pages( + prebuilt->table->space, BUF_REMOVE_FLUSH_WRITE, trx); + + if (trx_is_interrupted(trx)) { + ib_logf(IB_LOG_LEVEL_INFO, "Phase III - Flush interrupted"); + return(row_import_error(prebuilt, trx, DB_INTERRUPTED)); + } else { + ib_logf(IB_LOG_LEVEL_INFO, "Phase IV - Flush complete"); + } + + /* The dictionary latches will be released in in row_import_cleanup() + after the transaction commit, for both success and error. */ + + row_mysql_lock_data_dictionary(trx); + + /* Update the root pages of the table's indexes. */ + err = row_import_update_index_root(trx, table, false, true); + + if (err != DB_SUCCESS) { + return(row_import_error(prebuilt, trx, err)); + } + + /* Update the table's discarded flag, unset it. */ + err = row_import_update_discarded_flag(trx, table->id, false, true); + + if (err != DB_SUCCESS) { + return(row_import_error(prebuilt, trx, err)); + } + + table->ibd_file_missing = false; + table->flags2 &= ~DICT_TF2_DISCARDED; + + if (autoinc != 0) { + char table_name[MAX_FULL_NAME_LEN + 1]; + + innobase_format_name( + table_name, sizeof(table_name), table->name, FALSE); + + ib_logf(IB_LOG_LEVEL_INFO, "%s autoinc value set to " IB_ID_FMT, + table_name, autoinc); + + dict_table_autoinc_lock(table); + dict_table_autoinc_initialize(table, autoinc); + dict_table_autoinc_unlock(table); + } + + ut_a(err == DB_SUCCESS); + + return(row_import_cleanup(prebuilt, trx, err)); +} + diff --git a/storage/innobase/row/row0ins.cc b/storage/innobase/row/row0ins.cc index e8d15fb539c..c1c27152831 100644 --- a/storage/innobase/row/row0ins.cc +++ b/storage/innobase/row/row0ins.cc @@ -23,11 +23,8 @@ Insert into a table Created 4/20/1996 Heikki Tuuri *******************************************************/ -#include "m_string.h" /* for my_sys.h */ #include "row0ins.h" -#define DEBUG_SYNC_C_IF_THD(A,B) DEBUG_SYNC(A,B) - #ifdef UNIV_NONINL #include "row0ins.ic" #endif @@ -35,6 +32,7 @@ Created 4/20/1996 Heikki Tuuri #include "ha_prototypes.h" #include "dict0dict.h" #include "dict0boot.h" +#include "trx0rec.h" #include "trx0undo.h" #include "btr0btr.h" #include "btr0cur.h" @@ -43,6 +41,7 @@ Created 4/20/1996 Heikki Tuuri #include "row0upd.h" #include "row0sel.h" #include "row0row.h" +#include "row0log.h" #include "rem0cmp.h" #include "lock0lock.h" #include "log0log.h" @@ -52,6 +51,7 @@ Created 4/20/1996 Heikki Tuuri #include "buf0lru.h" #include "fts0fts.h" #include "fts0types.h" +#include "m_string.h" /************************************************************************* IMPORTANT NOTE: Any operation that generates redo MUST check that there @@ -101,7 +101,7 @@ ins_node_create( /***********************************************************//** Creates an entry template for each index of a table. */ -UNIV_INTERN +static void ins_node_create_entry_list( /*=======================*/ @@ -222,68 +222,92 @@ Does an insert operation by updating a delete-marked existing record in the index. This situation can occur if the delete-marked record is kept in the index for consistent reads. @return DB_SUCCESS or error code */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t row_ins_sec_index_entry_by_modify( /*==============================*/ + ulint flags, /*!< in: undo logging and locking flags */ ulint mode, /*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE, depending on whether mtr holds just a leaf latch or also a tree latch */ btr_cur_t* cursor, /*!< in: B-tree cursor */ + ulint** offsets,/*!< in/out: offsets on cursor->page_cur.rec */ + mem_heap_t* offsets_heap, + /*!< in/out: memory heap that can be emptied */ + mem_heap_t* heap, /*!< in/out: memory heap */ const dtuple_t* entry, /*!< in: index entry to insert */ que_thr_t* thr, /*!< in: query thread */ mtr_t* mtr) /*!< in: mtr; must be committed before latching any further pages */ { big_rec_t* dummy_big_rec; - mem_heap_t* heap; upd_t* update; rec_t* rec; - ulint err; + dberr_t err; rec = btr_cur_get_rec(cursor); ut_ad(!dict_index_is_clust(cursor->index)); - ut_ad(rec_get_deleted_flag(rec, - dict_table_is_comp(cursor->index->table))); + ut_ad(rec_offs_validate(rec, cursor->index, *offsets)); + ut_ad(!entry->info_bits); /* We know that in the alphabetical ordering, entry and rec are identified. But in their binary form there may be differences if there are char fields in them. Therefore we have to calculate the difference. */ - heap = mem_heap_create(1024); - update = row_upd_build_sec_rec_difference_binary( - cursor->index, entry, rec, thr_get_trx(thr), heap); + rec, cursor->index, *offsets, entry, heap); + + if (!rec_get_deleted_flag(rec, rec_offs_comp(*offsets))) { + /* We should never insert in place of a record that + has not been delete-marked. The only exception is when + online CREATE INDEX copied the changes that we already + made to the clustered index, and completed the + secondary index creation before we got here. In this + case, the change would already be there. The CREATE + INDEX should be waiting for a MySQL meta-data lock + upgrade at least until this INSERT or UPDATE + returns. After that point, the TEMP_INDEX_PREFIX + would be dropped from the index name in + commit_inplace_alter_table(). */ + ut_a(update->n_fields == 0); + ut_a(*cursor->index->name == TEMP_INDEX_PREFIX); + ut_ad(!dict_index_is_online_ddl(cursor->index)); + return(DB_SUCCESS); + } + if (mode == BTR_MODIFY_LEAF) { /* Try an optimistic updating of the record, keeping changes within the page */ - err = btr_cur_optimistic_update(BTR_KEEP_SYS_FLAG, cursor, - update, 0, thr, mtr); + /* TODO: pass only *offsets */ + err = btr_cur_optimistic_update( + flags | BTR_KEEP_SYS_FLAG, cursor, + offsets, &offsets_heap, update, 0, thr, + thr_get_trx(thr)->id, mtr); switch (err) { case DB_OVERFLOW: case DB_UNDERFLOW: case DB_ZIP_OVERFLOW: err = DB_FAIL; + default: + break; } } else { ut_a(mode == BTR_MODIFY_TREE); if (buf_LRU_buf_pool_running_out()) { - err = DB_LOCK_TABLE_FULL; - - goto func_exit; + return(DB_LOCK_TABLE_FULL); } - err = btr_cur_pessimistic_update(BTR_KEEP_SYS_FLAG, cursor, - &heap, &dummy_big_rec, update, - 0, thr, mtr); + err = btr_cur_pessimistic_update( + flags | BTR_KEEP_SYS_FLAG, cursor, + offsets, &offsets_heap, + heap, &dummy_big_rec, update, 0, + thr, thr_get_trx(thr)->id, mtr); ut_ad(!dummy_big_rec); } -func_exit: - mem_heap_free(heap); return(err); } @@ -293,15 +317,20 @@ Does an insert operation by delete unmarking and updating a delete marked existing record in the index. This situation can occur if the delete marked record is kept in the index for consistent reads. @return DB_SUCCESS, DB_FAIL, or error code */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t row_ins_clust_index_entry_by_modify( /*================================*/ + ulint flags, /*!< in: undo logging and locking flags */ ulint mode, /*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE, depending on whether mtr holds just a leaf latch or also a tree latch */ btr_cur_t* cursor, /*!< in: B-tree cursor */ - mem_heap_t** heap, /*!< in/out: pointer to memory heap, or NULL */ + ulint** offsets,/*!< out: offsets on cursor->page_cur.rec */ + mem_heap_t** offsets_heap, + /*!< in/out: pointer to memory heap that can + be emptied, or NULL */ + mem_heap_t* heap, /*!< in/out: memory heap */ big_rec_t** big_rec,/*!< out: possible big rec vector of fields which have to be stored externally by the caller */ @@ -310,9 +339,9 @@ row_ins_clust_index_entry_by_modify( mtr_t* mtr) /*!< in: mtr; must be committed before latching any further pages */ { - rec_t* rec; - upd_t* update; - ulint err; + const rec_t* rec; + const upd_t* update; + dberr_t err; ut_ad(dict_index_is_clust(cursor->index)); @@ -323,38 +352,40 @@ row_ins_clust_index_entry_by_modify( ut_ad(rec_get_deleted_flag(rec, dict_table_is_comp(cursor->index->table))); - if (!*heap) { - *heap = mem_heap_create(1024); - } - /* Build an update vector containing all the fields to be modified; NOTE that this vector may NOT contain system columns trx_id or roll_ptr */ - update = row_upd_build_difference_binary(cursor->index, entry, rec, - thr_get_trx(thr), *heap); - if (mode == BTR_MODIFY_LEAF) { + update = row_upd_build_difference_binary( + cursor->index, entry, rec, NULL, true, + thr_get_trx(thr), heap); + if (mode != BTR_MODIFY_TREE) { + ut_ad((mode & ~BTR_ALREADY_S_LATCHED) == BTR_MODIFY_LEAF); + /* Try optimistic updating of the record, keeping changes within the page */ - err = btr_cur_optimistic_update(0, cursor, update, 0, thr, - mtr); + err = btr_cur_optimistic_update( + flags, cursor, offsets, offsets_heap, update, 0, thr, + thr_get_trx(thr)->id, mtr); switch (err) { case DB_OVERFLOW: case DB_UNDERFLOW: case DB_ZIP_OVERFLOW: err = DB_FAIL; + default: + break; } } else { - ut_a(mode == BTR_MODIFY_TREE); if (buf_LRU_buf_pool_running_out()) { return(DB_LOCK_TABLE_FULL); } err = btr_cur_pessimistic_update( - BTR_KEEP_POS_FLAG, cursor, heap, big_rec, update, - 0, thr, mtr); + flags | BTR_KEEP_POS_FLAG, + cursor, offsets, offsets_heap, heap, + big_rec, update, 0, thr, thr_get_trx(thr)->id, mtr); } return(err); @@ -394,7 +425,7 @@ row_ins_cascade_ancestor_updates_table( Returns the number of ancestor UPDATE or DELETE nodes of a cascaded update/delete node. @return number of ancestors */ -static +static __attribute__((nonnull, warn_unused_result)) ulint row_ins_cascade_n_ancestors( /*========================*/ @@ -420,7 +451,7 @@ a cascaded update. can also be 0 if no foreign key fields changed; the returned value is ULINT_UNDEFINED if the column type in the child table is too short to fit the new value in the parent table: that means the update fails */ -static +static __attribute__((nonnull, warn_unused_result)) ulint row_ins_cascade_calc_update_vec( /*============================*/ @@ -691,6 +722,8 @@ row_ins_set_detailed( trx_t* trx, /*!< in: transaction */ dict_foreign_t* foreign) /*!< in: foreign key constraint */ { + ut_ad(!srv_read_only_mode); + mutex_enter(&srv_misc_tmpfile_mutex); rewind(srv_misc_tmpfile); @@ -717,13 +750,17 @@ row_ins_foreign_trx_print( /*======================*/ trx_t* trx) /*!< in: transaction */ { - ulint n_lock_rec; - ulint n_lock_struct; + ulint n_rec_locks; + ulint n_trx_locks; ulint heap_size; + if (srv_read_only_mode) { + return; + } + lock_mutex_enter(); - n_lock_rec = lock_number_of_rows_locked(&trx->lock); - n_lock_struct = UT_LIST_GET_LEN(trx->lock.trx_locks); + n_rec_locks = lock_number_of_rows_locked(&trx->lock); + n_trx_locks = UT_LIST_GET_LEN(trx->lock.trx_locks); heap_size = mem_heap_get_size(trx->lock.lock_heap); lock_mutex_exit(); @@ -735,7 +772,7 @@ row_ins_foreign_trx_print( fputs(" Transaction:\n", dict_foreign_err_file); trx_print_low(dict_foreign_err_file, trx, 600, - n_lock_rec, n_lock_struct, heap_size); + n_rec_locks, n_trx_locks, heap_size); mutex_exit(&trx_sys->mutex); @@ -759,6 +796,10 @@ row_ins_foreign_report_err( const dtuple_t* entry) /*!< in: index entry in the parent table */ { + if (srv_read_only_mode) { + return; + } + FILE* ef = dict_foreign_err_file; trx_t* trx = thr_get_trx(thr); @@ -810,6 +851,10 @@ row_ins_foreign_report_add_err( const dtuple_t* entry) /*!< in: index entry to insert in the child table */ { + if (srv_read_only_mode) { + return; + } + FILE* ef = dict_foreign_err_file; row_ins_set_detailed(trx, foreign); @@ -879,8 +924,8 @@ Perform referential actions or checks when a parent row is deleted or updated and the constraint had an ON DELETE or ON UPDATE condition which was not RESTRICT. @return DB_SUCCESS, DB_LOCK_WAIT, or error code */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t row_ins_foreign_check_on_constraint( /*================================*/ que_thr_t* thr, /*!< in: query thread whose run_node @@ -906,7 +951,7 @@ row_ins_foreign_check_on_constraint( const buf_block_t* clust_block; upd_t* update; ulint n_to_update; - ulint err; + dberr_t err; ulint i; trx_t* trx; mem_heap_t* tmp_heap = NULL; @@ -1242,6 +1287,9 @@ row_ins_foreign_check_on_constraint( release the latch. */ row_mysql_unfreeze_data_dictionary(thr_get_trx(thr)); + + DEBUG_SYNC_C("innodb_dml_cascade_dict_unfreeze"); + row_mysql_freeze_data_dictionary(thr_get_trx(thr)); mtr_start(mtr); @@ -1284,7 +1332,7 @@ Sets a shared lock on a record. Used in locking possible duplicate key records and also in checking foreign key constraints. @return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, or error code */ static -enum db_err +dberr_t row_ins_set_shared_rec_lock( /*========================*/ ulint type, /*!< in: LOCK_ORDINARY, LOCK_GAP, or @@ -1295,7 +1343,7 @@ row_ins_set_shared_rec_lock( const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */ que_thr_t* thr) /*!< in: query thread */ { - enum db_err err; + dberr_t err; ut_ad(rec_offs_validate(rec, index, offsets)); @@ -1315,7 +1363,7 @@ Sets a exclusive lock on a record. Used in locking possible duplicate key records @return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, or error code */ static -enum db_err +dberr_t row_ins_set_exclusive_rec_lock( /*===========================*/ ulint type, /*!< in: LOCK_ORDINARY, LOCK_GAP, or @@ -1326,7 +1374,7 @@ row_ins_set_exclusive_rec_lock( const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */ que_thr_t* thr) /*!< in: query thread */ { - enum db_err err; + dberr_t err; ut_ad(rec_offs_validate(rec, index, offsets)); @@ -1347,7 +1395,7 @@ which lock either the success or the failure of the constraint. NOTE that the caller must have a shared latch on dict_operation_lock. @return DB_SUCCESS, DB_NO_REFERENCED_ROW, or DB_ROW_IS_REFERENCED */ UNIV_INTERN -ulint +dberr_t row_ins_check_foreign_constraint( /*=============================*/ ibool check_ref,/*!< in: TRUE if we want to check that @@ -1361,7 +1409,7 @@ row_ins_check_foreign_constraint( dtuple_t* entry, /*!< in: index entry for index */ que_thr_t* thr) /*!< in: query thread */ { - ulint err; + dberr_t err; upd_node_t* upd_node; dict_table_t* check_table; dict_index_t* check_index; @@ -1433,9 +1481,11 @@ run_again: check_index = foreign->foreign_index; } - if (check_table == NULL || check_table->ibd_file_missing + if (check_table == NULL + || check_table->ibd_file_missing || check_index == NULL) { - if (check_ref) { + + if (!srv_read_only_mode && check_ref) { FILE* ef = dict_foreign_err_file; row_ins_set_detailed(trx, foreign); @@ -1611,6 +1661,8 @@ run_again: } else { err = DB_SUCCESS; } + default: + break; } goto end_scan; @@ -1635,18 +1687,43 @@ end_scan: do_possible_lock_wait: if (err == DB_LOCK_WAIT) { - trx->error_state = static_cast<enum db_err>(err); + bool verified = false; + + trx->error_state = err; que_thr_stop_for_mysql(thr); lock_wait_suspend_thread(thr); - if (trx->error_state == DB_SUCCESS) { + if (check_table->to_be_dropped) { + /* The table is being dropped. We shall timeout + this operation */ + err = DB_LOCK_WAIT_TIMEOUT; + goto exit_func; + } - goto run_again; + /* We had temporarily released dict_operation_lock in + above lock sleep wait, now we have the lock again, and + we will need to re-check whether the foreign key has been + dropped */ + for (const dict_foreign_t* check_foreign = UT_LIST_GET_FIRST( + table->referenced_list); + check_foreign; + check_foreign = UT_LIST_GET_NEXT( + referenced_list, check_foreign)) { + if (check_foreign == foreign) { + verified = true; + break; + } } - err = trx->error_state; + if (!verified) { + err = DB_DICT_CHANGED; + } else if (trx->error_state == DB_SUCCESS) { + goto run_again; + } else { + err = trx->error_state; + } } exit_func: @@ -1663,8 +1740,8 @@ Otherwise does searches to the indexes of referenced tables and sets shared locks which lock either the success or the failure of a constraint. @return DB_SUCCESS or error code */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t row_ins_check_foreign_constraints( /*==============================*/ dict_table_t* table, /*!< in: table */ @@ -1673,7 +1750,7 @@ row_ins_check_foreign_constraints( que_thr_t* thr) /*!< in: query thread */ { dict_foreign_t* foreign; - ulint err; + dberr_t err; trx_t* trx; ibool got_s_lock = FALSE; @@ -1681,14 +1758,21 @@ row_ins_check_foreign_constraints( foreign = UT_LIST_GET_FIRST(table->foreign_list); + DEBUG_SYNC_C_IF_THD(thr_get_trx(thr)->mysql_thd, + "foreign_constraint_check_for_ins"); + while (foreign) { if (foreign->foreign_index == index) { dict_table_t* ref_table = NULL; + dict_table_t* foreign_table = foreign->foreign_table; + dict_table_t* referenced_table + = foreign->referenced_table; - if (foreign->referenced_table == NULL) { + if (referenced_table == NULL) { ref_table = dict_table_open_on_name( - foreign->referenced_table_name_lookup, FALSE); + foreign->referenced_table_name_lookup, + FALSE, FALSE, DICT_ERR_IGNORE_NONE); } if (0 == trx->dict_operation_lock_mode) { @@ -1697,9 +1781,9 @@ row_ins_check_foreign_constraints( row_mysql_freeze_data_dictionary(trx); } - if (foreign->referenced_table) { + if (referenced_table) { os_inc_counter(dict_sys->mutex, - foreign->foreign_table + foreign_table ->n_foreign_key_checks_running); } @@ -1711,9 +1795,12 @@ row_ins_check_foreign_constraints( err = row_ins_check_foreign_constraint( TRUE, foreign, table, entry, thr); - if (foreign->referenced_table) { + DBUG_EXECUTE_IF("row_ins_dict_change_err", + err = DB_DICT_CHANGED;); + + if (referenced_table) { os_dec_counter(dict_sys->mutex, - foreign->foreign_table + foreign_table ->n_foreign_key_checks_running); } @@ -1722,7 +1809,7 @@ row_ins_check_foreign_constraints( } if (ref_table != NULL) { - dict_table_close(ref_table, FALSE); + dict_table_close(ref_table, FALSE, FALSE); } if (err != DB_SUCCESS) { @@ -1778,8 +1865,7 @@ row_ins_dupl_error_with_rec( if (!dict_index_is_clust(index)) { for (i = 0; i < n_unique; i++) { - if (UNIV_SQL_NULL == dfield_get_len( - dtuple_get_nth_field(entry, i))) { + if (dfield_is_null(dtuple_get_nth_field(entry, i))) { return(FALSE); } @@ -1794,26 +1880,30 @@ Scans a unique non-clustered index at a given index entry to determine whether a uniqueness violation has occurred for the key value of the entry. Set shared locks on possible duplicate records. @return DB_SUCCESS, DB_DUPLICATE_KEY, or DB_LOCK_WAIT */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t row_ins_scan_sec_index_for_duplicate( /*=================================*/ + ulint flags, /*!< in: undo logging and locking flags */ dict_index_t* index, /*!< in: non-clustered unique index */ dtuple_t* entry, /*!< in: index entry */ - que_thr_t* thr) /*!< in: query thread */ + que_thr_t* thr, /*!< in: query thread */ + bool s_latch,/*!< in: whether index->lock is being held */ + mtr_t* mtr, /*!< in/out: mini-transaction */ + mem_heap_t* offsets_heap) + /*!< in/out: memory heap that can be emptied */ { ulint n_unique; - ulint i; int cmp; ulint n_fields_cmp; btr_pcur_t pcur; - ulint err = DB_SUCCESS; + dberr_t err = DB_SUCCESS; ulint allow_duplicates; - mtr_t mtr; - mem_heap_t* heap = NULL; - ulint offsets_[REC_OFFS_NORMAL_SIZE]; - ulint* offsets = offsets_; - rec_offs_init(offsets_); + ulint* offsets = NULL; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(s_latch == rw_lock_own(&index->lock, RW_LOCK_SHARED)); +#endif /* UNIV_SYNC_DEBUG */ n_unique = dict_index_get_n_unique(index); @@ -1821,7 +1911,7 @@ row_ins_scan_sec_index_for_duplicate( n_unique first fields is NULL, a unique key violation cannot occur, since we define NULL != NULL in this case */ - for (i = 0; i < n_unique; i++) { + for (ulint i = 0; i < n_unique; i++) { if (UNIV_SQL_NULL == dfield_get_len( dtuple_get_nth_field(entry, i))) { @@ -1829,15 +1919,17 @@ row_ins_scan_sec_index_for_duplicate( } } - mtr_start(&mtr); - /* Store old value on n_fields_cmp */ n_fields_cmp = dtuple_get_n_fields_cmp(entry); - dtuple_set_n_fields_cmp(entry, dict_index_get_n_unique(index)); + dtuple_set_n_fields_cmp(entry, n_unique); - btr_pcur_open(index, entry, PAGE_CUR_GE, BTR_SEARCH_LEAF, &pcur, &mtr); + btr_pcur_open(index, entry, PAGE_CUR_GE, + s_latch + ? BTR_SEARCH_LEAF | BTR_ALREADY_S_LATCHED + : BTR_SEARCH_LEAF, + &pcur, mtr); allow_duplicates = thr_get_trx(thr)->duplicates; @@ -1853,9 +1945,12 @@ row_ins_scan_sec_index_for_duplicate( } offsets = rec_get_offsets(rec, index, offsets, - ULINT_UNDEFINED, &heap); + ULINT_UNDEFINED, &offsets_heap); - if (allow_duplicates) { + if (flags & BTR_NO_LOCKING_FLAG) { + /* Set no locks when applying log + in online table rebuild. */ + } else if (allow_duplicates) { /* If the SQL-query will update or replace duplicate key we will take X-lock for @@ -1901,37 +1996,115 @@ row_ins_scan_sec_index_for_duplicate( ut_a(cmp < 0); goto end_scan; } - } while (btr_pcur_move_to_next(&pcur, &mtr)); + } while (btr_pcur_move_to_next(&pcur, mtr)); end_scan: - if (UNIV_LIKELY_NULL(heap)) { - mem_heap_free(heap); - } - mtr_commit(&mtr); - /* Restore old value */ dtuple_set_n_fields_cmp(entry, n_fields_cmp); return(err); } +/** Checks for a duplicate when the table is being rebuilt online. +@retval DB_SUCCESS when no duplicate is detected +@retval DB_SUCCESS_LOCKED_REC when rec is an exact match of entry or +a newer version of entry (the entry should not be inserted) +@retval DB_DUPLICATE_KEY when entry is a duplicate of rec */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_ins_duplicate_online( +/*=====================*/ + ulint n_uniq, /*!< in: offset of DB_TRX_ID */ + const dtuple_t* entry, /*!< in: entry that is being inserted */ + const rec_t* rec, /*!< in: clustered index record */ + ulint* offsets)/*!< in/out: rec_get_offsets(rec) */ +{ + ulint fields = 0; + ulint bytes = 0; + + /* During rebuild, there should not be any delete-marked rows + in the new table. */ + ut_ad(!rec_get_deleted_flag(rec, rec_offs_comp(offsets))); + ut_ad(dtuple_get_n_fields_cmp(entry) == n_uniq); + + /* Compare the PRIMARY KEY fields and the + DB_TRX_ID, DB_ROLL_PTR. */ + cmp_dtuple_rec_with_match_low( + entry, rec, offsets, n_uniq + 2, &fields, &bytes); + + if (fields < n_uniq) { + /* Not a duplicate. */ + return(DB_SUCCESS); + } + + if (fields == n_uniq + 2) { + /* rec is an exact match of entry. */ + ut_ad(bytes == 0); + return(DB_SUCCESS_LOCKED_REC); + } + + return(DB_DUPLICATE_KEY); +} + +/** Checks for a duplicate when the table is being rebuilt online. +@retval DB_SUCCESS when no duplicate is detected +@retval DB_SUCCESS_LOCKED_REC when rec is an exact match of entry or +a newer version of entry (the entry should not be inserted) +@retval DB_DUPLICATE_KEY when entry is a duplicate of rec */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_ins_duplicate_error_in_clust_online( +/*====================================*/ + ulint n_uniq, /*!< in: offset of DB_TRX_ID */ + const dtuple_t* entry, /*!< in: entry that is being inserted */ + const btr_cur_t*cursor, /*!< in: cursor on insert position */ + ulint** offsets,/*!< in/out: rec_get_offsets(rec) */ + mem_heap_t** heap) /*!< in/out: heap for offsets */ +{ + dberr_t err = DB_SUCCESS; + const rec_t* rec = btr_cur_get_rec(cursor); + + if (cursor->low_match >= n_uniq && !page_rec_is_infimum(rec)) { + *offsets = rec_get_offsets(rec, cursor->index, *offsets, + ULINT_UNDEFINED, heap); + err = row_ins_duplicate_online(n_uniq, entry, rec, *offsets); + if (err != DB_SUCCESS) { + return(err); + } + } + + rec = page_rec_get_next_const(btr_cur_get_rec(cursor)); + + if (cursor->up_match >= n_uniq && !page_rec_is_supremum(rec)) { + *offsets = rec_get_offsets(rec, cursor->index, *offsets, + ULINT_UNDEFINED, heap); + err = row_ins_duplicate_online(n_uniq, entry, rec, *offsets); + } + + return(err); +} + /***************************************************************//** Checks if a unique key violation error would occur at an index entry insert. Sets shared locks on possible duplicate records. Works only for a clustered index! -@return DB_SUCCESS if no error, DB_DUPLICATE_KEY if error, -DB_LOCK_WAIT if we have to wait for a lock on a possible duplicate -record */ -static -ulint +@retval DB_SUCCESS if no error +@retval DB_DUPLICATE_KEY if error, +@retval DB_LOCK_WAIT if we have to wait for a lock on a possible duplicate +record +@retval DB_SUCCESS_LOCKED_REC if an exact match of the record was found +in online table rebuild (flags & (BTR_KEEP_SYS_FLAG | BTR_NO_LOCKING_FLAG)) */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t row_ins_duplicate_error_in_clust( /*=============================*/ + ulint flags, /*!< in: undo logging and locking flags */ btr_cur_t* cursor, /*!< in: B-tree cursor */ const dtuple_t* entry, /*!< in: entry to insert */ que_thr_t* thr, /*!< in: query thread */ mtr_t* mtr) /*!< in: mtr */ { - ulint err; + dberr_t err; rec_t* rec; ulint n_unique; trx_t* trx = thr_get_trx(thr); @@ -1942,8 +2115,7 @@ row_ins_duplicate_error_in_clust( UT_NOT_USED(mtr); - ut_a(dict_index_is_clust(cursor->index)); - ut_ad(dict_index_is_unique(cursor->index)); + ut_ad(dict_index_is_clust(cursor->index)); /* NOTE: For unique non-clustered indexes there may be any number of delete marked records with the same value for the non-clustered @@ -2002,6 +2174,7 @@ row_ins_duplicate_error_in_clust( if (row_ins_dupl_error_with_rec( rec, entry, cursor->index, offsets)) { +duplicate: trx->error_info = cursor->index; err = DB_DUPLICATE_KEY; goto func_exit; @@ -2046,14 +2219,12 @@ row_ins_duplicate_error_in_clust( if (row_ins_dupl_error_with_rec( rec, entry, cursor->index, offsets)) { - trx->error_info = cursor->index; - err = DB_DUPLICATE_KEY; - goto func_exit; + goto duplicate; } } - ut_a(!dict_index_is_clust(cursor->index)); /* This should never happen */ + ut_error; } err = DB_SUCCESS; @@ -2081,12 +2252,12 @@ row_ins_must_modify_rec( /*====================*/ const btr_cur_t* cursor) /*!< in: B-tree cursor */ { - /* NOTE: (compare to the note in row_ins_duplicate_error) Because node - pointers on upper levels of the B-tree may match more to entry than - to actual user records on the leaf level, we have to check if the - candidate record is actually a user record. In a clustered index - node pointers contain index->n_unique first fields, and in the case - of a secondary index, all fields of the index. */ + /* NOTE: (compare to the note in row_ins_duplicate_error_in_clust) + Because node pointers on upper levels of the B-tree may match more + to entry than to actual user records on the leaf level, we + have to check if the candidate record is actually a user record. + A clustered index node pointer contains index->n_unique first fields, + and a secondary index node pointer contains all index fields. */ return(cursor->low_match >= dict_index_get_n_unique_in_tree(cursor->index) @@ -2094,56 +2265,359 @@ row_ins_must_modify_rec( } /***************************************************************//** -Tries to insert an index entry to an index. If the index is clustered -and a record with the same unique key is found, the other record is -necessarily marked deleted by a committed transaction, or a unique key -violation error occurs. The delete marked record is then updated to an -existing record, and we must write an undo log record on the delete -marked record. If the index is secondary, and a record with exactly the -same fields is found, the other record is necessarily marked deleted. -It is then unmarked. Otherwise, the entry is just inserted to the index. -@return DB_SUCCESS, DB_LOCK_WAIT, DB_FAIL if pessimistic retry needed, -or error code */ -static -ulint -row_ins_index_entry_low( -/*====================*/ +Tries to insert an entry into a clustered index, ignoring foreign key +constraints. If a record with the same unique key is found, the other +record is necessarily marked deleted by a committed transaction, or a +unique key violation error occurs. The delete marked record is then +updated to an existing record, and we must write an undo log record on +the delete marked record. +@retval DB_SUCCESS on success +@retval DB_LOCK_WAIT on lock wait when !(flags & BTR_NO_LOCKING_FLAG) +@retval DB_FAIL if retry with BTR_MODIFY_TREE is needed +@return error code */ +UNIV_INTERN +dberr_t +row_ins_clust_index_entry_low( +/*==========================*/ + ulint flags, /*!< in: undo logging and locking flags */ ulint mode, /*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE, depending on whether we wish optimistic or pessimistic descent down the index tree */ - dict_index_t* index, /*!< in: index */ + dict_index_t* index, /*!< in: clustered index */ + ulint n_uniq, /*!< in: 0 or index->n_uniq */ dtuple_t* entry, /*!< in/out: index entry to insert */ ulint n_ext, /*!< in: number of externally stored columns */ que_thr_t* thr) /*!< in: query thread */ { btr_cur_t cursor; - ulint search_mode; - ibool modify = FALSE; - rec_t* insert_rec; - rec_t* rec; - ulint* offsets; - ulint err; - ulint n_unique; - big_rec_t* big_rec = NULL; + ulint* offsets = NULL; + dberr_t err; + big_rec_t* big_rec = NULL; mtr_t mtr; - mem_heap_t* heap = NULL; + mem_heap_t* offsets_heap = NULL; - log_free_check(); + ut_ad(dict_index_is_clust(index)); + ut_ad(!dict_index_is_unique(index) + || n_uniq == dict_index_get_n_unique(index)); + ut_ad(!n_uniq || n_uniq == dict_index_get_n_unique(index)); mtr_start(&mtr); + if (mode == BTR_MODIFY_LEAF && dict_index_is_online_ddl(index)) { + mode = BTR_MODIFY_LEAF | BTR_ALREADY_S_LATCHED; + mtr_s_lock(dict_index_get_lock(index), &mtr); + } + cursor.thr = thr; /* Note that we use PAGE_CUR_LE as the search mode, because then the function will return in both low_match and up_match of the cursor sensible values */ - if (dict_index_is_clust(index)) { - search_mode = mode; - } else if (!(thr_get_trx(thr)->check_unique_secondary)) { - search_mode = mode | BTR_INSERT | BTR_IGNORE_SEC_UNIQUE; + btr_cur_search_to_nth_level(index, 0, entry, PAGE_CUR_LE, mode, + &cursor, 0, __FILE__, __LINE__, &mtr); + +#ifdef UNIV_DEBUG + { + page_t* page = btr_cur_get_page(&cursor); + rec_t* first_rec = page_rec_get_next( + page_get_infimum_rec(page)); + + ut_ad(page_rec_is_supremum(first_rec) + || rec_get_n_fields(first_rec, index) + == dtuple_get_n_fields(entry)); + } +#endif + + if (n_uniq && (cursor.up_match >= n_uniq + || cursor.low_match >= n_uniq)) { + + if (flags + == (BTR_CREATE_FLAG | BTR_NO_LOCKING_FLAG + | BTR_NO_UNDO_LOG_FLAG | BTR_KEEP_SYS_FLAG)) { + /* Set no locks when applying log + in online table rebuild. Only check for duplicates. */ + err = row_ins_duplicate_error_in_clust_online( + n_uniq, entry, &cursor, + &offsets, &offsets_heap); + + switch (err) { + case DB_SUCCESS: + break; + default: + ut_ad(0); + /* fall through */ + case DB_SUCCESS_LOCKED_REC: + case DB_DUPLICATE_KEY: + thr_get_trx(thr)->error_info = cursor.index; + } + } else { + /* Note that the following may return also + DB_LOCK_WAIT */ + + err = row_ins_duplicate_error_in_clust( + flags, &cursor, entry, thr, &mtr); + } + + if (err != DB_SUCCESS) { +err_exit: + mtr_commit(&mtr); + goto func_exit; + } + } + + if (row_ins_must_modify_rec(&cursor)) { + /* There is already an index entry with a long enough common + prefix, we must convert the insert into a modify of an + existing record */ + mem_heap_t* entry_heap = mem_heap_create(1024); + + err = row_ins_clust_index_entry_by_modify( + flags, mode, &cursor, &offsets, &offsets_heap, + entry_heap, &big_rec, entry, thr, &mtr); + + rec_t* rec = btr_cur_get_rec(&cursor); + + if (big_rec) { + ut_a(err == DB_SUCCESS); + /* Write out the externally stored + columns while still x-latching + index->lock and block->lock. Allocate + pages for big_rec in the mtr that + modified the B-tree, but be sure to skip + any pages that were freed in mtr. We will + write out the big_rec pages before + committing the B-tree mini-transaction. If + the system crashes so that crash recovery + will not replay the mtr_commit(&mtr), the + big_rec pages will be left orphaned until + the pages are allocated for something else. + + TODO: If the allocation extends the + tablespace, it will not be redo + logged, in either mini-transaction. + Tablespace extension should be + redo-logged in the big_rec + mini-transaction, so that recovery + will not fail when the big_rec was + written to the extended portion of the + file, in case the file was somehow + truncated in the crash. */ + + DEBUG_SYNC_C_IF_THD( + thr_get_trx(thr)->mysql_thd, + "before_row_ins_upd_extern"); + err = btr_store_big_rec_extern_fields( + index, btr_cur_get_block(&cursor), + rec, offsets, big_rec, &mtr, + BTR_STORE_INSERT_UPDATE); + DEBUG_SYNC_C_IF_THD( + thr_get_trx(thr)->mysql_thd, + "after_row_ins_upd_extern"); + /* If writing big_rec fails (for + example, because of DB_OUT_OF_FILE_SPACE), + the record will be corrupted. Even if + we did not update any externally + stored columns, our update could cause + the record to grow so that a + non-updated column was selected for + external storage. This non-update + would not have been written to the + undo log, and thus the record cannot + be rolled back. + + However, because we have not executed + mtr_commit(mtr) yet, the update will + not be replayed in crash recovery, and + the following assertion failure will + effectively "roll back" the operation. */ + ut_a(err == DB_SUCCESS); + dtuple_big_rec_free(big_rec); + } + + if (err == DB_SUCCESS && dict_index_is_online_ddl(index)) { + row_log_table_insert(rec, index, offsets); + } + + mtr_commit(&mtr); + mem_heap_free(entry_heap); } else { - search_mode = mode | BTR_INSERT; + rec_t* insert_rec; + + if (mode != BTR_MODIFY_TREE) { + ut_ad((mode & ~BTR_ALREADY_S_LATCHED) + == BTR_MODIFY_LEAF); + err = btr_cur_optimistic_insert( + flags, &cursor, &offsets, &offsets_heap, + entry, &insert_rec, &big_rec, + n_ext, thr, &mtr); + } else { + if (buf_LRU_buf_pool_running_out()) { + + err = DB_LOCK_TABLE_FULL; + goto err_exit; + } + + err = btr_cur_optimistic_insert( + flags, &cursor, + &offsets, &offsets_heap, + entry, &insert_rec, &big_rec, + n_ext, thr, &mtr); + + if (err == DB_FAIL) { + err = btr_cur_pessimistic_insert( + flags, &cursor, + &offsets, &offsets_heap, + entry, &insert_rec, &big_rec, + n_ext, thr, &mtr); + } + } + + if (UNIV_LIKELY_NULL(big_rec)) { + mtr_commit(&mtr); + + /* Online table rebuild could read (and + ignore) the incomplete record at this point. + If online rebuild is in progress, the + row_ins_index_entry_big_rec() will write log. */ + + DBUG_EXECUTE_IF( + "row_ins_extern_checkpoint", + log_make_checkpoint_at( + IB_ULONGLONG_MAX, TRUE);); + err = row_ins_index_entry_big_rec( + entry, big_rec, offsets, &offsets_heap, index, + thr_get_trx(thr)->mysql_thd, + __FILE__, __LINE__); + dtuple_convert_back_big_rec(index, entry, big_rec); + } else { + if (err == DB_SUCCESS + && dict_index_is_online_ddl(index)) { + row_log_table_insert( + insert_rec, index, offsets); + } + + mtr_commit(&mtr); + } + } + +func_exit: + if (offsets_heap) { + mem_heap_free(offsets_heap); + } + + return(err); +} + +/***************************************************************//** +Starts a mini-transaction and checks if the index will be dropped. +@return true if the index is to be dropped */ +static __attribute__((nonnull, warn_unused_result)) +bool +row_ins_sec_mtr_start_and_check_if_aborted( +/*=======================================*/ + mtr_t* mtr, /*!< out: mini-transaction */ + dict_index_t* index, /*!< in/out: secondary index */ + bool check, /*!< in: whether to check */ + ulint search_mode) + /*!< in: flags */ +{ + ut_ad(!dict_index_is_clust(index)); + + mtr_start(mtr); + + if (!check) { + return(false); + } + + if (search_mode & BTR_ALREADY_S_LATCHED) { + mtr_s_lock(dict_index_get_lock(index), mtr); + } else { + mtr_x_lock(dict_index_get_lock(index), mtr); + } + + switch (index->online_status) { + case ONLINE_INDEX_ABORTED: + case ONLINE_INDEX_ABORTED_DROPPED: + ut_ad(*index->name == TEMP_INDEX_PREFIX); + return(true); + case ONLINE_INDEX_COMPLETE: + return(false); + case ONLINE_INDEX_CREATION: + break; + } + + ut_error; + return(true); +} + +/***************************************************************//** +Tries to insert an entry into a secondary index. If a record with exactly the +same fields is found, the other record is necessarily marked deleted. +It is then unmarked. Otherwise, the entry is just inserted to the index. +@retval DB_SUCCESS on success +@retval DB_LOCK_WAIT on lock wait when !(flags & BTR_NO_LOCKING_FLAG) +@retval DB_FAIL if retry with BTR_MODIFY_TREE is needed +@return error code */ +UNIV_INTERN +dberr_t +row_ins_sec_index_entry_low( +/*========================*/ + ulint flags, /*!< in: undo logging and locking flags */ + ulint mode, /*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE, + depending on whether we wish optimistic or + pessimistic descent down the index tree */ + dict_index_t* index, /*!< in: secondary index */ + mem_heap_t* offsets_heap, + /*!< in/out: memory heap that can be emptied */ + mem_heap_t* heap, /*!< in/out: memory heap */ + dtuple_t* entry, /*!< in/out: index entry to insert */ + trx_id_t trx_id, /*!< in: PAGE_MAX_TRX_ID during + row_log_table_apply(), or 0 */ + que_thr_t* thr) /*!< in: query thread */ +{ + btr_cur_t cursor; + ulint search_mode = mode | BTR_INSERT; + dberr_t err = DB_SUCCESS; + ulint n_unique; + mtr_t mtr; + ulint* offsets = NULL; + + ut_ad(!dict_index_is_clust(index)); + ut_ad(mode == BTR_MODIFY_LEAF || mode == BTR_MODIFY_TREE); + + cursor.thr = thr; + ut_ad(thr_get_trx(thr)->id); + mtr_start(&mtr); + + /* Ensure that we acquire index->lock when inserting into an + index with index->online_status == ONLINE_INDEX_COMPLETE, but + could still be subject to rollback_inplace_alter_table(). + This prevents a concurrent change of index->online_status. + The memory object cannot be freed as long as we have an open + reference to the table, or index->table->n_ref_count > 0. */ + const bool check = *index->name == TEMP_INDEX_PREFIX; + if (check) { + DEBUG_SYNC_C("row_ins_sec_index_enter"); + if (mode == BTR_MODIFY_LEAF) { + search_mode |= BTR_ALREADY_S_LATCHED; + mtr_s_lock(dict_index_get_lock(index), &mtr); + } else { + mtr_x_lock(dict_index_get_lock(index), &mtr); + } + + if (row_log_online_op_try( + index, entry, thr_get_trx(thr)->id)) { + goto func_exit; + } + } + + /* Note that we use PAGE_CUR_LE as the search mode, because then + the function will return in both low_match and up_match of the + cursor sensible values */ + + if (!thr_get_trx(thr)->check_unique_secondary) { + search_mode |= BTR_IGNORE_SEC_UNIQUE; } btr_cur_search_to_nth_level(index, 0, entry, PAGE_CUR_LE, @@ -2151,13 +2625,8 @@ row_ins_index_entry_low( &cursor, 0, __FILE__, __LINE__, &mtr); if (cursor.flag == BTR_CUR_INSERT_TO_IBUF) { - /* The insertion was made to the insert buffer already during - the search: we are done */ - - ut_ad(search_mode & BTR_INSERT); - err = DB_SUCCESS; - - goto function_exit; + /* The insert was buffered during the search: we are done */ + goto func_exit; } #ifdef UNIV_DEBUG @@ -2174,213 +2643,250 @@ row_ins_index_entry_low( n_unique = dict_index_get_n_unique(index); - if (dict_index_is_unique(index) && (cursor.up_match >= n_unique - || cursor.low_match >= n_unique)) { + if (dict_index_is_unique(index) + && (cursor.low_match >= n_unique || cursor.up_match >= n_unique)) { + mtr_commit(&mtr); + + DEBUG_SYNC_C("row_ins_sec_index_unique"); - if (dict_index_is_clust(index)) { - /* Note that the following may return also - DB_LOCK_WAIT */ + if (row_ins_sec_mtr_start_and_check_if_aborted( + &mtr, index, check, search_mode)) { + goto func_exit; + } - err = row_ins_duplicate_error_in_clust( - &cursor, entry, thr, &mtr); - if (err != DB_SUCCESS) { + err = row_ins_scan_sec_index_for_duplicate( + flags, index, entry, thr, check, &mtr, offsets_heap); - goto function_exit; - } - } else { - mtr_commit(&mtr); - err = row_ins_scan_sec_index_for_duplicate( - index, entry, thr); - mtr_start(&mtr); + mtr_commit(&mtr); - if (err != DB_SUCCESS) { - goto function_exit; + switch (err) { + case DB_SUCCESS: + break; + case DB_DUPLICATE_KEY: + if (*index->name == TEMP_INDEX_PREFIX) { + ut_ad(!thr_get_trx(thr) + ->dict_operation_lock_mode); + mutex_enter(&dict_sys->mutex); + dict_set_corrupted_index_cache_only( + index, index->table); + mutex_exit(&dict_sys->mutex); + /* Do not return any error to the + caller. The duplicate will be reported + by ALTER TABLE or CREATE UNIQUE INDEX. + Unfortunately we cannot report the + duplicate key value to the DDL thread, + because the altered_table object is + private to its call stack. */ + err = DB_SUCCESS; } + /* fall through */ + default: + return(err); + } - /* We did not find a duplicate and we have now - locked with s-locks the necessary records to - prevent any insertion of a duplicate by another - transaction. Let us now reposition the cursor and - continue the insertion. */ - - btr_cur_search_to_nth_level(index, 0, entry, - PAGE_CUR_LE, - mode | BTR_INSERT, - &cursor, 0, - __FILE__, __LINE__, &mtr); + if (row_ins_sec_mtr_start_and_check_if_aborted( + &mtr, index, check, search_mode)) { + goto func_exit; } - } - modify = row_ins_must_modify_rec(&cursor); + /* We did not find a duplicate and we have now + locked with s-locks the necessary records to + prevent any insertion of a duplicate by another + transaction. Let us now reposition the cursor and + continue the insertion. */ - if (modify) { + btr_cur_search_to_nth_level( + index, 0, entry, PAGE_CUR_LE, + search_mode & ~(BTR_INSERT | BTR_IGNORE_SEC_UNIQUE), + &cursor, 0, __FILE__, __LINE__, &mtr); + } + + if (row_ins_must_modify_rec(&cursor)) { /* There is already an index entry with a long enough common prefix, we must convert the insert into a modify of an existing record */ + offsets = rec_get_offsets( + btr_cur_get_rec(&cursor), index, offsets, + ULINT_UNDEFINED, &offsets_heap); - if (dict_index_is_clust(index)) { - err = row_ins_clust_index_entry_by_modify( - mode, &cursor, &heap, &big_rec, entry, - thr, &mtr); - - if (big_rec) { - ut_a(err == DB_SUCCESS); - /* Write out the externally stored - columns while still x-latching - index->lock and block->lock. Allocate - pages for big_rec in the mtr that - modified the B-tree, but be sure to skip - any pages that were freed in mtr. We will - write out the big_rec pages before - committing the B-tree mini-transaction. If - the system crashes so that crash recovery - will not replay the mtr_commit(&mtr), the - big_rec pages will be left orphaned until - the pages are allocated for something else. - - TODO: If the allocation extends the - tablespace, it will not be redo - logged, in either mini-transaction. - Tablespace extension should be - redo-logged in the big_rec - mini-transaction, so that recovery - will not fail when the big_rec was - written to the extended portion of the - file, in case the file was somehow - truncated in the crash. */ - - rec = btr_cur_get_rec(&cursor); - offsets = rec_get_offsets( - rec, index, NULL, - ULINT_UNDEFINED, &heap); - - DEBUG_SYNC_C_IF_THD((THD*) - thr_get_trx(thr)->mysql_thd, - "before_row_ins_upd_extern"); - err = btr_store_big_rec_extern_fields( - index, btr_cur_get_block(&cursor), - rec, offsets, big_rec, &mtr, - BTR_STORE_INSERT_UPDATE); - DEBUG_SYNC_C_IF_THD((THD*) - thr_get_trx(thr)->mysql_thd, - "after_row_ins_upd_extern"); - /* If writing big_rec fails (for - example, because of DB_OUT_OF_FILE_SPACE), - the record will be corrupted. Even if - we did not update any externally - stored columns, our update could cause - the record to grow so that a - non-updated column was selected for - external storage. This non-update - would not have been written to the - undo log, and thus the record cannot - be rolled back. - - However, because we have not executed - mtr_commit(mtr) yet, the update will - not be replayed in crash recovery, and - the following assertion failure will - effectively "roll back" the operation. */ - ut_a(err == DB_SUCCESS); - goto stored_big_rec; - } - } else { - ut_ad(!n_ext); - err = row_ins_sec_index_entry_by_modify( - mode, &cursor, entry, thr, &mtr); - } + err = row_ins_sec_index_entry_by_modify( + flags, mode, &cursor, &offsets, + offsets_heap, heap, entry, thr, &mtr); } else { + rec_t* insert_rec; + big_rec_t* big_rec; + if (mode == BTR_MODIFY_LEAF) { err = btr_cur_optimistic_insert( - 0, &cursor, entry, &insert_rec, &big_rec, - n_ext, thr, &mtr); + flags, &cursor, &offsets, &offsets_heap, + entry, &insert_rec, + &big_rec, 0, thr, &mtr); } else { - ut_a(mode == BTR_MODIFY_TREE); + ut_ad(mode == BTR_MODIFY_TREE); if (buf_LRU_buf_pool_running_out()) { err = DB_LOCK_TABLE_FULL; - - goto function_exit; + goto func_exit; } err = btr_cur_optimistic_insert( - 0, &cursor, entry, &insert_rec, &big_rec, - n_ext, thr, &mtr); - + flags, &cursor, + &offsets, &offsets_heap, + entry, &insert_rec, + &big_rec, 0, thr, &mtr); if (err == DB_FAIL) { err = btr_cur_pessimistic_insert( - 0, &cursor, entry, &insert_rec, - &big_rec, n_ext, thr, &mtr); + flags, &cursor, + &offsets, &offsets_heap, + entry, &insert_rec, + &big_rec, 0, thr, &mtr); } } + + if (err == DB_SUCCESS && trx_id) { + page_update_max_trx_id( + btr_cur_get_block(&cursor), + btr_cur_get_page_zip(&cursor), + trx_id, &mtr); + } + + ut_ad(!big_rec); } -function_exit: +func_exit: mtr_commit(&mtr); + return(err); +} - if (UNIV_LIKELY_NULL(big_rec)) { - DBUG_EXECUTE_IF( - "row_ins_extern_checkpoint", - log_make_checkpoint_at(IB_ULONGLONG_MAX, TRUE);); - - mtr_start(&mtr); - - DEBUG_SYNC_C_IF_THD((THD*) - thr_get_trx(thr)->mysql_thd, - "before_row_ins_extern_latch"); - btr_cur_search_to_nth_level(index, 0, entry, PAGE_CUR_LE, - BTR_MODIFY_TREE, &cursor, 0, - __FILE__, __LINE__, &mtr); - rec = btr_cur_get_rec(&cursor); - offsets = rec_get_offsets(rec, index, NULL, - ULINT_UNDEFINED, &heap); - - DEBUG_SYNC_C_IF_THD((THD*) - thr_get_trx(thr)->mysql_thd, - "before_row_ins_extern"); - err = btr_store_big_rec_extern_fields( - index, btr_cur_get_block(&cursor), - rec, offsets, big_rec, &mtr, BTR_STORE_INSERT); - DEBUG_SYNC_C_IF_THD((THD*) - thr_get_trx(thr)->mysql_thd, - "after_row_ins_extern"); - -stored_big_rec: - if (modify) { - dtuple_big_rec_free(big_rec); - } else { - dtuple_convert_back_big_rec(index, entry, big_rec); +/***************************************************************//** +Tries to insert the externally stored fields (off-page columns) +of a clustered index entry. +@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ +UNIV_INTERN +dberr_t +row_ins_index_entry_big_rec_func( +/*=============================*/ + const dtuple_t* entry, /*!< in/out: index entry to insert */ + const big_rec_t* big_rec,/*!< in: externally stored fields */ + ulint* offsets,/*!< in/out: rec offsets */ + mem_heap_t** heap, /*!< in/out: memory heap */ + dict_index_t* index, /*!< in: index */ + const char* file, /*!< in: file name of caller */ +#ifndef DBUG_OFF + const void* thd, /*!< in: connection, or NULL */ +#endif /* DBUG_OFF */ + ulint line) /*!< in: line number of caller */ +{ + mtr_t mtr; + btr_cur_t cursor; + rec_t* rec; + dberr_t error; + + ut_ad(dict_index_is_clust(index)); + + DEBUG_SYNC_C_IF_THD(thd, "before_row_ins_extern_latch"); + + mtr_start(&mtr); + btr_cur_search_to_nth_level(index, 0, entry, PAGE_CUR_LE, + BTR_MODIFY_TREE, &cursor, 0, + file, line, &mtr); + rec = btr_cur_get_rec(&cursor); + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, heap); + + DEBUG_SYNC_C_IF_THD(thd, "before_row_ins_extern"); + error = btr_store_big_rec_extern_fields( + index, btr_cur_get_block(&cursor), + rec, offsets, big_rec, &mtr, BTR_STORE_INSERT); + DEBUG_SYNC_C_IF_THD(thd, "after_row_ins_extern"); + + if (error == DB_SUCCESS + && dict_index_is_online_ddl(index)) { + row_log_table_insert(rec, index, offsets); + } + + mtr_commit(&mtr); + + return(error); +} + +/***************************************************************//** +Inserts an entry into a clustered index. Tries first optimistic, +then pessimistic descent down the tree. If the entry matches enough +to a delete marked record, performs the insert by updating or delete +unmarking the delete marked record. +@return DB_SUCCESS, DB_LOCK_WAIT, DB_DUPLICATE_KEY, or some other error code */ +UNIV_INTERN +dberr_t +row_ins_clust_index_entry( +/*======================*/ + dict_index_t* index, /*!< in: clustered index */ + dtuple_t* entry, /*!< in/out: index entry to insert */ + que_thr_t* thr, /*!< in: query thread */ + ulint n_ext) /*!< in: number of externally stored columns */ +{ + dberr_t err; + ulint n_uniq; + + if (UT_LIST_GET_FIRST(index->table->foreign_list)) { + err = row_ins_check_foreign_constraints( + index->table, index, entry, thr); + if (err != DB_SUCCESS) { + + return(err); } + } - mtr_commit(&mtr); + n_uniq = dict_index_is_unique(index) ? index->n_uniq : 0; + + /* Try first optimistic descent to the B-tree */ + + log_free_check(); + + err = row_ins_clust_index_entry_low( + 0, BTR_MODIFY_LEAF, index, n_uniq, entry, n_ext, thr); + +#ifdef UNIV_DEBUG + /* Work around Bug#14626800 ASSERTION FAILURE IN DEBUG_SYNC(). + Once it is fixed, remove the 'ifdef', 'if' and this comment. */ + if (!thr_get_trx(thr)->ddl) { + DEBUG_SYNC_C_IF_THD(thr_get_trx(thr)->mysql_thd, + "after_row_ins_clust_index_entry_leaf"); } +#endif /* UNIV_DEBUG */ - if (UNIV_LIKELY_NULL(heap)) { - mem_heap_free(heap); + if (err != DB_FAIL) { + DEBUG_SYNC_C("row_ins_clust_index_entry_leaf_after"); + return(err); } - return(err); + + /* Try then pessimistic descent to the B-tree */ + + log_free_check(); + + return(row_ins_clust_index_entry_low( + 0, BTR_MODIFY_TREE, index, n_uniq, entry, n_ext, thr)); } /***************************************************************//** -Inserts an index entry to index. Tries first optimistic, then pessimistic -descent down the tree. If the entry matches enough to a delete marked record, -performs the insert by updating or delete unmarking the delete marked -record. +Inserts an entry into a secondary index. Tries first optimistic, +then pessimistic descent down the tree. If the entry matches enough +to a delete marked record, performs the insert by updating or delete +unmarking the delete marked record. @return DB_SUCCESS, DB_LOCK_WAIT, DB_DUPLICATE_KEY, or some other error code */ UNIV_INTERN -ulint -row_ins_index_entry( -/*================*/ - dict_index_t* index, /*!< in: index */ +dberr_t +row_ins_sec_index_entry( +/*====================*/ + dict_index_t* index, /*!< in: secondary index */ dtuple_t* entry, /*!< in/out: index entry to insert */ - ulint n_ext, /*!< in: number of externally stored columns */ - ibool foreign,/*!< in: TRUE=check foreign key constraints - (foreign=FALSE only during CREATE INDEX) */ que_thr_t* thr) /*!< in: query thread */ { - ulint err; + dberr_t err; + mem_heap_t* offsets_heap; + mem_heap_t* heap; - if (foreign && UT_LIST_GET_FIRST(index->table->foreign_list)) { + if (UT_LIST_GET_FIRST(index->table->foreign_list)) { err = row_ins_check_foreign_constraints(index->table, index, entry, thr); if (err != DB_SUCCESS) { @@ -2389,29 +2895,59 @@ row_ins_index_entry( } } + ut_ad(thr_get_trx(thr)->id); + + offsets_heap = mem_heap_create(1024); + heap = mem_heap_create(1024); + /* Try first optimistic descent to the B-tree */ - err = row_ins_index_entry_low(BTR_MODIFY_LEAF, index, entry, - n_ext, thr); - if (err != DB_FAIL) { - if (index == dict_table_get_first_index(index->table) - && thr_get_trx(thr)->mysql_thd != 0) { - DEBUG_SYNC_C("row_ins_clust_index_entry_leaf_after"); - } - return(err); - } + log_free_check(); - /* Try then pessimistic descent to the B-tree */ + err = row_ins_sec_index_entry_low( + 0, BTR_MODIFY_LEAF, index, offsets_heap, heap, entry, 0, thr); + if (err == DB_FAIL) { + mem_heap_empty(heap); - err = row_ins_index_entry_low(BTR_MODIFY_TREE, index, entry, - n_ext, thr); + /* Try then pessimistic descent to the B-tree */ + + log_free_check(); + + err = row_ins_sec_index_entry_low( + 0, BTR_MODIFY_TREE, index, + offsets_heap, heap, entry, 0, thr); + } + + mem_heap_free(heap); + mem_heap_free(offsets_heap); return(err); } +/***************************************************************//** +Inserts an index entry to index. Tries first optimistic, then pessimistic +descent down the tree. If the entry matches enough to a delete marked record, +performs the insert by updating or delete unmarking the delete marked +record. +@return DB_SUCCESS, DB_LOCK_WAIT, DB_DUPLICATE_KEY, or some other error code */ +static +dberr_t +row_ins_index_entry( +/*================*/ + dict_index_t* index, /*!< in: index */ + dtuple_t* entry, /*!< in/out: index entry to insert */ + que_thr_t* thr) /*!< in: query thread */ +{ + if (dict_index_is_clust(index)) { + return(row_ins_clust_index_entry(index, entry, thr, 0)); + } else { + return(row_ins_sec_index_entry(index, entry, thr)); + } +} + /***********************************************************//** Sets the values of the dtuple fields in entry from the values of appropriate columns in row. */ -static +static __attribute__((nonnull)) void row_ins_index_entry_set_vals( /*=========================*/ @@ -2422,8 +2958,6 @@ row_ins_index_entry_set_vals( ulint n_fields; ulint i; - ut_ad(entry && row); - n_fields = dtuple_get_n_fields(entry); for (i = 0; i < n_fields; i++) { @@ -2466,14 +3000,14 @@ row_ins_index_entry_set_vals( Inserts a single index entry to the table. @return DB_SUCCESS if operation successfully completed, else error code or DB_LOCK_WAIT */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t row_ins_index_entry_step( /*=====================*/ ins_node_t* node, /*!< in: row insert node */ que_thr_t* thr) /*!< in: query thread */ { - ulint err; + dberr_t err; ut_ad(dtuple_check_typed(node->row)); @@ -2481,7 +3015,16 @@ row_ins_index_entry_step( ut_ad(dtuple_check_typed(node->entry)); - err = row_ins_index_entry(node->index, node->entry, 0, TRUE, thr); + err = row_ins_index_entry(node->index, node->entry, thr); + +#ifdef UNIV_DEBUG + /* Work around Bug#14626800 ASSERTION FAILURE IN DEBUG_SYNC(). + Once it is fixed, remove the 'ifdef', 'if' and this comment. */ + if (!thr_get_trx(thr)->ddl) { + DEBUG_SYNC_C_IF_THD(thr_get_trx(thr)->mysql_thd, + "after_row_ins_index_entry_step"); + } +#endif /* UNIV_DEBUG */ return(err); } @@ -2580,16 +3123,14 @@ row_ins_get_row_from_select( Inserts a row to a table. @return DB_SUCCESS if operation successfully completed, else error code or DB_LOCK_WAIT */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t row_ins( /*====*/ ins_node_t* node, /*!< in: row insert node */ que_thr_t* thr) /*!< in: query thread */ { - ulint err; - - ut_ad(node && thr); + dberr_t err; if (node->state == INS_NODE_ALLOC_ROW_ID) { @@ -2625,6 +3166,10 @@ row_ins( node->index = dict_table_get_next_index(node->index); node->entry = UT_LIST_GET_NEXT(tuple_list, node->entry); + DBUG_EXECUTE_IF( + "row_ins_skip_sec", + node->index = NULL; node->entry = NULL; break;); + /* Skip corrupted secondary index and its entry */ while (node->index && dict_index_is_corrupted(node->index)) { @@ -2654,7 +3199,7 @@ row_ins_step( que_node_t* parent; sel_node_t* sel_node; trx_t* trx; - ulint err; + dberr_t err; ut_ad(thr); @@ -2687,6 +3232,8 @@ row_ins_step( if (node->state == INS_NODE_SET_IX_LOCK) { + node->state = INS_NODE_ALLOC_ROW_ID; + /* It may be that the current session has not yet started its transaction, or it has been committed: */ @@ -2698,6 +3245,9 @@ row_ins_step( err = lock_table(0, node->table, LOCK_IX, thr); + DBUG_EXECUTE_IF("ib_row_ins_ix_lock_wait", + err = DB_LOCK_WAIT;); + if (err != DB_SUCCESS) { goto error_handling; @@ -2705,8 +3255,6 @@ row_ins_step( node->trx_id = trx->id; same_trx: - node->state = INS_NODE_ALLOC_ROW_ID; - if (node->ins_type == INS_SEARCHED) { /* Reset the cursor */ sel_node->state = SEL_NODE_OPEN; @@ -2735,7 +3283,7 @@ same_trx: err = row_ins(node, thr); error_handling: - trx->error_state = static_cast<enum db_err>(err); + trx->error_state = err; if (err != DB_SUCCESS) { /* err == DB_LOCK_WAIT or SQL error detected */ diff --git a/storage/innobase/row/row0log.cc b/storage/innobase/row/row0log.cc new file mode 100644 index 00000000000..b373b70ab7a --- /dev/null +++ b/storage/innobase/row/row0log.cc @@ -0,0 +1,3219 @@ +/***************************************************************************** + +Copyright (c) 2011, 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file row/row0log.cc +Modification log for online index creation and online table rebuild + +Created 2011-05-26 Marko Makela +*******************************************************/ + +#include "row0log.h" + +#ifdef UNIV_NONINL +#include "row0log.ic" +#endif + +#include "row0row.h" +#include "row0ins.h" +#include "row0upd.h" +#include "row0merge.h" +#include "row0ext.h" +#include "data0data.h" +#include "que0que.h" +#include "handler0alter.h" + +#include<set> + +/** Table row modification operations during online table rebuild. +Delete-marked records are not copied to the rebuilt table. */ +enum row_tab_op { + /** Insert a record */ + ROW_T_INSERT = 0x41, + /** Update a record in place */ + ROW_T_UPDATE, + /** Delete (purge) a record */ + ROW_T_DELETE +}; + +/** Index record modification operations during online index creation */ +enum row_op { + /** Insert a record */ + ROW_OP_INSERT = 0x61, + /** Delete a record */ + ROW_OP_DELETE +}; + +#ifdef UNIV_DEBUG +/** Write information about the applied record to the error log */ +# define ROW_LOG_APPLY_PRINT +#endif /* UNIV_DEBUG */ + +#ifdef ROW_LOG_APPLY_PRINT +/** When set, write information about the applied record to the error log */ +static bool row_log_apply_print; +#endif /* ROW_LOG_APPLY_PRINT */ + +/** Size of the modification log entry header, in bytes */ +#define ROW_LOG_HEADER_SIZE 2/*op, extra_size*/ + +/** Log block for modifications during online index creation */ +struct row_log_buf_t { + byte* block; /*!< file block buffer */ + mrec_buf_t buf; /*!< buffer for accessing a record + that spans two blocks */ + ulint blocks; /*!< current position in blocks */ + ulint bytes; /*!< current position within buf */ +}; + +/** Set of transactions that rolled back inserts of BLOBs during +online table rebuild */ +typedef std::set<trx_id_t> trx_id_set; + +/** @brief Buffer for logging modifications during online index creation + +All modifications to an index that is being created will be logged by +row_log_online_op() to this buffer. + +All modifications to a table that is being rebuilt will be logged by +row_log_table_delete(), row_log_table_update(), row_log_table_insert() +to this buffer. + +When head.blocks == tail.blocks, the reader will access tail.block +directly. When also head.bytes == tail.bytes, both counts will be +reset to 0 and the file will be truncated. */ +struct row_log_t { + int fd; /*!< file descriptor */ + ib_mutex_t mutex; /*!< mutex protecting trx_log, error, + max_trx and tail */ + trx_id_set* trx_rb; /*!< set of transactions that rolled back + inserts of BLOBs during online table rebuild; + protected by mutex */ + dict_table_t* table; /*!< table that is being rebuilt, + or NULL when this is a secondary + index that is being created online */ + bool same_pk;/*!< whether the definition of the PRIMARY KEY + has remained the same */ + const dtuple_t* add_cols; + /*!< default values of added columns, or NULL */ + const ulint* col_map;/*!< mapping of old column numbers to + new ones, or NULL if !table */ + dberr_t error; /*!< error that occurred during online + table rebuild */ + trx_id_t max_trx;/*!< biggest observed trx_id in + row_log_online_op(); + protected by mutex and index->lock S-latch, + or by index->lock X-latch only */ + row_log_buf_t tail; /*!< writer context; + protected by mutex and index->lock S-latch, + or by index->lock X-latch only */ + row_log_buf_t head; /*!< reader context; protected by MDL only; + modifiable by row_log_apply_ops() */ + ulint size; /*!< allocated size */ +}; + +/******************************************************//** +Logs an operation to a secondary index that is (or was) being created. */ +UNIV_INTERN +void +row_log_online_op( +/*==============*/ + dict_index_t* index, /*!< in/out: index, S or X latched */ + const dtuple_t* tuple, /*!< in: index tuple */ + trx_id_t trx_id) /*!< in: transaction ID for insert, + or 0 for delete */ +{ + byte* b; + ulint extra_size; + ulint size; + ulint mrec_size; + ulint avail_size; + row_log_t* log; + + ut_ad(dtuple_validate(tuple)); + ut_ad(dtuple_get_n_fields(tuple) == dict_index_get_n_fields(index)); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(dict_index_get_lock(index), RW_LOCK_SHARED) + || rw_lock_own(dict_index_get_lock(index), RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + if (dict_index_is_corrupted(index)) { + return; + } + + ut_ad(dict_index_is_online_ddl(index)); + + /* Compute the size of the record. This differs from + row_merge_buf_encode(), because here we do not encode + extra_size+1 (and reserve 0 as the end-of-chunk marker). */ + + size = rec_get_converted_size_temp( + index, tuple->fields, tuple->n_fields, &extra_size); + ut_ad(size >= extra_size); + ut_ad(size <= sizeof log->tail.buf); + + mrec_size = ROW_LOG_HEADER_SIZE + + (extra_size >= 0x80) + size + + (trx_id ? DATA_TRX_ID_LEN : 0); + + log = index->online_log; + mutex_enter(&log->mutex); + + if (trx_id > log->max_trx) { + log->max_trx = trx_id; + } + + UNIV_MEM_INVALID(log->tail.buf, sizeof log->tail.buf); + + ut_ad(log->tail.bytes < srv_sort_buf_size); + avail_size = srv_sort_buf_size - log->tail.bytes; + + if (mrec_size > avail_size) { + b = log->tail.buf; + } else { + b = log->tail.block + log->tail.bytes; + } + + if (trx_id != 0) { + *b++ = ROW_OP_INSERT; + trx_write_trx_id(b, trx_id); + b += DATA_TRX_ID_LEN; + } else { + *b++ = ROW_OP_DELETE; + } + + if (extra_size < 0x80) { + *b++ = (byte) extra_size; + } else { + ut_ad(extra_size < 0x8000); + *b++ = (byte) (0x80 | (extra_size >> 8)); + *b++ = (byte) extra_size; + } + + rec_convert_dtuple_to_temp( + b + extra_size, index, tuple->fields, tuple->n_fields); + b += size; + + if (mrec_size >= avail_size) { + const os_offset_t byte_offset + = (os_offset_t) log->tail.blocks + * srv_sort_buf_size; + ibool ret; + + if (byte_offset + srv_sort_buf_size >= srv_online_max_size) { + goto write_failed; + } + + if (mrec_size == avail_size) { + ut_ad(b == &log->tail.block[srv_sort_buf_size]); + } else { + ut_ad(b == log->tail.buf + mrec_size); + memcpy(log->tail.block + log->tail.bytes, + log->tail.buf, avail_size); + } + UNIV_MEM_ASSERT_RW(log->tail.block, srv_sort_buf_size); + ret = os_file_write( + "(modification log)", + OS_FILE_FROM_FD(log->fd), + log->tail.block, byte_offset, srv_sort_buf_size); + log->tail.blocks++; + if (!ret) { +write_failed: + /* We set the flag directly instead of invoking + dict_set_corrupted_index_cache_only(index) here, + because the index is not "public" yet. */ + index->type |= DICT_CORRUPT; + } + UNIV_MEM_INVALID(log->tail.block, srv_sort_buf_size); + memcpy(log->tail.block, log->tail.buf + avail_size, + mrec_size - avail_size); + log->tail.bytes = mrec_size - avail_size; + } else { + log->tail.bytes += mrec_size; + ut_ad(b == log->tail.block + log->tail.bytes); + } + + UNIV_MEM_INVALID(log->tail.buf, sizeof log->tail.buf); + mutex_exit(&log->mutex); +} + +/******************************************************//** +Gets the error status of the online index rebuild log. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +row_log_table_get_error( +/*====================*/ + const dict_index_t* index) /*!< in: clustered index of a table + that is being rebuilt online */ +{ + ut_ad(dict_index_is_clust(index)); + ut_ad(dict_index_is_online_ddl(index)); + return(index->online_log->error); +} + +/******************************************************//** +Starts logging an operation to a table that is being rebuilt. +@return pointer to log, or NULL if no logging is necessary */ +static __attribute__((nonnull, warn_unused_result)) +byte* +row_log_table_open( +/*===============*/ + row_log_t* log, /*!< in/out: online rebuild log */ + ulint size, /*!< in: size of log record */ + ulint* avail) /*!< out: available size for log record */ +{ + mutex_enter(&log->mutex); + + UNIV_MEM_INVALID(log->tail.buf, sizeof log->tail.buf); + + if (log->error != DB_SUCCESS) { + mutex_exit(&log->mutex); + return(NULL); + } + + ut_ad(log->tail.bytes < srv_sort_buf_size); + *avail = srv_sort_buf_size - log->tail.bytes; + + if (size > *avail) { + return(log->tail.buf); + } else { + return(log->tail.block + log->tail.bytes); + } +} + +/******************************************************//** +Stops logging an operation to a table that is being rebuilt. */ +static __attribute__((nonnull)) +void +row_log_table_close_func( +/*=====================*/ + row_log_t* log, /*!< in/out: online rebuild log */ +#ifdef UNIV_DEBUG + const byte* b, /*!< in: end of log record */ +#endif /* UNIV_DEBUG */ + ulint size, /*!< in: size of log record */ + ulint avail) /*!< in: available size for log record */ +{ + ut_ad(mutex_own(&log->mutex)); + + if (size >= avail) { + const os_offset_t byte_offset + = (os_offset_t) log->tail.blocks + * srv_sort_buf_size; + ibool ret; + + if (byte_offset + srv_sort_buf_size >= srv_online_max_size) { + goto write_failed; + } + + if (size == avail) { + ut_ad(b == &log->tail.block[srv_sort_buf_size]); + } else { + ut_ad(b == log->tail.buf + size); + memcpy(log->tail.block + log->tail.bytes, + log->tail.buf, avail); + } + UNIV_MEM_ASSERT_RW(log->tail.block, srv_sort_buf_size); + ret = os_file_write( + "(modification log)", + OS_FILE_FROM_FD(log->fd), + log->tail.block, byte_offset, srv_sort_buf_size); + log->tail.blocks++; + if (!ret) { +write_failed: + log->error = DB_ONLINE_LOG_TOO_BIG; + } + UNIV_MEM_INVALID(log->tail.block, srv_sort_buf_size); + memcpy(log->tail.block, log->tail.buf + avail, size - avail); + log->tail.bytes = size - avail; + } else { + log->tail.bytes += size; + ut_ad(b == log->tail.block + log->tail.bytes); + } + + UNIV_MEM_INVALID(log->tail.buf, sizeof log->tail.buf); + mutex_exit(&log->mutex); +} + +#ifdef UNIV_DEBUG +# define row_log_table_close(log, b, size, avail) \ + row_log_table_close_func(log, b, size, avail) +#else /* UNIV_DEBUG */ +# define row_log_table_close(log, b, size, avail) \ + row_log_table_close_func(log, size, avail) +#endif /* UNIV_DEBUG */ + +/******************************************************//** +Logs a delete operation to a table that is being rebuilt. +This will be merged in row_log_table_apply_delete(). */ +UNIV_INTERN +void +row_log_table_delete( +/*=================*/ + const rec_t* rec, /*!< in: clustered index leaf page record, + page X-latched */ + dict_index_t* index, /*!< in/out: clustered index, S-latched + or X-latched */ + const ulint* offsets,/*!< in: rec_get_offsets(rec,index) */ + trx_id_t trx_id) /*!< in: DB_TRX_ID of the record before + it was deleted */ +{ + ulint old_pk_extra_size; + ulint old_pk_size; + ulint ext_size = 0; + ulint mrec_size; + ulint avail_size; + mem_heap_t* heap = NULL; + const dtuple_t* old_pk; + row_ext_t* ext; + + ut_ad(dict_index_is_clust(index)); + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(rec_offs_n_fields(offsets) == dict_index_get_n_fields(index)); + ut_ad(rec_offs_size(offsets) <= sizeof index->online_log->tail.buf); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&index->lock, RW_LOCK_SHARED) + || rw_lock_own(&index->lock, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + if (dict_index_is_corrupted(index) + || !dict_index_is_online_ddl(index) + || index->online_log->error != DB_SUCCESS) { + return; + } + + dict_table_t* new_table = index->online_log->table; + dict_index_t* new_index = dict_table_get_first_index(new_table); + + ut_ad(dict_index_is_clust(new_index)); + ut_ad(!dict_index_is_online_ddl(new_index)); + + /* Create the tuple PRIMARY KEY, DB_TRX_ID in the new_table. */ + if (index->online_log->same_pk) { + byte* db_trx_id; + dtuple_t* tuple; + ut_ad(new_index->n_uniq == index->n_uniq); + + /* The PRIMARY KEY and DB_TRX_ID are in the first + fields of the record. */ + heap = mem_heap_create( + DATA_TRX_ID_LEN + + DTUPLE_EST_ALLOC(new_index->n_uniq + 1)); + old_pk = tuple = dtuple_create(heap, new_index->n_uniq + 1); + dict_index_copy_types(tuple, new_index, tuple->n_fields); + dtuple_set_n_fields_cmp(tuple, new_index->n_uniq); + + for (ulint i = 0; i < new_index->n_uniq; i++) { + ulint len; + const void* field = rec_get_nth_field( + rec, offsets, i, &len); + dfield_t* dfield = dtuple_get_nth_field( + tuple, i); + ut_ad(len != UNIV_SQL_NULL); + ut_ad(!rec_offs_nth_extern(offsets, i)); + dfield_set_data(dfield, field, len); + } + + db_trx_id = static_cast<byte*>( + mem_heap_alloc(heap, DATA_TRX_ID_LEN)); + trx_write_trx_id(db_trx_id, trx_id); + + dfield_set_data(dtuple_get_nth_field(tuple, new_index->n_uniq), + db_trx_id, DATA_TRX_ID_LEN); + } else { + /* The PRIMARY KEY has changed. Translate the tuple. */ + dfield_t* dfield; + + old_pk = row_log_table_get_pk(rec, index, offsets, &heap); + + if (!old_pk) { + ut_ad(index->online_log->error != DB_SUCCESS); + return; + } + + /* Remove DB_ROLL_PTR. */ + ut_ad(dtuple_get_n_fields_cmp(old_pk) + == dict_index_get_n_unique(new_index)); + ut_ad(dtuple_get_n_fields(old_pk) + == dict_index_get_n_unique(new_index) + 2); + const_cast<ulint&>(old_pk->n_fields)--; + + /* Overwrite DB_TRX_ID with the old trx_id. */ + dfield = dtuple_get_nth_field(old_pk, new_index->n_uniq); + ut_ad(dfield_get_type(dfield)->mtype == DATA_SYS); + ut_ad(dfield_get_type(dfield)->prtype + == (DATA_NOT_NULL | DATA_TRX_ID)); + ut_ad(dfield_get_len(dfield) == DATA_TRX_ID_LEN); + trx_write_trx_id(static_cast<byte*>(dfield->data), trx_id); + } + + ut_ad(dtuple_get_n_fields(old_pk) > 1); + ut_ad(DATA_TRX_ID_LEN == dtuple_get_nth_field( + old_pk, old_pk->n_fields - 1)->len); + old_pk_size = rec_get_converted_size_temp( + new_index, old_pk->fields, old_pk->n_fields, + &old_pk_extra_size); + ut_ad(old_pk_extra_size < 0x100); + + mrec_size = 4 + old_pk_size; + + /* If the row is marked as rollback, we will need to + log the enough prefix of the BLOB unless both the + old and new table are in COMPACT or REDUNDANT format */ + if ((dict_table_get_format(index->table) >= UNIV_FORMAT_B + || dict_table_get_format(new_table) >= UNIV_FORMAT_B) + && row_log_table_is_rollback(index, trx_id)) { + if (rec_offs_any_extern(offsets)) { + /* Build a cache of those off-page column + prefixes that are referenced by secondary + indexes. It can be that none of the off-page + columns are needed. */ + row_build(ROW_COPY_DATA, index, rec, + offsets, NULL, NULL, NULL, &ext, heap); + if (ext) { + /* Log the row_ext_t, ext->ext and ext->buf */ + ext_size = ext->n_ext * ext->max_len + + sizeof(*ext) + + ext->n_ext * sizeof(ulint) + + (ext->n_ext - 1) * sizeof ext->len; + mrec_size += ext_size; + } + } + } + + if (byte* b = row_log_table_open(index->online_log, + mrec_size, &avail_size)) { + *b++ = ROW_T_DELETE; + *b++ = static_cast<byte>(old_pk_extra_size); + + /* Log the size of external prefix we saved */ + mach_write_to_2(b, ext_size); + b += 2; + + rec_convert_dtuple_to_temp( + b + old_pk_extra_size, new_index, + old_pk->fields, old_pk->n_fields); + + b += old_pk_size; + + if (ext_size) { + ulint cur_ext_size = sizeof(*ext) + + (ext->n_ext - 1) * sizeof ext->len; + + memcpy(b, ext, cur_ext_size); + b += cur_ext_size; + + /* Check if we need to col_map to adjust the column + number. If columns were added/removed/reordered, + adjust the column number. */ + if (const ulint* col_map = + index->online_log->col_map) { + for (ulint i = 0; i < ext->n_ext; i++) { + const_cast<ulint&>(ext->ext[i]) = + col_map[ext->ext[i]]; + } + } + + memcpy(b, ext->ext, ext->n_ext * sizeof(*ext->ext)); + b += ext->n_ext * sizeof(*ext->ext); + + ext_size -= cur_ext_size + + ext->n_ext * sizeof(*ext->ext); + memcpy(b, ext->buf, ext_size); + b += ext_size; + } + + row_log_table_close( + index->online_log, b, mrec_size, avail_size); + } + + mem_heap_free(heap); +} + +/******************************************************//** +Logs an insert or update to a table that is being rebuilt. */ +static __attribute__((nonnull(1,2,3))) +void +row_log_table_low_redundant( +/*========================*/ + const rec_t* rec, /*!< in: clustered index leaf + page record in ROW_FORMAT=REDUNDANT, + page X-latched */ + dict_index_t* index, /*!< in/out: clustered index, S-latched + or X-latched */ + const ulint* offsets,/*!< in: rec_get_offsets(rec,index) */ + bool insert, /*!< in: true if insert, + false if update */ + const dtuple_t* old_pk, /*!< in: old PRIMARY KEY value + (if !insert and a PRIMARY KEY + is being created) */ + const dict_index_t* new_index) + /*!< in: clustered index of the + new table, not latched */ +{ + ulint old_pk_size; + ulint old_pk_extra_size; + ulint size; + ulint extra_size; + ulint mrec_size; + ulint avail_size; + mem_heap_t* heap = NULL; + dtuple_t* tuple; + + ut_ad(!page_is_comp(page_align(rec))); + ut_ad(dict_index_get_n_fields(index) == rec_get_n_fields_old(rec)); + + heap = mem_heap_create(DTUPLE_EST_ALLOC(index->n_fields)); + tuple = dtuple_create(heap, index->n_fields); + dict_index_copy_types(tuple, index, index->n_fields); + dtuple_set_n_fields_cmp(tuple, dict_index_get_n_unique(index)); + + if (rec_get_1byte_offs_flag(rec)) { + for (ulint i = 0; i < index->n_fields; i++) { + dfield_t* dfield; + ulint len; + const void* field; + + dfield = dtuple_get_nth_field(tuple, i); + field = rec_get_nth_field_old(rec, i, &len); + + dfield_set_data(dfield, field, len); + } + } else { + for (ulint i = 0; i < index->n_fields; i++) { + dfield_t* dfield; + ulint len; + const void* field; + + dfield = dtuple_get_nth_field(tuple, i); + field = rec_get_nth_field_old(rec, i, &len); + + dfield_set_data(dfield, field, len); + + if (rec_2_is_field_extern(rec, i)) { + dfield_set_ext(dfield); + } + } + } + + size = rec_get_converted_size_temp( + index, tuple->fields, tuple->n_fields, &extra_size); + + mrec_size = ROW_LOG_HEADER_SIZE + size + (extra_size >= 0x80); + + if (insert || index->online_log->same_pk) { + ut_ad(!old_pk); + old_pk_extra_size = old_pk_size = 0; + } else { + ut_ad(old_pk); + ut_ad(old_pk->n_fields == 2 + old_pk->n_fields_cmp); + ut_ad(DATA_TRX_ID_LEN == dtuple_get_nth_field( + old_pk, old_pk->n_fields - 2)->len); + ut_ad(DATA_ROLL_PTR_LEN == dtuple_get_nth_field( + old_pk, old_pk->n_fields - 1)->len); + + old_pk_size = rec_get_converted_size_temp( + new_index, old_pk->fields, old_pk->n_fields, + &old_pk_extra_size); + ut_ad(old_pk_extra_size < 0x100); + mrec_size += 1/*old_pk_extra_size*/ + old_pk_size; + } + + if (byte* b = row_log_table_open(index->online_log, + mrec_size, &avail_size)) { + *b++ = insert ? ROW_T_INSERT : ROW_T_UPDATE; + + if (old_pk_size) { + *b++ = static_cast<byte>(old_pk_extra_size); + + rec_convert_dtuple_to_temp( + b + old_pk_extra_size, new_index, + old_pk->fields, old_pk->n_fields); + b += old_pk_size; + } + + if (extra_size < 0x80) { + *b++ = static_cast<byte>(extra_size); + } else { + ut_ad(extra_size < 0x8000); + *b++ = static_cast<byte>(0x80 | (extra_size >> 8)); + *b++ = static_cast<byte>(extra_size); + } + + rec_convert_dtuple_to_temp( + b + extra_size, index, tuple->fields, tuple->n_fields); + b += size; + + row_log_table_close( + index->online_log, b, mrec_size, avail_size); + } + + mem_heap_free(heap); +} + +/******************************************************//** +Logs an insert or update to a table that is being rebuilt. */ +static __attribute__((nonnull(1,2,3))) +void +row_log_table_low( +/*==============*/ + const rec_t* rec, /*!< in: clustered index leaf page record, + page X-latched */ + dict_index_t* index, /*!< in/out: clustered index, S-latched + or X-latched */ + const ulint* offsets,/*!< in: rec_get_offsets(rec,index) */ + bool insert, /*!< in: true if insert, false if update */ + const dtuple_t* old_pk) /*!< in: old PRIMARY KEY value (if !insert + and a PRIMARY KEY is being created) */ +{ + ulint omit_size; + ulint old_pk_size; + ulint old_pk_extra_size; + ulint extra_size; + ulint mrec_size; + ulint avail_size; + const dict_index_t* new_index = dict_table_get_first_index( + index->online_log->table); + ut_ad(dict_index_is_clust(index)); + ut_ad(dict_index_is_clust(new_index)); + ut_ad(!dict_index_is_online_ddl(new_index)); + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(rec_offs_n_fields(offsets) == dict_index_get_n_fields(index)); + ut_ad(rec_offs_size(offsets) <= sizeof index->online_log->tail.buf); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&index->lock, RW_LOCK_SHARED) + || rw_lock_own(&index->lock, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + ut_ad(fil_page_get_type(page_align(rec)) == FIL_PAGE_INDEX); + ut_ad(page_is_leaf(page_align(rec))); + ut_ad(!page_is_comp(page_align(rec)) == !rec_offs_comp(offsets)); + + if (dict_index_is_corrupted(index) + || !dict_index_is_online_ddl(index) + || index->online_log->error != DB_SUCCESS) { + return; + } + + if (!rec_offs_comp(offsets)) { + row_log_table_low_redundant( + rec, index, offsets, insert, old_pk, new_index); + return; + } + + ut_ad(page_is_comp(page_align(rec))); + ut_ad(rec_get_status(rec) == REC_STATUS_ORDINARY); + + omit_size = REC_N_NEW_EXTRA_BYTES; + + extra_size = rec_offs_extra_size(offsets) - omit_size; + + mrec_size = rec_offs_size(offsets) - omit_size + + ROW_LOG_HEADER_SIZE + (extra_size >= 0x80); + + if (insert || index->online_log->same_pk) { + ut_ad(!old_pk); + old_pk_extra_size = old_pk_size = 0; + } else { + ut_ad(old_pk); + ut_ad(old_pk->n_fields == 2 + old_pk->n_fields_cmp); + ut_ad(DATA_TRX_ID_LEN == dtuple_get_nth_field( + old_pk, old_pk->n_fields - 2)->len); + ut_ad(DATA_ROLL_PTR_LEN == dtuple_get_nth_field( + old_pk, old_pk->n_fields - 1)->len); + + old_pk_size = rec_get_converted_size_temp( + new_index, old_pk->fields, old_pk->n_fields, + &old_pk_extra_size); + ut_ad(old_pk_extra_size < 0x100); + mrec_size += 1/*old_pk_extra_size*/ + old_pk_size; + } + + if (byte* b = row_log_table_open(index->online_log, + mrec_size, &avail_size)) { + *b++ = insert ? ROW_T_INSERT : ROW_T_UPDATE; + + if (old_pk_size) { + *b++ = static_cast<byte>(old_pk_extra_size); + + rec_convert_dtuple_to_temp( + b + old_pk_extra_size, new_index, + old_pk->fields, old_pk->n_fields); + b += old_pk_size; + } + + if (extra_size < 0x80) { + *b++ = static_cast<byte>(extra_size); + } else { + ut_ad(extra_size < 0x8000); + *b++ = static_cast<byte>(0x80 | (extra_size >> 8)); + *b++ = static_cast<byte>(extra_size); + } + + memcpy(b, rec - rec_offs_extra_size(offsets), extra_size); + b += extra_size; + memcpy(b, rec, rec_offs_data_size(offsets)); + b += rec_offs_data_size(offsets); + + row_log_table_close( + index->online_log, b, mrec_size, avail_size); + } +} + +/******************************************************//** +Logs an update to a table that is being rebuilt. +This will be merged in row_log_table_apply_update(). */ +UNIV_INTERN +void +row_log_table_update( +/*=================*/ + const rec_t* rec, /*!< in: clustered index leaf page record, + page X-latched */ + dict_index_t* index, /*!< in/out: clustered index, S-latched + or X-latched */ + const ulint* offsets,/*!< in: rec_get_offsets(rec,index) */ + const dtuple_t* old_pk) /*!< in: row_log_table_get_pk() + before the update */ +{ + row_log_table_low(rec, index, offsets, false, old_pk); +} + +/******************************************************//** +Constructs the old PRIMARY KEY and DB_TRX_ID,DB_ROLL_PTR +of a table that is being rebuilt. +@return tuple of PRIMARY KEY,DB_TRX_ID,DB_ROLL_PTR in the rebuilt table, +or NULL if the PRIMARY KEY definition does not change */ +UNIV_INTERN +const dtuple_t* +row_log_table_get_pk( +/*=================*/ + const rec_t* rec, /*!< in: clustered index leaf page record, + page X-latched */ + dict_index_t* index, /*!< in/out: clustered index, S-latched + or X-latched */ + const ulint* offsets,/*!< in: rec_get_offsets(rec,index) */ + mem_heap_t** heap) /*!< in/out: memory heap where allocated */ +{ + dtuple_t* tuple = NULL; + row_log_t* log = index->online_log; + + ut_ad(dict_index_is_clust(index)); + ut_ad(dict_index_is_online_ddl(index)); + ut_ad(!offsets || rec_offs_validate(rec, index, offsets)); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&index->lock, RW_LOCK_SHARED) + || rw_lock_own(&index->lock, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + ut_ad(log); + ut_ad(log->table); + + if (log->same_pk) { + /* The PRIMARY KEY columns are unchanged. */ + return(NULL); + } + + mutex_enter(&log->mutex); + + /* log->error is protected by log->mutex. */ + if (log->error == DB_SUCCESS) { + dict_table_t* new_table = log->table; + dict_index_t* new_index + = dict_table_get_first_index(new_table); + const ulint new_n_uniq + = dict_index_get_n_unique(new_index); + + if (!*heap) { + ulint size = 0; + + if (!offsets) { + size += (1 + REC_OFFS_HEADER_SIZE + + index->n_fields) + * sizeof *offsets; + } + + for (ulint i = 0; i < new_n_uniq; i++) { + size += dict_col_get_min_size( + dict_index_get_nth_col(new_index, i)); + } + + *heap = mem_heap_create( + DTUPLE_EST_ALLOC(new_n_uniq + 2) + size); + } + + if (!offsets) { + offsets = rec_get_offsets(rec, index, NULL, + ULINT_UNDEFINED, heap); + } + + tuple = dtuple_create(*heap, new_n_uniq + 2); + dict_index_copy_types(tuple, new_index, tuple->n_fields); + dtuple_set_n_fields_cmp(tuple, new_n_uniq); + + for (ulint new_i = 0; new_i < new_n_uniq; new_i++) { + dict_field_t* ifield; + dfield_t* dfield; + const dict_col_t* new_col; + const dict_col_t* col; + ulint col_no; + ulint i; + ulint len; + const byte* field; + + ifield = dict_index_get_nth_field(new_index, new_i); + dfield = dtuple_get_nth_field(tuple, new_i); + new_col = dict_field_get_col(ifield); + col_no = new_col->ind; + + for (ulint old_i = 0; old_i < index->table->n_cols; + old_i++) { + if (col_no == log->col_map[old_i]) { + col_no = old_i; + goto copy_col; + } + } + + /* No matching column was found in the old + table, so this must be an added column. + Copy the default value. */ + ut_ad(log->add_cols); + dfield_copy(dfield, + dtuple_get_nth_field( + log->add_cols, col_no)); + continue; + +copy_col: + col = dict_table_get_nth_col(index->table, col_no); + + i = dict_col_get_clust_pos(col, index); + + if (i == ULINT_UNDEFINED) { + ut_ad(0); + log->error = DB_CORRUPTION; + tuple = NULL; + goto func_exit; + } + + field = rec_get_nth_field(rec, offsets, i, &len); + + if (len == UNIV_SQL_NULL) { + log->error = DB_INVALID_NULL; + tuple = NULL; + goto func_exit; + } + + if (rec_offs_nth_extern(offsets, i)) { + ulint field_len = ifield->prefix_len; + byte* blob_field; + const ulint max_len = + DICT_MAX_FIELD_LEN_BY_FORMAT( + new_table); + + if (!field_len) { + field_len = ifield->fixed_len; + if (!field_len) { + field_len = max_len + 1; + } + } + + blob_field = static_cast<byte*>( + mem_heap_alloc(*heap, field_len)); + + len = btr_copy_externally_stored_field_prefix( + blob_field, field_len, + dict_table_zip_size(index->table), + field, len); + if (len == max_len + 1) { + log->error = DB_TOO_BIG_INDEX_COL; + tuple = NULL; + goto func_exit; + } + + dfield_set_data(dfield, blob_field, len); + } else { + if (ifield->prefix_len + && ifield->prefix_len < len) { + len = ifield->prefix_len; + } + + dfield_set_data( + dfield, + mem_heap_dup(*heap, field, len), len); + } + } + + const byte* trx_roll = rec + + row_get_trx_id_offset(index, offsets); + + dfield_set_data(dtuple_get_nth_field(tuple, new_n_uniq), + trx_roll, DATA_TRX_ID_LEN); + dfield_set_data(dtuple_get_nth_field(tuple, new_n_uniq + 1), + trx_roll + DATA_TRX_ID_LEN, DATA_ROLL_PTR_LEN); + } + +func_exit: + mutex_exit(&log->mutex); + return(tuple); +} + +/******************************************************//** +Logs an insert to a table that is being rebuilt. +This will be merged in row_log_table_apply_insert(). */ +UNIV_INTERN +void +row_log_table_insert( +/*=================*/ + const rec_t* rec, /*!< in: clustered index leaf page record, + page X-latched */ + dict_index_t* index, /*!< in/out: clustered index, S-latched + or X-latched */ + const ulint* offsets)/*!< in: rec_get_offsets(rec,index) */ +{ + row_log_table_low(rec, index, offsets, true, NULL); +} + +/******************************************************//** +Notes that a transaction is being rolled back. */ +UNIV_INTERN +void +row_log_table_rollback( +/*===================*/ + dict_index_t* index, /*!< in/out: clustered index */ + trx_id_t trx_id) /*!< in: transaction being rolled back */ +{ + ut_ad(dict_index_is_clust(index)); +#ifdef UNIV_DEBUG + ibool corrupt = FALSE; + ut_ad(trx_rw_is_active(trx_id, &corrupt)); + ut_ad(!corrupt); +#endif /* UNIV_DEBUG */ + + /* Protect transitions of index->online_status and access to + index->online_log. */ + rw_lock_s_lock(&index->lock); + + if (dict_index_is_online_ddl(index)) { + ut_ad(index->online_log); + ut_ad(index->online_log->table); + mutex_enter(&index->online_log->mutex); + trx_id_set* trxs = index->online_log->trx_rb; + + if (!trxs) { + index->online_log->trx_rb = trxs = new trx_id_set(); + } + + trxs->insert(trx_id); + + mutex_exit(&index->online_log->mutex); + } + + rw_lock_s_unlock(&index->lock); +} + +/******************************************************//** +Check if a transaction rollback has been initiated. +@return true if inserts of this transaction were rolled back */ +UNIV_INTERN +bool +row_log_table_is_rollback( +/*======================*/ + const dict_index_t* index, /*!< in: clustered index */ + trx_id_t trx_id) /*!< in: transaction id */ +{ + ut_ad(dict_index_is_clust(index)); + ut_ad(dict_index_is_online_ddl(index)); + ut_ad(index->online_log); + + if (const trx_id_set* trxs = index->online_log->trx_rb) { + mutex_enter(&index->online_log->mutex); + bool is_rollback = trxs->find(trx_id) != trxs->end(); + mutex_exit(&index->online_log->mutex); + + return(is_rollback); + } + + return(false); +} + +/******************************************************//** +Converts a log record to a table row. +@return converted row, or NULL if the conversion fails +or the transaction has been rolled back */ +static __attribute__((nonnull, warn_unused_result)) +const dtuple_t* +row_log_table_apply_convert_mrec( +/*=============================*/ + const mrec_t* mrec, /*!< in: merge record */ + dict_index_t* index, /*!< in: index of mrec */ + const ulint* offsets, /*!< in: offsets of mrec */ + const row_log_t* log, /*!< in: rebuild context */ + mem_heap_t* heap, /*!< in/out: memory heap */ + trx_id_t trx_id, /*!< in: DB_TRX_ID of mrec */ + dberr_t* error) /*!< out: DB_SUCCESS or + reason of failure */ +{ + dtuple_t* row; + +#ifdef UNIV_SYNC_DEBUG + /* This prevents BLOBs from being freed, in case an insert + transaction rollback starts after row_log_table_is_rollback(). */ + ut_ad(rw_lock_own(dict_index_get_lock(index), RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + if (row_log_table_is_rollback(index, trx_id)) { + row = NULL; + goto func_exit; + } + + /* This is based on row_build(). */ + if (log->add_cols) { + row = dtuple_copy(log->add_cols, heap); + /* dict_table_copy_types() would set the fields to NULL */ + for (ulint i = 0; i < dict_table_get_n_cols(log->table); i++) { + dict_col_copy_type( + dict_table_get_nth_col(log->table, i), + dfield_get_type(dtuple_get_nth_field(row, i))); + } + } else { + row = dtuple_create(heap, dict_table_get_n_cols(log->table)); + dict_table_copy_types(row, log->table); + } + + for (ulint i = 0; i < rec_offs_n_fields(offsets); i++) { + const dict_field_t* ind_field + = dict_index_get_nth_field(index, i); + + if (ind_field->prefix_len) { + /* Column prefixes can only occur in key + fields, which cannot be stored externally. For + a column prefix, there should also be the full + field in the clustered index tuple. The row + tuple comprises full fields, not prefixes. */ + ut_ad(!rec_offs_nth_extern(offsets, i)); + continue; + } + + const dict_col_t* col + = dict_field_get_col(ind_field); + ulint col_no + = log->col_map[dict_col_get_no(col)]; + + if (col_no == ULINT_UNDEFINED) { + /* dropped column */ + continue; + } + + dfield_t* dfield + = dtuple_get_nth_field(row, col_no); + ulint len; + const void* data; + + if (rec_offs_nth_extern(offsets, i)) { + ut_ad(rec_offs_any_extern(offsets)); + data = btr_rec_copy_externally_stored_field( + mrec, offsets, + dict_table_zip_size(index->table), + i, &len, heap); + ut_a(data); + } else { + data = rec_get_nth_field(mrec, offsets, i, &len); + } + + dfield_set_data(dfield, data, len); + + /* See if any columns were changed to NULL or NOT NULL. */ + const dict_col_t* new_col + = dict_table_get_nth_col(log->table, col_no); + ut_ad(new_col->mtype == col->mtype); + + /* Assert that prtype matches except for nullability. */ + ut_ad(!((new_col->prtype ^ col->prtype) & ~DATA_NOT_NULL)); + ut_ad(!((new_col->prtype ^ dfield_get_type(dfield)->prtype) + & ~DATA_NOT_NULL)); + + if (new_col->prtype == col->prtype) { + continue; + } + + if ((new_col->prtype & DATA_NOT_NULL) + && dfield_is_null(dfield)) { + /* We got a NULL value for a NOT NULL column. */ + *error = DB_INVALID_NULL; + return(NULL); + } + + /* Adjust the DATA_NOT_NULL flag in the parsed row. */ + dfield_get_type(dfield)->prtype = new_col->prtype; + + ut_ad(dict_col_type_assert_equal(new_col, + dfield_get_type(dfield))); + } + +func_exit: + *error = DB_SUCCESS; + return(row); +} + +/******************************************************//** +Replays an insert operation on a table that was rebuilt. +@return DB_SUCCESS or error code */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_log_table_apply_insert_low( +/*===========================*/ + que_thr_t* thr, /*!< in: query graph */ + const dtuple_t* row, /*!< in: table row + in the old table definition */ + trx_id_t trx_id, /*!< in: trx_id of the row */ + mem_heap_t* offsets_heap, /*!< in/out: memory heap + that can be emptied */ + mem_heap_t* heap, /*!< in/out: memory heap */ + row_merge_dup_t* dup) /*!< in/out: for reporting + duplicate key errors */ +{ + dberr_t error; + dtuple_t* entry; + const row_log_t*log = dup->index->online_log; + dict_index_t* index = dict_table_get_first_index(log->table); + + ut_ad(dtuple_validate(row)); + ut_ad(trx_id); + +#ifdef ROW_LOG_APPLY_PRINT + if (row_log_apply_print) { + fprintf(stderr, "table apply insert " + IB_ID_FMT " " IB_ID_FMT "\n", + index->table->id, index->id); + dtuple_print(stderr, row); + } +#endif /* ROW_LOG_APPLY_PRINT */ + + static const ulint flags + = (BTR_CREATE_FLAG + | BTR_NO_LOCKING_FLAG + | BTR_NO_UNDO_LOG_FLAG + | BTR_KEEP_SYS_FLAG); + + entry = row_build_index_entry(row, NULL, index, heap); + + error = row_ins_clust_index_entry_low( + flags, BTR_MODIFY_TREE, index, index->n_uniq, entry, 0, thr); + + switch (error) { + case DB_SUCCESS: + break; + case DB_SUCCESS_LOCKED_REC: + /* The row had already been copied to the table. */ + return(DB_SUCCESS); + default: + return(error); + } + + do { + if (!(index = dict_table_get_next_index(index))) { + break; + } + + if (index->type & DICT_FTS) { + continue; + } + + entry = row_build_index_entry(row, NULL, index, heap); + error = row_ins_sec_index_entry_low( + flags, BTR_MODIFY_TREE, + index, offsets_heap, heap, entry, trx_id, thr); + } while (error == DB_SUCCESS); + + return(error); +} + +/******************************************************//** +Replays an insert operation on a table that was rebuilt. +@return DB_SUCCESS or error code */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_log_table_apply_insert( +/*=======================*/ + que_thr_t* thr, /*!< in: query graph */ + const mrec_t* mrec, /*!< in: record to insert */ + const ulint* offsets, /*!< in: offsets of mrec */ + mem_heap_t* offsets_heap, /*!< in/out: memory heap + that can be emptied */ + mem_heap_t* heap, /*!< in/out: memory heap */ + row_merge_dup_t* dup, /*!< in/out: for reporting + duplicate key errors */ + trx_id_t trx_id) /*!< in: DB_TRX_ID of mrec */ +{ + const row_log_t*log = dup->index->online_log; + dberr_t error; + const dtuple_t* row = row_log_table_apply_convert_mrec( + mrec, dup->index, offsets, log, heap, trx_id, &error); + + ut_ad(error == DB_SUCCESS || !row); + /* Handling of duplicate key error requires storing + of offending key in a record buffer. */ + ut_ad(error != DB_DUPLICATE_KEY); + + if (error != DB_SUCCESS) + return(error); + + if (row) { + error = row_log_table_apply_insert_low( + thr, row, trx_id, offsets_heap, heap, dup); + if (error != DB_SUCCESS) { + /* Report the erroneous row using the new + version of the table. */ + innobase_row_to_mysql(dup->table, log->table, row); + } + } + return(error); +} + +/******************************************************//** +Deletes a record from a table that is being rebuilt. +@return DB_SUCCESS or error code */ +static __attribute__((nonnull(1, 2, 4, 5), warn_unused_result)) +dberr_t +row_log_table_apply_delete_low( +/*===========================*/ + btr_pcur_t* pcur, /*!< in/out: B-tree cursor, + will be trashed */ + const ulint* offsets, /*!< in: offsets on pcur */ + const row_ext_t* save_ext, /*!< in: saved external field + info, or NULL */ + mem_heap_t* heap, /*!< in/out: memory heap */ + mtr_t* mtr) /*!< in/out: mini-transaction, + will be committed */ +{ + dberr_t error; + row_ext_t* ext; + dtuple_t* row; + dict_index_t* index = btr_pcur_get_btr_cur(pcur)->index; + + ut_ad(dict_index_is_clust(index)); + +#ifdef ROW_LOG_APPLY_PRINT + if (row_log_apply_print) { + fprintf(stderr, "table apply delete " + IB_ID_FMT " " IB_ID_FMT "\n", + index->table->id, index->id); + rec_print_new(stderr, btr_pcur_get_rec(pcur), offsets); + } +#endif /* ROW_LOG_APPLY_PRINT */ + if (dict_table_get_next_index(index)) { + /* Build a row template for purging secondary index entries. */ + row = row_build( + ROW_COPY_DATA, index, btr_pcur_get_rec(pcur), + offsets, NULL, NULL, NULL, + save_ext ? NULL : &ext, heap); + if (!save_ext) { + save_ext = ext; + } + } else { + row = NULL; + } + + btr_cur_pessimistic_delete(&error, FALSE, btr_pcur_get_btr_cur(pcur), + BTR_CREATE_FLAG, RB_NONE, mtr); + mtr_commit(mtr); + + if (error != DB_SUCCESS) { + return(error); + } + + while ((index = dict_table_get_next_index(index)) != NULL) { + if (index->type & DICT_FTS) { + continue; + } + + const dtuple_t* entry = row_build_index_entry( + row, save_ext, index, heap); + mtr_start(mtr); + btr_pcur_open(index, entry, PAGE_CUR_LE, + BTR_MODIFY_TREE, pcur, mtr); +#ifdef UNIV_DEBUG + switch (btr_pcur_get_btr_cur(pcur)->flag) { + case BTR_CUR_DELETE_REF: + case BTR_CUR_DEL_MARK_IBUF: + case BTR_CUR_DELETE_IBUF: + case BTR_CUR_INSERT_TO_IBUF: + /* We did not request buffering. */ + break; + case BTR_CUR_HASH: + case BTR_CUR_HASH_FAIL: + case BTR_CUR_BINARY: + goto flag_ok; + } + ut_ad(0); +flag_ok: +#endif /* UNIV_DEBUG */ + + if (page_rec_is_infimum(btr_pcur_get_rec(pcur)) + || btr_pcur_get_low_match(pcur) < index->n_uniq) { + /* All secondary index entries should be + found, because new_table is being modified by + this thread only, and all indexes should be + updated in sync. */ + mtr_commit(mtr); + return(DB_INDEX_CORRUPT); + } + + btr_cur_pessimistic_delete(&error, FALSE, + btr_pcur_get_btr_cur(pcur), + BTR_CREATE_FLAG, RB_NONE, mtr); + mtr_commit(mtr); + } + + return(error); +} + +/******************************************************//** +Replays a delete operation on a table that was rebuilt. +@return DB_SUCCESS or error code */ +static __attribute__((nonnull(1, 3, 4, 5, 6, 7), warn_unused_result)) +dberr_t +row_log_table_apply_delete( +/*=======================*/ + que_thr_t* thr, /*!< in: query graph */ + ulint trx_id_col, /*!< in: position of + DB_TRX_ID in the new + clustered index */ + const mrec_t* mrec, /*!< in: merge record */ + const ulint* moffsets, /*!< in: offsets of mrec */ + mem_heap_t* offsets_heap, /*!< in/out: memory heap + that can be emptied */ + mem_heap_t* heap, /*!< in/out: memory heap */ + dict_table_t* new_table, /*!< in: rebuilt table */ + const row_ext_t* save_ext) /*!< in: saved external field + info, or NULL */ +{ + dict_index_t* index = dict_table_get_first_index(new_table); + dtuple_t* old_pk; + mtr_t mtr; + btr_pcur_t pcur; + ulint* offsets; + + ut_ad(rec_offs_n_fields(moffsets) + == dict_index_get_n_unique(index) + 1); + ut_ad(!rec_offs_any_extern(moffsets)); + + /* Convert the row to a search tuple. */ + old_pk = dtuple_create(heap, index->n_uniq + 1); + dict_index_copy_types(old_pk, index, old_pk->n_fields); + dtuple_set_n_fields_cmp(old_pk, index->n_uniq); + + for (ulint i = 0; i <= index->n_uniq; i++) { + ulint len; + const void* field; + field = rec_get_nth_field(mrec, moffsets, i, &len); + ut_ad(len != UNIV_SQL_NULL); + dfield_set_data(dtuple_get_nth_field(old_pk, i), + field, len); + } + + mtr_start(&mtr); + btr_pcur_open(index, old_pk, PAGE_CUR_LE, + BTR_MODIFY_TREE, &pcur, &mtr); +#ifdef UNIV_DEBUG + switch (btr_pcur_get_btr_cur(&pcur)->flag) { + case BTR_CUR_DELETE_REF: + case BTR_CUR_DEL_MARK_IBUF: + case BTR_CUR_DELETE_IBUF: + case BTR_CUR_INSERT_TO_IBUF: + /* We did not request buffering. */ + break; + case BTR_CUR_HASH: + case BTR_CUR_HASH_FAIL: + case BTR_CUR_BINARY: + goto flag_ok; + } + ut_ad(0); +flag_ok: +#endif /* UNIV_DEBUG */ + + if (page_rec_is_infimum(btr_pcur_get_rec(&pcur)) + || btr_pcur_get_low_match(&pcur) < index->n_uniq) { +all_done: + mtr_commit(&mtr); + /* The record was not found. All done. */ + return(DB_SUCCESS); + } + + offsets = rec_get_offsets(btr_pcur_get_rec(&pcur), index, NULL, + ULINT_UNDEFINED, &offsets_heap); +#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG + ut_a(!rec_offs_any_null_extern(btr_pcur_get_rec(&pcur), offsets)); +#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ + + /* Only remove the record if DB_TRX_ID matches what was + buffered. */ + + { + ulint len; + const void* mrec_trx_id + = rec_get_nth_field(mrec, moffsets, trx_id_col, &len); + ut_ad(len == DATA_TRX_ID_LEN); + const void* rec_trx_id + = rec_get_nth_field(btr_pcur_get_rec(&pcur), offsets, + trx_id_col, &len); + ut_ad(len == DATA_TRX_ID_LEN); + if (memcmp(mrec_trx_id, rec_trx_id, DATA_TRX_ID_LEN)) { + goto all_done; + } + } + + return(row_log_table_apply_delete_low(&pcur, offsets, save_ext, + heap, &mtr)); +} + +/******************************************************//** +Replays an update operation on a table that was rebuilt. +@return DB_SUCCESS or error code */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_log_table_apply_update( +/*=======================*/ + que_thr_t* thr, /*!< in: query graph */ + ulint trx_id_col, /*!< in: position of + DB_TRX_ID in the + old clustered index */ + ulint new_trx_id_col, /*!< in: position of + DB_TRX_ID in the new + clustered index */ + const mrec_t* mrec, /*!< in: new value */ + const ulint* offsets, /*!< in: offsets of mrec */ + mem_heap_t* offsets_heap, /*!< in/out: memory heap + that can be emptied */ + mem_heap_t* heap, /*!< in/out: memory heap */ + row_merge_dup_t* dup, /*!< in/out: for reporting + duplicate key errors */ + trx_id_t trx_id, /*!< in: DB_TRX_ID of mrec */ + const dtuple_t* old_pk) /*!< in: PRIMARY KEY and + DB_TRX_ID,DB_ROLL_PTR + of the old value, + or PRIMARY KEY if same_pk */ +{ + const row_log_t*log = dup->index->online_log; + const dtuple_t* row; + dict_index_t* index = dict_table_get_first_index(log->table); + mtr_t mtr; + btr_pcur_t pcur; + dberr_t error; + + ut_ad(dtuple_get_n_fields_cmp(old_pk) + == dict_index_get_n_unique(index)); + ut_ad(dtuple_get_n_fields(old_pk) + == dict_index_get_n_unique(index) + + (dup->index->online_log->same_pk ? 0 : 2)); + + row = row_log_table_apply_convert_mrec( + mrec, dup->index, offsets, log, heap, trx_id, &error); + + ut_ad(error == DB_SUCCESS || !row); + /* Handling of duplicate key error requires storing + of offending key in a record buffer. */ + ut_ad(error != DB_DUPLICATE_KEY); + + if (!row) { + return(error); + } + + mtr_start(&mtr); + btr_pcur_open(index, old_pk, PAGE_CUR_LE, + BTR_MODIFY_TREE, &pcur, &mtr); +#ifdef UNIV_DEBUG + switch (btr_pcur_get_btr_cur(&pcur)->flag) { + case BTR_CUR_DELETE_REF: + case BTR_CUR_DEL_MARK_IBUF: + case BTR_CUR_DELETE_IBUF: + case BTR_CUR_INSERT_TO_IBUF: + ut_ad(0);/* We did not request buffering. */ + case BTR_CUR_HASH: + case BTR_CUR_HASH_FAIL: + case BTR_CUR_BINARY: + break; + } +#endif /* UNIV_DEBUG */ + + if (page_rec_is_infimum(btr_pcur_get_rec(&pcur)) + || btr_pcur_get_low_match(&pcur) < index->n_uniq) { + mtr_commit(&mtr); +insert: + ut_ad(mtr.state == MTR_COMMITTED); + /* The row was not found. Insert it. */ + error = row_log_table_apply_insert_low( + thr, row, trx_id, offsets_heap, heap, dup); + if (error != DB_SUCCESS) { +err_exit: + /* Report the erroneous row using the new + version of the table. */ + innobase_row_to_mysql(dup->table, log->table, row); + } + + return(error); + } + + /* Update the record. */ + ulint* cur_offsets = rec_get_offsets( + btr_pcur_get_rec(&pcur), + index, NULL, ULINT_UNDEFINED, &offsets_heap); + + dtuple_t* entry = row_build_index_entry( + row, NULL, index, heap); + const upd_t* update = row_upd_build_difference_binary( + index, entry, btr_pcur_get_rec(&pcur), cur_offsets, + false, NULL, heap); + + error = DB_SUCCESS; + + if (!update->n_fields) { + /* Nothing to do. */ + goto func_exit; + } + + if (rec_offs_any_extern(cur_offsets)) { + /* If the record contains any externally stored + columns, perform the update by delete and insert, + because we will not write any undo log that would + allow purge to free any orphaned externally stored + columns. */ +delete_insert: + error = row_log_table_apply_delete_low( + &pcur, cur_offsets, NULL, heap, &mtr); + ut_ad(mtr.state == MTR_COMMITTED); + + if (error != DB_SUCCESS) { + goto err_exit; + } + + goto insert; + } + + if (upd_get_nth_field(update, 0)->field_no < new_trx_id_col) { + if (dup->index->online_log->same_pk) { + /* The ROW_T_UPDATE log record should only be + written when the PRIMARY KEY fields of the + record did not change in the old table. We + can only get a change of PRIMARY KEY columns + in the rebuilt table if the PRIMARY KEY was + redefined (!same_pk). */ + ut_ad(0); + error = DB_CORRUPTION; + goto func_exit; + } + + /* The PRIMARY KEY columns have changed. + Delete the record with the old PRIMARY KEY value, + provided that it carries the same + DB_TRX_ID,DB_ROLL_PTR. Then, insert the new row. */ + ulint len; + const byte* cur_trx_roll = rec_get_nth_field( + mrec, offsets, trx_id_col, &len); + ut_ad(len == DATA_TRX_ID_LEN); + const dfield_t* new_trx_roll = dtuple_get_nth_field( + old_pk, new_trx_id_col); + /* We assume that DB_TRX_ID,DB_ROLL_PTR are stored + in one contiguous block. */ + ut_ad(rec_get_nth_field(mrec, offsets, trx_id_col + 1, &len) + == cur_trx_roll + DATA_TRX_ID_LEN); + ut_ad(len == DATA_ROLL_PTR_LEN); + ut_ad(new_trx_roll->len == DATA_TRX_ID_LEN); + ut_ad(dtuple_get_nth_field(old_pk, new_trx_id_col + 1) + -> len == DATA_ROLL_PTR_LEN); + ut_ad(static_cast<const byte*>( + dtuple_get_nth_field(old_pk, new_trx_id_col + 1) + ->data) + == static_cast<const byte*>(new_trx_roll->data) + + DATA_TRX_ID_LEN); + + if (!memcmp(cur_trx_roll, new_trx_roll->data, + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN)) { + /* The old row exists. Remove it. */ + goto delete_insert; + } + + /* Unless we called row_log_table_apply_delete_low(), + this will likely cause a duplicate key error. */ + mtr_commit(&mtr); + goto insert; + } + + dtuple_t* old_row; + row_ext_t* old_ext; + + if (dict_table_get_next_index(index)) { + /* Construct the row corresponding to the old value of + the record. */ + old_row = row_build( + ROW_COPY_DATA, index, btr_pcur_get_rec(&pcur), + cur_offsets, NULL, NULL, NULL, &old_ext, heap); + ut_ad(old_row); +#ifdef ROW_LOG_APPLY_PRINT + if (row_log_apply_print) { + fprintf(stderr, "table apply update " + IB_ID_FMT " " IB_ID_FMT "\n", + index->table->id, index->id); + dtuple_print(stderr, old_row); + dtuple_print(stderr, row); + } +#endif /* ROW_LOG_APPLY_PRINT */ + } else { + old_row = NULL; + old_ext = NULL; + } + + big_rec_t* big_rec; + + error = btr_cur_pessimistic_update( + BTR_CREATE_FLAG | BTR_NO_LOCKING_FLAG + | BTR_NO_UNDO_LOG_FLAG | BTR_KEEP_SYS_FLAG + | BTR_KEEP_POS_FLAG, + btr_pcur_get_btr_cur(&pcur), + &cur_offsets, &offsets_heap, heap, &big_rec, + update, 0, NULL, 0, &mtr); + + if (big_rec) { + if (error == DB_SUCCESS) { + error = btr_store_big_rec_extern_fields( + index, btr_pcur_get_block(&pcur), + btr_pcur_get_rec(&pcur), cur_offsets, + big_rec, &mtr, BTR_STORE_UPDATE); + } + + dtuple_big_rec_free(big_rec); + } + + while ((index = dict_table_get_next_index(index)) != NULL) { + if (error != DB_SUCCESS) { + break; + } + + if (index->type & DICT_FTS) { + continue; + } + + if (!row_upd_changes_ord_field_binary( + index, update, thr, old_row, NULL)) { + continue; + } + + mtr_commit(&mtr); + + entry = row_build_index_entry(old_row, old_ext, index, heap); + if (!entry) { + ut_ad(0); + return(DB_CORRUPTION); + } + + mtr_start(&mtr); + + if (ROW_FOUND != row_search_index_entry( + index, entry, BTR_MODIFY_TREE, &pcur, &mtr)) { + ut_ad(0); + error = DB_CORRUPTION; + break; + } + + btr_cur_pessimistic_delete( + &error, FALSE, btr_pcur_get_btr_cur(&pcur), + BTR_CREATE_FLAG, RB_NONE, &mtr); + + if (error != DB_SUCCESS) { + break; + } + + mtr_commit(&mtr); + + entry = row_build_index_entry(row, NULL, index, heap); + error = row_ins_sec_index_entry_low( + BTR_CREATE_FLAG | BTR_NO_LOCKING_FLAG + | BTR_NO_UNDO_LOG_FLAG | BTR_KEEP_SYS_FLAG, + BTR_MODIFY_TREE, index, offsets_heap, heap, + entry, trx_id, thr); + + mtr_start(&mtr); + } + +func_exit: + mtr_commit(&mtr); + if (error != DB_SUCCESS) { + goto err_exit; + } + + return(error); +} + +/******************************************************//** +Applies an operation to a table that was rebuilt. +@return NULL on failure (mrec corruption) or when out of data; +pointer to next record on success */ +static __attribute__((nonnull, warn_unused_result)) +const mrec_t* +row_log_table_apply_op( +/*===================*/ + que_thr_t* thr, /*!< in: query graph */ + ulint trx_id_col, /*!< in: position of + DB_TRX_ID in old index */ + ulint new_trx_id_col, /*!< in: position of + DB_TRX_ID in new index */ + row_merge_dup_t* dup, /*!< in/out: for reporting + duplicate key errors */ + dberr_t* error, /*!< out: DB_SUCCESS + or error code */ + mem_heap_t* offsets_heap, /*!< in/out: memory heap + that can be emptied */ + mem_heap_t* heap, /*!< in/out: memory heap */ + const mrec_t* mrec, /*!< in: merge record */ + const mrec_t* mrec_end, /*!< in: end of buffer */ + ulint* offsets) /*!< in/out: work area + for parsing mrec */ +{ + const row_log_t*log = dup->index->online_log; + dict_index_t* new_index = dict_table_get_first_index(log->table); + ulint extra_size; + const mrec_t* next_mrec; + dtuple_t* old_pk; + row_ext_t* ext; + ulint ext_size; + + ut_ad(dict_index_is_clust(dup->index)); + ut_ad(dup->index->table != log->table); + + *error = DB_SUCCESS; + + /* 3 = 1 (op type) + 1 (ext_size) + at least 1 byte payload */ + if (mrec + 3 >= mrec_end) { + return(NULL); + } + + switch (*mrec++) { + default: + ut_ad(0); + *error = DB_CORRUPTION; + return(NULL); + case ROW_T_INSERT: + extra_size = *mrec++; + + if (extra_size >= 0x80) { + /* Read another byte of extra_size. */ + + extra_size = (extra_size & 0x7f) << 8; + extra_size |= *mrec++; + } + + mrec += extra_size; + + if (mrec > mrec_end) { + return(NULL); + } + + rec_offs_set_n_fields(offsets, dup->index->n_fields); + rec_init_offsets_temp(mrec, dup->index, offsets); + + next_mrec = mrec + rec_offs_data_size(offsets); + + if (next_mrec > mrec_end) { + return(NULL); + } else { + ulint len; + const byte* db_trx_id + = rec_get_nth_field( + mrec, offsets, trx_id_col, &len); + ut_ad(len == DATA_TRX_ID_LEN); + *error = row_log_table_apply_insert( + thr, mrec, offsets, offsets_heap, + heap, dup, trx_read_trx_id(db_trx_id)); + } + break; + + case ROW_T_DELETE: + /* 1 (extra_size) + 2 (ext_size) + at least 1 (payload) */ + if (mrec + 4 >= mrec_end) { + return(NULL); + } + + extra_size = *mrec++; + ext_size = mach_read_from_2(mrec); + mrec += 2; + ut_ad(mrec < mrec_end); + + /* We assume extra_size < 0x100 for the PRIMARY KEY prefix. + For fixed-length PRIMARY key columns, it is 0. */ + mrec += extra_size; + + rec_offs_set_n_fields(offsets, new_index->n_uniq + 1); + rec_init_offsets_temp(mrec, new_index, offsets); + next_mrec = mrec + rec_offs_data_size(offsets) + ext_size; + if (next_mrec > mrec_end) { + return(NULL); + } + + /* If there are external fields, retrieve those logged + prefix info and reconstruct the row_ext_t */ + if (ext_size) { + /* We use memcpy to avoid unaligned + access on some non-x86 platforms.*/ + ext = static_cast<row_ext_t*>( + mem_heap_dup(heap, + mrec + rec_offs_data_size(offsets), + ext_size)); + + byte* ext_start = reinterpret_cast<byte*>(ext); + + ulint ext_len = sizeof(*ext) + + (ext->n_ext - 1) * sizeof ext->len; + + ext->ext = reinterpret_cast<ulint*>(ext_start + ext_len); + ext_len += ext->n_ext * sizeof(*ext->ext); + + ext->buf = static_cast<byte*>(ext_start + ext_len); + } else { + ext = NULL; + } + + *error = row_log_table_apply_delete( + thr, new_trx_id_col, + mrec, offsets, offsets_heap, heap, + log->table, ext); + break; + + case ROW_T_UPDATE: + /* Logically, the log entry consists of the + (PRIMARY KEY,DB_TRX_ID) of the old value (converted + to the new primary key definition) followed by + the new value in the old table definition. If the + definition of the columns belonging to PRIMARY KEY + is not changed, the log will only contain + DB_TRX_ID,new_row. */ + + if (dup->index->online_log->same_pk) { + ut_ad(new_index->n_uniq == dup->index->n_uniq); + + extra_size = *mrec++; + + if (extra_size >= 0x80) { + /* Read another byte of extra_size. */ + + extra_size = (extra_size & 0x7f) << 8; + extra_size |= *mrec++; + } + + mrec += extra_size; + + if (mrec > mrec_end) { + return(NULL); + } + + rec_offs_set_n_fields(offsets, dup->index->n_fields); + rec_init_offsets_temp(mrec, dup->index, offsets); + + next_mrec = mrec + rec_offs_data_size(offsets); + + if (next_mrec > mrec_end) { + return(NULL); + } + + old_pk = dtuple_create(heap, new_index->n_uniq); + dict_index_copy_types( + old_pk, new_index, old_pk->n_fields); + + /* Copy the PRIMARY KEY fields from mrec to old_pk. */ + for (ulint i = 0; i < new_index->n_uniq; i++) { + const void* field; + ulint len; + dfield_t* dfield; + + ut_ad(!rec_offs_nth_extern(offsets, i)); + + field = rec_get_nth_field( + mrec, offsets, i, &len); + ut_ad(len != UNIV_SQL_NULL); + + dfield = dtuple_get_nth_field(old_pk, i); + dfield_set_data(dfield, field, len); + } + } else { + /* We assume extra_size < 0x100 + for the PRIMARY KEY prefix. */ + mrec += *mrec + 1; + + if (mrec > mrec_end) { + return(NULL); + } + + /* Get offsets for PRIMARY KEY, + DB_TRX_ID, DB_ROLL_PTR. */ + rec_offs_set_n_fields(offsets, new_index->n_uniq + 2); + rec_init_offsets_temp(mrec, new_index, offsets); + + next_mrec = mrec + rec_offs_data_size(offsets); + if (next_mrec + 2 > mrec_end) { + return(NULL); + } + + /* Copy the PRIMARY KEY fields and + DB_TRX_ID, DB_ROLL_PTR from mrec to old_pk. */ + old_pk = dtuple_create(heap, new_index->n_uniq + 2); + dict_index_copy_types(old_pk, new_index, + old_pk->n_fields); + + for (ulint i = 0; + i < dict_index_get_n_unique(new_index) + 2; + i++) { + const void* field; + ulint len; + dfield_t* dfield; + + ut_ad(!rec_offs_nth_extern(offsets, i)); + + field = rec_get_nth_field( + mrec, offsets, i, &len); + ut_ad(len != UNIV_SQL_NULL); + + dfield = dtuple_get_nth_field(old_pk, i); + dfield_set_data(dfield, field, len); + } + + mrec = next_mrec; + + /* Fetch the new value of the row as it was + in the old table definition. */ + extra_size = *mrec++; + + if (extra_size >= 0x80) { + /* Read another byte of extra_size. */ + + extra_size = (extra_size & 0x7f) << 8; + extra_size |= *mrec++; + } + + mrec += extra_size; + + if (mrec > mrec_end) { + return(NULL); + } + + rec_offs_set_n_fields(offsets, dup->index->n_fields); + rec_init_offsets_temp(mrec, dup->index, offsets); + + next_mrec = mrec + rec_offs_data_size(offsets); + + if (next_mrec > mrec_end) { + return(NULL); + } + } + + ut_ad(next_mrec <= mrec_end); + dtuple_set_n_fields_cmp(old_pk, new_index->n_uniq); + + { + ulint len; + const byte* db_trx_id + = rec_get_nth_field( + mrec, offsets, trx_id_col, &len); + ut_ad(len == DATA_TRX_ID_LEN); + *error = row_log_table_apply_update( + thr, trx_id_col, new_trx_id_col, + mrec, offsets, offsets_heap, + heap, dup, trx_read_trx_id(db_trx_id), old_pk); + } + + break; + } + + mem_heap_empty(offsets_heap); + mem_heap_empty(heap); + return(next_mrec); +} + +/******************************************************//** +Applies operations to a table was rebuilt. +@return DB_SUCCESS, or error code on failure */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_log_table_apply_ops( +/*====================*/ + que_thr_t* thr, /*!< in: query graph */ + row_merge_dup_t*dup) /*!< in/out: for reporting duplicate key + errors */ +{ + dberr_t error; + const mrec_t* mrec = NULL; + const mrec_t* next_mrec; + const mrec_t* mrec_end = NULL; /* silence bogus warning */ + const mrec_t* next_mrec_end; + mem_heap_t* heap; + mem_heap_t* offsets_heap; + ulint* offsets; + bool has_index_lock; + dict_index_t* index = const_cast<dict_index_t*>( + dup->index); + dict_table_t* new_table = index->online_log->table; + dict_index_t* new_index = dict_table_get_first_index( + new_table); + const ulint i = 1 + REC_OFFS_HEADER_SIZE + + ut_max(dict_index_get_n_fields(index), + dict_index_get_n_unique(new_index) + 2); + const ulint trx_id_col = dict_col_get_clust_pos( + dict_table_get_sys_col(index->table, DATA_TRX_ID), index); + const ulint new_trx_id_col = dict_col_get_clust_pos( + dict_table_get_sys_col(new_table, DATA_TRX_ID), new_index); + trx_t* trx = thr_get_trx(thr); + + ut_ad(dict_index_is_clust(index)); + ut_ad(dict_index_is_online_ddl(index)); + ut_ad(trx->mysql_thd); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(dict_index_get_lock(index), RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + ut_ad(!dict_index_is_online_ddl(new_index)); + ut_ad(trx_id_col > 0); + ut_ad(trx_id_col != ULINT_UNDEFINED); + ut_ad(new_trx_id_col > 0); + ut_ad(new_trx_id_col != ULINT_UNDEFINED); + + UNIV_MEM_INVALID(&mrec_end, sizeof mrec_end); + + offsets = static_cast<ulint*>(ut_malloc(i * sizeof *offsets)); + offsets[0] = i; + offsets[1] = dict_index_get_n_fields(index); + + heap = mem_heap_create(UNIV_PAGE_SIZE); + offsets_heap = mem_heap_create(UNIV_PAGE_SIZE); + has_index_lock = true; + +next_block: + ut_ad(has_index_lock); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(dict_index_get_lock(index), RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + ut_ad(index->online_log->head.bytes == 0); + + if (trx_is_interrupted(trx)) { + goto interrupted; + } + + if (dict_index_is_corrupted(index)) { + error = DB_INDEX_CORRUPT; + goto func_exit; + } + + ut_ad(dict_index_is_online_ddl(index)); + + error = index->online_log->error; + + if (error != DB_SUCCESS) { + goto func_exit; + } + + if (UNIV_UNLIKELY(index->online_log->head.blocks + > index->online_log->tail.blocks)) { +unexpected_eof: + fprintf(stderr, "InnoDB: unexpected end of temporary file" + " for table %s\n", index->table_name); +corruption: + error = DB_CORRUPTION; + goto func_exit; + } + + if (index->online_log->head.blocks + == index->online_log->tail.blocks) { + if (index->online_log->head.blocks) { +#ifdef HAVE_FTRUNCATE + /* Truncate the file in order to save space. */ + ftruncate(index->online_log->fd, 0); +#endif /* HAVE_FTRUNCATE */ + index->online_log->head.blocks + = index->online_log->tail.blocks = 0; + } + + next_mrec = index->online_log->tail.block; + next_mrec_end = next_mrec + index->online_log->tail.bytes; + + if (next_mrec_end == next_mrec) { + /* End of log reached. */ +all_done: + ut_ad(has_index_lock); + ut_ad(index->online_log->head.blocks == 0); + ut_ad(index->online_log->tail.blocks == 0); + index->online_log->head.bytes = 0; + index->online_log->tail.bytes = 0; + error = DB_SUCCESS; + goto func_exit; + } + } else { + os_offset_t ofs; + ibool success; + + ofs = (os_offset_t) index->online_log->head.blocks + * srv_sort_buf_size; + + ut_ad(has_index_lock); + has_index_lock = false; + rw_lock_x_unlock(dict_index_get_lock(index)); + + log_free_check(); + + ut_ad(dict_index_is_online_ddl(index)); + + success = os_file_read_no_error_handling( + OS_FILE_FROM_FD(index->online_log->fd), + index->online_log->head.block, ofs, + srv_sort_buf_size); + + if (!success) { + fprintf(stderr, "InnoDB: unable to read temporary file" + " for table %s\n", index->table_name); + goto corruption; + } + +#ifdef POSIX_FADV_DONTNEED + /* Each block is read exactly once. Free up the file cache. */ + posix_fadvise(index->online_log->fd, + ofs, srv_sort_buf_size, POSIX_FADV_DONTNEED); +#endif /* POSIX_FADV_DONTNEED */ +#ifdef FALLOC_FL_PUNCH_HOLE + /* Try to deallocate the space for the file on disk. + This should work on ext4 on Linux 2.6.39 and later, + and be ignored when the operation is unsupported. */ + fallocate(index->online_log->fd, + FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, + ofs, srv_buf_size); +#endif /* FALLOC_FL_PUNCH_HOLE */ + + next_mrec = index->online_log->head.block; + next_mrec_end = next_mrec + srv_sort_buf_size; + } + + /* This read is not protected by index->online_log->mutex for + performance reasons. We will eventually notice any error that + was flagged by a DML thread. */ + error = index->online_log->error; + + if (error != DB_SUCCESS) { + goto func_exit; + } + + if (mrec) { + /* A partial record was read from the previous block. + Copy the temporary buffer full, as we do not know the + length of the record. Parse subsequent records from + the bigger buffer index->online_log->head.block + or index->online_log->tail.block. */ + + ut_ad(mrec == index->online_log->head.buf); + ut_ad(mrec_end > mrec); + ut_ad(mrec_end < (&index->online_log->head.buf)[1]); + + memcpy((mrec_t*) mrec_end, next_mrec, + (&index->online_log->head.buf)[1] - mrec_end); + mrec = row_log_table_apply_op( + thr, trx_id_col, new_trx_id_col, + dup, &error, offsets_heap, heap, + index->online_log->head.buf, + (&index->online_log->head.buf)[1], offsets); + if (error != DB_SUCCESS) { + goto func_exit; + } else if (UNIV_UNLIKELY(mrec == NULL)) { + /* The record was not reassembled properly. */ + goto corruption; + } + /* The record was previously found out to be + truncated. Now that the parse buffer was extended, + it should proceed beyond the old end of the buffer. */ + ut_a(mrec > mrec_end); + + index->online_log->head.bytes = mrec - mrec_end; + next_mrec += index->online_log->head.bytes; + } + + ut_ad(next_mrec <= next_mrec_end); + /* The following loop must not be parsing the temporary + buffer, but head.block or tail.block. */ + + /* mrec!=NULL means that the next record starts from the + middle of the block */ + ut_ad((mrec == NULL) == (index->online_log->head.bytes == 0)); + +#ifdef UNIV_DEBUG + if (next_mrec_end == index->online_log->head.block + + srv_sort_buf_size) { + /* If tail.bytes == 0, next_mrec_end can also be at + the end of tail.block. */ + if (index->online_log->tail.bytes == 0) { + ut_ad(next_mrec == next_mrec_end); + ut_ad(index->online_log->tail.blocks == 0); + ut_ad(index->online_log->head.blocks == 0); + ut_ad(index->online_log->head.bytes == 0); + } else { + ut_ad(next_mrec == index->online_log->head.block + + index->online_log->head.bytes); + ut_ad(index->online_log->tail.blocks + > index->online_log->head.blocks); + } + } else if (next_mrec_end == index->online_log->tail.block + + index->online_log->tail.bytes) { + ut_ad(next_mrec == index->online_log->tail.block + + index->online_log->head.bytes); + ut_ad(index->online_log->tail.blocks == 0); + ut_ad(index->online_log->head.blocks == 0); + ut_ad(index->online_log->head.bytes + <= index->online_log->tail.bytes); + } else { + ut_error; + } +#endif /* UNIV_DEBUG */ + + mrec_end = next_mrec_end; + + while (!trx_is_interrupted(trx)) { + mrec = next_mrec; + ut_ad(mrec < mrec_end); + + if (!has_index_lock) { + /* We are applying operations from a different + block than the one that is being written to. + We do not hold index->lock in order to + allow other threads to concurrently buffer + modifications. */ + ut_ad(mrec >= index->online_log->head.block); + ut_ad(mrec_end == index->online_log->head.block + + srv_sort_buf_size); + ut_ad(index->online_log->head.bytes + < srv_sort_buf_size); + + /* Take the opportunity to do a redo log + checkpoint if needed. */ + log_free_check(); + } else { + /* We are applying operations from the last block. + Do not allow other threads to buffer anything, + so that we can finally catch up and synchronize. */ + ut_ad(index->online_log->head.blocks == 0); + ut_ad(index->online_log->tail.blocks == 0); + ut_ad(mrec_end == index->online_log->tail.block + + index->online_log->tail.bytes); + ut_ad(mrec >= index->online_log->tail.block); + } + + /* This read is not protected by index->online_log->mutex + for performance reasons. We will eventually notice any + error that was flagged by a DML thread. */ + error = index->online_log->error; + + if (error != DB_SUCCESS) { + goto func_exit; + } + + next_mrec = row_log_table_apply_op( + thr, trx_id_col, new_trx_id_col, + dup, &error, offsets_heap, heap, + mrec, mrec_end, offsets); + + if (error != DB_SUCCESS) { + goto func_exit; + } else if (next_mrec == next_mrec_end) { + /* The record happened to end on a block boundary. + Do we have more blocks left? */ + if (has_index_lock) { + /* The index will be locked while + applying the last block. */ + goto all_done; + } + + mrec = NULL; +process_next_block: + rw_lock_x_lock(dict_index_get_lock(index)); + has_index_lock = true; + + index->online_log->head.bytes = 0; + index->online_log->head.blocks++; + goto next_block; + } else if (next_mrec != NULL) { + ut_ad(next_mrec < next_mrec_end); + index->online_log->head.bytes += next_mrec - mrec; + } else if (has_index_lock) { + /* When mrec is within tail.block, it should + be a complete record, because we are holding + index->lock and thus excluding the writer. */ + ut_ad(index->online_log->tail.blocks == 0); + ut_ad(mrec_end == index->online_log->tail.block + + index->online_log->tail.bytes); + ut_ad(0); + goto unexpected_eof; + } else { + memcpy(index->online_log->head.buf, mrec, + mrec_end - mrec); + mrec_end += index->online_log->head.buf - mrec; + mrec = index->online_log->head.buf; + goto process_next_block; + } + } + +interrupted: + error = DB_INTERRUPTED; +func_exit: + if (!has_index_lock) { + rw_lock_x_lock(dict_index_get_lock(index)); + } + + mem_heap_free(offsets_heap); + mem_heap_free(heap); + ut_free(offsets); + return(error); +} + +/******************************************************//** +Apply the row_log_table log to a table upon completing rebuild. +@return DB_SUCCESS, or error code on failure */ +UNIV_INTERN +dberr_t +row_log_table_apply( +/*================*/ + que_thr_t* thr, /*!< in: query graph */ + dict_table_t* old_table, + /*!< in: old table */ + struct TABLE* table) /*!< in/out: MySQL table + (for reporting duplicates) */ +{ + dberr_t error; + dict_index_t* clust_index; + + thr_get_trx(thr)->error_key_num = 0; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(!rw_lock_own(&dict_operation_lock, RW_LOCK_SHARED)); +#endif /* UNIV_SYNC_DEBUG */ + clust_index = dict_table_get_first_index(old_table); + + rw_lock_x_lock(dict_index_get_lock(clust_index)); + + if (!clust_index->online_log) { + ut_ad(dict_index_get_online_status(clust_index) + == ONLINE_INDEX_COMPLETE); + /* This function should not be called unless + rebuilding a table online. Build in some fault + tolerance. */ + ut_ad(0); + error = DB_ERROR; + } else { + row_merge_dup_t dup = { + clust_index, table, + clust_index->online_log->col_map, 0 + }; + + error = row_log_table_apply_ops(thr, &dup); + } + + rw_lock_x_unlock(dict_index_get_lock(clust_index)); + return(error); +} + +/******************************************************//** +Allocate the row log for an index and flag the index +for online creation. +@retval true if success, false if not */ +UNIV_INTERN +bool +row_log_allocate( +/*=============*/ + dict_index_t* index, /*!< in/out: index */ + dict_table_t* table, /*!< in/out: new table being rebuilt, + or NULL when creating a secondary index */ + bool same_pk,/*!< in: whether the definition of the + PRIMARY KEY has remained the same */ + const dtuple_t* add_cols, + /*!< in: default values of + added columns, or NULL */ + const ulint* col_map)/*!< in: mapping of old column + numbers to new ones, or NULL if !table */ +{ + byte* buf; + row_log_t* log; + ulint size; + + ut_ad(!dict_index_is_online_ddl(index)); + ut_ad(dict_index_is_clust(index) == !!table); + ut_ad(!table || index->table != table); + ut_ad(same_pk || table); + ut_ad(!table || col_map); + ut_ad(!add_cols || col_map); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(dict_index_get_lock(index), RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + size = 2 * srv_sort_buf_size + sizeof *log; + buf = (byte*) os_mem_alloc_large(&size); + if (!buf) { + return(false); + } + + log = (row_log_t*) &buf[2 * srv_sort_buf_size]; + log->size = size; + log->fd = row_merge_file_create_low(); + if (log->fd < 0) { + os_mem_free_large(buf, size); + return(false); + } + mutex_create(index_online_log_key, &log->mutex, + SYNC_INDEX_ONLINE_LOG); + log->trx_rb = NULL; + log->table = table; + log->same_pk = same_pk; + log->add_cols = add_cols; + log->col_map = col_map; + log->error = DB_SUCCESS; + log->max_trx = 0; + log->head.block = buf; + log->tail.block = buf + srv_sort_buf_size; + log->tail.blocks = log->tail.bytes = 0; + log->head.blocks = log->head.bytes = 0; + dict_index_set_online_status(index, ONLINE_INDEX_CREATION); + index->online_log = log; + + /* While we might be holding an exclusive data dictionary lock + here, in row_log_abort_sec() we will not always be holding it. Use + atomic operations in both cases. */ + MONITOR_ATOMIC_INC(MONITOR_ONLINE_CREATE_INDEX); + + return(true); +} + +/******************************************************//** +Free the row log for an index that was being created online. */ +UNIV_INTERN +void +row_log_free( +/*=========*/ + row_log_t*& log) /*!< in,own: row log */ +{ + MONITOR_ATOMIC_DEC(MONITOR_ONLINE_CREATE_INDEX); + + delete log->trx_rb; + row_merge_file_destroy_low(log->fd); + mutex_free(&log->mutex); + os_mem_free_large(log->head.block, log->size); + log = 0; +} + +/******************************************************//** +Get the latest transaction ID that has invoked row_log_online_op() +during online creation. +@return latest transaction ID, or 0 if nothing was logged */ +UNIV_INTERN +trx_id_t +row_log_get_max_trx( +/*================*/ + dict_index_t* index) /*!< in: index, must be locked */ +{ + ut_ad(dict_index_get_online_status(index) == ONLINE_INDEX_CREATION); +#ifdef UNIV_SYNC_DEBUG + ut_ad((rw_lock_own(dict_index_get_lock(index), RW_LOCK_SHARED) + && mutex_own(&index->online_log->mutex)) + || rw_lock_own(dict_index_get_lock(index), RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + return(index->online_log->max_trx); +} + +/******************************************************//** +Applies an operation to a secondary index that was being created. */ +static __attribute__((nonnull)) +void +row_log_apply_op_low( +/*=================*/ + dict_index_t* index, /*!< in/out: index */ + row_merge_dup_t*dup, /*!< in/out: for reporting + duplicate key errors */ + dberr_t* error, /*!< out: DB_SUCCESS or error code */ + mem_heap_t* offsets_heap, /*!< in/out: memory heap for + allocating offsets; can be emptied */ + bool has_index_lock, /*!< in: true if holding index->lock + in exclusive mode */ + enum row_op op, /*!< in: operation being applied */ + trx_id_t trx_id, /*!< in: transaction identifier */ + const dtuple_t* entry) /*!< in: row */ +{ + mtr_t mtr; + btr_cur_t cursor; + ulint* offsets = NULL; + + ut_ad(!dict_index_is_clust(index)); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(dict_index_get_lock(index), RW_LOCK_EX) + == has_index_lock); +#endif /* UNIV_SYNC_DEBUG */ + ut_ad(!dict_index_is_corrupted(index)); + ut_ad(trx_id != 0 || op == ROW_OP_DELETE); + + mtr_start(&mtr); + + /* We perform the pessimistic variant of the operations if we + already hold index->lock exclusively. First, search the + record. The operation may already have been performed, + depending on when the row in the clustered index was + scanned. */ + btr_cur_search_to_nth_level(index, 0, entry, PAGE_CUR_LE, + has_index_lock + ? BTR_MODIFY_TREE + : BTR_MODIFY_LEAF, + &cursor, 0, __FILE__, __LINE__, + &mtr); + + ut_ad(dict_index_get_n_unique(index) > 0); + /* This test is somewhat similar to row_ins_must_modify_rec(), + but not identical for unique secondary indexes. */ + if (cursor.low_match >= dict_index_get_n_unique(index) + && !page_rec_is_infimum(btr_cur_get_rec(&cursor))) { + /* We have a matching record. */ + bool exists = (cursor.low_match + == dict_index_get_n_fields(index)); +#ifdef UNIV_DEBUG + rec_t* rec = btr_cur_get_rec(&cursor); + ut_ad(page_rec_is_user_rec(rec)); + ut_ad(!rec_get_deleted_flag(rec, page_rec_is_comp(rec))); +#endif /* UNIV_DEBUG */ + + ut_ad(exists || dict_index_is_unique(index)); + + switch (op) { + case ROW_OP_DELETE: + if (!exists) { + /* The record was already deleted. */ + goto func_exit; + } + + if (btr_cur_optimistic_delete( + &cursor, BTR_CREATE_FLAG, &mtr)) { + *error = DB_SUCCESS; + break; + } + + if (!has_index_lock) { + /* This needs a pessimistic operation. + Lock the index tree exclusively. */ + mtr_commit(&mtr); + mtr_start(&mtr); + btr_cur_search_to_nth_level( + index, 0, entry, PAGE_CUR_LE, + BTR_MODIFY_TREE, &cursor, 0, + __FILE__, __LINE__, &mtr); + + /* No other thread than the current one + is allowed to modify the index tree. + Thus, the record should still exist. */ + ut_ad(cursor.low_match + >= dict_index_get_n_fields(index)); + ut_ad(page_rec_is_user_rec( + btr_cur_get_rec(&cursor))); + } + + /* As there are no externally stored fields in + a secondary index record, the parameter + rb_ctx = RB_NONE will be ignored. */ + + btr_cur_pessimistic_delete( + error, FALSE, &cursor, + BTR_CREATE_FLAG, RB_NONE, &mtr); + break; + case ROW_OP_INSERT: + if (exists) { + /* The record already exists. There + is nothing to be inserted. */ + goto func_exit; + } + + if (dtuple_contains_null(entry)) { + /* The UNIQUE KEY columns match, but + there is a NULL value in the key, and + NULL!=NULL. */ + goto insert_the_rec; + } + + /* Duplicate key error */ + ut_ad(dict_index_is_unique(index)); + row_merge_dup_report(dup, entry->fields); + goto func_exit; + } + } else { + switch (op) { + rec_t* rec; + big_rec_t* big_rec; + case ROW_OP_DELETE: + /* The record does not exist. */ + goto func_exit; + case ROW_OP_INSERT: + if (dict_index_is_unique(index) + && (cursor.up_match + >= dict_index_get_n_unique(index) + || cursor.low_match + >= dict_index_get_n_unique(index)) + && (!index->n_nullable + || !dtuple_contains_null(entry))) { + /* Duplicate key */ + row_merge_dup_report(dup, entry->fields); + goto func_exit; + } +insert_the_rec: + /* Insert the record. As we are inserting into + a secondary index, there cannot be externally + stored columns (!big_rec). */ + *error = btr_cur_optimistic_insert( + BTR_NO_UNDO_LOG_FLAG + | BTR_NO_LOCKING_FLAG + | BTR_CREATE_FLAG, + &cursor, &offsets, &offsets_heap, + const_cast<dtuple_t*>(entry), + &rec, &big_rec, 0, NULL, &mtr); + ut_ad(!big_rec); + if (*error != DB_FAIL) { + break; + } + + if (!has_index_lock) { + /* This needs a pessimistic operation. + Lock the index tree exclusively. */ + mtr_commit(&mtr); + mtr_start(&mtr); + btr_cur_search_to_nth_level( + index, 0, entry, PAGE_CUR_LE, + BTR_MODIFY_TREE, &cursor, 0, + __FILE__, __LINE__, &mtr); + } + + /* We already determined that the + record did not exist. No other thread + than the current one is allowed to + modify the index tree. Thus, the + record should still not exist. */ + + *error = btr_cur_pessimistic_insert( + BTR_NO_UNDO_LOG_FLAG + | BTR_NO_LOCKING_FLAG + | BTR_CREATE_FLAG, + &cursor, &offsets, &offsets_heap, + const_cast<dtuple_t*>(entry), + &rec, &big_rec, + 0, NULL, &mtr); + ut_ad(!big_rec); + break; + } + mem_heap_empty(offsets_heap); + } + + if (*error == DB_SUCCESS && trx_id) { + page_update_max_trx_id(btr_cur_get_block(&cursor), + btr_cur_get_page_zip(&cursor), + trx_id, &mtr); + } + +func_exit: + mtr_commit(&mtr); +} + +/******************************************************//** +Applies an operation to a secondary index that was being created. +@return NULL on failure (mrec corruption) or when out of data; +pointer to next record on success */ +static __attribute__((nonnull, warn_unused_result)) +const mrec_t* +row_log_apply_op( +/*=============*/ + dict_index_t* index, /*!< in/out: index */ + row_merge_dup_t*dup, /*!< in/out: for reporting + duplicate key errors */ + dberr_t* error, /*!< out: DB_SUCCESS or error code */ + mem_heap_t* offsets_heap, /*!< in/out: memory heap for + allocating offsets; can be emptied */ + mem_heap_t* heap, /*!< in/out: memory heap for + allocating data tuples */ + bool has_index_lock, /*!< in: true if holding index->lock + in exclusive mode */ + const mrec_t* mrec, /*!< in: merge record */ + const mrec_t* mrec_end, /*!< in: end of buffer */ + ulint* offsets) /*!< in/out: work area for + rec_init_offsets_temp() */ + +{ + enum row_op op; + ulint extra_size; + ulint data_size; + ulint n_ext; + dtuple_t* entry; + trx_id_t trx_id; + + /* Online index creation is only used for secondary indexes. */ + ut_ad(!dict_index_is_clust(index)); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(dict_index_get_lock(index), RW_LOCK_EX) + == has_index_lock); +#endif /* UNIV_SYNC_DEBUG */ + + if (dict_index_is_corrupted(index)) { + *error = DB_INDEX_CORRUPT; + return(NULL); + } + + *error = DB_SUCCESS; + + if (mrec + ROW_LOG_HEADER_SIZE >= mrec_end) { + return(NULL); + } + + switch (*mrec) { + case ROW_OP_INSERT: + if (ROW_LOG_HEADER_SIZE + DATA_TRX_ID_LEN + mrec >= mrec_end) { + return(NULL); + } + + op = static_cast<enum row_op>(*mrec++); + trx_id = trx_read_trx_id(mrec); + mrec += DATA_TRX_ID_LEN; + break; + case ROW_OP_DELETE: + op = static_cast<enum row_op>(*mrec++); + trx_id = 0; + break; + default: +corrupted: + ut_ad(0); + *error = DB_CORRUPTION; + return(NULL); + } + + extra_size = *mrec++; + + ut_ad(mrec < mrec_end); + + if (extra_size >= 0x80) { + /* Read another byte of extra_size. */ + + extra_size = (extra_size & 0x7f) << 8; + extra_size |= *mrec++; + } + + mrec += extra_size; + + if (mrec > mrec_end) { + return(NULL); + } + + rec_init_offsets_temp(mrec, index, offsets); + + if (rec_offs_any_extern(offsets)) { + /* There should never be any externally stored fields + in a secondary index, which is what online index + creation is used for. Therefore, the log file must be + corrupted. */ + goto corrupted; + } + + data_size = rec_offs_data_size(offsets); + + mrec += data_size; + + if (mrec > mrec_end) { + return(NULL); + } + + entry = row_rec_to_index_entry_low( + mrec - data_size, index, offsets, &n_ext, heap); + /* Online index creation is only implemented for secondary + indexes, which never contain off-page columns. */ + ut_ad(n_ext == 0); +#ifdef ROW_LOG_APPLY_PRINT + if (row_log_apply_print) { + fprintf(stderr, "apply " IB_ID_FMT " " TRX_ID_FMT " %u %u ", + index->id, trx_id, + unsigned (op), unsigned (has_index_lock)); + for (const byte* m = mrec - data_size; m < mrec; m++) { + fprintf(stderr, "%02x", *m); + } + putc('\n', stderr); + } +#endif /* ROW_LOG_APPLY_PRINT */ + row_log_apply_op_low(index, dup, error, offsets_heap, + has_index_lock, op, trx_id, entry); + return(mrec); +} + +/******************************************************//** +Applies operations to a secondary index that was being created. +@return DB_SUCCESS, or error code on failure */ +static __attribute__((nonnull)) +dberr_t +row_log_apply_ops( +/*==============*/ + trx_t* trx, /*!< in: transaction (for checking if + the operation was interrupted) */ + dict_index_t* index, /*!< in/out: index */ + row_merge_dup_t*dup) /*!< in/out: for reporting duplicate key + errors */ +{ + dberr_t error; + const mrec_t* mrec = NULL; + const mrec_t* next_mrec; + const mrec_t* mrec_end= NULL; /* silence bogus warning */ + const mrec_t* next_mrec_end; + mem_heap_t* offsets_heap; + mem_heap_t* heap; + ulint* offsets; + bool has_index_lock; + const ulint i = 1 + REC_OFFS_HEADER_SIZE + + dict_index_get_n_fields(index); + + ut_ad(dict_index_is_online_ddl(index)); + ut_ad(*index->name == TEMP_INDEX_PREFIX); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(dict_index_get_lock(index), RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + ut_ad(index->online_log); + UNIV_MEM_INVALID(&mrec_end, sizeof mrec_end); + + offsets = static_cast<ulint*>(ut_malloc(i * sizeof *offsets)); + offsets[0] = i; + offsets[1] = dict_index_get_n_fields(index); + + offsets_heap = mem_heap_create(UNIV_PAGE_SIZE); + heap = mem_heap_create(UNIV_PAGE_SIZE); + has_index_lock = true; + +next_block: + ut_ad(has_index_lock); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(dict_index_get_lock(index), RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + ut_ad(index->online_log->head.bytes == 0); + + if (trx_is_interrupted(trx)) { + goto interrupted; + } + + if (dict_index_is_corrupted(index)) { + error = DB_INDEX_CORRUPT; + goto func_exit; + } + + if (UNIV_UNLIKELY(index->online_log->head.blocks + > index->online_log->tail.blocks)) { +unexpected_eof: + fprintf(stderr, "InnoDB: unexpected end of temporary file" + " for index %s\n", index->name + 1); +corruption: + error = DB_CORRUPTION; + goto func_exit; + } + + if (index->online_log->head.blocks + == index->online_log->tail.blocks) { + if (index->online_log->head.blocks) { +#ifdef HAVE_FTRUNCATE + /* Truncate the file in order to save space. */ + ftruncate(index->online_log->fd, 0); +#endif /* HAVE_FTRUNCATE */ + index->online_log->head.blocks + = index->online_log->tail.blocks = 0; + } + + next_mrec = index->online_log->tail.block; + next_mrec_end = next_mrec + index->online_log->tail.bytes; + + if (next_mrec_end == next_mrec) { + /* End of log reached. */ +all_done: + ut_ad(has_index_lock); + ut_ad(index->online_log->head.blocks == 0); + ut_ad(index->online_log->tail.blocks == 0); + error = DB_SUCCESS; + goto func_exit; + } + } else { + os_offset_t ofs; + ibool success; + + ofs = (os_offset_t) index->online_log->head.blocks + * srv_sort_buf_size; + + ut_ad(has_index_lock); + has_index_lock = false; + rw_lock_x_unlock(dict_index_get_lock(index)); + + log_free_check(); + + success = os_file_read_no_error_handling( + OS_FILE_FROM_FD(index->online_log->fd), + index->online_log->head.block, ofs, + srv_sort_buf_size); + + if (!success) { + fprintf(stderr, "InnoDB: unable to read temporary file" + " for index %s\n", index->name + 1); + goto corruption; + } + +#ifdef POSIX_FADV_DONTNEED + /* Each block is read exactly once. Free up the file cache. */ + posix_fadvise(index->online_log->fd, + ofs, srv_sort_buf_size, POSIX_FADV_DONTNEED); +#endif /* POSIX_FADV_DONTNEED */ +#ifdef FALLOC_FL_PUNCH_HOLE + /* Try to deallocate the space for the file on disk. + This should work on ext4 on Linux 2.6.39 and later, + and be ignored when the operation is unsupported. */ + fallocate(index->online_log->fd, + FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, + ofs, srv_buf_size); +#endif /* FALLOC_FL_PUNCH_HOLE */ + + next_mrec = index->online_log->head.block; + next_mrec_end = next_mrec + srv_sort_buf_size; + } + + if (mrec) { + /* A partial record was read from the previous block. + Copy the temporary buffer full, as we do not know the + length of the record. Parse subsequent records from + the bigger buffer index->online_log->head.block + or index->online_log->tail.block. */ + + ut_ad(mrec == index->online_log->head.buf); + ut_ad(mrec_end > mrec); + ut_ad(mrec_end < (&index->online_log->head.buf)[1]); + + memcpy((mrec_t*) mrec_end, next_mrec, + (&index->online_log->head.buf)[1] - mrec_end); + mrec = row_log_apply_op( + index, dup, &error, offsets_heap, heap, + has_index_lock, index->online_log->head.buf, + (&index->online_log->head.buf)[1], offsets); + if (error != DB_SUCCESS) { + goto func_exit; + } else if (UNIV_UNLIKELY(mrec == NULL)) { + /* The record was not reassembled properly. */ + goto corruption; + } + /* The record was previously found out to be + truncated. Now that the parse buffer was extended, + it should proceed beyond the old end of the buffer. */ + ut_a(mrec > mrec_end); + + index->online_log->head.bytes = mrec - mrec_end; + next_mrec += index->online_log->head.bytes; + } + + ut_ad(next_mrec <= next_mrec_end); + /* The following loop must not be parsing the temporary + buffer, but head.block or tail.block. */ + + /* mrec!=NULL means that the next record starts from the + middle of the block */ + ut_ad((mrec == NULL) == (index->online_log->head.bytes == 0)); + +#ifdef UNIV_DEBUG + if (next_mrec_end == index->online_log->head.block + + srv_sort_buf_size) { + /* If tail.bytes == 0, next_mrec_end can also be at + the end of tail.block. */ + if (index->online_log->tail.bytes == 0) { + ut_ad(next_mrec == next_mrec_end); + ut_ad(index->online_log->tail.blocks == 0); + ut_ad(index->online_log->head.blocks == 0); + ut_ad(index->online_log->head.bytes == 0); + } else { + ut_ad(next_mrec == index->online_log->head.block + + index->online_log->head.bytes); + ut_ad(index->online_log->tail.blocks + > index->online_log->head.blocks); + } + } else if (next_mrec_end == index->online_log->tail.block + + index->online_log->tail.bytes) { + ut_ad(next_mrec == index->online_log->tail.block + + index->online_log->head.bytes); + ut_ad(index->online_log->tail.blocks == 0); + ut_ad(index->online_log->head.blocks == 0); + ut_ad(index->online_log->head.bytes + <= index->online_log->tail.bytes); + } else { + ut_error; + } +#endif /* UNIV_DEBUG */ + + mrec_end = next_mrec_end; + + while (!trx_is_interrupted(trx)) { + mrec = next_mrec; + ut_ad(mrec < mrec_end); + + if (!has_index_lock) { + /* We are applying operations from a different + block than the one that is being written to. + We do not hold index->lock in order to + allow other threads to concurrently buffer + modifications. */ + ut_ad(mrec >= index->online_log->head.block); + ut_ad(mrec_end == index->online_log->head.block + + srv_sort_buf_size); + ut_ad(index->online_log->head.bytes + < srv_sort_buf_size); + + /* Take the opportunity to do a redo log + checkpoint if needed. */ + log_free_check(); + } else { + /* We are applying operations from the last block. + Do not allow other threads to buffer anything, + so that we can finally catch up and synchronize. */ + ut_ad(index->online_log->head.blocks == 0); + ut_ad(index->online_log->tail.blocks == 0); + ut_ad(mrec_end == index->online_log->tail.block + + index->online_log->tail.bytes); + ut_ad(mrec >= index->online_log->tail.block); + } + + next_mrec = row_log_apply_op( + index, dup, &error, offsets_heap, heap, + has_index_lock, mrec, mrec_end, offsets); + + if (error != DB_SUCCESS) { + goto func_exit; + } else if (next_mrec == next_mrec_end) { + /* The record happened to end on a block boundary. + Do we have more blocks left? */ + if (has_index_lock) { + /* The index will be locked while + applying the last block. */ + goto all_done; + } + + mrec = NULL; +process_next_block: + rw_lock_x_lock(dict_index_get_lock(index)); + has_index_lock = true; + + index->online_log->head.bytes = 0; + index->online_log->head.blocks++; + goto next_block; + } else if (next_mrec != NULL) { + ut_ad(next_mrec < next_mrec_end); + index->online_log->head.bytes += next_mrec - mrec; + } else if (has_index_lock) { + /* When mrec is within tail.block, it should + be a complete record, because we are holding + index->lock and thus excluding the writer. */ + ut_ad(index->online_log->tail.blocks == 0); + ut_ad(mrec_end == index->online_log->tail.block + + index->online_log->tail.bytes); + ut_ad(0); + goto unexpected_eof; + } else { + memcpy(index->online_log->head.buf, mrec, + mrec_end - mrec); + mrec_end += index->online_log->head.buf - mrec; + mrec = index->online_log->head.buf; + goto process_next_block; + } + } + +interrupted: + error = DB_INTERRUPTED; +func_exit: + if (!has_index_lock) { + rw_lock_x_lock(dict_index_get_lock(index)); + } + + switch (error) { + case DB_SUCCESS: + break; + case DB_INDEX_CORRUPT: + if (((os_offset_t) index->online_log->tail.blocks + 1) + * srv_sort_buf_size >= srv_online_max_size) { + /* The log file grew too big. */ + error = DB_ONLINE_LOG_TOO_BIG; + } + /* fall through */ + default: + /* We set the flag directly instead of invoking + dict_set_corrupted_index_cache_only(index) here, + because the index is not "public" yet. */ + index->type |= DICT_CORRUPT; + } + + mem_heap_free(heap); + mem_heap_free(offsets_heap); + ut_free(offsets); + return(error); +} + +/******************************************************//** +Apply the row log to the index upon completing index creation. +@return DB_SUCCESS, or error code on failure */ +UNIV_INTERN +dberr_t +row_log_apply( +/*==========*/ + trx_t* trx, /*!< in: transaction (for checking if + the operation was interrupted) */ + dict_index_t* index, /*!< in/out: secondary index */ + struct TABLE* table) /*!< in/out: MySQL table + (for reporting duplicates) */ +{ + dberr_t error; + row_log_t* log; + row_merge_dup_t dup = { index, table, NULL, 0 }; + + ut_ad(dict_index_is_online_ddl(index)); + ut_ad(!dict_index_is_clust(index)); + + log_free_check(); + + rw_lock_x_lock(dict_index_get_lock(index)); + + if (!dict_table_is_corrupted(index->table)) { + error = row_log_apply_ops(trx, index, &dup); + } else { + error = DB_SUCCESS; + } + + if (error != DB_SUCCESS || dup.n_dup) { + ut_a(!dict_table_is_discarded(index->table)); + /* We set the flag directly instead of invoking + dict_set_corrupted_index_cache_only(index) here, + because the index is not "public" yet. */ + index->type |= DICT_CORRUPT; + index->table->drop_aborted = TRUE; + + if (error == DB_SUCCESS) { + error = DB_DUPLICATE_KEY; + } + + dict_index_set_online_status(index, ONLINE_INDEX_ABORTED); + } else { + dict_index_set_online_status(index, ONLINE_INDEX_COMPLETE); + } + + log = index->online_log; + index->online_log = NULL; + /* We could remove the TEMP_INDEX_PREFIX and update the data + dictionary to say that this index is complete, if we had + access to the .frm file here. If the server crashes before + all requested indexes have been created, this completed index + will be dropped. */ + rw_lock_x_unlock(dict_index_get_lock(index)); + + row_log_free(log); + + return(error); +} diff --git a/storage/innobase/row/row0merge.cc b/storage/innobase/row/row0merge.cc index 244aa0a69f1..a509e2c5ca8 100644 --- a/storage/innobase/row/row0merge.cc +++ b/storage/innobase/row/row0merge.cc @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1995, 2012, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2005, 2012, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -26,40 +26,18 @@ Completed by Sunny Bains and Marko Makela #include "row0merge.h" #include "row0ext.h" -#include "row0row.h" -#include "row0upd.h" +#include "row0log.h" #include "row0ins.h" #include "row0sel.h" -#include "dict0dict.h" -#include "dict0mem.h" -#include "dict0boot.h" #include "dict0crea.h" -#include "dict0load.h" -#include "btr0btr.h" -#include "mach0data.h" -#include "trx0rseg.h" -#include "trx0trx.h" -#include "trx0roll.h" -#include "trx0undo.h" #include "trx0purge.h" -#include "trx0rec.h" -#include "que0que.h" -#include "rem0cmp.h" -#include "read0read.h" -#include "os0file.h" #include "lock0lock.h" -#include "data0data.h" -#include "data0type.h" -#include "que0que.h" #include "pars0pars.h" -#include "mem0mem.h" -#include "log0log.h" #include "ut0sort.h" -#include "handler0alter.h" -#include "fts0fts.h" -#include "fts0types.h" -#include "fts0priv.h" #include "row0ftsort.h" +#include "row0import.h" +#include "handler0alter.h" +#include "ha_prototypes.h" /* Ignore posix_fadvise() on those platforms where it does not exist */ #if defined __WIN__ @@ -69,8 +47,6 @@ Completed by Sunny Bains and Marko Makela #ifdef UNIV_DEBUG /** Set these in order ot enable debug printout. */ /* @{ */ -/** Log the outcome of each row_merge_cmp() call, comparing records. */ -static ibool row_merge_print_cmp; /** Log each record read from temporary file. */ static ibool row_merge_print_read; /** Log each record write to temporary file. */ @@ -86,39 +62,23 @@ static ibool row_merge_print_block_write; #endif /* UNIV_DEBUG */ /* Whether to disable file system cache */ -UNIV_INTERN char srv_disable_sort_file_cache; - -/********************************************************************//** -Read sorted file containing index data tuples and insert these data -tuples to the index -@return DB_SUCCESS or error number */ -static -ulint -row_merge_insert_index_tuples( -/*==========================*/ - trx_t* trx, /*!< in: transaction */ - dict_index_t* index, /*!< in: index */ - dict_table_t* table, /*!< in: new table */ - ulint zip_size,/*!< in: compressed page size of - the old table, or 0 if uncompressed */ - int fd, /*!< in: file descriptor */ - row_merge_block_t* block); /*!< in/out: file buffer */ +UNIV_INTERN char srv_disable_sort_file_cache; #ifdef UNIV_DEBUG /******************************************************//** Display a merge tuple. */ -static +static __attribute__((nonnull)) void row_merge_tuple_print( /*==================*/ FILE* f, /*!< in: output stream */ - const dfield_t* entry, /*!< in: tuple to print */ + const mtuple_t* entry, /*!< in: tuple to print */ ulint n_fields)/*!< in: number of fields in the tuple */ { ulint j; for (j = 0; j < n_fields; j++) { - const dfield_t* field = &entry[j]; + const dfield_t* field = &entry->fields[j]; if (dfield_is_null(field)) { fputs("\n NULL;", f); @@ -141,16 +101,54 @@ row_merge_tuple_print( #endif /* UNIV_DEBUG */ /******************************************************//** +Encode an index record. */ +static __attribute__((nonnull)) +void +row_merge_buf_encode( +/*=================*/ + byte** b, /*!< in/out: pointer to + current end of output buffer */ + const dict_index_t* index, /*!< in: index */ + const mtuple_t* entry, /*!< in: index fields + of the record to encode */ + ulint n_fields) /*!< in: number of fields + in the entry */ +{ + ulint size; + ulint extra_size; + + size = rec_get_converted_size_temp( + index, entry->fields, n_fields, &extra_size); + ut_ad(size >= extra_size); + + /* Encode extra_size + 1 */ + if (extra_size + 1 < 0x80) { + *(*b)++ = (byte) (extra_size + 1); + } else { + ut_ad((extra_size + 1) < 0x8000); + *(*b)++ = (byte) (0x80 | ((extra_size + 1) >> 8)); + *(*b)++ = (byte) (extra_size + 1); + } + + rec_convert_dtuple_to_temp(*b + extra_size, index, + entry->fields, n_fields); + + *b += size; +} + +/******************************************************//** Allocate a sort buffer. @return own: sort buffer */ -static +static __attribute__((malloc, nonnull)) row_merge_buf_t* row_merge_buf_create_low( /*=====================*/ mem_heap_t* heap, /*!< in: heap where allocated */ dict_index_t* index, /*!< in: secondary index */ - ulint max_tuples, /*!< in: maximum number of data tuples */ - ulint buf_size) /*!< in: size of the buffer, in bytes */ + ulint max_tuples, /*!< in: maximum number of + data tuples */ + ulint buf_size) /*!< in: size of the buffer, + in bytes */ { row_merge_buf_t* buf; @@ -162,7 +160,7 @@ row_merge_buf_create_low( buf->heap = heap; buf->index = index; buf->max_tuples = max_tuples; - buf->tuples = static_cast<const dfield_t**>( + buf->tuples = static_cast<mtuple_t*>( ut_malloc(2 * max_tuples * sizeof *buf->tuples)); buf->tmp_tuples = buf->tuples + max_tuples; @@ -204,13 +202,11 @@ row_merge_buf_empty( /*================*/ row_merge_buf_t* buf) /*!< in,own: sort buffer */ { - ulint buf_size; + ulint buf_size = sizeof *buf; ulint max_tuples = buf->max_tuples; mem_heap_t* heap = buf->heap; dict_index_t* index = buf->index; - void* tuple = buf->tuples; - - buf_size = (sizeof *buf);; + mtuple_t* tuples = buf->tuples; mem_heap_empty(heap); @@ -218,7 +214,7 @@ row_merge_buf_empty( buf->heap = heap; buf->index = index; buf->max_tuples = max_tuples; - buf->tuples = static_cast<const dfield_t**>(tuple); + buf->tuples = tuples; buf->tmp_tuples = buf->tuples + max_tuples; return(buf); @@ -230,7 +226,7 @@ UNIV_INTERN void row_merge_buf_free( /*===============*/ - row_merge_buf_t* buf) /*!< in,own: sort buffer, to be freed */ + row_merge_buf_t* buf) /*!< in,own: sort buffer to be freed */ { ut_free(buf->tuples); mem_heap_free(buf->heap); @@ -244,19 +240,18 @@ ulint row_merge_buf_add( /*==============*/ row_merge_buf_t* buf, /*!< in/out: sort buffer */ - dict_index_t* fts_index,/*!< fts index to be - created */ + dict_index_t* fts_index,/*!< in: fts index to be created */ + const dict_table_t* old_table,/*!< in: original table */ fts_psort_t* psort_info, /*!< in: parallel sort info */ - const dtuple_t* row, /*!< in: row in clustered index */ + const dtuple_t* row, /*!< in: table row */ const row_ext_t* ext, /*!< in: cache of externally stored column prefixes, or NULL */ doc_id_t* doc_id) /*!< in/out: Doc ID if we are creating FTS index */ - { ulint i; const dict_index_t* index; - dfield_t* entry; + mtuple_t* entry; dfield_t* field; const dict_field_t* ifield; ulint n_fields; @@ -267,9 +262,13 @@ row_merge_buf_add( ulint n_row_added = 0; if (buf->n_tuples >= buf->max_tuples) { - return(FALSE); + return(0); } + DBUG_EXECUTE_IF( + "ib_row_merge_buf_add_two", + if (buf->n_tuples >= 2) return(0);); + UNIV_PREFETCH_R(row->fields); /* If we are building FTS index, buf->index points to @@ -279,11 +278,9 @@ row_merge_buf_add( n_fields = dict_index_get_n_fields(index); - entry = static_cast<dfield_t*>( - mem_heap_alloc(buf->heap, n_fields * sizeof *entry)); - - buf->tuples[buf->n_tuples] = entry; - field = entry; + entry = &buf->tuples[buf->n_tuples]; + field = entry->fields = static_cast<dfield_t*>( + mem_heap_alloc(buf->heap, n_fields * sizeof *entry->fields)); data_size = 0; extra_size = UT_BITS_IN_BYTES(index->n_nullable); @@ -296,30 +293,13 @@ row_merge_buf_add( ulint col_no; ulint fixed_len; const dfield_t* row_field; - ibool col_adjusted; col = ifield->col; col_no = dict_col_get_no(col); - col_adjusted = FALSE; - - /* If we are creating a FTS index, a new Doc - ID column is being added, so we need to adjust - any column number positioned after this Doc ID */ - if (*doc_id > 0 - && DICT_TF2_FLAG_IS_SET(index->table, - DICT_TF2_FTS_ADD_DOC_ID) - && col_no > index->table->fts->doc_col) { - - ut_ad(index->table->fts); - - col_no--; - col_adjusted = TRUE; - } /* Process the Doc ID column */ if (*doc_id > 0 - && col_no == index->table->fts->doc_col - && !col_adjusted) { + && col_no == index->table->fts->doc_col) { fts_write_doc_id((byte*) &write_doc_id, *doc_id); /* Note: field->data now points to a value on the @@ -487,7 +467,7 @@ row_merge_buf_add( ulint extra; size = rec_get_converted_size_temp( - index, entry, n_fields, &extra); + index, entry->fields, n_fields, &extra); ut_ad(data_size + extra_size == size); ut_ad(extra_size == extra); @@ -500,12 +480,6 @@ row_merge_buf_add( of extra_size. */ data_size += (extra_size + 1) + ((extra_size + 1) >= 0x80); - /* The following assertion may fail if row_merge_block_t is - declared very small and a PRIMARY KEY is being created with - many prefix columns. In that case, the record may exceed the - page_zip_rec_needs_ext() limit. However, no further columns - will be moved to external storage until the record is inserted - to the clustered index B-tree. */ ut_ad(data_size < srv_sort_buf_size); /* Reserve one byte for the end marker of row_merge_block_t. */ @@ -517,7 +491,7 @@ row_merge_buf_add( buf->n_tuples++; n_row_added++; - field = entry; + field = entry->fields; /* Copy the data fields. */ @@ -530,118 +504,120 @@ row_merge_buf_add( /*************************************************************//** Report a duplicate key. */ -static +UNIV_INTERN void row_merge_dup_report( /*=================*/ row_merge_dup_t* dup, /*!< in/out: for reporting duplicates */ const dfield_t* entry) /*!< in: duplicate index entry */ { - mrec_buf_t* buf; - const dtuple_t* tuple; - dtuple_t tuple_store; - const rec_t* rec; - const dict_index_t* index = dup->index; - ulint n_fields= dict_index_get_n_fields(index); - mem_heap_t* heap; - ulint* offsets; - ulint n_ext; - - if (dup->n_dup++) { + if (!dup->n_dup++) { /* Only report the first duplicate record, but count all duplicate records. */ - return; + innobase_fields_to_mysql(dup->table, dup->index, entry); } - - /* Convert the tuple to a record and then to MySQL format. */ - heap = mem_heap_create((1 + REC_OFFS_HEADER_SIZE + n_fields) - * sizeof *offsets - + sizeof *buf); - - buf = static_cast<mrec_buf_t*>(mem_heap_alloc(heap, sizeof *buf)); - - tuple = dtuple_from_fields(&tuple_store, entry, n_fields); - n_ext = dict_index_is_clust(index) ? dtuple_get_n_ext(tuple) : 0; - - rec = rec_convert_dtuple_to_rec(*buf, index, tuple, n_ext); - offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, &heap); - - innobase_rec_to_mysql(dup->table, rec, index, offsets); - - mem_heap_free(heap); } /*************************************************************//** Compare two tuples. @return 1, 0, -1 if a is greater, equal, less, respectively, than b */ -static +static __attribute__((warn_unused_result)) int row_merge_tuple_cmp( /*================*/ + ulint n_uniq, /*!< in: number of unique fields */ ulint n_field,/*!< in: number of fields */ - const dfield_t* a, /*!< in: first tuple to be compared */ - const dfield_t* b, /*!< in: second tuple to be compared */ - row_merge_dup_t* dup) /*!< in/out: for reporting duplicates */ + const mtuple_t& a, /*!< in: first tuple to be compared */ + const mtuple_t& b, /*!< in: second tuple to be compared */ + row_merge_dup_t* dup) /*!< in/out: for reporting duplicates, + NULL if non-unique index */ { int cmp; - const dfield_t* field = a; + const dfield_t* af = a.fields; + const dfield_t* bf = b.fields; + ulint n = n_uniq; + + ut_ad(n_uniq > 0); + ut_ad(n_uniq <= n_field); /* Compare the fields of the tuples until a difference is found or we run out of fields to compare. If !cmp at the end, the tuples are equal. */ do { - cmp = cmp_dfield_dfield(a++, b++); - } while (!cmp && --n_field); + cmp = cmp_dfield_dfield(af++, bf++); + } while (!cmp && --n); - if (UNIV_UNLIKELY(!cmp) && UNIV_LIKELY_NULL(dup)) { + if (cmp) { + return(cmp); + } + + if (dup) { /* Report a duplicate value error if the tuples are logically equal. NULL columns are logically inequal, although they are equal in the sorting order. Find out if any of the fields are NULL. */ - for (b = field; b != a; b++) { - if (dfield_is_null(b)) { - - goto func_exit; + for (const dfield_t* df = a.fields; df != af; df++) { + if (dfield_is_null(df)) { + goto no_report; } } - row_merge_dup_report(dup, field); + row_merge_dup_report(dup, a.fields); } -func_exit: +no_report: + /* The n_uniq fields were equal, but we compare all fields so + that we will get the same (internal) order as in the B-tree. */ + for (n = n_field - n_uniq + 1; --n; ) { + cmp = cmp_dfield_dfield(af++, bf++); + if (cmp) { + return(cmp); + } + } + + /* This should never be reached, except in a secondary index + when creating a secondary index and a PRIMARY KEY, and there + is a duplicate in the PRIMARY KEY that has not been detected + yet. Internally, an index must never contain duplicates. */ return(cmp); } /** Wrapper for row_merge_tuple_sort() to inject some more context to UT_SORT_FUNCTION_BODY(). -@param a array of tuples that being sorted -@param b aux (work area), same size as tuples[] -@param c lower bound of the sorting area, inclusive -@param d upper bound of the sorting area, inclusive */ -#define row_merge_tuple_sort_ctx(a,b,c,d) \ - row_merge_tuple_sort(n_field, dup, a, b, c, d) +@param tuples array of tuples that being sorted +@param aux work area, same size as tuples[] +@param low lower bound of the sorting area, inclusive +@param high upper bound of the sorting area, inclusive */ +#define row_merge_tuple_sort_ctx(tuples, aux, low, high) \ + row_merge_tuple_sort(n_uniq, n_field, dup, tuples, aux, low, high) /** Wrapper for row_merge_tuple_cmp() to inject some more context to UT_SORT_FUNCTION_BODY(). @param a first tuple to be compared @param b second tuple to be compared @return 1, 0, -1 if a is greater, equal, less, respectively, than b */ -#define row_merge_tuple_cmp_ctx(a,b) row_merge_tuple_cmp(n_field, a, b, dup) +#define row_merge_tuple_cmp_ctx(a,b) \ + row_merge_tuple_cmp(n_uniq, n_field, a, b, dup) /**********************************************************************//** Merge sort the tuple buffer in main memory. */ -static +static __attribute__((nonnull(4,5))) void row_merge_tuple_sort( /*=================*/ + ulint n_uniq, /*!< in: number of unique fields */ ulint n_field,/*!< in: number of fields */ - row_merge_dup_t* dup, /*!< in/out: for reporting duplicates */ - const dfield_t** tuples, /*!< in/out: tuples */ - const dfield_t** aux, /*!< in/out: work area */ + row_merge_dup_t* dup, /*!< in/out: reporter of duplicates + (NULL if non-unique index) */ + mtuple_t* tuples, /*!< in/out: tuples */ + mtuple_t* aux, /*!< in/out: work area */ ulint low, /*!< in: lower bound of the sorting area, inclusive */ ulint high) /*!< in: upper bound of the sorting area, exclusive */ { + ut_ad(n_field > 0); + ut_ad(n_uniq <= n_field); + UT_SORT_FUNCTION_BODY(row_merge_tuple_sort_ctx, tuples, aux, low, high, row_merge_tuple_cmp_ctx); } @@ -653,9 +629,12 @@ void row_merge_buf_sort( /*===============*/ row_merge_buf_t* buf, /*!< in/out: sort buffer */ - row_merge_dup_t* dup) /*!< in/out: for reporting duplicates */ + row_merge_dup_t* dup) /*!< in/out: reporter of duplicates + (NULL if non-unique index) */ { - row_merge_tuple_sort(dict_index_get_n_unique(buf->index), dup, + row_merge_tuple_sort(dict_index_get_n_unique(buf->index), + dict_index_get_n_fields(buf->index), + dup, buf->tuples, buf->tmp_tuples, 0, buf->n_tuples); } @@ -674,33 +653,11 @@ row_merge_buf_write( ulint n_fields= dict_index_get_n_fields(index); byte* b = &block[0]; - ulint i; - - for (i = 0; i < buf->n_tuples; i++) { - ulint size; - ulint extra_size; - const dfield_t* entry = buf->tuples[i]; - - size = rec_get_converted_size_temp( - index, entry, n_fields, &extra_size); - ut_ad(size >= extra_size); - - /* Encode extra_size + 1 */ - if (extra_size + 1 < 0x80) { - *b++ = (byte) (extra_size + 1); - } else { - ut_ad((extra_size + 1) < 0x8000); - *b++ = (byte) (0x80 | ((extra_size + 1) >> 8)); - *b++ = (byte) (extra_size + 1); - } - - ut_ad(b + size < &block[srv_sort_buf_size]); - - rec_convert_dtuple_to_temp(b + extra_size, index, - entry, n_fields); - - b += size; + for (ulint i = 0; i < buf->n_tuples; i++) { + const mtuple_t* entry = &buf->tuples[i]; + row_merge_buf_encode(&b, index, entry, n_fields); + ut_ad(b < &block[srv_sort_buf_size]); #ifdef UNIV_DEBUG if (row_merge_print_write) { fprintf(stderr, "row_merge_buf_write %p,%d,%lu %lu", @@ -759,36 +716,6 @@ row_merge_heap_create( return(heap); } -/**********************************************************************//** -Search an index object by name and column names. If several indexes match, -return the index with the max id. -@return matching index, NULL if not found */ -static -dict_index_t* -row_merge_dict_table_get_index( -/*===========================*/ - dict_table_t* table, /*!< in: table */ - const merge_index_def_t*index_def) /*!< in: index definition */ -{ - ulint i; - dict_index_t* index; - const char** column_names; - - column_names = static_cast<const char**>( - mem_alloc(index_def->n_fields * sizeof *column_names)); - - for (i = 0; i < index_def->n_fields; ++i) { - column_names[i] = index_def->fields[i].field_name; - } - - index = dict_table_get_index_by_max_id( - table, index_def->name, column_names, index_def->n_fields); - - mem_free((void*) column_names); - - return(index); -} - /********************************************************************//** Read a merge block from the file system. @return TRUE if request was successful, FALSE if fail */ @@ -854,10 +781,10 @@ row_merge_write( os_offset_t ofs = buf_len * (os_offset_t) offset; ibool ret; - ret = os_file_write("(merge)", OS_FILE_FROM_FD(fd), buf, ofs, buf_len); - DBUG_EXECUTE_IF("row_merge_write_failure", return(FALSE);); + ret = os_file_write("(merge)", OS_FILE_FROM_FD(fd), buf, ofs, buf_len); + #ifdef UNIV_DEBUG if (row_merge_print_block_write) { fprintf(stderr, "row_merge_write fd=%d ofs=%lu\n", @@ -877,7 +804,7 @@ row_merge_write( /********************************************************************//** Read a merge record. @return pointer to next record, or NULL on I/O error or end of list */ -UNIV_INTERN __attribute__((nonnull)) +UNIV_INTERN const byte* row_merge_read_rec( /*===============*/ @@ -953,7 +880,7 @@ err_exit: case. */ avail_size = &block[srv_sort_buf_size] - b; - + ut_ad(avail_size < sizeof *buf); memcpy(*buf, b, avail_size); if (!row_merge_read(fd, ++(*foffs), block)) { @@ -1193,46 +1120,12 @@ row_merge_write_eof( return(&block[0]); } -/*************************************************************//** -Compare two merge records. -@return 1, 0, -1 if mrec1 is greater, equal, less, respectively, than mrec2 */ -UNIV_INTERN -int -row_merge_cmp( -/*==========*/ - const mrec_t* mrec1, /*!< in: first merge - record to be compared */ - const mrec_t* mrec2, /*!< in: second merge - record to be compared */ - const ulint* offsets1, /*!< in: first record offsets */ - const ulint* offsets2, /*!< in: second record offsets */ - const dict_index_t* index, /*!< in: index */ - ibool* null_eq) /*!< out: set to TRUE if - found matching null values */ -{ - int cmp; - - cmp = cmp_rec_rec_simple(mrec1, mrec2, offsets1, offsets2, index, - null_eq); - -#ifdef UNIV_DEBUG - if (row_merge_print_cmp) { - fputs("row_merge_cmp1 ", stderr); - rec_print_comp(stderr, mrec1, offsets1); - fputs("\nrow_merge_cmp2 ", stderr); - rec_print_comp(stderr, mrec2, offsets2); - fprintf(stderr, "\nrow_merge_cmp=%d\n", cmp); - } -#endif /* UNIV_DEBUG */ - - return(cmp); -} /********************************************************************//** Reads clustered index of the table and create temporary files containing the index entries for the indexes to be built. @return DB_SUCCESS or error */ -static __attribute__((nonnull)) -ulint +static __attribute__((nonnull(1,2,3,4,6,9,10,16), warn_unused_result)) +dberr_t row_merge_read_clustered_index( /*===========================*/ trx_t* trx, /*!< in: transaction */ @@ -1243,23 +1136,40 @@ row_merge_read_clustered_index( const dict_table_t* new_table,/*!< in: table where indexes are created; identical to old_table unless creating a PRIMARY KEY */ + bool online, /*!< in: true if creating indexes + online */ dict_index_t** index, /*!< in: indexes to be created */ dict_index_t* fts_sort_idx, - /*!< in: indexes to be created */ - fts_psort_t* psort_info, /*!< in: parallel sort info */ + /*!< in: full-text index to be created, + or NULL */ + fts_psort_t* psort_info, + /*!< in: parallel sort info for + fts_sort_idx creation, or NULL */ merge_file_t* files, /*!< in: temporary files */ + const ulint* key_numbers, + /*!< in: MySQL key numbers to create */ ulint n_index,/*!< in: number of indexes to create */ + const dtuple_t* add_cols, + /*!< in: default values of + added columns, or NULL */ + const ulint* col_map,/*!< in: mapping of old column + numbers to new ones, or NULL + if old_table == new_table */ + ulint add_autoinc, + /*!< in: number of added + AUTO_INCREMENT column, or + ULINT_UNDEFINED if none is added */ + ib_sequence_t& sequence,/*!< in/out: autoinc sequence */ row_merge_block_t* block) /*!< in/out: file buffer */ { dict_index_t* clust_index; /* Clustered index */ mem_heap_t* row_heap; /* Heap memory to create - clustered index records */ + clustered index tuples */ row_merge_buf_t** merge_buf; /* Temporary list for records*/ - btr_pcur_t pcur; /* Persistent cursor on the - clustered index */ + btr_pcur_t pcur; /* Cursor on the clustered + index */ mtr_t mtr; /* Mini transaction */ - ulint err = DB_SUCCESS;/* Return code */ - ulint i; + dberr_t err = DB_SUCCESS;/* Return code */ ulint n_nonnull = 0; /* number of columns changed to NOT NULL */ ulint* nonnull = NULL; /* NOT NULL columns */ @@ -1271,13 +1181,10 @@ row_merge_read_clustered_index( ibool fts_pll_sort = FALSE; ib_int64_t sig_count = 0; - trx->op_info = "reading clustered index"; + ut_ad((old_table == new_table) == !col_map); + ut_ad(!add_cols || col_map); - ut_ad(trx); - ut_ad(old_table); - ut_ad(new_table); - ut_ad(index); - ut_ad(files); + trx->op_info = "reading clustered index"; #ifdef FTS_INTERNAL_DIAG_PRINT DEBUG_FTS_SORT_PRINT("FTS_SORT: Start Create Index\n"); @@ -1288,8 +1195,7 @@ row_merge_read_clustered_index( merge_buf = static_cast<row_merge_buf_t**>( mem_alloc(n_index * sizeof *merge_buf)); - - for (i = 0; i < n_index; i++) { + for (ulint i = 0; i < n_index; i++) { if (index[i]->type & DICT_FTS) { /* We are building a FT index, make sure @@ -1301,14 +1207,14 @@ row_merge_read_clustered_index( merge_buf[i] = row_merge_buf_create(fts_sort_idx); add_doc_id = DICT_TF2_FLAG_IS_SET( - old_table, DICT_TF2_FTS_ADD_DOC_ID); + new_table, DICT_TF2_FTS_ADD_DOC_ID); /* If Doc ID does not exist in the table itself, fetch the first FTS Doc ID */ if (add_doc_id) { fts_get_next_doc_id( (dict_table_t*) new_table, - &doc_id); + &doc_id); ut_ad(doc_id > 0); } @@ -1329,35 +1235,34 @@ row_merge_read_clustered_index( clust_index = dict_table_get_first_index(old_table); btr_pcur_open_at_index_side( - TRUE, clust_index, BTR_SEARCH_LEAF, &pcur, TRUE, &mtr); - - if (UNIV_UNLIKELY(old_table != new_table)) { - ulint n_cols = dict_table_get_n_cols(old_table); - - /* A primary key will be created. Identify the - columns that were flagged NOT NULL in the new table, - so that we can quickly check that the records in the - (old) clustered index do not violate the added NOT - NULL constraints. */ + true, clust_index, BTR_SEARCH_LEAF, &pcur, true, 0, &mtr); - if (!fts_sort_idx) { - ut_a(n_cols == dict_table_get_n_cols(new_table)); - } + if (old_table != new_table) { + /* The table is being rebuilt. Identify the columns + that were flagged NOT NULL in the new table, so that + we can quickly check that the records in the old table + do not violate the added NOT NULL constraints. */ nonnull = static_cast<ulint*>( - mem_alloc(n_cols * sizeof *nonnull)); + mem_alloc(dict_table_get_n_cols(new_table) + * sizeof *nonnull)); - for (i = 0; i < n_cols; i++) { + for (ulint i = 0; i < dict_table_get_n_cols(old_table); i++) { if (dict_table_get_nth_col(old_table, i)->prtype & DATA_NOT_NULL) { + continue; + } + const ulint j = col_map[i]; + + if (j == ULINT_UNDEFINED) { + /* The column was dropped. */ continue; } - if (dict_table_get_nth_col(new_table, i)->prtype + if (dict_table_get_nth_col(new_table, j)->prtype & DATA_NOT_NULL) { - - nonnull[n_nonnull++] = i; + nonnull[n_nonnull++] = j; } } @@ -1373,81 +1278,221 @@ row_merge_read_clustered_index( for (;;) { const rec_t* rec; ulint* offsets; - dtuple_t* row = NULL; + const dtuple_t* row; row_ext_t* ext; - ibool has_next = TRUE; + page_cur_t* cur = btr_pcur_get_page_cur(&pcur); - btr_pcur_move_to_next_on_page(&pcur); + page_cur_move_to_next(cur); - /* When switching pages, commit the mini-transaction - in order to release the latch on the old page. */ - - if (btr_pcur_is_after_last_on_page(&pcur)) { + if (page_cur_is_after_last(cur)) { if (UNIV_UNLIKELY(trx_is_interrupted(trx))) { err = DB_INTERRUPTED; trx->error_key_num = 0; goto func_exit; } - /* Store the cursor position on the last user - record on the page. */ - btr_pcur_move_to_prev_on_page(&pcur); - /* Leaf pages must never be empty, unless - this is the only page in the index tree. */ - ut_ad(btr_pcur_is_on_user_rec(&pcur) - || buf_block_get_page_no( - btr_pcur_get_block(&pcur)) - == clust_index->page); - - btr_pcur_store_position(&pcur, &mtr); - mtr_commit(&mtr); - mtr_start(&mtr); - /* Restore position on the record, or its - predecessor if the record was purged - meanwhile. */ - btr_pcur_restore_position(BTR_SEARCH_LEAF, - &pcur, &mtr); - /* Move to the successor of the original record. */ - has_next = btr_pcur_move_to_next_user_rec(&pcur, &mtr); + if (online && old_table != new_table) { + err = row_log_table_get_error(clust_index); + if (err != DB_SUCCESS) { + trx->error_key_num = 0; + goto func_exit; + } + } +#ifdef DBUG_OFF +# define dbug_run_purge false +#else /* DBUG_OFF */ + bool dbug_run_purge = false; +#endif /* DBUG_OFF */ + DBUG_EXECUTE_IF( + "ib_purge_on_create_index_page_switch", + dbug_run_purge = true;); + + if (dbug_run_purge + || rw_lock_get_waiters( + dict_index_get_lock(clust_index))) { + /* There are waiters on the clustered + index tree lock, likely the purge + thread. Store and restore the cursor + position, and yield so that scanning a + large table will not starve other + threads. */ + + /* Store the cursor position on the last user + record on the page. */ + btr_pcur_move_to_prev_on_page(&pcur); + /* Leaf pages must never be empty, unless + this is the only page in the index tree. */ + ut_ad(btr_pcur_is_on_user_rec(&pcur) + || buf_block_get_page_no( + btr_pcur_get_block(&pcur)) + == clust_index->page); + + btr_pcur_store_position(&pcur, &mtr); + mtr_commit(&mtr); + + if (dbug_run_purge) { + /* This is for testing + purposes only (see + DBUG_EXECUTE_IF above). We + signal the purge thread and + hope that the purge batch will + complete before we execute + btr_pcur_restore_position(). */ + trx_purge_run(); + os_thread_sleep(1000000); + } + + /* Give the waiters a chance to proceed. */ + os_thread_yield(); + + mtr_start(&mtr); + /* Restore position on the record, or its + predecessor if the record was purged + meanwhile. */ + btr_pcur_restore_position( + BTR_SEARCH_LEAF, &pcur, &mtr); + /* Move to the successor of the + original record. */ + if (!btr_pcur_move_to_next_user_rec( + &pcur, &mtr)) { +end_of_index: + row = NULL; + mtr_commit(&mtr); + mem_heap_free(row_heap); + if (nonnull) { + mem_free(nonnull); + } + goto write_buffers; + } + } else { + ulint next_page_no; + buf_block_t* block; + + next_page_no = btr_page_get_next( + page_cur_get_page(cur), &mtr); + + if (next_page_no == FIL_NULL) { + goto end_of_index; + } + + block = page_cur_get_block(cur); + block = btr_block_get( + buf_block_get_space(block), + buf_block_get_zip_size(block), + next_page_no, BTR_SEARCH_LEAF, + clust_index, &mtr); + + btr_leaf_page_release(page_cur_get_block(cur), + BTR_SEARCH_LEAF, &mtr); + page_cur_set_before_first(block, cur); + page_cur_move_to_next(cur); + + ut_ad(!page_cur_is_after_last(cur)); + } } - if (UNIV_LIKELY(has_next)) { - rec = btr_pcur_get_rec(&pcur); - offsets = rec_get_offsets(rec, clust_index, NULL, - ULINT_UNDEFINED, &row_heap); + rec = page_cur_get_rec(cur); + + offsets = rec_get_offsets(rec, clust_index, NULL, + ULINT_UNDEFINED, &row_heap); + + if (online && new_table != old_table) { + /* When rebuilding the table online, perform a + REPEATABLE READ, so that row_log_table_apply() + will not see a newer state of the table when + applying the log. This is mainly to prevent + false duplicate key errors, because the log + will identify records by the PRIMARY KEY. */ + ut_ad(trx->read_view); + + if (!read_view_sees_trx_id( + trx->read_view, + row_get_rec_trx_id( + rec, clust_index, offsets))) { + rec_t* old_vers; + + row_vers_build_for_consistent_read( + rec, &mtr, clust_index, &offsets, + trx->read_view, &row_heap, + row_heap, &old_vers); + + rec = old_vers; + + if (!rec) { + continue; + } + } - /* Skip delete marked records. */ if (rec_get_deleted_flag( - rec, dict_table_is_comp(old_table))) { + rec, + dict_table_is_comp(old_table))) { + /* This record was deleted in the latest + committed version, or it was deleted and + then reinserted-by-update before purge + kicked in. Skip it. */ continue; } - srv_n_rows_inserted++; + ut_ad(!rec_offs_any_null_extern(rec, offsets)); + } else if (rec_get_deleted_flag( + rec, dict_table_is_comp(old_table))) { + /* Skip delete-marked records. + + Skipping delete-marked records will make the + created indexes unuseable for transactions + whose read views were created before the index + creation completed, but preserving the history + would make it tricky to detect duplicate + keys. */ + continue; + } else if (UNIV_LIKELY_NULL(rec_offs_any_null_extern( + rec, offsets))) { + /* This is essentially a READ UNCOMMITTED to + fetch the most recent version of the record. */ +#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG + trx_id_t trx_id; + ulint trx_id_offset; + + /* It is possible that the record was + just inserted and the off-page columns + have not yet been written. We will + ignore the record if this is the case, + because it should be covered by the + index->info.online log in that case. */ + + trx_id_offset = clust_index->trx_id_offset; + if (!trx_id_offset) { + trx_id_offset = row_get_trx_id_offset( + clust_index, offsets); + } - /* Build a row based on the clustered index. */ + trx_id = trx_read_trx_id(rec + trx_id_offset); + ut_a(trx_rw_is_active(trx_id, NULL)); + ut_a(trx_undo_trx_id_is_insert(rec + trx_id_offset)); +#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ - row = row_build(ROW_COPY_POINTERS, clust_index, - rec, offsets, - new_table, &ext, row_heap); + /* When !online, we are holding an X-lock on + old_table, preventing any inserts. */ + ut_ad(online); + continue; + } - if (UNIV_LIKELY_NULL(nonnull)) { - for (i = 0; i < n_nonnull; i++) { - dfield_t* field - = &row->fields[nonnull[i]]; - dtype_t* field_type - = dfield_get_type(field); + /* Build a row based on the clustered index. */ - ut_a(!(field_type->prtype - & DATA_NOT_NULL)); + row = row_build(ROW_COPY_POINTERS, clust_index, + rec, offsets, new_table, + add_cols, col_map, &ext, row_heap); + ut_ad(row); - if (dfield_is_null(field)) { - err = DB_PRIMARY_KEY_IS_NULL; - trx->error_key_num = 0; - goto func_exit; - } + for (ulint i = 0; i < n_nonnull; i++) { + const dfield_t* field = &row->fields[nonnull[i]]; - field_type->prtype |= DATA_NOT_NULL; - } + ut_ad(dfield_get_type(field)->prtype & DATA_NOT_NULL); + + if (dfield_is_null(field)) { + err = DB_INVALID_NULL; + trx->error_key_num = 0; + goto func_exit; } } @@ -1458,19 +1503,72 @@ row_merge_read_clustered_index( doc_id = 0; } + if (add_autoinc != ULINT_UNDEFINED) { + + ut_ad(add_autoinc + < dict_table_get_n_user_cols(new_table)); + + const dfield_t* dfield; + + dfield = dtuple_get_nth_field(row, add_autoinc); + if (dfield_is_null(dfield)) { + goto write_buffers; + } + + const dtype_t* dtype = dfield_get_type(dfield); + byte* b = static_cast<byte*>(dfield_get_data(dfield)); + + if (sequence.eof()) { + err = DB_ERROR; + trx->error_key_num = 0; + + ib_errf(trx->mysql_thd, IB_LOG_LEVEL_ERROR, + ER_AUTOINC_READ_FAILED, "[NULL]"); + + goto func_exit; + } + + ulonglong value = sequence++; + + switch (dtype_get_mtype(dtype)) { + case DATA_INT: { + ibool usign; + ulint len = dfield_get_len(dfield); + + usign = dtype_get_prtype(dtype) & DATA_UNSIGNED; + mach_write_ulonglong(b, value, len, usign); + + break; + } + + case DATA_FLOAT: + mach_float_write( + b, static_cast<float>(value)); + break; + + case DATA_DOUBLE: + mach_double_write( + b, static_cast<double>(value)); + break; + + default: + ut_ad(0); + } + } + +write_buffers: /* Build all entries for all the indexes to be created in a single scan of the clustered index. */ - for (i = 0; i < n_index; i++) { + for (ulint i = 0; i < n_index; i++) { row_merge_buf_t* buf = merge_buf[i]; merge_file_t* file = &files[i]; - const dict_index_t* index = buf->index; ulint rows_added = 0; if (UNIV_LIKELY (row && (rows_added = row_merge_buf_add( - buf, fts_index, psort_info, - row, ext, &doc_id)))) { + buf, fts_index, old_table, + psort_info, row, ext, &doc_id)))) { /* If we are creating FTS index, a single row can generate more @@ -1483,35 +1581,60 @@ row_merge_read_clustered_index( continue; } - if ((!row || !doc_id) - && index->type & DICT_FTS) { + if ((buf->index->type & DICT_FTS) + && (!row || !doc_id)) { continue; } /* The buffer must be sufficiently large - to hold at least one record. */ - ut_ad(buf->n_tuples || !has_next); + to hold at least one record. It may only + be empty when we reach the end of the + clustered index. row_merge_buf_add() + must not have been called in this loop. */ + ut_ad(buf->n_tuples || row == NULL); /* We have enough data tuples to form a block. Sort them and write to disk. */ if (buf->n_tuples) { - if (dict_index_is_unique(index)) { - row_merge_dup_t dup; - dup.index = buf->index; - dup.table = table; - dup.n_dup = 0; + if (dict_index_is_unique(buf->index)) { + row_merge_dup_t dup = { + buf->index, table, col_map, 0}; row_merge_buf_sort(buf, &dup); if (dup.n_dup) { err = DB_DUPLICATE_KEY; - trx->error_key_num = i; - goto func_exit; + trx->error_key_num + = key_numbers[i]; + break; } } else { row_merge_buf_sort(buf, NULL); } + } else if (online && new_table == old_table) { + /* Note the newest transaction that + modified this index when the scan was + completed. We prevent older readers + from accessing this index, to ensure + read consistency. */ + + trx_id_t max_trx_id; + + ut_a(row == NULL); + rw_lock_x_lock( + dict_index_get_lock(buf->index)); + ut_a(dict_index_get_online_status(buf->index) + == ONLINE_INDEX_CREATION); + + max_trx_id = row_log_get_max_trx(buf->index); + + if (max_trx_id > buf->index->trx_id) { + buf->index->trx_id = max_trx_id; + } + + rw_lock_x_unlock( + dict_index_get_lock(buf->index)); } row_merge_buf_write(buf, file, block); @@ -1520,7 +1643,7 @@ row_merge_read_clustered_index( block)) { err = DB_OUT_OF_FILE_SPACE; trx->error_key_num = i; - goto func_exit; + break; } UNIV_MEM_INVALID(&block[0], srv_sort_buf_size); @@ -1533,14 +1656,11 @@ row_merge_read_clustered_index( if (UNIV_UNLIKELY (!(rows_added = row_merge_buf_add( - buf, fts_index, psort_info, row, - ext, &doc_id)))) { + buf, fts_index, old_table, + psort_info, row, ext, + &doc_id)))) { /* An empty buffer should have enough - room for at least one record. - TODO: for FTS index building, we'll - need to prepared for coping with very - large text/blob data in a single row - that could fill up the merge file */ + room for at least one record. */ ut_error; } @@ -1548,27 +1668,40 @@ row_merge_read_clustered_index( } } - mem_heap_empty(row_heap); + if (row == NULL) { + goto all_done; + } - if (UNIV_UNLIKELY(!has_next)) { + if (err != DB_SUCCESS) { goto func_exit; } + + mem_heap_empty(row_heap); } func_exit: + mtr_commit(&mtr); + mem_heap_free(row_heap); + + if (nonnull) { + mem_free(nonnull); + } + +all_done: #ifdef FTS_INTERNAL_DIAG_PRINT DEBUG_FTS_SORT_PRINT("FTS_SORT: Complete Scan Table\n"); #endif if (fts_pll_sort) { - for (i = 0; i < fts_sort_pll_degree; i++) { + for (ulint i = 0; i < fts_sort_pll_degree; i++) { psort_info[i].state = FTS_PARENT_COMPLETE; } wait_again: os_event_wait_time_low(fts_parallel_sort_event, 1000000, sig_count); - for (i = 0; i < fts_sort_pll_degree; i++) { - if (psort_info[i].child_status != FTS_CHILD_COMPLETE) { + for (ulint i = 0; i < fts_sort_pll_degree; i++) { + if (psort_info[i].child_status != FTS_CHILD_COMPLETE + && psort_info[i].child_status != FTS_CHILD_EXITING) { sig_count = os_event_reset( fts_parallel_sort_event); goto wait_again; @@ -1579,17 +1712,7 @@ wait_again: #ifdef FTS_INTERNAL_DIAG_PRINT DEBUG_FTS_SORT_PRINT("FTS_SORT: Complete Tokenization\n"); #endif - - btr_pcur_close(&pcur); - mtr_commit(&mtr); - mem_heap_free(row_heap); - - if (UNIV_LIKELY_NULL(nonnull)) { - mem_free(nonnull); - } - - - for (i = 0; i < n_index; i++) { + for (ulint i = 0; i < n_index; i++) { row_merge_buf_free(merge_buf[i]); } @@ -1597,10 +1720,13 @@ wait_again: mem_free(merge_buf); + btr_pcur_close(&pcur); + /* Update the next Doc ID we used. Table should be locked, so no concurrent DML */ if (max_doc_id) { - fts_update_next_doc_id(new_table, old_table->name, max_doc_id); + fts_update_next_doc_id( + 0, new_table, old_table->name, max_doc_id); } trx->op_info = ""; @@ -1609,24 +1735,20 @@ wait_again: } /** Write a record via buffer 2 and read the next record to buffer N. -@param M FTS merge info structure -@param N index into array of merge info structure -@param INDEX the FTS index */ - - -/** Write a record via buffer 2 and read the next record to buffer N. @param N number of the buffer (0 or 1) +@param INDEX record descriptor @param AT_END statement to execute at end of input */ -#define ROW_MERGE_WRITE_GET_NEXT(N, AT_END) \ +#define ROW_MERGE_WRITE_GET_NEXT(N, INDEX, AT_END) \ do { \ - b2 = row_merge_write_rec(&block[2 * srv_sort_buf_size], &buf[2], b2, \ + b2 = row_merge_write_rec(&block[2 * srv_sort_buf_size], \ + &buf[2], b2, \ of->fd, &of->offset, \ mrec##N, offsets##N); \ if (UNIV_UNLIKELY(!b2 || ++of->n_rec > file->n_rec)) { \ goto corrupt; \ } \ - b##N = row_merge_read_rec(&block[N * srv_sort_buf_size], &buf[N], \ - b##N, index, \ + b##N = row_merge_read_rec(&block[N * srv_sort_buf_size],\ + &buf[N], b##N, INDEX, \ file->fd, foffs##N, \ &mrec##N, offsets##N); \ if (UNIV_UNLIKELY(!b##N)) { \ @@ -1640,11 +1762,12 @@ wait_again: /*************************************************************//** Merge two blocks of records on disk and write a bigger block. @return DB_SUCCESS or error code */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t row_merge_blocks( /*=============*/ - const dict_index_t* index, /*!< in: index being created */ + const row_merge_dup_t* dup, /*!< in: descriptor of + index being created */ const merge_file_t* file, /*!< in: file containing index entries */ row_merge_block_t* block, /*!< in/out: 3 buffers */ @@ -1652,20 +1775,18 @@ row_merge_blocks( source list in the file */ ulint* foffs1, /*!< in/out: offset of second source list in the file */ - merge_file_t* of, /*!< in/out: output file */ - struct TABLE* table) /*!< in/out: MySQL table, for - reporting erroneous key value - if applicable */ + merge_file_t* of) /*!< in/out: output file */ { mem_heap_t* heap; /*!< memory heap for offsets0, offsets1 */ mrec_buf_t* buf; /*!< buffer for handling split mrec in block[] */ const byte* b0; /*!< pointer to block[0] */ - const byte* b1; /*!< pointer to block[1] */ - byte* b2; /*!< pointer to block[2] */ + const byte* b1; /*!< pointer to block[srv_sort_buf_size] */ + byte* b2; /*!< pointer to block[2 * srv_sort_buf_size] */ const mrec_t* mrec0; /*!< merge rec, points to block[0] or buf[0] */ - const mrec_t* mrec1; /*!< merge rec, points to block[1] or buf[1] */ + const mrec_t* mrec1; /*!< merge rec, points to + block[srv_sort_buf_size] or buf[1] */ ulint* offsets0;/* offsets of mrec0 */ ulint* offsets1;/* offsets of mrec1 */ @@ -1680,7 +1801,7 @@ row_merge_blocks( } #endif /* UNIV_DEBUG */ - heap = row_merge_heap_create(index, &buf, &offsets0, &offsets1); + heap = row_merge_heap_create(dup->index, &buf, &offsets0, &offsets1); /* Write a record and read the next record. Split the output file in two halves, which can be merged on the following pass. */ @@ -1696,10 +1817,13 @@ corrupt: b1 = &block[srv_sort_buf_size]; b2 = &block[2 * srv_sort_buf_size]; - b0 = row_merge_read_rec(&block[0], &buf[0], b0, index, file->fd, - foffs0, &mrec0, offsets0); - b1 = row_merge_read_rec(&block[srv_sort_buf_size], &buf[srv_sort_buf_size], b1, index, file->fd, - foffs1, &mrec1, offsets1); + b0 = row_merge_read_rec( + &block[0], &buf[0], b0, dup->index, + file->fd, foffs0, &mrec0, offsets0); + b1 = row_merge_read_rec( + &block[srv_sort_buf_size], + &buf[srv_sort_buf_size], b1, dup->index, + file->fd, foffs1, &mrec1, offsets1); if (UNIV_UNLIKELY(!b0 && mrec0) || UNIV_UNLIKELY(!b1 && mrec1)) { @@ -1707,56 +1831,49 @@ corrupt: } while (mrec0 && mrec1) { - ibool null_eq = FALSE; - switch (row_merge_cmp(mrec0, mrec1, - offsets0, offsets1, index, - &null_eq)) { + switch (cmp_rec_rec_simple( + mrec0, mrec1, offsets0, offsets1, + dup->index, dup->table)) { case 0: - if (UNIV_UNLIKELY - (dict_index_is_unique(index) && !null_eq)) { - innobase_rec_to_mysql(table, mrec0, - index, offsets0); - mem_heap_free(heap); - return(DB_DUPLICATE_KEY); - } - /* fall through */ + mem_heap_free(heap); + return(DB_DUPLICATE_KEY); case -1: - ROW_MERGE_WRITE_GET_NEXT(0, goto merged); + ROW_MERGE_WRITE_GET_NEXT(0, dup->index, goto merged); break; case 1: - ROW_MERGE_WRITE_GET_NEXT(1, goto merged); + ROW_MERGE_WRITE_GET_NEXT(1, dup->index, goto merged); break; default: ut_error; } - } merged: if (mrec0) { /* append all mrec0 to output */ for (;;) { - ROW_MERGE_WRITE_GET_NEXT(0, goto done0); + ROW_MERGE_WRITE_GET_NEXT(0, dup->index, goto done0); } } done0: if (mrec1) { /* append all mrec1 to output */ for (;;) { - ROW_MERGE_WRITE_GET_NEXT(1, goto done1); + ROW_MERGE_WRITE_GET_NEXT(1, dup->index, goto done1); } } done1: mem_heap_free(heap); - b2 = row_merge_write_eof(&block[2 * srv_sort_buf_size], b2, of->fd, &of->offset); + b2 = row_merge_write_eof(&block[2 * srv_sort_buf_size], + b2, of->fd, &of->offset); return(b2 ? DB_SUCCESS : DB_CORRUPTION); } /*************************************************************//** Copy a block of index entries. @return TRUE on success, FALSE on failure */ -static __attribute__((nonnull)) +static __attribute__((nonnull, warn_unused_result)) ibool row_merge_blocks_copy( /*==================*/ @@ -1771,7 +1888,7 @@ row_merge_blocks_copy( mrec_buf_t* buf; /*!< buffer for handling split mrec in block[] */ const byte* b0; /*!< pointer to block[0] */ - byte* b2; /*!< pointer to block[2] */ + byte* b2; /*!< pointer to block[2 * srv_sort_buf_size] */ const mrec_t* mrec0; /*!< merge rec, points to block[0] */ ulint* offsets0;/* offsets of mrec0 */ ulint* offsets1;/* dummy offsets */ @@ -1801,8 +1918,8 @@ corrupt: b2 = &block[2 * srv_sort_buf_size]; - b0 = row_merge_read_rec(&block[0], &buf[0], b0, index, file->fd, - foffs0, &mrec0, offsets0); + b0 = row_merge_read_rec(&block[0], &buf[0], b0, index, + file->fd, foffs0, &mrec0, offsets0); if (UNIV_UNLIKELY(!b0 && mrec0)) { goto corrupt; @@ -1811,7 +1928,7 @@ corrupt: if (mrec0) { /* append all mrec0 to output */ for (;;) { - ROW_MERGE_WRITE_GET_NEXT(0, goto done0); + ROW_MERGE_WRITE_GET_NEXT(0, index, goto done0); } } done0: @@ -1821,7 +1938,8 @@ done0: (*foffs0)++; mem_heap_free(heap); - return(row_merge_write_eof(&block[2 * srv_sort_buf_size], b2, of->fd, &of->offset) + return(row_merge_write_eof(&block[2 * srv_sort_buf_size], + b2, of->fd, &of->offset) != NULL); } @@ -1829,18 +1947,16 @@ done0: Merge disk files. @return DB_SUCCESS or error code */ static __attribute__((nonnull)) -ulint +dberr_t row_merge( /*======*/ trx_t* trx, /*!< in: transaction */ - const dict_index_t* index, /*!< in: index being created */ + const row_merge_dup_t* dup, /*!< in: descriptor of + index being created */ merge_file_t* file, /*!< in/out: file containing index entries */ row_merge_block_t* block, /*!< in/out: 3 buffers */ int* tmpfd, /*!< in/out: temporary file handle */ - struct TABLE* table, /*!< in/out: MySQL table, for - reporting erroneous key value - if applicable */ ulint* num_run,/*!< in/out: Number of runs remain to be merged */ ulint* run_offset) /*!< in/out: Array contains the @@ -1849,7 +1965,7 @@ row_merge( { ulint foffs0; /*!< first input offset */ ulint foffs1; /*!< second input offset */ - ulint error; /*!< error code */ + dberr_t error; /*!< error code */ merge_file_t of; /*!< output file */ const ulint ihalf = run_offset[*num_run / 2]; /*!< half the input file */ @@ -1880,15 +1996,15 @@ row_merge( for (; foffs0 < ihalf && foffs1 < file->offset; foffs0++, foffs1++) { - if (UNIV_UNLIKELY(trx_is_interrupted(trx))) { + if (trx_is_interrupted(trx)) { return(DB_INTERRUPTED); } /* Remember the offset number for this run */ run_offset[n_run++] = of.offset; - error = row_merge_blocks(index, file, block, - &foffs0, &foffs1, &of, table); + error = row_merge_blocks(dup, file, block, + &foffs0, &foffs1, &of); if (error != DB_SUCCESS) { return(error); @@ -1906,7 +2022,8 @@ row_merge( /* Remember the offset number for this run */ run_offset[n_run++] = of.offset; - if (!row_merge_blocks_copy(index, file, block, &foffs0, &of)) { + if (!row_merge_blocks_copy(dup->index, file, block, + &foffs0, &of)) { return(DB_CORRUPTION); } } @@ -1914,14 +2031,15 @@ row_merge( ut_ad(foffs0 == ihalf); while (foffs1 < file->offset) { - if (UNIV_UNLIKELY(trx_is_interrupted(trx))) { + if (trx_is_interrupted(trx)) { return(DB_INTERRUPTED); } /* Remember the offset number for this run */ run_offset[n_run++] = of.offset; - if (!row_merge_blocks_copy(index, file, block, &foffs1, &of)) { + if (!row_merge_blocks_copy(dup->index, file, block, + &foffs1, &of)) { return(DB_CORRUPTION); } } @@ -1959,23 +2077,21 @@ row_merge( Merge disk files. @return DB_SUCCESS or error code */ UNIV_INTERN -ulint +dberr_t row_merge_sort( /*===========*/ trx_t* trx, /*!< in: transaction */ - const dict_index_t* index, /*!< in: index being created */ + const row_merge_dup_t* dup, /*!< in: descriptor of + index being created */ merge_file_t* file, /*!< in/out: file containing index entries */ row_merge_block_t* block, /*!< in/out: 3 buffers */ - int* tmpfd, /*!< in/out: temporary file handle */ - struct TABLE* table) /*!< in/out: MySQL table, for - reporting erroneous key value - if applicable */ + int* tmpfd) /*!< in/out: temporary file handle */ { - ulint half = file->offset / 2; - ulint num_runs; - ulint* run_offset; - ulint error = DB_SUCCESS; + const ulint half = file->offset / 2; + ulint num_runs; + ulint* run_offset; + dberr_t error = DB_SUCCESS; /* Record the number of merge runs we need to perform */ num_runs = file->offset; @@ -1998,14 +2114,14 @@ row_merge_sort( /* Merge the runs until we have one big run */ do { - error = row_merge(trx, index, file, block, tmpfd, - table, &num_runs, run_offset); - - UNIV_MEM_ASSERT_RW(run_offset, num_runs * sizeof *run_offset); + error = row_merge(trx, dup, file, block, tmpfd, + &num_runs, run_offset); if (error != DB_SUCCESS) { break; } + + UNIV_MEM_ASSERT_RW(run_offset, num_runs * sizeof *run_offset); } while (num_runs > 1); mem_free(run_offset); @@ -2014,8 +2130,25 @@ row_merge_sort( } /*************************************************************//** +Set blob fields empty */ +static __attribute__((nonnull)) +void +row_merge_set_blob_empty( +/*=====================*/ + dtuple_t* tuple) /*!< in/out: data tuple */ +{ + for (ulint i = 0; i < dtuple_get_n_fields(tuple); i++) { + dfield_t* field = dtuple_get_nth_field(tuple, i); + + if (dfield_is_ext(field)) { + dfield_set_data(field, NULL, 0); + } + } +} + +/*************************************************************//** Copy externally stored columns to the data tuple. */ -static +static __attribute__((nonnull)) void row_merge_copy_blobs( /*=================*/ @@ -2025,10 +2158,9 @@ row_merge_copy_blobs( dtuple_t* tuple, /*!< in/out: data tuple */ mem_heap_t* heap) /*!< in/out: memory heap */ { - ulint i; - ulint n_fields = dtuple_get_n_fields(tuple); + ut_ad(rec_offs_any_extern(offsets)); - for (i = 0; i < n_fields; i++) { + for (ulint i = 0; i < dtuple_get_n_fields(tuple); i++) { ulint len; const void* data; dfield_t* field = dtuple_get_nth_field(tuple, i); @@ -2039,11 +2171,12 @@ row_merge_copy_blobs( ut_ad(!dfield_is_null(field)); - /* The table is locked during index creation. - Therefore, externally stored columns cannot possibly - be freed between the time the BLOB pointers are read - (row_merge_read_clustered_index()) and dereferenced - (below). */ + /* During the creation of a PRIMARY KEY, the table is + X-locked, and we skip copying records that have been + marked for deletion. Therefore, externally stored + columns cannot possibly be freed between the time the + BLOB pointers are read (row_merge_read_clustered_index()) + and dereferenced (below). */ data = btr_rec_copy_externally_stored_field( mrec, offsets, zip_size, i, &len, heap); /* Because we have locked the table, any records @@ -2060,54 +2193,38 @@ row_merge_copy_blobs( Read sorted file containing index data tuples and insert these data tuples to the index @return DB_SUCCESS or error number */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t row_merge_insert_index_tuples( /*==========================*/ - trx_t* trx, /*!< in: transaction */ + trx_id_t trx_id, /*!< in: transaction identifier */ dict_index_t* index, /*!< in: index */ - dict_table_t* table, /*!< in: new table */ - ulint zip_size,/*!< in: compressed page size of - the old table, or 0 if uncompressed */ + const dict_table_t* old_table,/*!< in: old table */ int fd, /*!< in: file descriptor */ row_merge_block_t* block) /*!< in/out: file buffer */ { const byte* b; - que_thr_t* thr; - ins_node_t* node; + mem_heap_t* heap; mem_heap_t* tuple_heap; - mem_heap_t* graph_heap; - ulint error = DB_SUCCESS; + mem_heap_t* ins_heap; + dberr_t error = DB_SUCCESS; ulint foffs = 0; ulint* offsets; + mrec_buf_t* buf; - ut_ad(trx); - ut_ad(index); - ut_ad(table); - + ut_ad(!srv_read_only_mode); ut_ad(!(index->type & DICT_FTS)); - - /* We use the insert query graph as the dummy graph - needed in the row module call */ - - trx->op_info = "inserting index entries"; - - graph_heap = mem_heap_create(500 + sizeof(mrec_buf_t)); - node = ins_node_create(INS_DIRECT, table, graph_heap); - - thr = pars_complete_graph_for_exec(node, trx, graph_heap); - - que_thr_move_to_run_state_for_mysql(thr, trx); + ut_ad(trx_id); tuple_heap = mem_heap_create(1000); { ulint i = 1 + REC_OFFS_HEADER_SIZE + dict_index_get_n_fields(index); - + heap = mem_heap_create(sizeof *buf + i * sizeof *offsets); + ins_heap = mem_heap_create(sizeof *buf + i * sizeof *offsets); offsets = static_cast<ulint*>( - mem_heap_alloc(graph_heap, i * sizeof *offsets)); - + mem_heap_alloc(heap, i * sizeof *offsets)); offsets[0] = i; offsets[1] = dict_index_get_n_fields(index); } @@ -2117,15 +2234,17 @@ row_merge_insert_index_tuples( if (!row_merge_read(fd, foffs, block)) { error = DB_CORRUPTION; } else { - mrec_buf_t* buf; - buf = static_cast<mrec_buf_t*>( - mem_heap_alloc(graph_heap, sizeof *buf)); + mem_heap_alloc(heap, sizeof *buf)); for (;;) { const mrec_t* mrec; dtuple_t* dtuple; ulint n_ext; + big_rec_t* big_rec; + rec_t* rec; + btr_cur_t cursor; + mtr_t mtr; b = row_merge_read_rec(block, buf, b, index, fd, &foffs, &mrec, offsets); @@ -2137,55 +2256,164 @@ row_merge_insert_index_tuples( break; } + dict_index_t* old_index + = dict_table_get_first_index(old_table); + + if (dict_index_is_clust(index) + && dict_index_is_online_ddl(old_index)) { + error = row_log_table_get_error(old_index); + if (error != DB_SUCCESS) { + break; + } + } + dtuple = row_rec_to_index_entry_low( mrec, index, offsets, &n_ext, tuple_heap); - if (UNIV_UNLIKELY(n_ext)) { - row_merge_copy_blobs(mrec, offsets, zip_size, - dtuple, tuple_heap); - } + if (!n_ext) { + /* There are no externally stored columns. */ + } else if (!dict_index_is_online_ddl(old_index)) { + ut_ad(dict_index_is_clust(index)); + /* Modifications to the table are + blocked while we are not rebuilding it + or creating indexes. Off-page columns + can be fetched safely. */ + row_merge_copy_blobs( + mrec, offsets, + dict_table_zip_size(old_table), + dtuple, tuple_heap); + } else { + ut_ad(dict_index_is_clust(index)); - node->row = dtuple; - node->table = table; - node->trx_id = trx->id; + ulint offset = index->trx_id_offset; - ut_ad(dtuple_validate(dtuple)); + if (!offset) { + offset = row_get_trx_id_offset( + index, offsets); + } - do { - thr->run_node = thr; - thr->prev_node = thr->common.parent; + /* Copy the off-page columns while + holding old_index->lock, so + that they cannot be freed by + a rollback of a fresh insert. */ + rw_lock_s_lock(&old_index->lock); + + if (row_log_table_is_rollback( + old_index, + trx_read_trx_id(mrec + offset))) { + /* The row and BLOB could + already be freed. They + will be deleted by + row_undo_ins_remove_clust_rec + when rolling back a fresh + insert. So, no need to retrieve + the off-page column. */ + row_merge_set_blob_empty( + dtuple); + } else { + row_merge_copy_blobs( + mrec, offsets, + dict_table_zip_size(old_table), + dtuple, tuple_heap); + } - error = row_ins_index_entry(index, dtuple, - 0, FALSE, thr); + rw_lock_s_unlock(&old_index->lock); + } - if (UNIV_LIKELY(error == DB_SUCCESS)) { + ut_ad(dtuple_validate(dtuple)); + log_free_check(); - goto next_rec; - } + mtr_start(&mtr); + /* Insert after the last user record. */ + btr_cur_open_at_index_side( + false, index, BTR_MODIFY_LEAF, + &cursor, 0, &mtr); + page_cur_position( + page_rec_get_prev(btr_cur_get_rec(&cursor)), + btr_cur_get_block(&cursor), + btr_cur_get_page_cur(&cursor)); + cursor.flag = BTR_CUR_BINARY; +#ifdef UNIV_DEBUG + /* Check that the records are inserted in order. */ + rec = btr_cur_get_rec(&cursor); + + if (!page_rec_is_infimum(rec)) { + ulint* rec_offsets = rec_get_offsets( + rec, index, offsets, + ULINT_UNDEFINED, &tuple_heap); + ut_ad(cmp_dtuple_rec(dtuple, rec, rec_offsets) + > 0); + } +#endif /* UNIV_DEBUG */ + ulint* ins_offsets = NULL; + + error = btr_cur_optimistic_insert( + BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG + | BTR_KEEP_SYS_FLAG | BTR_CREATE_FLAG, + &cursor, &ins_offsets, &ins_heap, + dtuple, &rec, &big_rec, 0, NULL, &mtr); + + if (error == DB_FAIL) { + ut_ad(!big_rec); + mtr_commit(&mtr); + mtr_start(&mtr); + btr_cur_open_at_index_side( + false, index, BTR_MODIFY_TREE, + &cursor, 0, &mtr); + page_cur_position( + page_rec_get_prev(btr_cur_get_rec( + &cursor)), + btr_cur_get_block(&cursor), + btr_cur_get_page_cur(&cursor)); + + error = btr_cur_pessimistic_insert( + BTR_NO_UNDO_LOG_FLAG + | BTR_NO_LOCKING_FLAG + | BTR_KEEP_SYS_FLAG | BTR_CREATE_FLAG, + &cursor, &ins_offsets, &ins_heap, + dtuple, &rec, &big_rec, 0, NULL, &mtr); + } - thr->lock_state = QUE_THR_LOCK_ROW; + if (!dict_index_is_clust(index)) { + page_update_max_trx_id( + btr_cur_get_block(&cursor), + btr_cur_get_page_zip(&cursor), + trx_id, &mtr); + } - trx->error_state = static_cast<enum db_err>( - error); + mtr_commit(&mtr); - que_thr_stop_for_mysql(thr); - thr->lock_state = QUE_THR_LOCK_NOLOCK; - } while (row_mysql_handle_errors(&error, trx, - thr, NULL)); + if (UNIV_LIKELY_NULL(big_rec)) { + /* If the system crashes at this + point, the clustered index record will + contain a null BLOB pointer. This + should not matter, because the copied + table will be dropped on crash + recovery anyway. */ + + ut_ad(dict_index_is_clust(index)); + ut_ad(error == DB_SUCCESS); + error = row_ins_index_entry_big_rec( + dtuple, big_rec, + ins_offsets, &ins_heap, + index, NULL, __FILE__, __LINE__); + dtuple_convert_back_big_rec( + index, dtuple, big_rec); + } + + if (error != DB_SUCCESS) { + goto err_exit; + } - goto err_exit; -next_rec: mem_heap_empty(tuple_heap); + mem_heap_empty(ins_heap); } } - que_thr_stop_for_mysql_no_error(thr, trx); err_exit: - que_graph_free(thr->graph); - - trx->op_info = ""; - mem_heap_free(tuple_heap); + mem_heap_free(ins_heap); + mem_heap_free(heap); return(error); } @@ -2194,7 +2422,7 @@ err_exit: Sets an exclusive lock on a table, for the duration of creating indexes. @return error code or DB_SUCCESS */ UNIV_INTERN -ulint +dberr_t row_merge_lock_table( /*=================*/ trx_t* trx, /*!< in/out: transaction */ @@ -2203,10 +2431,10 @@ row_merge_lock_table( { mem_heap_t* heap; que_thr_t* thr; - ulint err; + dberr_t err; sel_node_t* node; - ut_ad(trx); + ut_ad(!srv_read_only_mode); ut_ad(mode == LOCK_X || mode == LOCK_S); heap = mem_heap_create(512); @@ -2232,7 +2460,7 @@ run_again: err = lock_table(0, table, mode, thr); - trx->error_state =static_cast<enum db_err>( err); + trx->error_state = err; if (UNIV_LIKELY(err == DB_SUCCESS)) { que_thr_stop_for_mysql_no_error(thr, trx); @@ -2240,7 +2468,7 @@ run_again: que_thr_stop_for_mysql(thr); if (err != DB_QUE_THR_SUSPENDED) { - ibool was_lock_wait; + bool was_lock_wait; was_lock_wait = row_mysql_handle_errors( &err, trx, thr, NULL); @@ -2274,105 +2502,312 @@ run_again: } /*********************************************************************//** -Drop an index from the InnoDB system tables. The data dictionary must -have been locked exclusively by the caller, because the transaction -will not be committed. */ -UNIV_INTERN +Drop an index that was created before an error occurred. +The data dictionary must have been locked exclusively by the caller, +because the transaction will not be committed. */ +static void -row_merge_drop_index( -/*=================*/ - dict_index_t* index, /*!< in: index to be removed */ - dict_table_t* table, /*!< in: table */ - trx_t* trx) /*!< in: transaction handle */ +row_merge_drop_index_dict( +/*======================*/ + trx_t* trx, /*!< in/out: dictionary transaction */ + index_id_t index_id)/*!< in: index identifier */ { - db_err err; - pars_info_t* info = pars_info_create(); - - /* We use the private SQL parser of Innobase to generate the - query graphs needed in deleting the dictionary data from system - tables in Innobase. Deleting a row from SYS_INDEXES table also - frees the file segments of the B-tree associated with the index. */ - static const char sql[] = "PROCEDURE DROP_INDEX_PROC () IS\n" "BEGIN\n" - /* Rename the index, so that it will be dropped by - row_merge_drop_temp_indexes() at crash recovery - if the server crashes before this trx is committed. */ - "UPDATE SYS_INDEXES SET NAME=CONCAT('" - TEMP_INDEX_PREFIX_STR "', NAME) WHERE ID = :indexid;\n" - "COMMIT WORK;\n" - /* Drop the field definitions of the index. */ - "DELETE FROM SYS_FIELDS WHERE INDEX_ID = :indexid;\n" - /* Drop the index definition and the B-tree. */ - "DELETE FROM SYS_INDEXES WHERE ID = :indexid;\n" + "DELETE FROM SYS_FIELDS WHERE INDEX_ID=:indexid;\n" + "DELETE FROM SYS_INDEXES WHERE ID=:indexid;\n" "END;\n"; + dberr_t error; + pars_info_t* info; - ut_ad(index && table && trx); + ut_ad(!srv_read_only_mode); + ut_ad(mutex_own(&dict_sys->mutex)); + ut_ad(trx->dict_operation_lock_mode == RW_X_LATCH); + ut_ad(trx_get_dict_operation(trx) == TRX_DICT_OP_INDEX); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ - pars_info_add_ull_literal(info, "indexid", index->id); + info = pars_info_create(); + pars_info_add_ull_literal(info, "indexid", index_id); + trx->op_info = "dropping index from dictionary"; + error = que_eval_sql(info, sql, FALSE, trx); - trx_start_if_not_started_xa(trx); - trx->op_info = "dropping index"; + if (error != DB_SUCCESS) { + /* Even though we ensure that DDL transactions are WAIT + and DEADLOCK free, we could encounter other errors e.g., + DB_TOO_MANY_CONCURRENT_TRXS. */ + trx->error_state = DB_SUCCESS; - ut_a(trx->dict_operation_lock_mode == RW_X_LATCH); + ut_print_timestamp(stderr); + fprintf(stderr, " InnoDB: Error: row_merge_drop_index_dict " + "failed with error code: %u.\n", (unsigned) error); + } - err = static_cast<db_err>(que_eval_sql(info, sql, FALSE, trx)); + trx->op_info = ""; +} - DBUG_EXECUTE_IF( - "ib_drop_index_too_many_concurrent_trxs", - err = DB_TOO_MANY_CONCURRENT_TRXS; - trx->error_state = err;); +/*********************************************************************//** +Drop indexes that were created before an error occurred. +The data dictionary must have been locked exclusively by the caller, +because the transaction will not be committed. */ +UNIV_INTERN +void +row_merge_drop_indexes_dict( +/*========================*/ + trx_t* trx, /*!< in/out: dictionary transaction */ + table_id_t table_id)/*!< in: table identifier */ +{ + static const char sql[] = + "PROCEDURE DROP_INDEXES_PROC () IS\n" + "ixid CHAR;\n" + "found INT;\n" - if (err == DB_SUCCESS) { + "DECLARE CURSOR index_cur IS\n" + " SELECT ID FROM SYS_INDEXES\n" + " WHERE TABLE_ID=:tableid AND\n" + " SUBSTR(NAME,0,1)='" TEMP_INDEX_PREFIX_STR "'\n" + "FOR UPDATE;\n" - /* If it is FTS index, drop from table->fts and also drop - its auxiliary tables */ - if (index->type & DICT_FTS) { - ut_a(table->fts); - fts_drop_index(table, index, trx); - } + "BEGIN\n" + "found := 1;\n" + "OPEN index_cur;\n" + "WHILE found = 1 LOOP\n" + " FETCH index_cur INTO ixid;\n" + " IF (SQL % NOTFOUND) THEN\n" + " found := 0;\n" + " ELSE\n" + " DELETE FROM SYS_FIELDS WHERE INDEX_ID=ixid;\n" + " DELETE FROM SYS_INDEXES WHERE CURRENT OF index_cur;\n" + " END IF;\n" + "END LOOP;\n" + "CLOSE index_cur;\n" + + "END;\n"; + dberr_t error; + pars_info_t* info; - /* Replace this index with another equivalent index for all - foreign key constraints on this table where this index is - used */ + ut_ad(!srv_read_only_mode); + ut_ad(mutex_own(&dict_sys->mutex)); + ut_ad(trx->dict_operation_lock_mode == RW_X_LATCH); + ut_ad(trx_get_dict_operation(trx) == TRX_DICT_OP_INDEX); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ - dict_table_replace_index_in_foreign_list(table, index, trx); - dict_index_remove_from_cache(table, index); + /* It is possible that table->n_ref_count > 1 when + locked=TRUE. In this case, all code that should have an open + handle to the table be waiting for the next statement to execute, + or waiting for a meta-data lock. - } else { + A concurrent purge will be prevented by dict_operation_lock. */ + + info = pars_info_create(); + pars_info_add_ull_literal(info, "tableid", table_id); + trx->op_info = "dropping indexes"; + error = que_eval_sql(info, sql, FALSE, trx); + + if (error != DB_SUCCESS) { /* Even though we ensure that DDL transactions are WAIT and DEADLOCK free, we could encounter other errors e.g., - DB_TOO_MANY_TRANSACTIONS. */ + DB_TOO_MANY_CONCURRENT_TRXS. */ trx->error_state = DB_SUCCESS; ut_print_timestamp(stderr); - fprintf(stderr, " InnoDB: Error: row_merge_drop_index failed " - "with error code: %lu.\n", (ulint) err); + fprintf(stderr, " InnoDB: Error: row_merge_drop_indexes_dict " + "failed with error code: %u.\n", (unsigned) error); } trx->op_info = ""; } /*********************************************************************//** -Drop those indexes which were created before an error occurred when -building an index. The data dictionary must have been locked -exclusively by the caller, because the transaction will not be -committed. */ +Drop indexes that were created before an error occurred. +The data dictionary must have been locked exclusively by the caller, +because the transaction will not be committed. */ UNIV_INTERN void row_merge_drop_indexes( /*===================*/ - trx_t* trx, /*!< in: transaction */ - dict_table_t* table, /*!< in: table containing the indexes */ - dict_index_t** index, /*!< in: indexes to drop */ - ulint num_created) /*!< in: number of elements in index[] */ + trx_t* trx, /*!< in/out: dictionary transaction */ + dict_table_t* table, /*!< in/out: table containing the indexes */ + ibool locked) /*!< in: TRUE=table locked, + FALSE=may need to do a lazy drop */ { - ulint key_num; + dict_index_t* index; + dict_index_t* next_index; + + ut_ad(!srv_read_only_mode); + ut_ad(mutex_own(&dict_sys->mutex)); + ut_ad(trx->dict_operation_lock_mode == RW_X_LATCH); + ut_ad(trx_get_dict_operation(trx) == TRX_DICT_OP_INDEX); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + + index = dict_table_get_first_index(table); + ut_ad(dict_index_is_clust(index)); + ut_ad(dict_index_get_online_status(index) == ONLINE_INDEX_COMPLETE); + + /* the caller should have an open handle to the table */ + ut_ad(table->n_ref_count >= 1); + + /* It is possible that table->n_ref_count > 1 when + locked=TRUE. In this case, all code that should have an open + handle to the table be waiting for the next statement to execute, + or waiting for a meta-data lock. + + A concurrent purge will be prevented by dict_operation_lock. */ + + if (!locked && table->n_ref_count > 1) { + /* We will have to drop the indexes later, when the + table is guaranteed to be no longer in use. Mark the + indexes as incomplete and corrupted, so that other + threads will stop using them. Let dict_table_close() + or crash recovery or the next invocation of + prepare_inplace_alter_table() take care of dropping + the indexes. */ + + while ((index = dict_table_get_next_index(index)) != NULL) { + ut_ad(!dict_index_is_clust(index)); + + switch (dict_index_get_online_status(index)) { + case ONLINE_INDEX_ABORTED_DROPPED: + continue; + case ONLINE_INDEX_COMPLETE: + if (*index->name != TEMP_INDEX_PREFIX) { + /* Do nothing to already + published indexes. */ + } else if (index->type & DICT_FTS) { + /* Drop a completed FULLTEXT + index, due to a timeout during + MDL upgrade for + commit_inplace_alter_table(). + Because only concurrent reads + are allowed (and they are not + seeing this index yet) we + are safe to drop the index. */ + dict_index_t* prev = UT_LIST_GET_PREV( + indexes, index); + /* At least there should be + the clustered index before + this one. */ + ut_ad(prev); + ut_a(table->fts); + fts_drop_index(table, index, trx); + /* Since + INNOBASE_SHARE::idx_trans_tbl + is shared between all open + ha_innobase handles to this + table, no thread should be + accessing this dict_index_t + object. Also, we should be + holding LOCK=SHARED MDL on the + table even after the MDL + upgrade timeout. */ + + /* We can remove a DICT_FTS + index from the cache, because + we do not allow ADD FULLTEXT INDEX + with LOCK=NONE. If we allowed that, + we should exclude FTS entries from + prebuilt->ins_node->entry_list + in ins_node_create_entry_list(). */ + dict_index_remove_from_cache( + table, index); + index = prev; + } else { + rw_lock_x_lock( + dict_index_get_lock(index)); + dict_index_set_online_status( + index, ONLINE_INDEX_ABORTED); + index->type |= DICT_CORRUPT; + table->drop_aborted = TRUE; + goto drop_aborted; + } + continue; + case ONLINE_INDEX_CREATION: + rw_lock_x_lock(dict_index_get_lock(index)); + ut_ad(*index->name == TEMP_INDEX_PREFIX); + row_log_abort_sec(index); + drop_aborted: + rw_lock_x_unlock(dict_index_get_lock(index)); + + DEBUG_SYNC_C("merge_drop_index_after_abort"); + /* covered by dict_sys->mutex */ + MONITOR_INC(MONITOR_BACKGROUND_DROP_INDEX); + /* fall through */ + case ONLINE_INDEX_ABORTED: + /* Drop the index tree from the + data dictionary and free it from + the tablespace, but keep the object + in the data dictionary cache. */ + row_merge_drop_index_dict(trx, index->id); + rw_lock_x_lock(dict_index_get_lock(index)); + dict_index_set_online_status( + index, ONLINE_INDEX_ABORTED_DROPPED); + rw_lock_x_unlock(dict_index_get_lock(index)); + table->drop_aborted = TRUE; + continue; + } + ut_error; + } - for (key_num = 0; key_num < num_created; key_num++) { - row_merge_drop_index(index[key_num], table, trx); + return; } + + row_merge_drop_indexes_dict(trx, table->id); + + /* Invalidate all row_prebuilt_t::ins_graph that are referring + to this table. That is, force row_get_prebuilt_insert_row() to + rebuild prebuilt->ins_node->entry_list). */ + ut_ad(table->def_trx_id <= trx->id); + table->def_trx_id = trx->id; + + next_index = dict_table_get_next_index(index); + + while ((index = next_index) != NULL) { + /* read the next pointer before freeing the index */ + next_index = dict_table_get_next_index(index); + + ut_ad(!dict_index_is_clust(index)); + + if (*index->name == TEMP_INDEX_PREFIX) { + /* If it is FTS index, drop from table->fts + and also drop its auxiliary tables */ + if (index->type & DICT_FTS) { + ut_a(table->fts); + fts_drop_index(table, index, trx); + } + + switch (dict_index_get_online_status(index)) { + case ONLINE_INDEX_CREATION: + /* This state should only be possible + when prepare_inplace_alter_table() fails + after invoking row_merge_create_index(). + In inplace_alter_table(), + row_merge_build_indexes() + should never leave the index in this state. + It would invoke row_log_abort_sec() on + failure. */ + case ONLINE_INDEX_COMPLETE: + /* In these cases, we are able to drop + the index straight. The DROP INDEX was + never deferred. */ + break; + case ONLINE_INDEX_ABORTED: + case ONLINE_INDEX_ABORTED_DROPPED: + /* covered by dict_sys->mutex */ + MONITOR_DEC(MONITOR_BACKGROUND_DROP_INDEX); + } + + dict_index_remove_from_cache(table, index); + } + } + + table->drop_aborted = FALSE; + ut_d(dict_table_check_for_dup_indexes(table, CHECK_ALL_COMPLETE)); } /*********************************************************************//** @@ -2382,9 +2817,32 @@ void row_merge_drop_temp_indexes(void) /*=============================*/ { - trx_t* trx; - btr_pcur_t pcur; - mtr_t mtr; + static const char sql[] = + "PROCEDURE DROP_TEMP_INDEXES_PROC () IS\n" + "ixid CHAR;\n" + "found INT;\n" + + "DECLARE CURSOR index_cur IS\n" + " SELECT ID FROM SYS_INDEXES\n" + " WHERE SUBSTR(NAME,0,1)='" TEMP_INDEX_PREFIX_STR "'\n" + "FOR UPDATE;\n" + + "BEGIN\n" + "found := 1;\n" + "OPEN index_cur;\n" + "WHILE found = 1 LOOP\n" + " FETCH index_cur INTO ixid;\n" + " IF (SQL % NOTFOUND) THEN\n" + " found := 0;\n" + " ELSE\n" + " DELETE FROM SYS_FIELDS WHERE INDEX_ID=ixid;\n" + " DELETE FROM SYS_INDEXES WHERE CURRENT OF index_cur;\n" + " END IF;\n" + "END LOOP;\n" + "CLOSE index_cur;\n" + "END;\n"; + trx_t* trx; + dberr_t error; /* Load the table definitions that contain partially defined indexes, so that the data dictionary information can be checked @@ -2392,75 +2850,26 @@ row_merge_drop_temp_indexes(void) trx = trx_allocate_for_background(); trx->op_info = "dropping partially created indexes"; row_mysql_lock_data_dictionary(trx); + /* Ensure that this transaction will be rolled back and locks + will be released, if the server gets killed before the commit + gets written to the redo log. */ + trx_set_dict_operation(trx, TRX_DICT_OP_INDEX); - mtr_start(&mtr); - - btr_pcur_open_at_index_side( - TRUE, - dict_table_get_first_index(dict_sys->sys_indexes), - BTR_SEARCH_LEAF, &pcur, TRUE, &mtr); - - for (;;) { - const rec_t* rec; - const byte* field; - ulint len; - table_id_t table_id; - dict_table_t* table; - - btr_pcur_move_to_next_user_rec(&pcur, &mtr); - - if (!btr_pcur_is_on_user_rec(&pcur)) { - break; - } - - rec = btr_pcur_get_rec(&pcur); - field = rec_get_nth_field_old( - rec, DICT_FLD__SYS_INDEXES__NAME, &len); - if (len == UNIV_SQL_NULL || len == 0 - || (char) *field != TEMP_INDEX_PREFIX) { - continue; - } - - /* This is a temporary index. */ + trx->op_info = "dropping indexes"; + error = que_eval_sql(NULL, sql, FALSE, trx); - field = rec_get_nth_field_old( - rec, DICT_FLD__SYS_INDEXES__TABLE_ID, &len); - if (len != 8) { - /* Corrupted TABLE_ID */ - continue; - } - - table_id = mach_read_from_8(field); - - btr_pcur_store_position(&pcur, &mtr); - btr_pcur_commit_specify_mtr(&pcur, &mtr); - - table = dict_table_open_on_id(table_id, TRUE); - - if (table) { - dict_index_t* index; - dict_index_t* next_index; - - for (index = dict_table_get_first_index(table); - index; index = next_index) { - - next_index = dict_table_get_next_index(index); - - if (*index->name == TEMP_INDEX_PREFIX) { - row_merge_drop_index(index, table, trx); - trx_commit_for_mysql(trx); - } - } - - dict_table_close(table, TRUE); - } + if (error != DB_SUCCESS) { + /* Even though we ensure that DDL transactions are WAIT + and DEADLOCK free, we could encounter other errors e.g., + DB_TOO_MANY_CONCURRENT_TRXS. */ + trx->error_state = DB_SUCCESS; - mtr_start(&mtr); - btr_pcur_restore_position(BTR_SEARCH_LEAF, &pcur, &mtr); + ut_print_timestamp(stderr); + fprintf(stderr, " InnoDB: Error: row_merge_drop_temp_indexes " + "failed with error code: %u.\n", (unsigned) error); } - btr_pcur_close(&pcur); - mtr_commit(&mtr); + trx_commit_for_mysql(trx); row_mysql_unlock_data_dictionary(trx); trx_free_for_background(trx); } @@ -2469,7 +2878,7 @@ row_merge_drop_temp_indexes(void) Creates temporary merge files, and if UNIV_PFS_IO defined, register the file descriptor with Performance Schema. @return file descriptor, or -1 on failure */ -UNIV_INLINE +UNIV_INTERN int row_merge_file_create_low(void) /*===========================*/ @@ -2488,12 +2897,13 @@ row_merge_file_create_low(void) #endif fd = innobase_mysql_tmpfile(); #ifdef UNIV_PFS_IO - register_pfs_file_open_end(locker, fd); + register_pfs_file_open_end(locker, fd); #endif + if (fd < 0) { - fprintf(stderr, - "InnoDB: Error: Cannot create temporary merge file\n"); - return(-1); + ib_logf(IB_LOG_LEVEL_ERROR, + "Cannot create temporary merge file"); + return -1; } return(fd); } @@ -2508,18 +2918,22 @@ row_merge_file_create( merge_file_t* merge_file) /*!< out: merge file structure */ { merge_file->fd = row_merge_file_create_low(); - if (srv_disable_sort_file_cache) { - os_file_set_nocache(merge_file->fd, "row0merge.c", "sort"); - } merge_file->offset = 0; merge_file->n_rec = 0; + + if (merge_file->fd >= 0) { + if (srv_disable_sort_file_cache) { + os_file_set_nocache(merge_file->fd, + "row0merge.cc", "sort"); + } + } return(merge_file->fd); } /*********************************************************************//** Destroy a merge file. And de-register the file from Performance Schema if UNIV_PFS_IO is defined. */ -UNIV_INLINE +UNIV_INTERN void row_merge_file_destroy_low( /*=======================*/ @@ -2532,7 +2946,9 @@ row_merge_file_destroy_low( fd, 0, PSI_FILE_CLOSE, __FILE__, __LINE__); #endif - close(fd); + if (fd >= 0) { + close(fd); + } #ifdef UNIV_PFS_IO register_pfs_file_io_end(locker, 0); #endif @@ -2543,8 +2959,10 @@ UNIV_INTERN void row_merge_file_destroy( /*===================*/ - merge_file_t* merge_file) /*!< out: merge file structure */ + merge_file_t* merge_file) /*!< in/out: merge file structure */ { + ut_ad(!srv_read_only_mode); + if (merge_file->fd != -1) { row_merge_file_destroy_low(merge_file->fd); merge_file->fd = -1; @@ -2552,173 +2970,109 @@ row_merge_file_destroy( } /*********************************************************************//** -Determine the precise type of a column that is added to a tem -if a column must be constrained NOT NULL. -@return col->prtype, possibly ORed with DATA_NOT_NULL */ -UNIV_INLINE -ulint -row_merge_col_prtype( -/*=================*/ - const dict_col_t* col, /*!< in: column */ - const char* col_name, /*!< in: name of the column */ - const merge_index_def_t*index_def) /*!< in: the index definition - of the primary key */ -{ - ulint prtype = col->prtype; - ulint i; - - ut_ad(index_def->ind_type & DICT_CLUSTERED); - - if (prtype & DATA_NOT_NULL) { - - return(prtype); - } - - /* All columns that are included - in the PRIMARY KEY must be NOT NULL. */ - - for (i = 0; i < index_def->n_fields; i++) { - if (!strcmp(col_name, index_def->fields[i].field_name)) { - return(prtype | DATA_NOT_NULL); - } - } - - return(prtype); -} - -/*********************************************************************//** -Create a temporary table for creating a primary key, using the definition -of an existing table. -@return table, or NULL on error */ +Rename an index in the dictionary that was created. The data +dictionary must have been locked exclusively by the caller, because +the transaction will not be committed. +@return DB_SUCCESS if all OK */ UNIV_INTERN -dict_table_t* -row_merge_create_temporary_table( -/*=============================*/ - const char* table_name, /*!< in: new table name */ - const merge_index_def_t*index_def, /*!< in: the index definition - of the primary key */ - const dict_table_t* table, /*!< in: old table definition */ - trx_t* trx) /*!< in/out: transaction - (sets error_state) */ +dberr_t +row_merge_rename_index_to_add( +/*==========================*/ + trx_t* trx, /*!< in/out: transaction */ + table_id_t table_id, /*!< in: table identifier */ + index_id_t index_id) /*!< in: index identifier */ { - ulint i; - dict_table_t* new_table = NULL; - ulint n_cols = dict_table_get_n_user_cols(table); - ulint error; - mem_heap_t* heap = mem_heap_create(1000); - ulint num_col; - - ut_ad(table_name); - ut_ad(index_def); - ut_ad(table); - ut_ad(mutex_own(&dict_sys->mutex)); - - num_col = DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_ADD_DOC_ID) - ? n_cols + 1 - : n_cols; - - new_table = dict_mem_table_create( - table_name, 0, num_col, table->flags, table->flags2); - - for (i = 0; i < n_cols; i++) { - const dict_col_t* col; - const char* col_name; + dberr_t err = DB_SUCCESS; + pars_info_t* info = pars_info_create(); - col = dict_table_get_nth_col(table, i); - col_name = dict_table_get_col_name(table, i); + /* We use the private SQL parser of Innobase to generate the + query graphs needed in renaming indexes. */ - dict_mem_table_add_col(new_table, heap, col_name, col->mtype, - row_merge_col_prtype(col, col_name, - index_def), - col->len); - } + static const char rename_index[] = + "PROCEDURE RENAME_INDEX_PROC () IS\n" + "BEGIN\n" + "UPDATE SYS_INDEXES SET NAME=SUBSTR(NAME,1,LENGTH(NAME)-1)\n" + "WHERE TABLE_ID = :tableid AND ID = :indexid;\n" + "END;\n"; - /* Add the FTS doc_id hidden column */ - if (DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_ADD_DOC_ID)) { - fts_add_doc_id_column(new_table); - new_table->fts->doc_col = n_cols; - } + ut_ad(trx); + ut_a(trx->dict_operation_lock_mode == RW_X_LATCH); + ut_ad(trx_get_dict_operation(trx) == TRX_DICT_OP_INDEX); - error = row_create_table_for_mysql(new_table, trx); - mem_heap_free(heap); + trx->op_info = "renaming index to add"; - if (error != DB_SUCCESS) { - trx->error_state = static_cast<enum db_err>(error); - new_table = NULL; - } else { - dict_table_t* temp_table; + pars_info_add_ull_literal(info, "tableid", table_id); + pars_info_add_ull_literal(info, "indexid", index_id); - /* We need to bump up the table ref count and before we can - use it we need to open the table. */ + err = que_eval_sql(info, rename_index, FALSE, trx); - temp_table = dict_table_open_on_name_no_stats( - new_table->name, TRUE, DICT_ERR_IGNORE_NONE); + if (err != DB_SUCCESS) { + /* Even though we ensure that DDL transactions are WAIT + and DEADLOCK free, we could encounter other errors e.g., + DB_TOO_MANY_CONCURRENT_TRXS. */ + trx->error_state = DB_SUCCESS; - ut_a(new_table == temp_table); + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Error: row_merge_rename_index_to_add " + "failed with error code: %u.\n", (unsigned) err); } - return(new_table); + trx->op_info = ""; + + return(err); } /*********************************************************************//** -Rename the temporary indexes in the dictionary to permanent ones. The -data dictionary must have been locked exclusively by the caller, -because the transaction will not be committed. +Rename an index in the dictionary that is to be dropped. The data +dictionary must have been locked exclusively by the caller, because +the transaction will not be committed. @return DB_SUCCESS if all OK */ UNIV_INTERN -ulint -row_merge_rename_indexes( -/*=====================*/ +dberr_t +row_merge_rename_index_to_drop( +/*===========================*/ trx_t* trx, /*!< in/out: transaction */ - dict_table_t* table) /*!< in/out: table with new indexes */ + table_id_t table_id, /*!< in: table identifier */ + index_id_t index_id) /*!< in: index identifier */ { - db_err err = DB_SUCCESS; + dberr_t err; pars_info_t* info = pars_info_create(); + ut_ad(!srv_read_only_mode); + /* We use the private SQL parser of Innobase to generate the query graphs needed in renaming indexes. */ - static const char* sql = - "PROCEDURE RENAME_INDEXES_PROC () IS\n" + static const char rename_index[] = + "PROCEDURE RENAME_INDEX_PROC () IS\n" "BEGIN\n" - "UPDATE SYS_INDEXES SET NAME=SUBSTR(NAME,1,LENGTH(NAME)-1)\n" - "WHERE TABLE_ID = :tableid AND SUBSTR(NAME,0,1)='" - TEMP_INDEX_PREFIX_STR "';\n" + "UPDATE SYS_INDEXES SET NAME=CONCAT('" + TEMP_INDEX_PREFIX_STR "',NAME)\n" + "WHERE TABLE_ID = :tableid AND ID = :indexid;\n" "END;\n"; - ut_ad(table); ut_ad(trx); ut_a(trx->dict_operation_lock_mode == RW_X_LATCH); + ut_ad(trx_get_dict_operation(trx) == TRX_DICT_OP_INDEX); - trx->op_info = "renaming indexes"; + trx->op_info = "renaming index to drop"; - pars_info_add_ull_literal(info, "tableid", table->id); + pars_info_add_ull_literal(info, "tableid", table_id); + pars_info_add_ull_literal(info, "indexid", index_id); - err = static_cast<db_err>(que_eval_sql(info, sql, FALSE, trx)); + err = que_eval_sql(info, rename_index, FALSE, trx); - DBUG_EXECUTE_IF( - "ib_rename_indexes_too_many_concurrent_trxs", - err = DB_TOO_MANY_CONCURRENT_TRXS; - trx->error_state = static_cast<db_err>(err);); - - if (err == DB_SUCCESS) { - dict_index_t* index = dict_table_get_first_index(table); - do { - if (*index->name == TEMP_INDEX_PREFIX) { - index->name++; - } - index = dict_table_get_next_index(index); - } while (index); - } else { + if (err != DB_SUCCESS) { /* Even though we ensure that DDL transactions are WAIT and DEADLOCK free, we could encounter other errors e.g., - DB_TOO_MANY_TRANSACTIONS. */ - + DB_TOO_MANY_CONCURRENT_TRXS. */ trx->error_state = DB_SUCCESS; ut_print_timestamp(stderr); - fprintf(stderr, " InnoDB: Error: row_merge_rename_indexes " - "failed with error code: %lu.\n", (ulint) err); + fprintf(stderr, + " InnoDB: Error: row_merge_rename_index_to_drop " + "failed with error code: %u.\n", (unsigned) err); } trx->op_info = ""; @@ -2727,12 +3081,39 @@ row_merge_rename_indexes( } /*********************************************************************//** +Provide a new pathname for a table that is being renamed if it belongs to +a file-per-table tablespace. The caller is responsible for freeing the +memory allocated for the return value. +@return new pathname of tablespace file, or NULL if space = 0 */ +UNIV_INTERN +char* +row_make_new_pathname( +/*==================*/ + dict_table_t* table, /*!< in: table to be renamed */ + const char* new_name) /*!< in: new name */ +{ + char* new_path; + char* old_path; + + ut_ad(table->space != TRX_SYS_SPACE); + + old_path = fil_space_get_first_path(table->space); + ut_a(old_path); + + new_path = os_file_make_new_pathname(old_path, new_name); + + mem_free(old_path); + + return(new_path); +} + +/*********************************************************************//** Rename the tables in the data dictionary. The data dictionary must have been locked exclusively by the caller, because the transaction will not be committed. @return error code or DB_SUCCESS */ UNIV_INTERN -ulint +dberr_t row_merge_rename_tables( /*====================*/ dict_table_t* old_table, /*!< in/out: old table, renamed to @@ -2742,28 +3123,32 @@ row_merge_rename_tables( const char* tmp_name, /*!< in: new name for old_table */ trx_t* trx) /*!< in: transaction handle */ { - ulint err = DB_ERROR; + dberr_t err = DB_ERROR; pars_info_t* info; char old_name[MAX_FULL_NAME_LEN + 1]; + ut_ad(!srv_read_only_mode); ut_ad(old_table != new_table); ut_ad(mutex_own(&dict_sys->mutex)); - ut_a(trx->dict_operation_lock_mode == RW_X_LATCH); + ut_ad(trx_get_dict_operation(trx) == TRX_DICT_OP_TABLE); /* store the old/current name to an automatic variable */ if (strlen(old_table->name) + 1 <= sizeof(old_name)) { memcpy(old_name, old_table->name, strlen(old_table->name) + 1); } else { - ut_print_timestamp(stderr); - fprintf(stderr, " InnoDB: too long table name: '%s', " - "max length is %d\n", old_table->name, - MAX_FULL_NAME_LEN); + ib_logf(IB_LOG_LEVEL_ERROR, + "Too long table name: '%s', max length is %d", + old_table->name, MAX_FULL_NAME_LEN); ut_error; } trx->op_info = "renaming tables"; + DBUG_EXECUTE_IF( + "ib_rebuild_cannot_rename", + err = DB_ERROR; goto err_exit;); + /* We use the private SQL parser of Innobase to generate the query graphs needed in updating the dictionary data in system tables. */ @@ -2782,8 +3167,63 @@ row_merge_rename_tables( " WHERE NAME = :new_name;\n" "END;\n", FALSE, trx); - if (err != DB_SUCCESS) { + /* Update SYS_TABLESPACES and SYS_DATAFILES if the old + table is in a non-system tablespace where space > 0. */ + if (err == DB_SUCCESS + && old_table->space != TRX_SYS_SPACE + && !old_table->ibd_file_missing) { + /* Make pathname to update SYS_DATAFILES. */ + char* tmp_path = row_make_new_pathname(old_table, tmp_name); + + info = pars_info_create(); + + pars_info_add_str_literal(info, "tmp_name", tmp_name); + pars_info_add_str_literal(info, "tmp_path", tmp_path); + pars_info_add_int4_literal(info, "old_space", + (lint) old_table->space); + + err = que_eval_sql(info, + "PROCEDURE RENAME_OLD_SPACE () IS\n" + "BEGIN\n" + "UPDATE SYS_TABLESPACES" + " SET NAME = :tmp_name\n" + " WHERE SPACE = :old_space;\n" + "UPDATE SYS_DATAFILES" + " SET PATH = :tmp_path\n" + " WHERE SPACE = :old_space;\n" + "END;\n", FALSE, trx); + + mem_free(tmp_path); + } + + /* Update SYS_TABLESPACES and SYS_DATAFILES if the new + table is in a non-system tablespace where space > 0. */ + if (err == DB_SUCCESS && new_table->space != TRX_SYS_SPACE) { + /* Make pathname to update SYS_DATAFILES. */ + char* old_path = row_make_new_pathname(new_table, old_name); + + info = pars_info_create(); + + pars_info_add_str_literal(info, "old_name", old_name); + pars_info_add_str_literal(info, "old_path", old_path); + pars_info_add_int4_literal(info, "new_space", + (lint) new_table->space); + + err = que_eval_sql(info, + "PROCEDURE RENAME_NEW_SPACE () IS\n" + "BEGIN\n" + "UPDATE SYS_TABLESPACES" + " SET NAME = :old_name\n" + " WHERE SPACE = :new_space;\n" + "UPDATE SYS_DATAFILES" + " SET PATH = :old_path\n" + " WHERE SPACE = :new_space;\n" + "END;\n", FALSE, trx); + + mem_free(old_path); + } + if (err != DB_SUCCESS) { goto err_exit; } @@ -2812,13 +3252,39 @@ row_merge_rename_tables( /* The following calls will also rename the .ibd data files if the tables are stored in a single-table tablespace */ - if (!dict_table_rename_in_cache(old_table, tmp_name, FALSE) - || !dict_table_rename_in_cache(new_table, old_name, FALSE)) { + err = dict_table_rename_in_cache(old_table, tmp_name, FALSE); - err = DB_ERROR; - goto err_exit; + if (err == DB_SUCCESS) { + + ut_ad(dict_table_is_discarded(old_table) + == dict_table_is_discarded(new_table)); + + err = dict_table_rename_in_cache(new_table, old_name, FALSE); + + if (err != DB_SUCCESS) { + + if (dict_table_rename_in_cache( + old_table, old_name, FALSE) + != DB_SUCCESS) { + + ib_logf(IB_LOG_LEVEL_ERROR, + "Cannot undo the rename in cache " + "from %s to %s", old_name, tmp_name); + } + + goto err_exit; + } + + if (dict_table_is_discarded(new_table)) { + + err = row_import_update_discarded_flag( + trx, new_table->id, true, true); + } } + DBUG_EXECUTE_IF("ib_rebuild_cannot_load_fk", + err = DB_ERROR; goto err_exit;); + err = dict_load_foreigns(old_name, FALSE, TRUE); if (err != DB_SUCCESS) { @@ -2836,8 +3302,8 @@ err_exit: /*********************************************************************//** Create and execute a query graph for creating an index. @return DB_SUCCESS or error code */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t row_merge_create_index_graph( /*=========================*/ trx_t* trx, /*!< in: trx */ @@ -2847,7 +3313,7 @@ row_merge_create_index_graph( ind_node_t* node; /*!< Index creation node */ mem_heap_t* heap; /*!< Memory heap */ que_thr_t* thr; /*!< Query thread */ - ulint err; + dberr_t err; ut_ad(trx); ut_ad(table); @@ -2856,7 +3322,7 @@ row_merge_create_index_graph( heap = mem_heap_create(512); index->table = table; - node = ind_create_graph_create(index, heap); + node = ind_create_graph_create(index, heap, false); thr = pars_complete_graph_for_exec(node, trx, heap); ut_a(thr == que_fork_start_command( @@ -2880,14 +3346,16 @@ row_merge_create_index( /*===================*/ trx_t* trx, /*!< in/out: trx (sets error_state) */ dict_table_t* table, /*!< in: the index is on this table */ - const merge_index_def_t*index_def) + const index_def_t* index_def) /*!< in: the index definition */ { dict_index_t* index; - ulint err; + dberr_t err; ulint n_fields = index_def->n_fields; ulint i; + ut_ad(!srv_read_only_mode); + /* Create the index prototype, using the passed in def, this is not a persistent operation. We pass 0 as the space id, and determine at a lower level the space id where to store the table. */ @@ -2898,10 +3366,11 @@ row_merge_create_index( ut_a(index); for (i = 0; i < n_fields; i++) { - merge_index_field_t* ifield = &index_def->fields[i]; + index_field_t* ifield = &index_def->fields[i]; - dict_mem_index_add_field(index, ifield->field_name, - ifield->prefix_len); + dict_mem_index_add_field( + index, dict_table_get_col_name(table, ifield->col_no), + ifield->prefix_len); } /* Add the index to SYS_INDEXES, using the index prototype. */ @@ -2909,15 +3378,14 @@ row_merge_create_index( if (err == DB_SUCCESS) { - index = row_merge_dict_table_get_index( - table, index_def); + index = dict_table_get_index_on_name(table, index_def->name); ut_a(index); /* Note the id of the transaction that created this index, we use it to restrict readers from accessing this index, to ensure read consistency. */ - index->trx_id = trx->id; + ut_ad(index->trx_id == trx->id); } else { index = NULL; } @@ -2934,35 +3402,46 @@ row_merge_is_index_usable( const trx_t* trx, /*!< in: transaction */ const dict_index_t* index) /*!< in: index to check */ { + if (!dict_index_is_clust(index) + && dict_index_is_online_ddl(index)) { + /* Indexes that are being created are not useable. */ + return(FALSE); + } + return(!dict_index_is_corrupted(index) - && (!trx->read_view - || read_view_sees_trx_id(trx->read_view, index->trx_id))); + && (dict_table_is_temporary(index->table) + || !trx->read_view + || read_view_sees_trx_id(trx->read_view, index->trx_id))); } /*********************************************************************//** -Drop the old table. +Drop a table. The caller must have ensured that the background stats +thread is not processing the table. This can be done by calling +dict_stats_wait_bg_to_stop_using_tables() after locking the dictionary and +before calling this function. @return DB_SUCCESS or error code */ UNIV_INTERN -ulint +dberr_t row_merge_drop_table( /*=================*/ trx_t* trx, /*!< in: transaction */ dict_table_t* table) /*!< in: table to drop */ { + ut_ad(!srv_read_only_mode); + /* There must be no open transactions on the table. */ ut_a(table->n_ref_count == 0); - return(row_drop_table_for_mysql(table->name, trx, FALSE)); + return(row_drop_table_for_mysql(table->name, trx, false, false)); } - /*********************************************************************//** Build indexes on a table by reading a clustered index, creating a temporary file containing index entries, merge sorting these index entries and inserting sorted index entries to indexes. @return DB_SUCCESS or error code */ UNIV_INTERN -ulint +dberr_t row_merge_build_indexes( /*====================*/ trx_t* trx, /*!< in: transaction */ @@ -2971,54 +3450,59 @@ row_merge_build_indexes( dict_table_t* new_table, /*!< in: table where indexes are created; identical to old_table unless creating a PRIMARY KEY */ + bool online, /*!< in: true if creating indexes + online */ dict_index_t** indexes, /*!< in: indexes to be created */ + const ulint* key_numbers, /*!< in: MySQL key numbers */ ulint n_indexes, /*!< in: size of indexes[] */ - struct TABLE* table) /*!< in/out: MySQL table, for + struct TABLE* table, /*!< in/out: MySQL table, for reporting erroneous key value if applicable */ + const dtuple_t* add_cols, /*!< in: default values of + added columns, or NULL */ + const ulint* col_map, /*!< in: mapping of old column + numbers to new ones, or NULL + if old_table == new_table */ + ulint add_autoinc, /*!< in: number of added + AUTO_INCREMENT column, or + ULINT_UNDEFINED if none is added */ + ib_sequence_t& sequence) /*!< in: autoinc instance if + add_autoinc != ULINT_UNDEFINED */ { merge_file_t* merge_files; row_merge_block_t* block; ulint block_size; ulint i; ulint j; - ulint error; - int tmpfd = -1; + dberr_t error; + int tmpfd; dict_index_t* fts_sort_idx = NULL; fts_psort_t* psort_info = NULL; fts_psort_t* merge_info = NULL; ib_int64_t sig_count = 0; - ut_ad(trx); - ut_ad(old_table); - ut_ad(new_table); - ut_ad(indexes); - ut_ad(n_indexes); - - trx_start_if_not_started_xa(trx); + ut_ad(!srv_read_only_mode); + ut_ad((old_table == new_table) == !col_map); + ut_ad(!add_cols || col_map); /* Allocate memory for merge file data structure and initialize fields */ - merge_files = static_cast<merge_file_t*>( - mem_alloc(n_indexes * sizeof *merge_files)); - block_size = 3 * srv_sort_buf_size; block = static_cast<row_merge_block_t*>( os_mem_alloc_large(&block_size)); - /* Initialize all the merge file descriptors, so that we - don't call row_merge_file_destroy() on uninitialized - merge file descriptor */ - - for (i = 0; i < n_indexes; i++) { - merge_files[i].fd = -1; + if (block == NULL) { + return(DB_OUT_OF_MEMORY); } - for (i = 0; i < n_indexes; i++) { + trx_start_if_not_started_xa(trx); - if (row_merge_file_create(&merge_files[i]) < 0) - { + merge_files = static_cast<merge_file_t*>( + mem_alloc(n_indexes * sizeof *merge_files)); + + for (i = 0; i < n_indexes; i++) { + if (row_merge_file_create(&merge_files[i]) < 0) { error = DB_OUT_OF_MEMORY; goto func_exit; } @@ -3031,19 +3515,24 @@ row_merge_build_indexes( we need to build a "fts sort index" indexing on above three 'fields' */ fts_sort_idx = row_merge_create_fts_sort_index( - indexes[i], old_table, - &opt_doc_id_size); - - row_fts_psort_info_init(trx, table, new_table, - fts_sort_idx, opt_doc_id_size, - &psort_info, &merge_info); + indexes[i], old_table, &opt_doc_id_size); + + row_merge_dup_t* dup = static_cast<row_merge_dup_t*>( + ut_malloc(sizeof *dup)); + dup->index = fts_sort_idx; + dup->table = table; + dup->col_map = col_map; + dup->n_dup = 0; + + row_fts_psort_info_init( + trx, dup, new_table, opt_doc_id_size, + &psort_info, &merge_info); } } tmpfd = row_merge_file_create_low(); - if (tmpfd < 0) - { + if (tmpfd < 0) { error = DB_OUT_OF_MEMORY; goto func_exit; } @@ -3056,31 +3545,61 @@ row_merge_build_indexes( secondary index entries for merge sort */ error = row_merge_read_clustered_index( - trx, table, old_table, new_table, indexes, - fts_sort_idx, psort_info, merge_files, n_indexes, block); + trx, table, old_table, new_table, online, indexes, + fts_sort_idx, psort_info, merge_files, key_numbers, + n_indexes, add_cols, col_map, + add_autoinc, sequence, block); if (error != DB_SUCCESS) { goto func_exit; } + DEBUG_SYNC_C("row_merge_after_scan"); + /* Now we have files containing index entries ready for sorting and inserting. */ for (i = 0; i < n_indexes; i++) { - dict_index_t* sort_idx; - - sort_idx = (indexes[i]->type & DICT_FTS) - ? fts_sort_idx - : indexes[i]; + dict_index_t* sort_idx = indexes[i]; if (indexes[i]->type & DICT_FTS) { os_event_t fts_parallel_merge_event; + bool all_exit = false; + ulint trial_count = 0; + + sort_idx = fts_sort_idx; + + /* Now all children should complete, wait + a bit until they all finish using event */ + while (!all_exit && trial_count < 10000) { + all_exit = true; + + for (j = 0; j < fts_sort_pll_degree; + j++) { + if (psort_info[j].child_status + != FTS_CHILD_EXITING) { + all_exit = false; + os_thread_sleep(1000); + break; + } + } + trial_count++; + } + + if (!all_exit) { + ib_logf(IB_LOG_LEVEL_ERROR, + "Not all child sort threads exited" + " when creating FTS index '%s'", + indexes[i]->name); + } fts_parallel_merge_event - = merge_info[0].psort_common->sort_event; + = merge_info[0].psort_common->merge_event; if (FTS_PLL_MERGE) { + trial_count = 0; + all_exit = false; os_event_reset(fts_parallel_merge_event); row_fts_start_parallel_merge(merge_info); wait_again: @@ -3090,33 +3609,64 @@ wait_again: for (j = 0; j < FTS_NUM_AUX_INDEX; j++) { if (merge_info[j].child_status - != FTS_CHILD_COMPLETE) { + != FTS_CHILD_COMPLETE + && merge_info[j].child_status + != FTS_CHILD_EXITING) { sig_count = os_event_reset( fts_parallel_merge_event); goto wait_again; } } + + /* Now all children should complete, wait + a bit until they all finish using event */ + while (!all_exit && trial_count < 10000) { + all_exit = true; + + for (j = 0; j < FTS_NUM_AUX_INDEX; + j++) { + if (merge_info[j].child_status + != FTS_CHILD_EXITING) { + all_exit = false; + os_thread_sleep(1000); + break; + } + } + trial_count++; + } + + if (!all_exit) { + ib_logf(IB_LOG_LEVEL_ERROR, + "Not all child merge threads" + " exited when creating FTS" + " index '%s'", + indexes[i]->name); + } } else { + /* This cannot report duplicates; an + assertion would fail in that case. */ error = row_fts_merge_insert( sort_idx, new_table, psort_info, 0); } +#ifdef FTS_INTERNAL_DIAG_PRINT + DEBUG_FTS_SORT_PRINT("FTS_SORT: Complete Insert\n"); +#endif } else { - error = row_merge_sort(trx, sort_idx, &merge_files[i], - block, &tmpfd, table); + row_merge_dup_t dup = { + sort_idx, table, col_map, 0}; + + error = row_merge_sort( + trx, &dup, &merge_files[i], + block, &tmpfd); if (error == DB_SUCCESS) { error = row_merge_insert_index_tuples( - trx, sort_idx, new_table, - dict_table_zip_size(old_table), + trx->id, sort_idx, old_table, merge_files[i].fd, block); } - -#ifdef FTS_INTERNAL_DIAG_PRINT - DEBUG_FTS_SORT_PRINT("FTS_SORT: Complete Insert\n"); -#endif } /* Close the temporary file to free up space. */ @@ -3124,10 +3674,20 @@ wait_again: if (indexes[i]->type & DICT_FTS) { row_fts_psort_info_destroy(psort_info, merge_info); + } else if (error != DB_SUCCESS || !online) { + /* Do not apply any online log. */ + } else if (old_table != new_table) { + ut_ad(!sort_idx->online_log); + ut_ad(sort_idx->online_status + == ONLINE_INDEX_COMPLETE); + } else { + DEBUG_SYNC_C("row_log_apply_before"); + error = row_log_apply(trx, sort_idx, table); + DEBUG_SYNC_C("row_log_apply_after"); } if (error != DB_SUCCESS) { - trx->error_key_num = i; + trx->error_key_num = key_numbers[i]; goto func_exit; } @@ -3148,7 +3708,7 @@ func_exit: DBUG_EXECUTE_IF( "ib_build_indexes_too_many_concurrent_trxs", error = DB_TOO_MANY_CONCURRENT_TRXS; - trx->error_state = static_cast<db_err>(error);); + trx->error_state = error;); row_merge_file_destroy_low(tmpfd); @@ -3163,5 +3723,45 @@ func_exit: mem_free(merge_files); os_mem_free_large(block, block_size); + DICT_TF2_FLAG_UNSET(new_table, DICT_TF2_FTS_ADD_DOC_ID); + + if (online && old_table == new_table && error != DB_SUCCESS) { + /* On error, flag all online secondary index creation + as aborted. */ + for (i = 0; i < n_indexes; i++) { + ut_ad(!(indexes[i]->type & DICT_FTS)); + ut_ad(*indexes[i]->name == TEMP_INDEX_PREFIX); + ut_ad(!dict_index_is_clust(indexes[i])); + + /* Completed indexes should be dropped as + well, and indexes whose creation was aborted + should be dropped from the persistent + storage. However, at this point we can only + set some flags in the not-yet-published + indexes. These indexes will be dropped later + in row_merge_drop_indexes(), called by + rollback_inplace_alter_table(). */ + + switch (dict_index_get_online_status(indexes[i])) { + case ONLINE_INDEX_COMPLETE: + break; + case ONLINE_INDEX_CREATION: + rw_lock_x_lock( + dict_index_get_lock(indexes[i])); + row_log_abort_sec(indexes[i]); + indexes[i]->type |= DICT_CORRUPT; + rw_lock_x_unlock( + dict_index_get_lock(indexes[i])); + new_table->drop_aborted = TRUE; + /* fall through */ + case ONLINE_INDEX_ABORTED_DROPPED: + case ONLINE_INDEX_ABORTED: + MONITOR_MUTEX_INC( + &dict_sys->mutex, + MONITOR_BACKGROUND_DROP_INDEX); + } + } + } + return(error); } diff --git a/storage/innobase/row/row0mysql.cc b/storage/innobase/row/row0mysql.cc index 7a07833fa16..f46d202eed8 100644 --- a/storage/innobase/row/row0mysql.cc +++ b/storage/innobase/row/row0mysql.cc @@ -30,6 +30,9 @@ Created 9/17/2000 Heikki Tuuri #include "row0mysql.ic" #endif +#include <debug_sync.h> +#include <my_dbug.h> + #include "row0ins.h" #include "row0merge.h" #include "row0sel.h" @@ -42,6 +45,7 @@ Created 9/17/2000 Heikki Tuuri #include "dict0load.h" #include "dict0boot.h" #include "dict0stats.h" +#include "dict0stats_bg.h" #include "trx0roll.h" #include "trx0purge.h" #include "trx0rec.h" @@ -54,16 +58,16 @@ Created 9/17/2000 Heikki Tuuri #include "ibuf0ibuf.h" #include "fts0fts.h" #include "fts0types.h" -#include "srv0mon.h" +#include "srv0start.h" +#include "row0import.h" +#include "m_string.h" +#include "my_sys.h" /** Provide optional 4.x backwards compatibility for 5.0 and above */ UNIV_INTERN ibool row_rollback_on_timeout = FALSE; /** Chain node of the list of tables to drop in the background. */ -typedef struct row_mysql_drop_struct row_mysql_drop_t; - -/** Chain node of the list of tables to drop in the background. */ -struct row_mysql_drop_struct{ +struct row_mysql_drop_t{ char* table_name; /*!< table name */ UT_LIST_NODE_T(row_mysql_drop_t)row_mysql_drop_list; /*!< list chain node */ @@ -82,7 +86,7 @@ more. Protected by row_drop_list_mutex. */ static UT_LIST_BASE_NODE_T(row_mysql_drop_t) row_mysql_drop_list; /** Mutex protecting the background table drop list. */ -static mutex_t row_drop_list_mutex; +static ib_mutex_t row_drop_list_mutex; /** Flag: has row_mysql_drop_list been initialized? */ static ibool row_mysql_drop_list_inited = FALSE; @@ -570,21 +574,21 @@ next_column: /****************************************************************//** Handles user errors and lock waits detected by the database engine. -@return TRUE if it was a lock wait and we should continue running the +@return true if it was a lock wait and we should continue running the query thread and in that case the thr is ALREADY in the running state. */ UNIV_INTERN -ibool +bool row_mysql_handle_errors( /*====================*/ - ulint* new_err,/*!< out: possible new error encountered in + dberr_t* new_err,/*!< out: possible new error encountered in lock wait, or if no new error, the value of trx->error_state at the entry of this function */ trx_t* trx, /*!< in: transaction */ - que_thr_t* thr, /*!< in: query thread */ - trx_savept_t* savept) /*!< in: savepoint or NULL */ + que_thr_t* thr, /*!< in: query thread, or NULL */ + trx_savept_t* savept) /*!< in: savepoint, or NULL */ { - ulint err; + dberr_t err; handle_new_error: err = trx->error_state; @@ -612,6 +616,7 @@ handle_new_error: case DB_READ_ONLY: case DB_FTS_INVALID_DOCID: case DB_INTERRUPTED: + case DB_DICT_CHANGED: if (savept) { /* Roll back the latest, possibly incomplete insertion or update */ @@ -631,7 +636,7 @@ handle_new_error: *new_err = err; - return(TRUE); + return(true); case DB_DEADLOCK: case DB_LOCK_TABLE_FULL: @@ -648,6 +653,7 @@ handle_new_error: " a new data file to\n" "InnoDB: my.cnf and restart the database.\n", stderr); + ut_ad(0); exit(1); case DB_CORRUPTION: @@ -686,7 +692,7 @@ handle_new_error: trx->error_state = DB_SUCCESS; - return(FALSE); + return(false); } /********************************************************************//** @@ -774,7 +780,7 @@ row_create_prebuilt( prebuilt->clust_ref = ref; - prebuilt->autoinc_error = 0; + prebuilt->autoinc_error = DB_SUCCESS; prebuilt->autoinc_offset = 0; /* Default to 1, we will set the actual value later in @@ -883,7 +889,7 @@ row_prebuilt_free( mem_free(base); } - dict_table_close(prebuilt->table, dict_locked); + dict_table_close(prebuilt->table, dict_locked, TRUE); mem_heap_free(prebuilt->heap); } @@ -950,44 +956,62 @@ row_get_prebuilt_insert_row( row_prebuilt_t* prebuilt) /*!< in: prebuilt struct in MySQL handle */ { - ins_node_t* node; - dtuple_t* row; - dict_table_t* table = prebuilt->table; + dict_table_t* table = prebuilt->table; ut_ad(prebuilt && table && prebuilt->trx); - if (prebuilt->ins_node == NULL) { - - /* Not called before for this handle: create an insert node - and query graph to the prebuilt struct */ + if (prebuilt->ins_node != 0) { - node = ins_node_create(INS_DIRECT, table, prebuilt->heap); + /* Check if indexes have been dropped or added and we + may need to rebuild the row insert template. */ - prebuilt->ins_node = node; + if (prebuilt->trx_id == table->def_trx_id + && UT_LIST_GET_LEN(prebuilt->ins_node->entry_list) + == UT_LIST_GET_LEN(table->indexes)) { - if (prebuilt->ins_upd_rec_buff == NULL) { - prebuilt->ins_upd_rec_buff = static_cast<byte*>( - mem_heap_alloc( - prebuilt->heap, - prebuilt->mysql_row_len)); + return(prebuilt->ins_node->row); } - row = dtuple_create(prebuilt->heap, - dict_table_get_n_cols(table)); + ut_ad(prebuilt->trx_id < table->def_trx_id); - dict_table_copy_types(row, table); + que_graph_free_recursive(prebuilt->ins_graph); - ins_node_set_new_row(node, row); + prebuilt->ins_graph = 0; + } - prebuilt->ins_graph = static_cast<que_fork_t*>( - que_node_get_parent( - pars_complete_graph_for_exec( - node, - prebuilt->trx, prebuilt->heap))); + /* Create an insert node and query graph to the prebuilt struct */ - prebuilt->ins_graph->state = QUE_FORK_ACTIVE; + ins_node_t* node; + + node = ins_node_create(INS_DIRECT, table, prebuilt->heap); + + prebuilt->ins_node = node; + + if (prebuilt->ins_upd_rec_buff == 0) { + prebuilt->ins_upd_rec_buff = static_cast<byte*>( + mem_heap_alloc( + prebuilt->heap, + prebuilt->mysql_row_len)); } + dtuple_t* row; + + row = dtuple_create(prebuilt->heap, dict_table_get_n_cols(table)); + + dict_table_copy_types(row, table); + + ins_node_set_new_row(node, row); + + prebuilt->ins_graph = static_cast<que_fork_t*>( + que_node_get_parent( + pars_complete_graph_for_exec( + node, + prebuilt->trx, prebuilt->heap))); + + prebuilt->ins_graph->state = QUE_FORK_ACTIVE; + + prebuilt->trx_id = table->def_trx_id; + return(prebuilt->ins_node->row); } @@ -1000,23 +1024,41 @@ row_update_statistics_if_needed( /*============================*/ dict_table_t* table) /*!< in: table */ { - ulint counter; + ib_uint64_t counter; + ib_uint64_t n_rows; + + if (!table->stat_initialized) { + DBUG_EXECUTE_IF( + "test_upd_stats_if_needed_not_inited", + fprintf(stderr, "test_upd_stats_if_needed_not_inited " + "was executed\n"); + ); + return; + } - counter = table->stat_modified_counter; + counter = table->stat_modified_counter++; + n_rows = dict_table_get_n_rows(table); - table->stat_modified_counter = counter + 1; + if (dict_stats_is_persistent_enabled(table)) { + if (counter > n_rows / 10 /* 10% */ + && dict_stats_auto_recalc_is_enabled(table)) { + + dict_stats_recalc_pool_add(table); + table->stat_modified_counter = 0; + } + return; + } /* Calculate new statistics if 1 / 16 of table has been modified - since the last time a statistics batch was run, or if - stat_modified_counter > 2 000 000 000 (to avoid wrap-around). + since the last time a statistics batch was run. We calculate statistics at most every 16th round, since we may have a counter table which is very small and updated very often. */ - if (counter > 2000000000 - || ((ib_int64_t) counter > 16 + table->stat_n_rows / 16)) { + if (counter > 16 + n_rows / 16 /* 6.25% */) { ut_ad(!mutex_own(&dict_sys->mutex)); - dict_stats_update(table, DICT_STATS_FETCH, FALSE); + /* this will reset table->stat_modified_counter to 0 */ + dict_stats_update(table, DICT_STATS_RECALC_TRANSIENT); } } @@ -1028,7 +1070,7 @@ It is not compatible with another AUTO_INC or exclusive lock on the table. @return error code or DB_SUCCESS */ UNIV_INTERN -int +dberr_t row_lock_table_autoinc_for_mysql( /*=============================*/ row_prebuilt_t* prebuilt) /*!< in: prebuilt struct in the MySQL @@ -1038,7 +1080,7 @@ row_lock_table_autoinc_for_mysql( ins_node_t* node = prebuilt->ins_node; const dict_table_t* table = prebuilt->table; que_thr_t* thr; - ulint err; + dberr_t err; ibool was_lock_wait; ut_ad(trx); @@ -1053,10 +1095,8 @@ row_lock_table_autoinc_for_mysql( trx->op_info = "setting auto-inc lock"; - if (node == NULL) { - row_get_prebuilt_insert_row(prebuilt); - node = prebuilt->ins_node; - } + row_get_prebuilt_insert_row(prebuilt); + node = prebuilt->ins_node; /* We use the insert query graph as the dummy graph needed in the lock module call */ @@ -1076,7 +1116,7 @@ run_again: err = lock_table(0, prebuilt->table, LOCK_AUTO_INC, thr); - trx->error_state = static_cast<enum db_err>(err); + trx->error_state = err; if (err != DB_SUCCESS) { que_thr_stop_for_mysql(thr); @@ -1089,21 +1129,21 @@ run_again: trx->op_info = ""; - return((int) err); + return(err); } que_thr_stop_for_mysql_no_error(thr, trx); trx->op_info = ""; - return((int) err); + return(err); } /*********************************************************************//** Sets a table lock on the table mentioned in prebuilt. @return error code or DB_SUCCESS */ UNIV_INTERN -int +dberr_t row_lock_table_for_mysql( /*=====================*/ row_prebuilt_t* prebuilt, /*!< in: prebuilt struct in the MySQL @@ -1117,7 +1157,7 @@ row_lock_table_for_mysql( { trx_t* trx = prebuilt->trx; que_thr_t* thr; - ulint err; + dberr_t err; ibool was_lock_wait; ut_ad(trx); @@ -1157,7 +1197,7 @@ run_again: thr); } - trx->error_state = static_cast<enum db_err>(err); + trx->error_state = err; if (err != DB_SUCCESS) { que_thr_stop_for_mysql(thr); @@ -1170,21 +1210,21 @@ run_again: trx->op_info = ""; - return((int) err); + return(err); } que_thr_stop_for_mysql_no_error(thr, trx); trx->op_info = ""; - return((int) err); + return(err); } /*********************************************************************//** Does an insert for MySQL. @return error code or DB_SUCCESS */ UNIV_INTERN -int +dberr_t row_insert_for_mysql( /*=================*/ byte* mysql_rec, /*!< in: row in the MySQL format */ @@ -1193,7 +1233,7 @@ row_insert_for_mysql( { trx_savept_t savept; que_thr_t* thr; - ulint err; + dberr_t err; ibool was_lock_wait; trx_t* trx = prebuilt->trx; ins_node_t* node = prebuilt->ins_node; @@ -1201,24 +1241,23 @@ row_insert_for_mysql( ut_ad(trx); - if (table->ibd_file_missing) { - ut_print_timestamp(stderr); - fprintf(stderr, " InnoDB: Error:\n" - "InnoDB: MySQL is trying to use a table handle" - " but the .ibd file for\n" - "InnoDB: table %s does not exist.\n" - "InnoDB: Have you deleted the .ibd file" - " from the database directory under\n" - "InnoDB: the MySQL datadir, or have you" - " used DISCARD TABLESPACE?\n" - "InnoDB: Look from\n" - "InnoDB: " REFMAN "innodb-troubleshooting.html\n" - "InnoDB: how you can resolve the problem.\n", + if (dict_table_is_discarded(prebuilt->table)) { + ib_logf(IB_LOG_LEVEL_ERROR, + "The table %s doesn't have a corresponding " + "tablespace, it was discarded.", prebuilt->table->name); - return(DB_ERROR); - } - if (UNIV_UNLIKELY(prebuilt->magic_n != ROW_PREBUILT_ALLOCATED)) { + return(DB_TABLESPACE_DELETED); + + } else if (prebuilt->table->ibd_file_missing) { + + ib_logf(IB_LOG_LEVEL_ERROR, + ".ibd file is missing for table %s", + prebuilt->table->name); + + return(DB_TABLESPACE_NOT_FOUND); + + } else if (prebuilt->magic_n != ROW_PREBUILT_ALLOCATED) { fprintf(stderr, "InnoDB: Error: trying to free a corrupt\n" "InnoDB: table handle. Magic n %lu, table name ", @@ -1229,9 +1268,7 @@ row_insert_for_mysql( mem_analyze_corruption(prebuilt); ut_error; - } - - if (UNIV_UNLIKELY(srv_created_new_raw || srv_force_recovery)) { + } else if (srv_created_new_raw || srv_force_recovery) { fputs("InnoDB: A new raw disk partition was initialized or\n" "InnoDB: innodb_force_recovery is on: we do not allow\n" "InnoDB: database modifications by the user. Shut down\n" @@ -1249,10 +1286,8 @@ row_insert_for_mysql( trx_start_if_not_started_xa(trx); - if (node == NULL) { - row_get_prebuilt_insert_row(prebuilt); - node = prebuilt->ins_node; - } + row_get_prebuilt_insert_row(prebuilt); + node = prebuilt->ins_node; row_mysql_convert_row_to_innobase(node->row, prebuilt, mysql_rec); @@ -1290,12 +1325,14 @@ error_exit: thr->lock_state = QUE_THR_LOCK_NOLOCK; if (was_lock_wait) { + ut_ad(node->state == INS_NODE_INSERT_ENTRIES + || node->state == INS_NODE_ALLOC_ROW_ID); goto run_again; } trx->op_info = ""; - return((int) err); + return(err); } if (dict_table_has_fts_index(table)) { @@ -1353,19 +1390,18 @@ error_exit: que_thr_stop_for_mysql_no_error(thr, trx); - table->stat_n_rows++; + srv_stats.n_rows_inserted.add((size_t)trx->id, 1); - srv_n_rows_inserted++; - - if (prebuilt->table->stat_n_rows == 0) { - /* Avoid wrap-over */ - table->stat_n_rows--; - } + /* Not protected by dict_table_stats_lock() for performance + reasons, we would rather get garbage in stat_n_rows (which is + just an estimate anyway) than protecting the following code + with a latch. */ + dict_table_n_rows_inc(table); row_update_statistics_if_needed(table); trx->op_info = ""; - return((int) err); + return(err); } /*********************************************************************//** @@ -1490,7 +1526,7 @@ row_fts_do_update( Handles FTS matters for an update or a delete. NOTE: should not be called if the table does not have an FTS index. .*/ static -ulint +dberr_t row_fts_update_or_delete( /*=====================*/ row_prebuilt_t* prebuilt) /* in: prebuilt struct in MySQL @@ -1530,16 +1566,18 @@ void init_fts_doc_id_for_ref( /*====================*/ dict_table_t* table, /*!< in: table */ - ulint depth) /*!< in: recusive call depth */ + ulint* depth) /*!< in: recusive call depth */ { dict_foreign_t* foreign; foreign = UT_LIST_GET_FIRST(table->referenced_list); - depth++; + table->fk_max_recusive_level = 0; + + (*depth)++; /* Limit on tables involved in cascading delete/update */ - if (depth > FK_MAX_CASCADE_DEL) { + if (*depth > FK_MAX_CASCADE_DEL) { return; } @@ -1563,7 +1601,7 @@ init_fts_doc_id_for_ref( Does an update or delete of a row for MySQL. @return error code or DB_SUCCESS */ UNIV_INTERN -int +dberr_t row_update_for_mysql( /*=================*/ byte* mysql_rec, /*!< in: the row to be updated, in @@ -1572,7 +1610,7 @@ row_update_for_mysql( handle */ { trx_savept_t savept; - ulint err; + dberr_t err; que_thr_t* thr; ibool was_lock_wait; dict_index_t* clust_index; @@ -1580,6 +1618,7 @@ row_update_for_mysql( upd_node_t* node; dict_table_t* table = prebuilt->table; trx_t* trx = prebuilt->trx; + ulint fk_depth = 0; ut_ad(prebuilt && trx); UT_NOT_USED(mysql_rec); @@ -1626,14 +1665,26 @@ row_update_for_mysql( return(DB_ERROR); } + DEBUG_SYNC_C("innodb_row_update_for_mysql_begin"); + trx->op_info = "updating or deleting"; row_mysql_delay_if_needed(); - init_fts_doc_id_for_ref(table, 0); - trx_start_if_not_started_xa(trx); + if (dict_table_is_referenced_by_foreign_key(table)) { + /* Share lock the data dictionary to prevent any + table dictionary (for foreign constraint) change. + This is similar to row_ins_check_foreign_constraint + check protect by the dictionary lock as well. + In the future, this can be removed once the Foreign + key MDL is implemented */ + row_mysql_freeze_data_dictionary(trx); + init_fts_doc_id_for_ref(table, &fk_depth); + row_mysql_unfreeze_data_dictionary(trx); + } + node = prebuilt->upd_node; clust_index = dict_table_get_first_index(table); @@ -1683,10 +1734,13 @@ run_again: trx->error_state = DB_SUCCESS; trx->op_info = ""; - return((int) err); + return(err); } thr->lock_state= QUE_THR_LOCK_ROW; + + DEBUG_SYNC(trx->mysql_thd, "row_update_for_mysql_error"); + was_lock_wait = row_mysql_handle_errors(&err, trx, thr, &savept); thr->lock_state= QUE_THR_LOCK_NOLOCK; @@ -1697,7 +1751,7 @@ run_again: trx->op_info = ""; - return((int) err); + return(err); } que_thr_stop_for_mysql_no_error(thr, trx); @@ -1707,18 +1761,20 @@ run_again: err = row_fts_update_or_delete(prebuilt); if (err != DB_SUCCESS) { trx->op_info = ""; - return((int) err); + return(err); } } if (node->is_delete) { - if (prebuilt->table->stat_n_rows > 0) { - prebuilt->table->stat_n_rows--; - } + /* Not protected by dict_table_stats_lock() for performance + reasons, we would rather get garbage in stat_n_rows (which is + just an estimate anyway) than protecting the following code + with a latch. */ + dict_table_n_rows_dec(prebuilt->table); - srv_n_rows_deleted++; + srv_stats.n_rows_deleted.add((size_t)trx->id, 1); } else { - srv_n_rows_updated++; + srv_stats.n_rows_updated.add((size_t)trx->id, 1); } /* We update table statistics only if it is a DELETE or UPDATE @@ -1730,7 +1786,7 @@ run_again: trx->op_info = ""; - return((int) err); + return(err); } /*********************************************************************//** @@ -1744,7 +1800,7 @@ prebuilt->clust_pcur. Thus, this implements a 'mini-rollback' that releases the latest clustered index record lock we set. @return error code or DB_SUCCESS */ UNIV_INTERN -int +void row_unlock_for_mysql( /*=================*/ row_prebuilt_t* prebuilt, /*!< in/out: prebuilt struct in MySQL @@ -1770,8 +1826,7 @@ row_unlock_for_mysql( "InnoDB: innodb_locks_unsafe_for_binlog is FALSE and\n" "InnoDB: this session is not using" " READ COMMITTED isolation level.\n"); - - return(DB_SUCCESS); + return; } trx->op_info = "unlock_row"; @@ -1863,15 +1918,13 @@ no_unlock: } trx->op_info = ""; - - return(DB_SUCCESS); } /**********************************************************************//** Does a cascaded delete or set null in a foreign key operation. @return error code or DB_SUCCESS */ UNIV_INTERN -ulint +dberr_t row_update_cascade_for_mysql( /*=========================*/ que_thr_t* thr, /*!< in: query thread */ @@ -1879,7 +1932,7 @@ row_update_cascade_for_mysql( or set null operation */ dict_table_t* table) /*!< in: table where we do the operation */ { - ulint err; + dberr_t err; trx_t* trx; trx = thr_get_trx(thr); @@ -1890,12 +1943,14 @@ row_update_cascade_for_mysql( thr->fk_cascade_depth++; if (thr->fk_cascade_depth > FK_MAX_CASCADE_DEL) { - return (DB_FOREIGN_EXCEED_MAX_CASCADE); + return(DB_FOREIGN_EXCEED_MAX_CASCADE); } run_again: thr->run_node = node; thr->prev_node = node; + DEBUG_SYNC_C("foreign_constraint_update_cascade"); + row_upd_step(thr); /* The recursive call for cascading update/delete happens @@ -1937,13 +1992,15 @@ run_again: } if (node->is_delete) { - if (table->stat_n_rows > 0) { - table->stat_n_rows--; - } + /* Not protected by dict_table_stats_lock() for performance + reasons, we would rather get garbage in stat_n_rows (which is + just an estimate anyway) than protecting the following code + with a latch. */ + dict_table_n_rows_dec(table); - srv_n_rows_deleted++; + srv_stats.n_rows_deleted.add((size_t)trx->id, 1); } else { - srv_n_rows_updated++; + srv_stats.n_rows_updated.add((size_t)trx->id, 1); } row_update_statistics_if_needed(table); @@ -1981,7 +2038,7 @@ row_mysql_freeze_data_dictionary_func( { ut_a(trx->dict_operation_lock_mode == 0); - rw_lock_s_lock_func(&dict_operation_lock, 0, file, line); + rw_lock_s_lock_inline(&dict_operation_lock, 0, file, line); trx->dict_operation_lock_mode = RW_S_LATCH; } @@ -1994,6 +2051,8 @@ row_mysql_unfreeze_data_dictionary( /*===============================*/ trx_t* trx) /*!< in/out: transaction */ { + ut_ad(lock_trx_has_sys_table_locks(trx) == NULL); + ut_a(trx->dict_operation_lock_mode == RW_S_LATCH); rw_lock_s_unlock(&dict_operation_lock); @@ -2018,7 +2077,7 @@ row_mysql_lock_data_dictionary_func( /* Serialize data dictionary operations with dictionary mutex: no deadlocks or lock waits can occur then in these operations */ - rw_lock_x_lock_func(&dict_operation_lock, 0, file, line); + rw_lock_x_lock_inline(&dict_operation_lock, 0, file, line); trx->dict_operation_lock_mode = RW_X_LATCH; mutex_enter(&(dict_sys->mutex)); @@ -2032,6 +2091,8 @@ row_mysql_unlock_data_dictionary( /*=============================*/ trx_t* trx) /*!< in/out: transaction */ { + ut_ad(lock_trx_has_sys_table_locks(trx) == NULL); + ut_a(trx->dict_operation_lock_mode == RW_X_LATCH); /* Serialize data dictionary operations with dictionary mutex: @@ -2052,19 +2113,21 @@ InnoDB will try to invoke mem_validate(). On failure the transaction will be rolled back and the 'table' object will be freed. @return error code or DB_SUCCESS */ UNIV_INTERN -int +dberr_t row_create_table_for_mysql( /*=======================*/ dict_table_t* table, /*!< in, own: table definition - (will be freed) */ - trx_t* trx) /*!< in: transaction handle */ + (will be freed, or on DB_SUCCESS + added to the data dictionary cache) */ + trx_t* trx, /*!< in/out: transaction */ + bool commit) /*!< in: if true, commit the transaction */ { tab_node_t* node; mem_heap_t* heap; que_thr_t* thr; const char* table_name; ulint table_name_len; - ulint err; + dberr_t err; #ifdef UNIV_SYNC_DEBUG ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX)); @@ -2072,6 +2135,11 @@ row_create_table_for_mysql( ut_ad(mutex_own(&(dict_sys->mutex))); ut_ad(trx->dict_operation_lock_mode == RW_X_LATCH); + DBUG_EXECUTE_IF( + "ib_create_table_fail_at_start_of_row_create_table_for_mysql", + goto err_exit; + ); + if (srv_created_new_raw) { fputs("InnoDB: A new raw disk partition was initialized:\n" "InnoDB: we do not allow database modifications" @@ -2080,7 +2148,10 @@ row_create_table_for_mysql( " is replaced with raw.\n", stderr); err_exit: dict_mem_table_free(table); - trx_commit_for_mysql(trx); + + if (commit) { + trx_commit_for_mysql(trx); + } return(DB_ERROR); } @@ -2117,23 +2188,23 @@ err_exit: /* The lock timeout monitor thread also takes care of InnoDB monitor prints */ - os_event_set(srv_timeout_event); + os_event_set(lock_sys->timeout_event); } else if (STR_EQ(table_name, table_name_len, S_innodb_lock_monitor)) { srv_print_innodb_monitor = TRUE; srv_print_innodb_lock_monitor = TRUE; - os_event_set(srv_timeout_event); + os_event_set(lock_sys->timeout_event); } else if (STR_EQ(table_name, table_name_len, S_innodb_tablespace_monitor)) { srv_print_innodb_tablespace_monitor = TRUE; - os_event_set(srv_timeout_event); + os_event_set(lock_sys->timeout_event); } else if (STR_EQ(table_name, table_name_len, S_innodb_table_monitor)) { srv_print_innodb_table_monitor = TRUE; - os_event_set(srv_timeout_event); + os_event_set(lock_sys->timeout_event); #ifdef UNIV_MEM_DEBUG } else if (STR_EQ(table_name, table_name_len, S_innodb_mem_validate)) { @@ -2152,12 +2223,21 @@ err_exit: #endif /* UNIV_MEM_DEBUG */ } - heap = mem_heap_create(512); - trx_set_dict_operation(trx, TRX_DICT_OP_TABLE); + switch (trx_get_dict_operation(trx)) { + case TRX_DICT_OP_NONE: + trx_set_dict_operation(trx, TRX_DICT_OP_TABLE); + case TRX_DICT_OP_TABLE: + break; + case TRX_DICT_OP_INDEX: + /* If the transaction was previously flagged as + TRX_DICT_OP_INDEX, we should be creating auxiliary + tables for full-text indexes. */ + ut_ad(strstr(table->name, "/FTS_") != NULL); + } - node = tab_create_graph_create(table, heap); + node = tab_create_graph_create(table, heap, commit); thr = pars_complete_graph_for_exec(node, trx, heap); @@ -2168,6 +2248,29 @@ err_exit: err = trx->error_state; + if (table->space != TRX_SYS_SPACE) { + ut_a(DICT_TF2_FLAG_IS_SET(table, DICT_TF2_USE_TABLESPACE)); + + /* Update SYS_TABLESPACES and SYS_DATAFILES if a new + tablespace was created. */ + if (err == DB_SUCCESS) { + char* path; + path = fil_space_get_first_path(table->space); + + err = dict_create_add_tablespace_to_dictionary( + table->space, table->name, + fil_space_get_flags(table->space), + path, trx, commit); + + mem_free(path); + } + + if (err != DB_SUCCESS) { + /* We must delete the link file. */ + fil_delete_link_file(table->name); + } + } + switch (err) { case DB_SUCCESS: break; @@ -2181,8 +2284,8 @@ err_exit: ut_print_name(stderr, trx, TRUE, table->name); fputs(" because tablespace full\n", stderr); - if (dict_table_open_on_name_no_stats( - table->name, FALSE, DICT_ERR_IGNORE_NONE)) { + if (dict_table_open_on_name(table->name, TRUE, FALSE, + DICT_ERR_IGNORE_NONE)) { /* Make things easy for the drop table code. */ @@ -2190,10 +2293,13 @@ err_exit: dict_table_move_from_lru_to_non_lru(table); } - dict_table_close(table, FALSE); + dict_table_close(table, TRUE, FALSE); row_drop_table_for_mysql(table->name, trx, FALSE); - trx_commit_for_mysql(trx); + + if (commit) { + trx_commit_for_mysql(trx); + } } else { dict_mem_table_free(table); } @@ -2203,7 +2309,12 @@ err_exit: case DB_TOO_MANY_CONCURRENT_TRXS: /* We already have .ibd file here. it should be deleted. */ - if (table->space && !fil_delete_tablespace(table->space)) { + if (table->space + && fil_delete_tablespace( + table->space, + BUF_REMOVE_FLUSH_NO_WRITE) + != DB_SUCCESS) { + ut_print_timestamp(stderr); fprintf(stderr, " InnoDB: Error: not able to" @@ -2215,10 +2326,8 @@ err_exit: /* fall through */ case DB_DUPLICATE_KEY: + case DB_TABLESPACE_EXISTS: default: - /* We may also get err == DB_ERROR if the .ibd file for the - table already exists */ - trx->error_state = DB_SUCCESS; trx_rollback_to_savepoint(trx, NULL); dict_mem_table_free(table); @@ -2229,7 +2338,7 @@ err_exit: trx->op_info = ""; - return((int) err); + return(err); } /*********************************************************************//** @@ -2238,7 +2347,7 @@ to create an index results in dropping the whole table! This is no problem currently as all indexes must be created at the same time as the table. @return error number or DB_SUCCESS */ UNIV_INTERN -int +dberr_t row_create_index_for_mysql( /*=======================*/ dict_index_t* index, /*!< in, own: index definition @@ -2254,13 +2363,13 @@ row_create_index_for_mysql( ind_node_t* node; mem_heap_t* heap; que_thr_t* thr; - ulint err; + dberr_t err; ulint i; ulint len; char* table_name; char* index_name; dict_table_t* table; - ibool is_fts = FALSE; + ibool is_fts; #ifdef UNIV_SYNC_DEBUG ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX)); @@ -2277,8 +2386,8 @@ row_create_index_for_mysql( is_fts = (index->type == DICT_FTS); - table = dict_table_open_on_name_no_stats(table_name, TRUE, - DICT_ERR_IGNORE_NONE); + table = dict_table_open_on_name(table_name, TRUE, TRUE, + DICT_ERR_IGNORE_NONE); trx_start_if_not_started_xa(trx); @@ -2292,6 +2401,11 @@ row_create_index_for_mysql( len = ut_max(len, field_lengths[i]); } + DBUG_EXECUTE_IF( + "ib_create_table_fail_at_create_index", + len = DICT_MAX_FIELD_LEN_BY_FORMAT(table) + 1; + ); + /* Column or prefix length exceeds maximum column length */ if (len > (ulint) DICT_MAX_FIELD_LEN_BY_FORMAT(table)) { err = DB_TOO_BIG_INDEX_COL; @@ -2308,7 +2422,7 @@ row_create_index_for_mysql( /* Note that the space id where we store the index is inherited from the table in dict_build_index_def_step() in dict0crea.cc. */ - node = ind_create_graph_create(index, heap); + node = ind_create_graph_create(index, heap, true); thr = pars_complete_graph_for_exec(node, trx, heap); @@ -2332,7 +2446,7 @@ row_create_index_for_mysql( } error_handling: - dict_table_close(table, TRUE); + dict_table_close(table, TRUE, FALSE); if (err != DB_SUCCESS) { /* We have special error handling here */ @@ -2353,7 +2467,7 @@ error_handling: mem_free(table_name); mem_free(index_name); - return((int) err); + return(err); } /*********************************************************************//** @@ -2366,7 +2480,7 @@ fields than mentioned in the constraint. Check also that foreign key constraints which reference this table are ok. @return error code or DB_SUCCESS */ UNIV_INTERN -int +dberr_t row_table_add_foreign_constraints( /*==============================*/ trx_t* trx, /*!< in: transaction */ @@ -2383,7 +2497,7 @@ row_table_add_foreign_constraints( code DB_CANNOT_ADD_CONSTRAINT if any foreign keys are found. */ { - ulint err; + dberr_t err; ut_ad(mutex_own(&(dict_sys->mutex))); #ifdef UNIV_SYNC_DEBUG @@ -2399,6 +2513,12 @@ row_table_add_foreign_constraints( err = dict_create_foreign_constraints(trx, sql_string, sql_length, name, reject_fks); + + DBUG_EXECUTE_IF("ib_table_add_foreign_fail", + err = DB_DUPLICATE_KEY;); + + DEBUG_SYNC_C("table_add_foreign_constraints"); + if (err == DB_SUCCESS) { /* Check that also referencing constraints are ok */ err = dict_load_foreigns(name, FALSE, TRUE); @@ -2418,7 +2538,7 @@ row_table_add_foreign_constraints( trx->error_state = DB_SUCCESS; } - return((int) err); + return(err); } /*********************************************************************//** @@ -2430,12 +2550,12 @@ as a background operation, which is taken care of by the master thread in srv0srv.cc. @return error code or DB_SUCCESS */ static -int +dberr_t row_drop_table_for_mysql_in_background( /*===================================*/ const char* name) /*!< in: table name */ { - ulint error; + dberr_t error; trx_t* trx; trx = trx_allocate_for_background(); @@ -2464,7 +2584,7 @@ row_drop_table_for_mysql_in_background( trx_free_for_background(trx); - return((int) error); + return(error); } /*********************************************************************//** @@ -2498,8 +2618,8 @@ loop: return(n_tables + n_tables_dropped); } - table = dict_table_open_on_name_no_stats(drop->table_name, FALSE, - DICT_ERR_IGNORE_NONE); + table = dict_table_open_on_name(drop->table_name, FALSE, FALSE, + DICT_ERR_IGNORE_NONE); if (table == NULL) { /* If for some reason the table has already been dropped @@ -2510,7 +2630,7 @@ loop: ut_a(!table->can_be_evicted); - dict_table_close(table, FALSE); + dict_table_close(table, FALSE, FALSE); if (DB_SUCCESS != row_drop_table_for_mysql_in_background( drop->table_name)) { @@ -2617,356 +2737,429 @@ row_add_table_to_background_drop_list( } /*********************************************************************//** -Discards the tablespace of a table which stored in an .ibd file. Discarding -means that this function deletes the .ibd file and assigns a new table id for -the table. Also the flag table->ibd_file_missing is set TRUE. +Reassigns the table identifier of a table. @return error code or DB_SUCCESS */ UNIV_INTERN -int -row_discard_tablespace_for_mysql( -/*=============================*/ - const char* name, /*!< in: table name */ - trx_t* trx) /*!< in: transaction handle */ +dberr_t +row_mysql_table_id_reassign( +/*========================*/ + dict_table_t* table, /*!< in/out: table */ + trx_t* trx, /*!< in/out: transaction */ + table_id_t* new_id) /*!< out: new table id */ { - dict_foreign_t* foreign; - table_id_t new_id; - dict_table_t* table; - ibool success; - ulint err; - pars_info_t* info = NULL; + dberr_t err; + pars_info_t* info = pars_info_create(); - /* How do we prevent crashes caused by ongoing operations on - the table? Old operations could try to access non-existent - pages. + dict_hdr_get_new_id(new_id, NULL, NULL); - 1) SQL queries, INSERT, SELECT, ...: we must get an exclusive - MySQL table lock on the table before we can do DISCARD - TABLESPACE. Then there are no running queries on the table. + /* Remove all locks except the table-level S and X locks. */ + lock_remove_all_on_table(table, FALSE); - 2) Purge and rollback: we assign a new table id for the - table. Since purge and rollback look for the table based on - the table id, they see the table as 'dropped' and discard - their operations. + pars_info_add_ull_literal(info, "old_id", table->id); + pars_info_add_ull_literal(info, "new_id", *new_id); + + err = que_eval_sql( + info, + "PROCEDURE RENUMBER_TABLE_PROC () IS\n" + "BEGIN\n" + "UPDATE SYS_TABLES SET ID = :new_id\n" + " WHERE ID = :old_id;\n" + "UPDATE SYS_COLUMNS SET TABLE_ID = :new_id\n" + " WHERE TABLE_ID = :old_id;\n" + "UPDATE SYS_INDEXES SET TABLE_ID = :new_id\n" + " WHERE TABLE_ID = :old_id;\n" + "END;\n", FALSE, trx); - 3) Insert buffer: we remove all entries for the tablespace in - the insert buffer tree; as long as the tablespace mem object - does not exist, ongoing insert buffer page merges are - discarded in buf0rea.cc. If we recreate the tablespace mem - object with IMPORT TABLESPACE later, then the tablespace will - have the same id, but the tablespace_version field in the mem - object is different, and ongoing old insert buffer page merges - get discarded. + return(err); +} - 4) Linear readahead and random readahead: we use the same - method as in 3) to discard ongoing operations. +/*********************************************************************//** +Setup the pre-requisites for DISCARD TABLESPACE. It will start the transaction, +acquire the data dictionary lock in X mode and open the table. +@return table instance or 0 if not found. */ +static +dict_table_t* +row_discard_tablespace_begin( +/*=========================*/ + const char* name, /*!< in: table name */ + trx_t* trx) /*!< in: transaction handle */ +{ + trx->op_info = "discarding tablespace"; - 5) FOREIGN KEY operations: if - table->n_foreign_key_checks_running > 0, we do not allow the - discard. We also reserve the data dictionary latch. */ + trx_set_dict_operation(trx, TRX_DICT_OP_TABLE); - trx->op_info = "discarding tablespace"; trx_start_if_not_started_xa(trx); /* Serialize data dictionary operations with dictionary mutex: - no deadlocks can occur then in these operations */ + this is to avoid deadlocks during data dictionary operations */ row_mysql_lock_data_dictionary(trx); - table = dict_table_open_on_name_no_stats(name, TRUE, - DICT_ERR_IGNORE_NONE); - - if (!table) { - err = DB_TABLE_NOT_FOUND; - - goto funct_exit; - } + dict_table_t* table; - if (table->space == 0) { - ut_print_timestamp(stderr); - fputs(" InnoDB: Error: table ", stderr); - ut_print_name(stderr, trx, TRUE, name); - fputs("\n" - "InnoDB: is in the system tablespace 0" - " which cannot be discarded\n", stderr); - err = DB_ERROR; + table = dict_table_open_on_name( + name, TRUE, FALSE, DICT_ERR_IGNORE_NONE); - goto funct_exit; + if (table) { + dict_stats_wait_bg_to_stop_using_tables(table, NULL, trx); + ut_a(table->space != TRX_SYS_SPACE); + ut_a(table->n_foreign_key_checks_running == 0); } - if (table->n_foreign_key_checks_running > 0) { - - ut_print_timestamp(stderr); - fputs(" InnoDB: You are trying to DISCARD table ", stderr); - ut_print_name(stderr, trx, TRUE, table->name); - fputs("\n" - "InnoDB: though there is a foreign key check" - " running on it.\n" - "InnoDB: Cannot discard the table.\n", - stderr); - - err = DB_ERROR; + return(table); +} - goto funct_exit; - } +/*********************************************************************//** +Do the foreign key constraint checks. +@return DB_SUCCESS or error code. */ +static +dberr_t +row_discard_tablespace_foreign_key_checks( +/*======================================*/ + const trx_t* trx, /*!< in: transaction handle */ + const dict_table_t* table) /*!< in: table to be discarded */ +{ + const dict_foreign_t* foreign; /* Check if the table is referenced by foreign key constraints from some other table (not the table itself) */ - foreign = UT_LIST_GET_FIRST(table->referenced_list); + for (foreign = UT_LIST_GET_FIRST(table->referenced_list); + foreign && foreign->foreign_table == table; + foreign = UT_LIST_GET_NEXT(referenced_list, foreign)) { - while (foreign && foreign->foreign_table == table) { - foreign = UT_LIST_GET_NEXT(referenced_list, foreign); } - if (foreign && trx->check_foreigns) { + if (!srv_read_only_mode && foreign && trx->check_foreigns) { FILE* ef = dict_foreign_err_file; /* We only allow discarding a referenced table if FOREIGN_KEY_CHECKS is set to 0 */ - err = DB_CANNOT_DROP_CONSTRAINT; - mutex_enter(&dict_foreign_err_mutex); + rewind(ef); + ut_print_timestamp(ef); fputs(" Cannot DISCARD table ", ef); - ut_print_name(stderr, trx, TRUE, name); + ut_print_name(stderr, trx, TRUE, table->name); fputs("\n" "because it is referenced by ", ef); ut_print_name(stderr, trx, TRUE, foreign->foreign_table_name); putc('\n', ef); + mutex_exit(&dict_foreign_err_mutex); - goto funct_exit; + return(DB_CANNOT_DROP_CONSTRAINT); } - dict_hdr_get_new_id(&new_id, NULL, NULL); + return(DB_SUCCESS); +} - /* Remove all locks except the table-level S and X locks. */ - lock_remove_all_on_table(table, FALSE); +/*********************************************************************//** +Cleanup after the DISCARD TABLESPACE operation. +@return error code. */ +static +dberr_t +row_discard_tablespace_end( +/*=======================*/ + trx_t* trx, /*!< in/out: transaction handle */ + dict_table_t* table, /*!< in/out: table to be discarded */ + dberr_t err) /*!< in: error code */ +{ + if (table != 0) { + dict_table_close(table, TRUE, FALSE); + } - info = pars_info_create(); + DBUG_EXECUTE_IF("ib_discard_before_commit_crash", + log_make_checkpoint_at(IB_ULONGLONG_MAX, TRUE); + DBUG_SUICIDE();); - pars_info_add_str_literal(info, "table_name", name); - pars_info_add_ull_literal(info, "new_id", new_id); + trx_commit_for_mysql(trx); - err = que_eval_sql(info, - "PROCEDURE DISCARD_TABLESPACE_PROC () IS\n" - "old_id CHAR;\n" - "BEGIN\n" - "SELECT ID INTO old_id\n" - "FROM SYS_TABLES\n" - "WHERE NAME = :table_name\n" - "LOCK IN SHARE MODE;\n" - "IF (SQL % NOTFOUND) THEN\n" - " COMMIT WORK;\n" - " RETURN;\n" - "END IF;\n" - "UPDATE SYS_TABLES SET ID = :new_id\n" - " WHERE ID = old_id;\n" - "UPDATE SYS_COLUMNS SET TABLE_ID = :new_id\n" - " WHERE TABLE_ID = old_id;\n" - "UPDATE SYS_INDEXES SET TABLE_ID = :new_id\n" - " WHERE TABLE_ID = old_id;\n" - "COMMIT WORK;\n" - "END;\n" - , FALSE, trx); + DBUG_EXECUTE_IF("ib_discard_after_commit_crash", + log_make_checkpoint_at(IB_ULONGLONG_MAX, TRUE); + DBUG_SUICIDE();); + + row_mysql_unlock_data_dictionary(trx); + + trx->op_info = ""; + + return(err); +} + +/*********************************************************************//** +Do the DISCARD TABLESPACE operation. +@return DB_SUCCESS or error code. */ +static +dberr_t +row_discard_tablespace( +/*===================*/ + trx_t* trx, /*!< in/out: transaction handle */ + dict_table_t* table) /*!< in/out: table to be discarded */ +{ + dberr_t err; + + /* How do we prevent crashes caused by ongoing operations on + the table? Old operations could try to access non-existent + pages. MySQL will block all DML on the table using MDL and a + DISCARD will not start unless all existing operations on the + table to be discarded are completed. + + 1) Acquire the data dictionary latch in X mode. To prevent any + internal operations that MySQL is not aware off and also for + the internal SQL parser. + + 2) Purge and rollback: we assign a new table id for the + table. Since purge and rollback look for the table based on + the table id, they see the table as 'dropped' and discard + their operations. + + 3) Insert buffer: we remove all entries for the tablespace in + the insert buffer tree. + + 4) FOREIGN KEY operations: if table->n_foreign_key_checks_running > 0, + we do not allow the discard. */ + + /* Play safe and remove all insert buffer entries, though we should + have removed them already when DISCARD TABLESPACE was called */ + + ibuf_delete_for_discarded_space(table->space); + + table_id_t new_id; + + /* Set the TABLESPACE DISCARD flag in the table definition on disk. */ + + err = row_import_update_discarded_flag(trx, table->id, true, true); if (err != DB_SUCCESS) { - trx->error_state = DB_SUCCESS; - trx_rollback_to_savepoint(trx, NULL); - trx->error_state = DB_SUCCESS; - } else { - dict_table_change_id_in_cache(table, new_id); + return(err); + } - success = fil_discard_tablespace(table->space); + /* Update the index root pages in the system tables, on disk */ - if (!success) { - trx->error_state = DB_SUCCESS; - trx_rollback_to_savepoint(trx, NULL); - trx->error_state = DB_SUCCESS; + err = row_import_update_index_root(trx, table, true, true); - err = DB_ERROR; - } else { - /* Set the flag which tells that now it is legal to - IMPORT a tablespace for this table */ - table->tablespace_discarded = TRUE; - table->ibd_file_missing = TRUE; - } + if (err != DB_SUCCESS) { + return(err); } -funct_exit: + /* Drop all the FTS auxiliary tables. */ + if (dict_table_has_fts_index(table) + || DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID)) { - if (table != NULL) { - dict_table_close(table, TRUE); + fts_drop_tables(trx, table); } - trx_commit_for_mysql(trx); + /* Assign a new space ID to the table definition so that purge + can ignore the changes. Update the system table on disk. */ - row_mysql_unlock_data_dictionary(trx); + err = row_mysql_table_id_reassign(table, trx, &new_id); - trx->op_info = ""; + if (err != DB_SUCCESS) { + return(err); + } - return((int) err); + /* Discard the physical file that is used for the tablespace. */ + + err = fil_discard_tablespace(table->space); + + switch(err) { + case DB_SUCCESS: + case DB_IO_ERROR: + case DB_TABLESPACE_NOT_FOUND: + /* All persistent operations successful, update the + data dictionary memory cache. */ + + table->ibd_file_missing = TRUE; + + table->flags2 |= DICT_TF2_DISCARDED; + + dict_table_change_id_in_cache(table, new_id); + + /* Reset the root page numbers. */ + + for (dict_index_t* index = UT_LIST_GET_FIRST(table->indexes); + index != 0; + index = UT_LIST_GET_NEXT(indexes, index)) { + + index->page = FIL_NULL; + index->space = FIL_NULL; + } + + /* If the tablespace did not already exist or we couldn't + write to it, we treat that as a successful DISCARD. It is + unusable anyway. */ + + err = DB_SUCCESS; + break; + + default: + /* We need to rollback the disk changes, something failed. */ + + trx->error_state = DB_SUCCESS; + + trx_rollback_to_savepoint(trx, NULL); + + trx->error_state = DB_SUCCESS; + } + + return(err); } -/*****************************************************************//** -Imports a tablespace. The space id in the .ibd file must match the space id -of the table in the data dictionary. +/*********************************************************************//** +Discards the tablespace of a table which stored in an .ibd file. Discarding +means that this function renames the .ibd file and assigns a new table id for +the table. Also the flag table->ibd_file_missing is set to TRUE. @return error code or DB_SUCCESS */ UNIV_INTERN -int -row_import_tablespace_for_mysql( -/*============================*/ +dberr_t +row_discard_tablespace_for_mysql( +/*=============================*/ const char* name, /*!< in: table name */ trx_t* trx) /*!< in: transaction handle */ { + dberr_t err; dict_table_t* table; - ibool success; - lsn_t current_lsn; - ulint err = DB_SUCCESS; - trx_start_if_not_started_xa(trx); + /* Open the table and start the transaction if not started. */ - trx->op_info = "importing tablespace"; + table = row_discard_tablespace_begin(name, trx); - current_lsn = log_get_lsn(); + if (table == 0) { + err = DB_TABLE_NOT_FOUND; + } else if (table->space == TRX_SYS_SPACE) { + char table_name[MAX_FULL_NAME_LEN + 1]; - /* It is possible, though very improbable, that the lsn's in the - tablespace to be imported have risen above the current system lsn, if - a lengthy purge, ibuf merge, or rollback was performed on a backup - taken with ibbackup. If that is the case, reset page lsn's in the - file. We assume that mysqld was shut down after it performed these - cleanup operations on the .ibd file, so that it stamped the latest lsn - to the FIL_PAGE_FILE_FLUSH_LSN in the first page of the .ibd file. + innobase_format_name( + table_name, sizeof(table_name), table->name, FALSE); - TODO: reset also the trx id's in clustered index records and write - a new space id to each data page. That would allow us to import clean - .ibd files from another MySQL installation. */ + ib_senderrf(trx->mysql_thd, IB_LOG_LEVEL_ERROR, + ER_TABLE_IN_SYSTEM_TABLESPACE, table_name); - success = fil_reset_too_high_lsns(name, current_lsn); + err = DB_ERROR; - if (!success) { - ut_print_timestamp(stderr); - fputs(" InnoDB: Error: cannot reset lsn's in table ", stderr); - ut_print_name(stderr, trx, TRUE, name); - fputs("\n" - "InnoDB: in ALTER TABLE ... IMPORT TABLESPACE\n", - stderr); + } else if (table->n_foreign_key_checks_running > 0) { + char table_name[MAX_FULL_NAME_LEN + 1]; + + innobase_format_name( + table_name, sizeof(table_name), table->name, FALSE); + + ib_senderrf(trx->mysql_thd, IB_LOG_LEVEL_ERROR, + ER_DISCARD_FK_CHECKS_RUNNING, table_name); err = DB_ERROR; - row_mysql_lock_data_dictionary(trx); - table = NULL; + } else { + /* Do foreign key constraint checks. */ - goto funct_exit; - } + err = row_discard_tablespace_foreign_key_checks(trx, table); - /* Serialize data dictionary operations with dictionary mutex: - no deadlocks can occur then in these operations */ + if (err == DB_SUCCESS) { + err = row_discard_tablespace(trx, table); + } + } - row_mysql_lock_data_dictionary(trx); + return(row_discard_tablespace_end(trx, table, err)); +} - table = dict_table_open_on_name_no_stats(name, TRUE, - DICT_ERR_IGNORE_NONE); +/*********************************************************************//** +Sets an exclusive lock on a table. +@return error code or DB_SUCCESS */ +UNIV_INTERN +dberr_t +row_mysql_lock_table( +/*=================*/ + trx_t* trx, /*!< in/out: transaction */ + dict_table_t* table, /*!< in: table to lock */ + enum lock_mode mode, /*!< in: LOCK_X or LOCK_S */ + const char* op_info) /*!< in: string for trx->op_info */ +{ + mem_heap_t* heap; + que_thr_t* thr; + dberr_t err; + sel_node_t* node; - if (!table) { - ut_print_timestamp(stderr); - fputs(" InnoDB: table ", stderr); - ut_print_name(stderr, trx, TRUE, name); - fputs("\n" - "InnoDB: does not exist in the InnoDB data dictionary\n" - "InnoDB: in ALTER TABLE ... IMPORT TABLESPACE\n", - stderr); + ut_ad(trx); + ut_ad(mode == LOCK_X || mode == LOCK_S); - err = DB_TABLE_NOT_FOUND; + heap = mem_heap_create(512); - goto funct_exit; - } + trx->op_info = op_info; - if (table->space == 0) { - ut_print_timestamp(stderr); - fputs(" InnoDB: Error: table ", stderr); - ut_print_name(stderr, trx, TRUE, name); - fputs("\n" - "InnoDB: is in the system tablespace 0" - " which cannot be imported\n", stderr); - err = DB_ERROR; + node = sel_node_create(heap); + thr = pars_complete_graph_for_exec(node, trx, heap); + thr->graph->state = QUE_FORK_ACTIVE; - goto funct_exit; - } + /* We use the select query graph as the dummy graph needed + in the lock module call */ - if (!table->tablespace_discarded) { - ut_print_timestamp(stderr); - fputs(" InnoDB: Error: you are trying to" - " IMPORT a tablespace\n" - "InnoDB: ", stderr); - ut_print_name(stderr, trx, TRUE, name); - fputs(", though you have not called DISCARD on it yet\n" - "InnoDB: during the lifetime of the mysqld process!\n", - stderr); + thr = que_fork_get_first_thr( + static_cast<que_fork_t*>(que_node_get_parent(thr))); - err = DB_ERROR; + que_thr_move_to_run_state_for_mysql(thr, trx); - goto funct_exit; - } +run_again: + thr->run_node = thr; + thr->prev_node = thr->common.parent; - /* Play safe and remove all insert buffer entries, though we should - have removed them already when DISCARD TABLESPACE was called */ + err = lock_table(0, table, mode, thr); - ibuf_delete_for_discarded_space(table->space); + trx->error_state = err; - success = fil_open_single_table_tablespace( - TRUE, table->space, - dict_tf_to_fsp_flags(table->flags), - table->name); - if (success) { - table->ibd_file_missing = FALSE; - table->tablespace_discarded = FALSE; + if (err == DB_SUCCESS) { + que_thr_stop_for_mysql_no_error(thr, trx); } else { - if (table->ibd_file_missing) { - ut_print_timestamp(stderr); - fputs(" InnoDB: cannot find or open in the" - " database directory the .ibd file of\n" - "InnoDB: table ", stderr); - ut_print_name(stderr, trx, TRUE, name); - fputs("\n" - "InnoDB: in ALTER TABLE ... IMPORT TABLESPACE\n", - stderr); - } + que_thr_stop_for_mysql(thr); - err = DB_ERROR; - } + if (err != DB_QUE_THR_SUSPENDED) { + ibool was_lock_wait; -funct_exit: + was_lock_wait = row_mysql_handle_errors( + &err, trx, thr, NULL); - if (table != NULL) { - dict_table_close(table, TRUE); - } + if (was_lock_wait) { + goto run_again; + } + } else { + que_thr_t* run_thr; + que_node_t* parent; - trx_commit_for_mysql(trx); + parent = que_node_get_parent(thr); - row_mysql_unlock_data_dictionary(trx); + run_thr = que_fork_start_command( + static_cast<que_fork_t*>(parent)); + + ut_a(run_thr == thr); + + /* There was a lock wait but the thread was not + in a ready to run or running state. */ + trx->error_state = DB_LOCK_WAIT; + goto run_again; + } + } + + que_graph_free(thr->graph); trx->op_info = ""; - return((int) err); + return(err); } /*********************************************************************//** Truncates a table for MySQL. @return error code or DB_SUCCESS */ UNIV_INTERN -int +dberr_t row_truncate_table_for_mysql( /*=========================*/ dict_table_t* table, /*!< in: table handle */ trx_t* trx) /*!< in: transaction handle */ { dict_foreign_t* foreign; - ulint err; + dberr_t err; mem_heap_t* heap; byte* buf; dtuple_t* tuple; @@ -2978,17 +3171,15 @@ row_truncate_table_for_mysql( ulint recreate_space = 0; pars_info_t* info = NULL; ibool has_internal_doc_id; + ulint old_space = table->space; /* How do we prevent crashes caused by ongoing operations on the table? Old operations could try to access non-existent pages. 1) SQL queries, INSERT, SELECT, ...: we must get an exclusive - MySQL table lock on the table before we can do TRUNCATE - TABLE. Then there are no running queries on the table. This is - guaranteed, because in ha_innobase::store_lock(), we do not - weaken the TL_WRITE lock requested by MySQL when executing - SQLCOM_TRUNCATE. + InnoDB table lock on the table before we can do TRUNCATE + TABLE. Then there are no running queries on the table. 2) Purge and rollback: we assign a new table id for the table. Since purge and rollback look for the table based on @@ -3031,9 +3222,15 @@ row_truncate_table_for_mysql( return(DB_ERROR); } - trx->op_info = "truncating table"; + if (dict_table_is_discarded(table)) { + return(DB_TABLESPACE_DELETED); + } else if (table->ibd_file_missing) { + return(DB_TABLESPACE_NOT_FOUND); + } - trx_start_if_not_started_xa(trx); + trx_start_for_ddl(trx, TRX_DICT_OP_TABLE); + + trx->op_info = "truncating table"; /* Serialize data dictionary operations with dictionary mutex: no deadlocks can occur then in these operations */ @@ -3049,16 +3246,22 @@ row_truncate_table_for_mysql( ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX)); #endif /* UNIV_SYNC_DEBUG */ + dict_stats_wait_bg_to_stop_using_tables(table, NULL, trx); + /* Check if the table is referenced by foreign key constraints from some other table (not the table itself) */ - foreign = UT_LIST_GET_FIRST(table->referenced_list); + for (foreign = UT_LIST_GET_FIRST(table->referenced_list); + foreign != 0 && foreign->foreign_table == table; + foreign = UT_LIST_GET_NEXT(referenced_list, foreign)) { - while (foreign && foreign->foreign_table == table) { - foreign = UT_LIST_GET_NEXT(referenced_list, foreign); + /* Do nothing. */ } - if (foreign && trx->check_foreigns) { + if (!srv_read_only_mode + && foreign + && trx->check_foreigns) { + FILE* ef = dict_foreign_err_file; /* We only allow truncating a referenced table if @@ -3099,19 +3302,41 @@ row_truncate_table_for_mysql( goto funct_exit; } - /* Remove all locks except the table-level S and X locks. */ + /* Remove all locks except the table-level X lock. */ lock_remove_all_on_table(table, FALSE); + /* Ensure that the table will be dropped by + trx_rollback_active() in case of a crash. */ + trx->table_id = table->id; + trx_set_dict_operation(trx, TRX_DICT_OP_TABLE); + + /* Assign an undo segment for the transaction, so that the + transaction will be recovered after a crash. */ + + mutex_enter(&trx->undo_mutex); + + err = trx_undo_assign_undo(trx, TRX_UNDO_UPDATE); + + mutex_exit(&trx->undo_mutex); + + if (err != DB_SUCCESS) { + + goto funct_exit; + } if (table->space && !table->dir_path_of_temp_table) { /* Discard and create the single-table tablespace. */ ulint space = table->space; ulint flags = fil_space_get_flags(space); + ut_a(!DICT_TF2_FLAG_IS_SET(table, DICT_TF2_TEMPORARY)); + + dict_get_and_save_data_dir_path(table, true); + if (flags != ULINT_UNDEFINED - && fil_discard_tablespace(space)) { + && fil_discard_tablespace(space) == DB_SUCCESS) { dict_index_t* index; @@ -3124,15 +3349,18 @@ row_truncate_table_for_mysql( if (space == ULINT_UNDEFINED || fil_create_new_single_table_tablespace( - space, table->name, FALSE, + space, table->name, + table->data_dir_path, flags, table->flags2, - FIL_IBD_FILE_INITIAL_SIZE) != DB_SUCCESS) { + FIL_IBD_FILE_INITIAL_SIZE) + != DB_SUCCESS) { dict_table_x_unlock_indexes(table); - ut_print_timestamp(stderr); - fprintf(stderr, - " InnoDB: TRUNCATE TABLE %s failed to" - " create a new tablespace\n", + + ib_logf(IB_LOG_LEVEL_ERROR, + "TRUNCATE TABLE %s failed to " + "create a new tablespace", table->name); + table->ibd_file_missing = 1; err = DB_ERROR; goto funct_exit; @@ -3240,7 +3468,6 @@ next_rec: mtr_commit(&mtr); mem_heap_free(heap); - /* Done with index truncation, release index tree locks, subsequent work relates to table level metadata change */ dict_table_x_unlock_indexes(table); @@ -3259,21 +3486,21 @@ next_rec: fts_table.name = table->name; fts_table.id = new_id; - err = fts_create_common_tables(trx, &fts_table, table->name, - TRUE); + err = fts_create_common_tables( + trx, &fts_table, table->name, TRUE); - if (err == DB_SUCCESS) { - for (i = 0; i < ib_vector_size(table->fts->indexes); - i++) { - dict_index_t* fts_index; + for (i = 0; + i < ib_vector_size(table->fts->indexes) + && err == DB_SUCCESS; + i++) { - fts_index = static_cast<dict_index_t*>( - ib_vector_getp( - table->fts->indexes, i)); + dict_index_t* fts_index; - fts_create_index_tables_low( - trx, fts_index, table->name, new_id); - } + fts_index = static_cast<dict_index_t*>( + ib_vector_getp(table->fts->indexes, i)); + + err = fts_create_index_tables_low( + trx, fts_index, table->name, new_id); } if (err != DB_SUCCESS) { @@ -3287,34 +3514,64 @@ next_rec: fputs("\n", stderr); goto funct_exit; + } else { + ut_ad(trx->state != TRX_STATE_NOT_STARTED); } } info = pars_info_create(); - pars_info_add_int4_literal(info, "space", (lint) table->space); + pars_info_add_int4_literal(info, "new_space", (lint) table->space); pars_info_add_ull_literal(info, "old_id", table->id); pars_info_add_ull_literal(info, "new_id", new_id); err = que_eval_sql(info, - "PROCEDURE RENUMBER_TABLESPACE_PROC () IS\n" + "PROCEDURE RENUMBER_TABLE_ID_PROC () IS\n" "BEGIN\n" "UPDATE SYS_TABLES" - " SET ID = :new_id, SPACE = :space\n" + " SET ID = :new_id, SPACE = :new_space\n" " WHERE ID = :old_id;\n" "UPDATE SYS_COLUMNS SET TABLE_ID = :new_id\n" " WHERE TABLE_ID = :old_id;\n" "UPDATE SYS_INDEXES" - " SET TABLE_ID = :new_id, SPACE = :space\n" + " SET TABLE_ID = :new_id, SPACE = :new_space\n" " WHERE TABLE_ID = :old_id;\n" - "COMMIT WORK;\n" "END;\n" , FALSE, trx); + if (err == DB_SUCCESS && old_space != table->space) { + info = pars_info_create(); + + pars_info_add_int4_literal(info, "old_space", (lint) old_space); + + pars_info_add_int4_literal( + info, "new_space", (lint) table->space); + + err = que_eval_sql(info, + "PROCEDURE RENUMBER_TABLESPACE_PROC () IS\n" + "BEGIN\n" + "UPDATE SYS_TABLESPACES" + " SET SPACE = :new_space\n" + " WHERE SPACE = :old_space;\n" + "UPDATE SYS_DATAFILES" + " SET SPACE = :new_space" + " WHERE SPACE = :old_space;\n" + "END;\n" + , FALSE, trx); + } + DBUG_EXECUTE_IF("ib_ddl_crash_before_fts_truncate", err = DB_ERROR;); + if (err != DB_SUCCESS) { trx->error_state = DB_SUCCESS; trx_rollback_to_savepoint(trx, NULL); trx->error_state = DB_SUCCESS; + + /* Update system table failed. Table in memory metadata + could be in an inconsistent state, mark the in-memory + table->corrupted to be true. In the long run, this should + be fixed by atomic truncate table */ + table->corrupted = true; + ut_print_timestamp(stderr); fputs(" InnoDB: Unable to assign a new identifier to table ", stderr); @@ -3323,30 +3580,40 @@ next_rec: "InnoDB: after truncating it. Background processes" " may corrupt the table!\n", stderr); - /* Fail to update the table id, so drop the new + /* Failed to update the table id, so drop the new FTS auxiliary tables */ if (has_internal_doc_id) { - dict_table_t fts_table; + ut_ad(trx->state == TRX_STATE_NOT_STARTED); + + table_id_t id = table->id; - fts_table.name = table->name; - fts_table.id = new_id; + table->id = new_id; - fts_drop_tables(trx, &fts_table); + fts_drop_tables(trx, table); + + table->id = id; + + ut_ad(trx->state != TRX_STATE_NOT_STARTED); } err = DB_ERROR; } else { /* Drop the old FTS index */ if (has_internal_doc_id) { + ut_ad(trx->state != TRX_STATE_NOT_STARTED); fts_drop_tables(trx, table); + ut_ad(trx->state != TRX_STATE_NOT_STARTED); } + DBUG_EXECUTE_IF("ib_truncate_crash_after_fts_drop", + DBUG_SUICIDE();); + dict_table_change_id_in_cache(table, new_id); /* Reset the Doc ID in cache to 0 */ if (has_internal_doc_id && table->fts->cache) { table->fts->fts_status |= TABLE_DICT_LOCKED; - fts_update_next_doc_id(table, NULL, 0); + fts_update_next_doc_id(trx, table, NULL, 0); fts_cache_clear(table->fts->cache, TRUE); fts_cache_init(table->fts->cache); table->fts->fts_status &= ~TABLE_DICT_LOCKED; @@ -3364,16 +3631,13 @@ funct_exit: row_mysql_unlock_data_dictionary(trx); - /* We are supposed to recalc and save the stats only - on ANALYZE, but it also makes sense to do so on TRUNCATE */ - dict_stats_update(table, DICT_STATS_RECALC_PERSISTENT_SILENT, - FALSE); + dict_stats_update(table, DICT_STATS_EMPTY_TABLE); trx->op_info = ""; srv_wake_master_thread(); - return((int) err); + return(err); } /*********************************************************************//** @@ -3385,23 +3649,29 @@ by the transaction, the transaction will be committed. Otherwise, the data dictionary will remain locked. @return error code or DB_SUCCESS */ UNIV_INTERN -int +dberr_t row_drop_table_for_mysql( /*=====================*/ const char* name, /*!< in: table name */ trx_t* trx, /*!< in: transaction handle */ - ibool drop_db)/*!< in: TRUE=dropping whole database */ + bool drop_db,/*!< in: true=dropping whole database */ + bool nonatomic) + /*!< in: whether it is permitted + to release and reacquire dict_operation_lock */ { + dberr_t err; dict_foreign_t* foreign; dict_table_t* table; - dict_index_t* index; + ibool print_msg; ulint space_id; - ulint err; - const char* table_name; + char* filepath = NULL; + const char* tablename_minus_db; + char* tablename = NULL; + bool ibd_file_missing; ulint namelen; - ibool locked_dictionary = FALSE; - ibool fts_bg_thread_exited = FALSE; + bool locked_dictionary = false; pars_info_t* info = NULL; + mem_heap_t* heap = NULL; ut_a(name != NULL); @@ -3419,19 +3689,19 @@ row_drop_table_for_mysql( Certain table names starting with 'innodb_' have their special meaning regardless of the database name. Thus, we need to ignore the database name prefix in the comparisons. */ - table_name = strchr(name, '/'); + tablename_minus_db = strchr(name, '/'); - if (table_name) { - table_name++; + if (tablename_minus_db) { + tablename_minus_db++; } else { /* Ancillary FTS tables don't have '/' characters. */ - table_name = name; + tablename_minus_db = name; } - namelen = strlen(table_name) + 1; + namelen = strlen(tablename_minus_db) + 1; if (namelen == sizeof S_innodb_monitor - && !memcmp(table_name, S_innodb_monitor, + && !memcmp(tablename_minus_db, S_innodb_monitor, sizeof S_innodb_monitor)) { /* Table name equals "innodb_monitor": @@ -3440,17 +3710,17 @@ row_drop_table_for_mysql( srv_print_innodb_monitor = FALSE; srv_print_innodb_lock_monitor = FALSE; } else if (namelen == sizeof S_innodb_lock_monitor - && !memcmp(table_name, S_innodb_lock_monitor, + && !memcmp(tablename_minus_db, S_innodb_lock_monitor, sizeof S_innodb_lock_monitor)) { srv_print_innodb_monitor = FALSE; srv_print_innodb_lock_monitor = FALSE; } else if (namelen == sizeof S_innodb_tablespace_monitor - && !memcmp(table_name, S_innodb_tablespace_monitor, + && !memcmp(tablename_minus_db, S_innodb_tablespace_monitor, sizeof S_innodb_tablespace_monitor)) { srv_print_innodb_tablespace_monitor = FALSE; } else if (namelen == sizeof S_innodb_table_monitor - && !memcmp(table_name, S_innodb_table_monitor, + && !memcmp(tablename_minus_db, S_innodb_table_monitor, sizeof S_innodb_table_monitor)) { srv_print_innodb_table_monitor = FALSE; @@ -3461,7 +3731,10 @@ row_drop_table_for_mysql( trx->op_info = "dropping table"; - trx_start_if_not_started(trx); + /* This function is called recursively via fts_drop_tables(). */ + if (trx->state == TRX_STATE_NOT_STARTED) { + trx_start_for_ddl(trx, TRX_DICT_OP_TABLE); + } if (trx->dict_operation_lock_mode != RW_X_LATCH) { /* Prevent foreign key checks etc. while we are dropping the @@ -3469,17 +3742,17 @@ row_drop_table_for_mysql( row_mysql_lock_data_dictionary(trx); - locked_dictionary = TRUE; + locked_dictionary = true; + nonatomic = true; } -retry: ut_ad(mutex_own(&(dict_sys->mutex))); #ifdef UNIV_SYNC_DEBUG ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX)); #endif /* UNIV_SYNC_DEBUG */ - table = dict_table_open_on_name_no_stats( - name, TRUE, + table = dict_table_open_on_name( + name, TRUE, FALSE, static_cast<dict_err_ignore_t>( DICT_ERR_IGNORE_INDEX_ROOT | DICT_ERR_IGNORE_CORRUPT)); @@ -3502,34 +3775,53 @@ retry: goto funct_exit; } - if (table->fts) { - fts_t* fts = table->fts; + /* Turn on this drop bit before we could release the dictionary + latch */ + table->to_be_dropped = true; - /* It is possible that background 'Add' thread fts_add_thread() - just gets called and the fts_optimize_thread() - is processing deleted records. There could be undetected - deadlock between threads synchronization and dict_sys_mutex - since fts_parse_sql() requires dict_sys->mutex. Ask the - background thread to exit before proceeds to drop table to - avoid undetected deadlocks */ - row_mysql_unlock_data_dictionary(trx); + if (nonatomic) { + /* This trx did not acquire any locks on dictionary + table records yet. Thus it is safe to release and + reacquire the data dictionary latches. */ + if (table->fts) { + ut_ad(!table->fts->add_wq); + ut_ad(lock_trx_has_sys_table_locks(trx) == 0); - if (fts->add_wq && (!fts_bg_thread_exited)) { - /* Wait for any background threads accessing the table - to exit. */ - mutex_enter(&fts->bg_threads_mutex); - fts->fts_status |= BG_THREAD_STOP; + row_mysql_unlock_data_dictionary(trx); + fts_optimize_remove_table(table); + row_mysql_lock_data_dictionary(trx); + } - dict_table_wait_for_bg_threads_to_exit(table, 250000); + /* Do not bother to deal with persistent stats for temp + tables since we know temp tables do not use persistent + stats. */ + if (!dict_table_is_temporary(table)) { + dict_stats_wait_bg_to_stop_using_tables( + table, NULL, trx); + } + } - mutex_exit(&fts->bg_threads_mutex); + /* make sure background stats thread is not running on the table */ + ut_ad(!(table->stats_bg_flag & BG_STAT_IN_PROGRESS)); - row_mysql_lock_data_dictionary(trx); - fts_bg_thread_exited = TRUE; - goto retry; - } else { - fts_optimize_remove_table(table); - row_mysql_lock_data_dictionary(trx); + /* Delete the link file if used. */ + if (DICT_TF_HAS_DATA_DIR(table->flags)) { + fil_delete_link_file(name); + } + + if (!dict_table_is_temporary(table)) { + + dict_stats_recalc_pool_del(table); + + /* Remove stats for this table and all of its indexes from the + persistent storage if it exists and if there are stats for this + table in there. This function creates its own trx and commits + it. */ + char errstr[1024]; + err = dict_stats_drop_table(name, errstr, sizeof(errstr)); + + if (err != DB_SUCCESS) { + ib_logf(IB_LOG_LEVEL_WARN, "%s", errstr); } } @@ -3540,7 +3832,7 @@ retry: dict_table_move_from_lru_to_non_lru(table); } - dict_table_close(table, TRUE); + dict_table_close(table, TRUE, FALSE); /* Check if the table is referenced by foreign key constraints from some other table (not the table itself) */ @@ -3552,7 +3844,9 @@ check_next_foreign: foreign = UT_LIST_GET_NEXT(referenced_list, foreign); } - if (foreign && trx->check_foreigns + if (!srv_read_only_mode + && foreign + && trx->check_foreigns && !(drop_db && dict_tables_have_same_db( name, foreign->foreign_table_name_lookup))) { FILE* ef = dict_foreign_err_file; @@ -3589,16 +3883,16 @@ check_next_foreign: if (table->n_foreign_key_checks_running > 0) { - const char* table_name = table->name; + const char* save_tablename = table->name; ibool added; - added = row_add_table_to_background_drop_list(table_name); + added = row_add_table_to_background_drop_list(save_tablename); if (added) { ut_print_timestamp(stderr); fputs(" InnoDB: You are trying to drop table ", stderr); - ut_print_name(stderr, trx, TRUE, table_name); + ut_print_name(stderr, trx, TRUE, save_tablename); fputs("\n" "InnoDB: though there is a" " foreign key check running on it.\n" @@ -3663,23 +3957,54 @@ check_next_foreign: goto funct_exit; } + /* The "to_be_dropped" marks table that is to be dropped, but + has not been dropped, instead, was put in the background drop + list due to being used by concurrent DML operations. Clear it + here since there are no longer any concurrent activities on it, + and it is free to be dropped */ + table->to_be_dropped = false; + /* If we get this far then the table to be dropped must not have any table or record locks on it. */ ut_a(!lock_table_has_locks(table)); - trx_set_dict_operation(trx, TRX_DICT_OP_TABLE); - trx->table_id = table->id; + switch (trx_get_dict_operation(trx)) { + case TRX_DICT_OP_NONE: + trx_set_dict_operation(trx, TRX_DICT_OP_TABLE); + trx->table_id = table->id; + case TRX_DICT_OP_TABLE: + break; + case TRX_DICT_OP_INDEX: + /* If the transaction was previously flagged as + TRX_DICT_OP_INDEX, we should be dropping auxiliary + tables for full-text indexes. */ + ut_ad(strstr(table->name, "/FTS_") != NULL); + } /* Mark all indexes unavailable in the data dictionary cache before starting to drop the table. */ - for (index = dict_table_get_first_index(table); + unsigned* page_no; + unsigned* page_nos; + heap = mem_heap_create( + 200 + UT_LIST_GET_LEN(table->indexes) * sizeof *page_nos); + tablename = mem_heap_strdup(heap, name); + + page_no = page_nos = static_cast<unsigned*>( + mem_heap_alloc( + heap, + UT_LIST_GET_LEN(table->indexes) * sizeof *page_no)); + + for (dict_index_t* index = dict_table_get_first_index(table); index != NULL; index = dict_table_get_next_index(index)) { rw_lock_x_lock(dict_index_get_lock(index)); - ut_ad(!index->to_be_dropped); - index->to_be_dropped = TRUE; + /* Save the page numbers so that we can restore them + if the operation fails. */ + *page_no++ = index->page; + /* Mark the index unusable. */ + index->page = FIL_NULL; rw_lock_x_unlock(dict_index_get_lock(index)); } @@ -3698,6 +4023,7 @@ check_next_foreign: "table_id CHAR;\n" "index_id CHAR;\n" "foreign_id CHAR;\n" + "space_id INT;\n" "found INT;\n" "DECLARE CURSOR cur_fk IS\n" @@ -3720,6 +4046,12 @@ check_next_foreign: "IF (SQL % NOTFOUND) THEN\n" " RETURN;\n" "END IF;\n" + "SELECT SPACE INTO space_id\n" + "FROM SYS_TABLES\n" + "WHERE NAME = :table_name;\n" + "IF (SQL % NOTFOUND) THEN\n" + " RETURN;\n" + "END IF;\n" "found := 1;\n" "SELECT ID INTO sys_foreign_id\n" "FROM SYS_TABLES\n" @@ -3762,56 +4094,90 @@ check_next_foreign: " END IF;\n" "END LOOP;\n" "CLOSE cur_idx;\n" + "DELETE FROM SYS_TABLESPACES\n" + "WHERE SPACE = space_id;\n" + "DELETE FROM SYS_DATAFILES\n" + "WHERE SPACE = space_id;\n" "DELETE FROM SYS_COLUMNS\n" "WHERE TABLE_ID = table_id;\n" "DELETE FROM SYS_TABLES\n" - "WHERE ID = table_id;\n" + "WHERE NAME = :table_name;\n" "END;\n" , FALSE, trx); switch (err) { - ibool is_temp; - mem_heap_t* heap; + ibool is_temp; case DB_SUCCESS: - - heap = mem_heap_create(200); - /* Clone the name, in case it has been allocated from table->heap, which will be freed by dict_table_remove_from_cache(table) below. */ - name = mem_heap_strdup(heap, name); space_id = table->space; + ibd_file_missing = table->ibd_file_missing; - is_temp = table->flags2 & DICT_TF2_TEMPORARY; + is_temp = DICT_TF2_FLAG_IS_SET(table, DICT_TF2_TEMPORARY); + + /* If there is a temp path then the temp flag is set. + However, during recovery, we might have a temp flag but + not know the temp path */ ut_a(table->dir_path_of_temp_table == NULL || is_temp); + if (dict_table_is_discarded(table) + || table->ibd_file_missing) { + /* Do not attempt to drop known-to-be-missing + tablespaces. */ + space_id = 0; + } + + /* We do not allow temporary tables with a remote path. */ + ut_a(!(is_temp && DICT_TF_HAS_DATA_DIR(table->flags))); + + if (space_id && DICT_TF_HAS_DATA_DIR(table->flags)) { + dict_get_and_save_data_dir_path(table, true); + ut_a(table->data_dir_path); + + filepath = os_file_make_remote_pathname( + table->data_dir_path, table->name, "ibd"); + } else if (table->dir_path_of_temp_table) { + filepath = fil_make_ibd_name( + table->dir_path_of_temp_table, true); + } else { + filepath = fil_make_ibd_name(tablename, false); + } if (dict_table_has_fts_index(table) || DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID)) { ut_ad(table->n_ref_count == 0); + ut_ad(trx->state != TRX_STATE_NOT_STARTED); err = fts_drop_tables(trx, table); if (err != DB_SUCCESS) { ut_print_timestamp(stderr); - fprintf(stderr," InnoDB: Error: (%lu) not " + fprintf(stderr," InnoDB: Error: (%s) not " "able to remove ancillary FTS tables " - "for table ", err); - ut_print_name(stderr, trx, TRUE, name); + "for table ", ut_strerr(err)); + ut_print_name(stderr, trx, TRUE, tablename); fputs("\n", stderr); goto funct_exit; } + } + /* The table->fts flag can be set on the table for which + the cluster index is being rebuilt. Such table might not have + DICT_TF2_FTS flag set. So keep this out of above + dict_table_has_fts_index condition */ + if (table->fts) { fts_free(table); } dict_table_remove_from_cache(table); - if (dict_load_table(name, TRUE, DICT_ERR_IGNORE_NONE) != NULL) { + if (dict_load_table(tablename, TRUE, + DICT_ERR_IGNORE_NONE) != NULL) { ut_print_timestamp(stderr); fputs(" InnoDB: Error: not able to remove table ", stderr); - ut_print_name(stderr, trx, TRUE, name); + ut_print_name(stderr, trx, TRUE, tablename); fputs(" from the dictionary cache!\n", stderr); err = DB_ERROR; } @@ -3819,23 +4185,46 @@ check_next_foreign: /* Do not drop possible .ibd tablespace if something went wrong: we do not want to delete valuable data of the user */ - if (err == DB_SUCCESS && space_id > 0) { - if (!fil_space_for_table_exists_in_mem( - space_id, name, FALSE, !is_temp)) { + /* Don't spam the log if we can't find the tablespace of + a temp table or if the tablesace has been discarded. */ + print_msg = !(is_temp || ibd_file_missing); + + if (err == DB_SUCCESS && space_id > TRX_SYS_SPACE) { + if (!is_temp + && !fil_space_for_table_exists_in_mem( + space_id, tablename, FALSE, + print_msg, false, NULL, 0)) { + /* This might happen if we are dropping a + discarded tablespace */ err = DB_SUCCESS; + if (print_msg) { + char msg_tablename[MAX_FULL_NAME_LEN + 1]; + + innobase_format_name( + msg_tablename, sizeof(tablename), + tablename, FALSE); + + ib_logf(IB_LOG_LEVEL_INFO, + "Removed the table %s from " + "InnoDB's data dictionary", + msg_tablename); + } + + /* Force a delete of any discarded + or temporary files. */ + + fil_delete_file(filepath); + + } else if (fil_delete_tablespace( + space_id, + BUF_REMOVE_FLUSH_NO_WRITE) + != DB_SUCCESS) { fprintf(stderr, "InnoDB: We removed now the InnoDB" " internal data dictionary entry\n" "InnoDB: of table "); - ut_print_name(stderr, trx, TRUE, name); - fprintf(stderr, ".\n"); - } else if (!fil_delete_tablespace(space_id)) { - fprintf(stderr, - "InnoDB: We removed now the InnoDB" - " internal data dictionary entry\n" - "InnoDB: of table "); - ut_print_name(stderr, trx, TRUE, name); + ut_print_name(stderr, trx, TRUE, tablename); fprintf(stderr, ".\n"); ut_print_timestamp(stderr); @@ -3843,13 +4232,12 @@ check_next_foreign: " InnoDB: Error: not able to" " delete tablespace %lu of table ", (ulong) space_id); - ut_print_name(stderr, trx, TRUE, name); + ut_print_name(stderr, trx, TRUE, tablename); fputs("!\n", stderr); err = DB_ERROR; } } - mem_heap_free(heap); break; case DB_OUT_OF_FILE_SPACE: @@ -3874,7 +4262,7 @@ check_next_foreign: fprintf(stderr, "InnoDB: unknown error code %lu" " while dropping table:", (ulong) err); - ut_print_name(stderr, trx, TRUE, name); + ut_print_name(stderr, trx, TRUE, tablename); fprintf(stderr, ".\n"); trx->error_state = DB_SUCCESS; @@ -3884,16 +4272,25 @@ check_next_foreign: /* Mark all indexes available in the data dictionary cache again. */ - for (index = dict_table_get_first_index(table); + page_no = page_nos; + + for (dict_index_t* index = dict_table_get_first_index(table); index != NULL; index = dict_table_get_next_index(index)) { rw_lock_x_lock(dict_index_get_lock(index)); - index->to_be_dropped = FALSE; + ut_a(index->page == FIL_NULL); + index->page = *page_no++; rw_lock_x_unlock(dict_index_get_lock(index)); } } funct_exit: + if (heap) { + mem_heap_free(heap); + } + if (filepath) { + mem_free(filepath); + } if (locked_dictionary) { trx_commit_for_mysql(trx); @@ -3905,7 +4302,7 @@ funct_exit: srv_wake_master_thread(); - return((int) err); + return(err); } /*********************************************************************//** @@ -3929,9 +4326,9 @@ row_mysql_drop_temp_tables(void) mtr_start(&mtr); btr_pcur_open_at_index_side( - TRUE, + true, dict_table_get_first_index(dict_sys->sys_tables), - BTR_SEARCH_LEAF, &pcur, TRUE, &mtr); + BTR_SEARCH_LEAF, &pcur, true, 0, &mtr); for (;;) { const rec_t* rec; @@ -3950,6 +4347,8 @@ row_mysql_drop_temp_tables(void) ROW_FORMAT=REDUNDANT. */ rec = btr_pcur_get_rec(&pcur); field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_TABLES__NAME, &len); + field = rec_get_nth_field_old( rec, DICT_FLD__SYS_TABLES__N_COLS, &len); if (len != 4 || !(mach_read_from_4(field) & DICT_N_COLS_COMPACT)) { @@ -4003,15 +4402,15 @@ row_mysql_drop_temp_tables(void) Drop all foreign keys in a database, see Bug#18942. Called at the end of row_drop_database_for_mysql(). @return error code or DB_SUCCESS */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t drop_all_foreign_keys_in_db( /*========================*/ const char* name, /*!< in: database name which ends to '/' */ trx_t* trx) /*!< in: transaction handle */ { pars_info_t* pinfo; - ulint err; + dberr_t err; ut_a(name[strlen(name) - 1] == '/'); @@ -4063,22 +4462,24 @@ drop_all_foreign_keys_in_db( Drops a database for MySQL. @return error code or DB_SUCCESS */ UNIV_INTERN -int +dberr_t row_drop_database_for_mysql( /*========================*/ const char* name, /*!< in: database name which ends to '/' */ trx_t* trx) /*!< in: transaction handle */ { - dict_table_t* table; - char* table_name; - int err = DB_SUCCESS; - ulint namelen = strlen(name); + dict_table_t* table; + char* table_name; + dberr_t err = DB_SUCCESS; + ulint namelen = strlen(name); ut_a(name != NULL); ut_a(name[namelen - 1] == '/'); trx->op_info = "dropping database"; + trx_set_dict_operation(trx, TRX_DICT_OP_TABLE); + trx_start_if_not_started_xa(trx); loop: row_mysql_lock_data_dictionary(trx); @@ -4086,11 +4487,29 @@ loop: while ((table_name = dict_get_first_table_name_in_db(name))) { ut_a(memcmp(table_name, name, namelen) == 0); - table = dict_table_open_on_name_no_stats(table_name, TRUE, - DICT_ERR_IGNORE_NONE); + table = dict_table_open_on_name( + table_name, TRUE, FALSE, static_cast<dict_err_ignore_t>( + DICT_ERR_IGNORE_INDEX_ROOT + | DICT_ERR_IGNORE_CORRUPT)); - ut_a(table); - ut_a(!table->can_be_evicted); + if (!table) { + ib_logf(IB_LOG_LEVEL_ERROR, + "Cannot load table %s from InnoDB internal " + "data dictionary during drop database", + table_name); + mem_free(table_name); + err = DB_TABLE_NOT_FOUND; + break; + + } + + if (row_is_mysql_tmp_table_name(table->name)) { + /* There could be an orphan temp table left from + interupted alter table rebuild operation */ + dict_table_close(table, TRUE, FALSE); + } else { + ut_a(!table->can_be_evicted || table->ibd_file_missing); + } /* Wait until MySQL does not have any queries running on the table */ @@ -4121,8 +4540,8 @@ loop: if (err != DB_SUCCESS) { fputs("InnoDB: DROP DATABASE ", stderr); ut_print_name(stderr, trx, TRUE, name); - fprintf(stderr, " failed with error %lu for table ", - (ulint) err); + fprintf(stderr, " failed with error (%s) for table ", + ut_strerr(err)); ut_print_name(stderr, trx, TRUE, table_name); putc('\n', stderr); mem_free(table_name); @@ -4135,7 +4554,7 @@ loop: if (err == DB_SUCCESS) { /* after dropping all tables try to drop all leftover foreign keys in case orphaned ones exist */ - err = (int) drop_all_foreign_keys_in_db(name, trx); + err = drop_all_foreign_keys_in_db(name, trx); if (err != DB_SUCCESS) { fputs("InnoDB: DROP DATABASE ", stderr); @@ -4157,9 +4576,9 @@ loop: /*********************************************************************//** Checks if a table name contains the string "/#sql" which denotes temporary tables in MySQL. -@return TRUE if temporary table */ -static -ibool +@return true if temporary table */ +UNIV_INTERN __attribute__((warn_unused_result)) +bool row_is_mysql_tmp_table_name( /*========================*/ const char* name) /*!< in: table name in the form @@ -4172,8 +4591,8 @@ row_is_mysql_tmp_table_name( /****************************************************************//** Delete a single constraint. @return error code or DB_SUCCESS */ -static -int +static __attribute__((nonnull, warn_unused_result)) +dberr_t row_delete_constraint_low( /*======================*/ const char* id, /*!< in: constraint id */ @@ -4183,7 +4602,7 @@ row_delete_constraint_low( pars_info_add_str_literal(info, "id", id); - return((int) que_eval_sql(info, + return(que_eval_sql(info, "PROCEDURE DELETE_CONSTRAINT () IS\n" "BEGIN\n" "DELETE FROM SYS_FOREIGN_COLS WHERE ID = :id;\n" @@ -4195,8 +4614,8 @@ row_delete_constraint_low( /****************************************************************//** Delete a single constraint. @return error code or DB_SUCCESS */ -static -int +static __attribute__((nonnull, warn_unused_result)) +dberr_t row_delete_constraint( /*==================*/ const char* id, /*!< in: constraint id */ @@ -4205,7 +4624,7 @@ row_delete_constraint( mem_heap_t* heap, /*!< in: memory heap */ trx_t* trx) /*!< in: transaction handle */ { - ulint err; + dberr_t err; /* New format constraints have ids <databasename>/<constraintname>. */ err = row_delete_constraint_low( @@ -4222,29 +4641,30 @@ row_delete_constraint( err = row_delete_constraint_low(id, trx); } - return((int) err); + return(err); } /*********************************************************************//** Renames a table for MySQL. @return error code or DB_SUCCESS */ UNIV_INTERN -ulint +dberr_t row_rename_table_for_mysql( /*=======================*/ const char* old_name, /*!< in: old table name */ const char* new_name, /*!< in: new table name */ - trx_t* trx, /*!< in: transaction handle */ - ibool commit) /*!< in: if TRUE then commit trx */ + trx_t* trx, /*!< in/out: transaction */ + bool commit) /*!< in: whether to commit trx */ { dict_table_t* table = NULL; ibool dict_locked = FALSE; - ulint err = DB_ERROR; + dberr_t err = DB_ERROR; mem_heap_t* heap = NULL; const char** constraints_to_drop = NULL; ulint n_constraints_to_drop = 0; ibool old_is_tmp, new_is_tmp; pars_info_t* info = NULL; + int retry; ut_a(old_name != NULL); ut_a(new_name != NULL); @@ -4279,8 +4699,8 @@ row_rename_table_for_mysql( dict_locked = trx->dict_operation_lock_mode == RW_X_LATCH; - table = dict_table_open_on_name_no_stats(old_name, dict_locked, - DICT_ERR_IGNORE_NONE); + table = dict_table_open_on_name(old_name, dict_locked, FALSE, + DICT_ERR_IGNORE_NONE); if (!table) { err = DB_TABLE_NOT_FOUND; @@ -4299,18 +4719,19 @@ row_rename_table_for_mysql( "InnoDB: " REFMAN "innodb-troubleshooting.html\n", stderr); goto funct_exit; - } else if (table->ibd_file_missing) { + + } else if (table->ibd_file_missing + && !dict_table_is_discarded(table)) { + err = DB_TABLE_NOT_FOUND; - ut_print_timestamp(stderr); - fputs(" InnoDB: Error: table ", stderr); - ut_print_name(stderr, trx, TRUE, old_name); - fputs(" does not have an .ibd file" - " in the database directory.\n" - "InnoDB: You can look for further help from\n" - "InnoDB: " REFMAN "innodb-troubleshooting.html\n", - stderr); + ib_logf(IB_LOG_LEVEL_ERROR, + "Table %s does not have an .ibd file in the database " + "directory. See " REFMAN "innodb-troubleshooting.html", + old_name); + goto funct_exit; + } else if (new_is_tmp) { /* MySQL is doing an ALTER TABLE command and it renames the original table to a temporary table name. We want to preserve @@ -4329,27 +4750,75 @@ row_rename_table_for_mysql( } } + /* Is a foreign key check running on this table? */ + for (retry = 0; retry < 100 + && table->n_foreign_key_checks_running > 0; ++retry) { + row_mysql_unlock_data_dictionary(trx); + os_thread_yield(); + row_mysql_lock_data_dictionary(trx); + } + + if (table->n_foreign_key_checks_running > 0) { + ut_print_timestamp(stderr); + fputs(" InnoDB: Error: in ALTER TABLE ", stderr); + ut_print_name(stderr, trx, TRUE, old_name); + fprintf(stderr, "\n" + "InnoDB: a FOREIGN KEY check is running.\n" + "InnoDB: Cannot rename table.\n"); + err = DB_TABLE_IN_FK_CHECK; + goto funct_exit; + } + /* We use the private SQL parser of Innobase to generate the query graphs needed in updating the dictionary data from system tables. */ info = pars_info_create(); pars_info_add_str_literal(info, "new_table_name", new_name); - pars_info_add_str_literal(info, "old_table_name", old_name); err = que_eval_sql(info, "PROCEDURE RENAME_TABLE () IS\n" "BEGIN\n" - "UPDATE SYS_TABLES SET NAME = :new_table_name\n" + "UPDATE SYS_TABLES" + " SET NAME = :new_table_name\n" " WHERE NAME = :old_table_name;\n" "END;\n" , FALSE, trx); - if (err != DB_SUCCESS) { + /* SYS_TABLESPACES and SYS_DATAFILES track non-system tablespaces + which have space IDs > 0. */ + if (err == DB_SUCCESS + && table->space != TRX_SYS_SPACE + && !table->ibd_file_missing) { + /* Make a new pathname to update SYS_DATAFILES. */ + char* new_path = row_make_new_pathname(table, new_name); + + info = pars_info_create(); + pars_info_add_str_literal(info, "new_table_name", new_name); + pars_info_add_str_literal(info, "new_path_name", new_path); + pars_info_add_int4_literal(info, "space_id", table->space); + + err = que_eval_sql(info, + "PROCEDURE RENAME_SPACE () IS\n" + "BEGIN\n" + "UPDATE SYS_TABLESPACES" + " SET NAME = :new_table_name\n" + " WHERE SPACE = :space_id;\n" + "UPDATE SYS_DATAFILES" + " SET PATH = :new_path_name\n" + " WHERE SPACE = :space_id;\n" + "END;\n" + , FALSE, trx); + + mem_free(new_path); + } + if (err != DB_SUCCESS) { goto end; - } else if (!new_is_tmp) { + } + + if (!new_is_tmp) { /* Rename all constraints. */ info = pars_info_create(); @@ -4486,12 +4955,12 @@ end: /* The following call will also rename the .ibd data file if the table is stored in a single-table tablespace */ - if (!dict_table_rename_in_cache(table, new_name, - !new_is_tmp)) { + err = dict_table_rename_in_cache( + table, new_name, !new_is_tmp); + if (err != DB_SUCCESS) { trx->error_state = DB_SUCCESS; trx_rollback_to_savepoint(trx, NULL); trx->error_state = DB_SUCCESS; - err = DB_ERROR; goto funct_exit; } @@ -4527,8 +4996,8 @@ end: stderr); } - ut_a(dict_table_rename_in_cache(table, - old_name, FALSE)); + ut_a(DB_SUCCESS == dict_table_rename_in_cache( + table, old_name, FALSE)); trx->error_state = DB_SUCCESS; trx_rollback_to_savepoint(trx, NULL); trx->error_state = DB_SUCCESS; @@ -4545,7 +5014,7 @@ end: funct_exit: if (table != NULL) { - dict_table_close(table, dict_locked); + dict_table_close(table, dict_locked, FALSE); } if (commit) { @@ -4565,9 +5034,9 @@ funct_exit: Checks that the index contains entries in an ascending order, unique constraint is not broken, and calculates the number of index entries in the read view of the current transaction. -@return TRUE if ok */ +@return true if ok */ UNIV_INTERN -ibool +bool row_check_index_for_mysql( /*======================*/ row_prebuilt_t* prebuilt, /*!< in: prebuilt struct @@ -4582,7 +5051,7 @@ row_check_index_for_mysql( byte* buf; ulint ret; rec_t* rec; - ibool is_ok = TRUE; + bool is_ok = true; int cmp; ibool contains_null; ulint i; @@ -4595,10 +5064,20 @@ row_check_index_for_mysql( *n_rows = 0; - /* Full Text index are implemented by auxiliary tables, - not the B-tree */ - if (index->type & DICT_FTS) { - return(TRUE); + if (dict_index_is_clust(index)) { + /* The clustered index of a table is always available. + During online ALTER TABLE that rebuilds the table, the + clustered index in the old table will have + index->online_log pointing to the new table. All + indexes of the old table will remain valid and the new + table will be unaccessible to MySQL until the + completion of the ALTER TABLE. */ + } else if (dict_index_is_online_ddl(index) + || (index->type & DICT_FTS)) { + /* Full Text index are implemented by auxiliary tables, + not the B-tree. We also skip secondary indexes that are + being created online. */ + return(true); } buf = static_cast<byte*>(mem_alloc(UNIV_PAGE_SIZE)); @@ -4679,7 +5158,7 @@ not_ok: "InnoDB: record ", stderr); rec_print_new(stderr, rec, offsets); putc('\n', stderr); - is_ok = FALSE; + is_ok = false; } else if (dict_index_is_unique(index) && !contains_null && matched_fields @@ -4709,9 +5188,8 @@ not_ok: mem_heap_empty(heap); - prev_entry = row_rec_to_index_entry(ROW_COPY_DATA, rec, - index, offsets, - &n_ext, heap); + prev_entry = row_rec_to_index_entry( + rec, index, offsets, &n_ext, heap); if (UNIV_LIKELY_NULL(tmp_heap)) { mem_heap_free(tmp_heap); @@ -4725,9 +5203,9 @@ not_ok: /*********************************************************************//** Determines if a table is a magic monitor table. -@return TRUE if monitor table */ +@return true if monitor table */ UNIV_INTERN -ibool +bool row_is_magic_monitor_table( /*=======================*/ const char* table_name) /*!< in: name of the table, in the @@ -4758,7 +5236,7 @@ row_mysql_init(void) { mutex_create( row_drop_list_mutex_key, - &row_drop_list_mutex, SYNC_NO_ORDER_CHECK); + &row_drop_list_mutex, SYNC_NO_ORDER_CHECK); UT_LIST_INIT(row_mysql_drop_list); diff --git a/storage/innobase/row/row0purge.cc b/storage/innobase/row/row0purge.cc index ab28b396920..ee603be453a 100644 --- a/storage/innobase/row/row0purge.cc +++ b/storage/innobase/row/row0purge.cc @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1997, 2011, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1997, 2012, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -42,8 +42,10 @@ Created 3/14/1997 Heikki Tuuri #include "row0upd.h" #include "row0vers.h" #include "row0mysql.h" +#include "row0log.h" #include "log0log.h" #include "srv0mon.h" +#include "srv0start.h" /************************************************************************* IMPORTANT NOTE: Any operation that generates redo MUST check that there @@ -110,119 +112,134 @@ row_purge_reposition_pcur( return(node->found_clust); } +/** Status of row_purge_remove_clust() */ +enum row_purge_status { + ROW_PURGE_DONE, /*!< The row has been removed. */ + ROW_PURGE_FAIL, /*!< The purge was not successful. */ + ROW_PURGE_SUSPEND/*!< Cannot purge now, due to online rebuild. */ +}; + /***********************************************************//** Removes a delete marked clustered index record if possible. -@return TRUE if success, or if not found, or if modified after the -delete marking */ -static -ibool +@retval ROW_PURGE_DONE if the row was not found, or it was successfully removed +@retval ROW_PURGE_FAIL if the row was modified after the delete marking +@retval ROW_PURGE_SUSPEND if the row refers to an off-page column and +an online ALTER TABLE (table rebuild) is in progress. */ +static __attribute__((nonnull, warn_unused_result)) +enum row_purge_status row_purge_remove_clust_if_poss_low( /*===============================*/ - purge_node_t* node, /*!< in: row purge node */ + purge_node_t* node, /*!< in/out: row purge node */ ulint mode) /*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE */ { - dict_index_t* index; - btr_pcur_t* pcur; - btr_cur_t* btr_cur; - ibool success; - ulint err; - mtr_t mtr; - rec_t* rec; - mem_heap_t* heap = NULL; - ulint offsets_[REC_OFFS_NORMAL_SIZE]; + dict_index_t* index; + enum row_purge_status status = ROW_PURGE_DONE; + mtr_t mtr; + rec_t* rec; + mem_heap_t* heap = NULL; + ulint* offsets; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; rec_offs_init(offsets_); - index = dict_table_get_first_index(node->table); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_SHARED)); +#endif /* UNIV_SYNC_DEBUG */ - pcur = &node->pcur; - btr_cur = btr_pcur_get_btr_cur(pcur); + index = dict_table_get_first_index(node->table); log_free_check(); mtr_start(&mtr); - success = row_purge_reposition_pcur(mode, node, &mtr); - - if (!success) { - /* The record is already removed */ - - btr_pcur_commit_specify_mtr(pcur, &mtr); - - return(TRUE); + if (!row_purge_reposition_pcur(mode, node, &mtr)) { + /* The record was already removed. */ + goto func_exit; } - rec = btr_pcur_get_rec(pcur); + rec = btr_pcur_get_rec(&node->pcur); - if (node->roll_ptr != row_get_rec_roll_ptr( - rec, index, rec_get_offsets(rec, index, offsets_, - ULINT_UNDEFINED, &heap))) { - if (UNIV_LIKELY_NULL(heap)) { - mem_heap_free(heap); - } - /* Someone else has modified the record later: do not remove */ - btr_pcur_commit_specify_mtr(pcur, &mtr); + offsets = rec_get_offsets( + rec, index, offsets_, ULINT_UNDEFINED, &heap); - return(TRUE); + if (node->roll_ptr != row_get_rec_roll_ptr(rec, index, offsets)) { + /* Someone else has modified the record later: do not remove */ + goto func_exit; } - if (UNIV_LIKELY_NULL(heap)) { - mem_heap_free(heap); + if (dict_index_get_online_status(index) == ONLINE_INDEX_CREATION + && rec_offs_any_extern(offsets)) { + status = ROW_PURGE_SUSPEND; + goto func_exit; } if (mode == BTR_MODIFY_LEAF) { - success = btr_cur_optimistic_delete(btr_cur, &mtr); + status = btr_cur_optimistic_delete( + btr_pcur_get_btr_cur(&node->pcur), 0, &mtr) + ? ROW_PURGE_DONE : ROW_PURGE_FAIL; } else { + dberr_t err; ut_ad(mode == BTR_MODIFY_TREE); - btr_cur_pessimistic_delete(&err, FALSE, btr_cur, - RB_NONE, &mtr); + btr_cur_pessimistic_delete( + &err, FALSE, btr_pcur_get_btr_cur(&node->pcur), 0, + RB_NONE, &mtr); - if (err == DB_SUCCESS) { - success = TRUE; - } else if (err == DB_OUT_OF_FILE_SPACE) { - success = FALSE; - } else { + switch (err) { + case DB_SUCCESS: + break; + case DB_OUT_OF_FILE_SPACE: + status = ROW_PURGE_FAIL; + break; + default: ut_error; } } - btr_pcur_commit_specify_mtr(pcur, &mtr); +func_exit: + if (heap) { + mem_heap_free(heap); + } - return(success); + btr_pcur_commit_specify_mtr(&node->pcur, &mtr); + + return(status); } /***********************************************************//** Removes a clustered index record if it has not been modified after the delete -marking. */ -static -void +marking. +@retval true if the row was not found, or it was successfully removed +@retval false the purge needs to be suspended, either because of +running out of file space or because the row refers to an off-page +column and an online ALTER TABLE (table rebuild) is in progress. */ +static __attribute__((nonnull, warn_unused_result)) +bool row_purge_remove_clust_if_poss( /*===========================*/ - purge_node_t* node) /*!< in: row purge node */ + purge_node_t* node) /*!< in/out: row purge node */ { - ibool success; - ulint n_tries = 0; - - /* fputs("Purge: Removing clustered record\n", stderr); */ - - success = row_purge_remove_clust_if_poss_low(node, BTR_MODIFY_LEAF); - if (success) { - - return; + switch (row_purge_remove_clust_if_poss_low(node, BTR_MODIFY_LEAF)) { + case ROW_PURGE_DONE: + return(true); + case ROW_PURGE_SUSPEND: + return(false); + case ROW_PURGE_FAIL: + break; } -retry: - success = row_purge_remove_clust_if_poss_low(node, BTR_MODIFY_TREE); - /* The delete operation may fail if we have little - file space left: TODO: easiest to crash the database - and restart with more file space */ - if (!success && n_tries < BTR_CUR_RETRY_DELETE_N_TIMES) { - n_tries++; - - os_thread_sleep(BTR_CUR_RETRY_SLEEP_TIME); - - goto retry; + for (ulint n_tries = 0; + n_tries < BTR_CUR_RETRY_DELETE_N_TIMES; + n_tries++) { + switch (row_purge_remove_clust_if_poss_low( + node, BTR_MODIFY_TREE)) { + case ROW_PURGE_DONE: + return(true); + case ROW_PURGE_SUSPEND: + return(false); + case ROW_PURGE_FAIL: + os_thread_sleep(BTR_CUR_RETRY_SLEEP_TIME); + } } - ut_a(success); + return(false); } /***********************************************************//** @@ -234,21 +251,21 @@ is newer than the purge view. NOTE: This function should only be called by the purge thread, only while holding a latch on the leaf page of the secondary index entry (or keeping the buffer pool watch on the page). It is possible that -this function first returns TRUE and then FALSE, if a user transaction +this function first returns true and then false, if a user transaction inserts a record that the secondary index entry would refer to. However, in that case, the user transaction would also re-insert the secondary index entry after purge has removed it and released the leaf page latch. -@return TRUE if the secondary index record can be purged */ +@return true if the secondary index record can be purged */ UNIV_INTERN -ibool +bool row_purge_poss_sec( /*===============*/ purge_node_t* node, /*!< in/out: row purge node */ dict_index_t* index, /*!< in: secondary index */ const dtuple_t* entry) /*!< in: secondary index entry */ { - ibool can_delete; + bool can_delete; mtr_t mtr; ut_ad(!dict_index_is_clust(index)); @@ -268,7 +285,7 @@ row_purge_poss_sec( Removes a secondary index entry if possible, by modifying the index tree. Does not try to buffer the delete. @return TRUE if success or if not found */ -static +static __attribute__((nonnull, warn_unused_result)) ibool row_purge_remove_sec_if_poss_tree( /*==============================*/ @@ -279,13 +296,35 @@ row_purge_remove_sec_if_poss_tree( btr_pcur_t pcur; btr_cur_t* btr_cur; ibool success = TRUE; - ulint err; + dberr_t err; mtr_t mtr; enum row_search_result search_result; log_free_check(); mtr_start(&mtr); + if (*index->name == TEMP_INDEX_PREFIX) { + /* The index->online_status may change if the + index->name starts with TEMP_INDEX_PREFIX (meaning + that the index is or was being created online). It is + protected by index->lock. */ + mtr_x_lock(dict_index_get_lock(index), &mtr); + + if (dict_index_is_online_ddl(index)) { + /* Online secondary index creation will not + copy any delete-marked records. Therefore + there is nothing to be purged. We must also + skip the purge when a completed index is + dropped by rollback_inplace_alter_table(). */ + goto func_exit_no_pcur; + } + } else { + /* For secondary indexes, + index->online_status==ONLINE_INDEX_CREATION unless + index->name starts with TEMP_INDEX_PREFIX. */ + ut_ad(!dict_index_is_online_ddl(index)); + } + search_result = row_search_index_entry(index, entry, BTR_MODIFY_TREE, &pcur, &mtr); @@ -327,7 +366,7 @@ row_purge_remove_sec_if_poss_tree( & rec_get_info_bits(btr_cur_get_rec(btr_cur), dict_table_is_comp(index->table))); - btr_cur_pessimistic_delete(&err, FALSE, btr_cur, + btr_cur_pessimistic_delete(&err, FALSE, btr_cur, 0, RB_NONE, &mtr); switch (UNIV_EXPECT(err, DB_SUCCESS)) { case DB_SUCCESS: @@ -342,6 +381,7 @@ row_purge_remove_sec_if_poss_tree( func_exit: btr_pcur_close(&pcur); +func_exit_no_pcur: mtr_commit(&mtr); return(success); @@ -350,9 +390,10 @@ func_exit: /*************************************************************** Removes a secondary index entry without modifying the index tree, if possible. -@return TRUE if success or if not found */ -static -ibool +@retval true if success or if not found +@retval false if row_purge_remove_sec_if_poss_tree() should be invoked */ +static __attribute__((nonnull, warn_unused_result)) +bool row_purge_remove_sec_if_poss_leaf( /*==============================*/ purge_node_t* node, /*!< in: row purge node */ @@ -361,12 +402,40 @@ row_purge_remove_sec_if_poss_leaf( { mtr_t mtr; btr_pcur_t pcur; + ulint mode; enum row_search_result search_result; + bool success = true; log_free_check(); mtr_start(&mtr); + if (*index->name == TEMP_INDEX_PREFIX) { + /* The index->online_status may change if the + index->name starts with TEMP_INDEX_PREFIX (meaning + that the index is or was being created online). It is + protected by index->lock. */ + mtr_s_lock(dict_index_get_lock(index), &mtr); + + if (dict_index_is_online_ddl(index)) { + /* Online secondary index creation will not + copy any delete-marked records. Therefore + there is nothing to be purged. We must also + skip the purge when a completed index is + dropped by rollback_inplace_alter_table(). */ + goto func_exit_no_pcur; + } + + mode = BTR_MODIFY_LEAF | BTR_ALREADY_S_LATCHED | BTR_DELETE; + } else { + /* For secondary indexes, + index->online_status==ONLINE_INDEX_CREATION unless + index->name starts with TEMP_INDEX_PREFIX. */ + ut_ad(!dict_index_is_online_ddl(index)); + + mode = BTR_MODIFY_LEAF | BTR_DELETE; + } + /* Set the purge node for the call to row_purge_poss_sec(). */ pcur.btr_cur.purge_node = node; /* Set the query thread, so that ibuf_insert_low() will be @@ -374,10 +443,9 @@ row_purge_remove_sec_if_poss_leaf( pcur.btr_cur.thr = static_cast<que_thr_t*>(que_node_get_parent(node)); search_result = row_search_index_entry( - index, entry, BTR_MODIFY_LEAF | BTR_DELETE, &pcur, &mtr); + index, entry, mode, &pcur, &mtr); switch (search_result) { - ibool success; case ROW_FOUND: /* Before attempting to purge a record, check if it is safe to do so. */ @@ -390,11 +458,10 @@ row_purge_remove_sec_if_poss_leaf( btr_cur_get_rec(btr_cur), dict_table_is_comp(index->table))); - if (!btr_cur_optimistic_delete(btr_cur, &mtr)) { + if (!btr_cur_optimistic_delete(btr_cur, 0, &mtr)) { /* The index entry could not be deleted. */ - success = FALSE; - goto func_exit; + success = false; } } /* fall through (the index entry is still needed, @@ -405,9 +472,8 @@ row_purge_remove_sec_if_poss_leaf( /* The deletion was buffered. */ case ROW_NOT_FOUND: /* The index entry does not exist, nothing to do. */ - success = TRUE; - func_exit: btr_pcur_close(&pcur); + func_exit_no_pcur: mtr_commit(&mtr); return(success); } @@ -418,19 +484,26 @@ row_purge_remove_sec_if_poss_leaf( /***********************************************************//** Removes a secondary index entry if possible. */ -UNIV_INLINE +UNIV_INLINE __attribute__((nonnull(1,2))) void row_purge_remove_sec_if_poss( /*=========================*/ purge_node_t* node, /*!< in: row purge node */ dict_index_t* index, /*!< in: index */ - dtuple_t* entry) /*!< in: index entry */ + const dtuple_t* entry) /*!< in: index entry */ { ibool success; ulint n_tries = 0; /* fputs("Purge: Removing secondary record\n", stderr); */ + if (!entry) { + /* The node->row must have lacked some fields of this + index. This is possible when the undo log record was + written before this index was created. */ + return; + } + if (row_purge_remove_sec_if_poss_leaf(node, index, entry)) { return; @@ -454,18 +527,18 @@ retry: } /***********************************************************//** -Purges a delete marking of a record. */ -static -void +Purges a delete marking of a record. +@retval true if the row was not found, or it was successfully removed +@retval false the purge needs to be suspended, either because of +running out of file space or because the row refers to an off-page +column and an online ALTER TABLE (table rebuild) is in progress. */ +static __attribute__((nonnull, warn_unused_result)) +bool row_purge_del_mark( /*===============*/ - purge_node_t* node) /*!< in: row purge node */ + purge_node_t* node) /*!< in/out: row purge node */ { mem_heap_t* heap; - dtuple_t* entry; - dict_index_t* index; - - ut_ad(node); heap = mem_heap_create(1024); @@ -477,13 +550,11 @@ row_purge_del_mark( break; } - index = node->index; - if (node->index->type != DICT_FTS) { - /* Build the index entry */ - entry = row_build_index_entry(node->row, NULL, index, heap); - ut_a(entry); - row_purge_remove_sec_if_poss(node, index, entry); + dtuple_t* entry = row_build_index_entry_low( + node->row, NULL, node->index, heap); + row_purge_remove_sec_if_poss(node, node->index, entry); + mem_heap_empty(heap); } node->index = dict_table_get_next_index(node->index); @@ -491,14 +562,15 @@ row_purge_del_mark( mem_heap_free(heap); - row_purge_remove_clust_if_poss(node); + return(row_purge_remove_clust_if_poss(node)); } /***********************************************************//** Purges an update of an existing record. Also purges an update of a delete -marked record if that record contained an externally stored field. */ -static -void +marked record if that record contained an externally stored field. +@return true if purged, false if skipped */ +static __attribute__((nonnull, warn_unused_result)) +bool row_purge_upd_exist_or_extern_func( /*===============================*/ #ifdef UNIV_DEBUG @@ -508,16 +580,24 @@ row_purge_upd_exist_or_extern_func( trx_undo_rec_t* undo_rec) /*!< in: record to purge */ { mem_heap_t* heap; - dtuple_t* entry; - dict_index_t* index; - ibool is_insert; - ulint rseg_id; - ulint page_no; - ulint offset; - ulint i; - mtr_t mtr; - ut_ad(node); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_SHARED)); +#endif /* UNIV_SYNC_DEBUG */ + + if (dict_index_get_online_status(dict_table_get_first_index( + node->table)) + == ONLINE_INDEX_CREATION) { + for (ulint i = 0; i < upd_get_n_fields(node->update); i++) { + + const upd_field_t* ufield + = upd_get_nth_field(node->update, i); + + if (dfield_is_ext(&ufield->new_val)) { + return(false); + } + } + } if (node->rec_type == TRX_UNDO_UPD_DEL_REC || (node->cmpl_info & UPD_NODE_NO_ORD_CHANGE)) { @@ -534,15 +614,13 @@ row_purge_upd_exist_or_extern_func( break; } - index = node->index; - if (row_upd_changes_ord_field_binary(node->index, node->update, thr, NULL, NULL)) { /* Build the older version of the index entry */ - entry = row_build_index_entry(node->row, NULL, - index, heap); - ut_a(entry); - row_purge_remove_sec_if_poss(node, index, entry); + dtuple_t* entry = row_build_index_entry_low( + node->row, NULL, node->index, heap); + row_purge_remove_sec_if_poss(node, node->index, entry); + mem_heap_empty(heap); } node->index = dict_table_get_next_index(node->index); @@ -552,7 +630,7 @@ row_purge_upd_exist_or_extern_func( skip_secondaries: /* Free possible externally stored fields */ - for (i = 0; i < upd_get_n_fields(node->update); i++) { + for (ulint i = 0; i < upd_get_n_fields(node->update); i++) { const upd_field_t* ufield = upd_get_nth_field(node->update, i); @@ -562,6 +640,12 @@ skip_secondaries: buf_block_t* block; ulint internal_offset; byte* data_field; + dict_index_t* index; + ibool is_insert; + ulint rseg_id; + ulint page_no; + ulint offset; + mtr_t mtr; /* We use the fact that new_val points to undo_rec and get thus the offset of @@ -590,9 +674,17 @@ skip_secondaries: index tree */ index = dict_table_get_first_index(node->table); - mtr_x_lock(dict_index_get_lock(index), &mtr); - +#ifdef UNIV_DEBUG + switch (dict_index_get_online_status(index)) { + case ONLINE_INDEX_CREATION: + case ONLINE_INDEX_ABORTED_DROPPED: + ut_ad(0); + case ONLINE_INDEX_COMPLETE: + case ONLINE_INDEX_ABORTED: + break; + } +#endif /* UNIV_DEBUG */ /* NOTE: we must also acquire an X-latch to the root page of the tree. We will need it when we free pages from the tree. If the tree is of height 1, @@ -622,6 +714,8 @@ skip_secondaries: mtr_commit(&mtr); } } + + return(true); } #ifdef UNIV_DEBUG @@ -634,14 +728,14 @@ skip_secondaries: /***********************************************************//** Parses the row reference and other info in a modify undo log record. -@return TRUE if purge operation required */ +@return true if purge operation required */ static -ibool +bool row_purge_parse_undo_rec( /*=====================*/ purge_node_t* node, /*!< in: row undo node */ trx_undo_rec_t* undo_rec, /*!< in: record to purge */ - ibool* updated_extern, /*!< out: TRUE if an externally + bool* updated_extern, /*!< out: true if an externally stored field was updated */ que_thr_t* thr) /*!< in: query thread */ { @@ -665,40 +759,29 @@ row_purge_parse_undo_rec( if (type == TRX_UNDO_UPD_DEL_REC && !*updated_extern) { - return(FALSE); + return(false); } ptr = trx_undo_update_rec_get_sys_cols(ptr, &trx_id, &roll_ptr, &info_bits); node->table = NULL; - if (type == TRX_UNDO_UPD_EXIST_REC - && node->cmpl_info & UPD_NODE_NO_ORD_CHANGE - && !(*updated_extern)) { - - /* Purge requires no changes to indexes: we may return */ - - return(FALSE); - } - /* Prevent DROP TABLE etc. from running when we are doing the purge for this row */ - rw_lock_s_lock_func(&dict_operation_lock, 0, __FILE__, __LINE__); + rw_lock_s_lock_inline(&dict_operation_lock, 0, __FILE__, __LINE__); - node->table = dict_table_open_on_id(table_id, FALSE); + node->table = dict_table_open_on_id(table_id, FALSE, FALSE); if (node->table == NULL) { -err_exit: /* The table has been dropped: no need to do purge */ - rw_lock_s_unlock_gen(&dict_operation_lock, 0); - return(FALSE); + goto err_exit; } if (node->table->ibd_file_missing) { /* We skip purge of missing .ibd files */ - dict_table_close(node->table, FALSE); + dict_table_close(node->table, FALSE, FALSE); node->table = NULL; @@ -708,12 +791,22 @@ err_exit: clust_index = dict_table_get_first_index(node->table); if (clust_index == NULL) { + /* The table was corrupt in the data dictionary. + dict_set_corrupted() works on an index, and + we do not have an index to call it with. */ +close_exit: + dict_table_close(node->table, FALSE, FALSE); +err_exit: + rw_lock_s_unlock(&dict_operation_lock); + return(false); + } - dict_table_close(node->table, FALSE); - - /* The table was corrupt in the data dictionary */ + if (type == TRX_UNDO_UPD_EXIST_REC + && (node->cmpl_info & UPD_NODE_NO_ORD_CHANGE) + && !*updated_extern) { - goto err_exit; + /* Purge requires no changes to indexes: we may return */ + goto close_exit; } ptr = trx_undo_rec_get_row_ref(ptr, clust_index, &(node->ref), @@ -734,13 +827,14 @@ err_exit: node->heap); } - return(TRUE); + return(true); } /***********************************************************//** -Purges the parsed record. */ -static -void +Purges the parsed record. +@return true if purged, false if skipped */ +static __attribute__((nonnull, warn_unused_result)) +bool row_purge_record_func( /*==================*/ purge_node_t* node, /*!< in: row purge node */ @@ -748,10 +842,11 @@ row_purge_record_func( #ifdef UNIV_DEBUG const que_thr_t*thr, /*!< in: query thread */ #endif /* UNIV_DEBUG */ - ibool updated_extern) /*!< in: TRUE if external columns + bool updated_extern) /*!< in: whether external columns were updated */ { dict_index_t* clust_index; + bool purged = true; clust_index = dict_table_get_first_index(node->table); @@ -759,7 +854,10 @@ row_purge_record_func( switch (node->rec_type) { case TRX_UNDO_DEL_MARK_REC: - row_purge_del_mark(node); + purged = row_purge_del_mark(node); + if (!purged) { + break; + } MONITOR_INC(MONITOR_N_DEL_ROW_PURGE); break; default: @@ -768,20 +866,25 @@ row_purge_record_func( } /* fall through */ case TRX_UNDO_UPD_EXIST_REC: - row_purge_upd_exist_or_extern(thr, node, undo_rec); + purged = row_purge_upd_exist_or_extern(thr, node, undo_rec); + if (!purged) { + break; + } MONITOR_INC(MONITOR_N_UPD_EXIST_EXTERN); break; } if (node->found_clust) { btr_pcur_close(&node->pcur); + node->found_clust = FALSE; } if (node->table != NULL) { - dict_table_close(node->table, FALSE); + dict_table_close(node->table, FALSE, FALSE); node->table = NULL; } + return(purged); } #ifdef UNIV_DEBUG @@ -804,18 +907,24 @@ row_purge( trx_undo_rec_t* undo_rec, /*!< in: record to purge */ que_thr_t* thr) /*!< in: query thread */ { - ut_ad(node); - ut_ad(thr); - if (undo_rec != &trx_purge_dummy_rec) { - ibool updated_extern; + bool updated_extern; - if (row_purge_parse_undo_rec( - node, undo_rec, &updated_extern, thr)) { + while (row_purge_parse_undo_rec( + node, undo_rec, &updated_extern, thr)) { - row_purge_record(node, undo_rec, thr, updated_extern); + bool purged = row_purge_record( + node, undo_rec, thr, updated_extern); + + rw_lock_s_unlock(&dict_operation_lock); + + if (purged + || srv_shutdown_state != SRV_SHUTDOWN_NONE) { + return; + } - rw_lock_s_unlock_gen(&dict_operation_lock, 0); + /* Retry the purge in a second. */ + os_thread_sleep(1000000); } } } diff --git a/storage/innobase/row/row0quiesce.cc b/storage/innobase/row/row0quiesce.cc new file mode 100644 index 00000000000..72e0bf43d77 --- /dev/null +++ b/storage/innobase/row/row0quiesce.cc @@ -0,0 +1,702 @@ +/***************************************************************************** + +Copyright (c) 2012, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file row/row0quiesce.cc +Quiesce a tablespace. + +Created 2012-02-08 by Sunny Bains. +*******************************************************/ + +#include "row0quiesce.h" +#include "row0mysql.h" + +#ifdef UNIV_NONINL +#include "row0quiesce.ic" +#endif + +#include "ibuf0ibuf.h" +#include "srv0start.h" +#include "trx0purge.h" + +/*********************************************************************//** +Write the meta data (index user fields) config file. +@return DB_SUCCESS or error code. */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_quiesce_write_index_fields( +/*===========================*/ + const dict_index_t* index, /*!< in: write the meta data for + this index */ + FILE* file, /*!< in: file to write to */ + THD* thd) /*!< in/out: session */ +{ + byte row[sizeof(ib_uint32_t) * 2]; + + for (ulint i = 0; i < index->n_fields; ++i) { + byte* ptr = row; + const dict_field_t* field = &index->fields[i]; + + mach_write_to_4(ptr, field->prefix_len); + ptr += sizeof(ib_uint32_t); + + mach_write_to_4(ptr, field->fixed_len); + + DBUG_EXECUTE_IF("ib_export_io_write_failure_9", + close(fileno(file));); + + if (fwrite(row, 1, sizeof(row), file) != sizeof(row)) { + + ib_senderrf( + thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR, + errno, strerror(errno), + "while writing index fields."); + + return(DB_IO_ERROR); + } + + /* Include the NUL byte in the length. */ + ib_uint32_t len = strlen(field->name) + 1; + ut_a(len > 1); + + mach_write_to_4(row, len); + + DBUG_EXECUTE_IF("ib_export_io_write_failure_10", + close(fileno(file));); + + if (fwrite(row, 1, sizeof(len), file) != sizeof(len) + || fwrite(field->name, 1, len, file) != len) { + + ib_senderrf( + thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR, + errno, strerror(errno), + "while writing index column."); + + return(DB_IO_ERROR); + } + } + + return(DB_SUCCESS); +} + +/*********************************************************************//** +Write the meta data config file index information. +@return DB_SUCCESS or error code. */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_quiesce_write_indexes( +/*======================*/ + const dict_table_t* table, /*!< in: write the meta data for + this table */ + FILE* file, /*!< in: file to write to */ + THD* thd) /*!< in/out: session */ +{ + { + byte row[sizeof(ib_uint32_t)]; + + /* Write the number of indexes in the table. */ + mach_write_to_4(row, UT_LIST_GET_LEN(table->indexes)); + + DBUG_EXECUTE_IF("ib_export_io_write_failure_11", + close(fileno(file));); + + if (fwrite(row, 1, sizeof(row), file) != sizeof(row)) { + ib_senderrf( + thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR, + errno, strerror(errno), + "while writing index count."); + + return(DB_IO_ERROR); + } + } + + dberr_t err = DB_SUCCESS; + + /* Write the index meta data. */ + for (const dict_index_t* index = UT_LIST_GET_FIRST(table->indexes); + index != 0 && err == DB_SUCCESS; + index = UT_LIST_GET_NEXT(indexes, index)) { + + byte* ptr; + byte row[sizeof(index_id_t) + + sizeof(ib_uint32_t) * 8]; + + ptr = row; + + ut_ad(sizeof(index_id_t) == 8); + mach_write_to_8(ptr, index->id); + ptr += sizeof(index_id_t); + + mach_write_to_4(ptr, index->space); + ptr += sizeof(ib_uint32_t); + + mach_write_to_4(ptr, index->page); + ptr += sizeof(ib_uint32_t); + + mach_write_to_4(ptr, index->type); + ptr += sizeof(ib_uint32_t); + + mach_write_to_4(ptr, index->trx_id_offset); + ptr += sizeof(ib_uint32_t); + + mach_write_to_4(ptr, index->n_user_defined_cols); + ptr += sizeof(ib_uint32_t); + + mach_write_to_4(ptr, index->n_uniq); + ptr += sizeof(ib_uint32_t); + + mach_write_to_4(ptr, index->n_nullable); + ptr += sizeof(ib_uint32_t); + + mach_write_to_4(ptr, index->n_fields); + + DBUG_EXECUTE_IF("ib_export_io_write_failure_12", + close(fileno(file));); + + if (fwrite(row, 1, sizeof(row), file) != sizeof(row)) { + + ib_senderrf( + thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR, + errno, strerror(errno), + "while writing index meta-data."); + + return(DB_IO_ERROR); + } + + /* Write the length of the index name. + NUL byte is included in the length. */ + ib_uint32_t len = strlen(index->name) + 1; + ut_a(len > 1); + + mach_write_to_4(row, len); + + DBUG_EXECUTE_IF("ib_export_io_write_failure_1", + close(fileno(file));); + + if (fwrite(row, 1, sizeof(len), file) != sizeof(len) + || fwrite(index->name, 1, len, file) != len) { + + ib_senderrf( + thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR, + errno, strerror(errno), + "while writing index name."); + + return(DB_IO_ERROR); + } + + err = row_quiesce_write_index_fields(index, file, thd); + } + + return(err); +} + +/*********************************************************************//** +Write the meta data (table columns) config file. Serialise the contents of +dict_col_t structure, along with the column name. All fields are serialized +as ib_uint32_t. +@return DB_SUCCESS or error code. */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_quiesce_write_table( +/*====================*/ + const dict_table_t* table, /*!< in: write the meta data for + this table */ + FILE* file, /*!< in: file to write to */ + THD* thd) /*!< in/out: session */ +{ + dict_col_t* col; + byte row[sizeof(ib_uint32_t) * 7]; + + col = table->cols; + + for (ulint i = 0; i < table->n_cols; ++i, ++col) { + byte* ptr = row; + + mach_write_to_4(ptr, col->prtype); + ptr += sizeof(ib_uint32_t); + + mach_write_to_4(ptr, col->mtype); + ptr += sizeof(ib_uint32_t); + + mach_write_to_4(ptr, col->len); + ptr += sizeof(ib_uint32_t); + + mach_write_to_4(ptr, col->mbminmaxlen); + ptr += sizeof(ib_uint32_t); + + mach_write_to_4(ptr, col->ind); + ptr += sizeof(ib_uint32_t); + + mach_write_to_4(ptr, col->ord_part); + ptr += sizeof(ib_uint32_t); + + mach_write_to_4(ptr, col->max_prefix); + + DBUG_EXECUTE_IF("ib_export_io_write_failure_2", + close(fileno(file));); + + if (fwrite(row, 1, sizeof(row), file) != sizeof(row)) { + ib_senderrf( + thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR, + errno, strerror(errno), + "while writing table column data."); + + return(DB_IO_ERROR); + } + + /* Write out the column name as [len, byte array]. The len + includes the NUL byte. */ + ib_uint32_t len; + const char* col_name; + + col_name = dict_table_get_col_name(table, dict_col_get_no(col)); + + /* Include the NUL byte in the length. */ + len = strlen(col_name) + 1; + ut_a(len > 1); + + mach_write_to_4(row, len); + + DBUG_EXECUTE_IF("ib_export_io_write_failure_3", + close(fileno(file));); + + if (fwrite(row, 1, sizeof(len), file) != sizeof(len) + || fwrite(col_name, 1, len, file) != len) { + + ib_senderrf( + thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR, + errno, strerror(errno), + "while writing column name."); + + return(DB_IO_ERROR); + } + } + + return(DB_SUCCESS); +} + +/*********************************************************************//** +Write the meta data config file header. +@return DB_SUCCESS or error code. */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_quiesce_write_header( +/*=====================*/ + const dict_table_t* table, /*!< in: write the meta data for + this table */ + FILE* file, /*!< in: file to write to */ + THD* thd) /*!< in/out: session */ +{ + byte value[sizeof(ib_uint32_t)]; + + /* Write the meta-data version number. */ + mach_write_to_4(value, IB_EXPORT_CFG_VERSION_V1); + + DBUG_EXECUTE_IF("ib_export_io_write_failure_4", close(fileno(file));); + + if (fwrite(&value, 1, sizeof(value), file) != sizeof(value)) { + ib_senderrf( + thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR, + errno, strerror(errno), + "while writing meta-data version number."); + + return(DB_IO_ERROR); + } + + /* Write the server hostname. */ + ib_uint32_t len; + const char* hostname = server_get_hostname(); + + /* Play it safe and check for NULL. */ + if (hostname == 0) { + static const char NullHostname[] = "Hostname unknown"; + + ib_logf(IB_LOG_LEVEL_WARN, + "Unable to determine server hostname."); + + hostname = NullHostname; + } + + /* The server hostname includes the NUL byte. */ + len = strlen(hostname) + 1; + mach_write_to_4(value, len); + + DBUG_EXECUTE_IF("ib_export_io_write_failure_5", close(fileno(file));); + + if (fwrite(&value, 1, sizeof(value), file) != sizeof(value) + || fwrite(hostname, 1, len, file) != len) { + + ib_senderrf( + thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR, + errno, strerror(errno), + "while writing hostname."); + + return(DB_IO_ERROR); + } + + /* The table name includes the NUL byte. */ + ut_a(table->name != 0); + len = strlen(table->name) + 1; + + /* Write the table name. */ + mach_write_to_4(value, len); + + DBUG_EXECUTE_IF("ib_export_io_write_failure_6", close(fileno(file));); + + if (fwrite(&value, 1, sizeof(value), file) != sizeof(value) + || fwrite(table->name, 1, len, file) != len) { + + ib_senderrf( + thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR, + errno, strerror(errno), + "while writing table name."); + + return(DB_IO_ERROR); + } + + byte row[sizeof(ib_uint32_t) * 3]; + + /* Write the next autoinc value. */ + mach_write_to_8(row, table->autoinc); + + DBUG_EXECUTE_IF("ib_export_io_write_failure_7", close(fileno(file));); + + if (fwrite(row, 1, sizeof(ib_uint64_t), file) != sizeof(ib_uint64_t)) { + ib_senderrf( + thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR, + errno, strerror(errno), + "while writing table autoinc value."); + + return(DB_IO_ERROR); + } + + byte* ptr = row; + + /* Write the system page size. */ + mach_write_to_4(ptr, UNIV_PAGE_SIZE); + ptr += sizeof(ib_uint32_t); + + /* Write the table->flags. */ + mach_write_to_4(ptr, table->flags); + ptr += sizeof(ib_uint32_t); + + /* Write the number of columns in the table. */ + mach_write_to_4(ptr, table->n_cols); + + DBUG_EXECUTE_IF("ib_export_io_write_failure_8", close(fileno(file));); + + if (fwrite(row, 1, sizeof(row), file) != sizeof(row)) { + ib_senderrf( + thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR, + errno, strerror(errno), + "while writing table meta-data."); + + return(DB_IO_ERROR); + } + + return(DB_SUCCESS); +} + +/*********************************************************************//** +Write the table meta data after quiesce. +@return DB_SUCCESS or error code */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t +row_quiesce_write_cfg( +/*==================*/ + dict_table_t* table, /*!< in: write the meta data for + this table */ + THD* thd) /*!< in/out: session */ +{ + dberr_t err; + char name[OS_FILE_MAX_PATH]; + + srv_get_meta_data_filename(table, name, sizeof(name)); + + ib_logf(IB_LOG_LEVEL_INFO, "Writing table metadata to '%s'", name); + + FILE* file = fopen(name, "w+b"); + + if (file == NULL) { + ib_errf(thd, IB_LOG_LEVEL_WARN, ER_CANT_CREATE_FILE, + name, errno, strerror(errno)); + + err = DB_IO_ERROR; + } else { + err = row_quiesce_write_header(table, file, thd); + + if (err == DB_SUCCESS) { + err = row_quiesce_write_table(table, file, thd); + } + + if (err == DB_SUCCESS) { + err = row_quiesce_write_indexes(table, file, thd); + } + + if (fflush(file) != 0) { + + char msg[BUFSIZ]; + + ut_snprintf(msg, sizeof(msg), "%s flush() failed", + name); + + ib_senderrf( + thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR, + errno, strerror(errno), msg); + } + + if (fclose(file) != 0) { + char msg[BUFSIZ]; + + ut_snprintf(msg, sizeof(msg), "%s flose() failed", + name); + + ib_senderrf( + thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR, + errno, strerror(errno), msg); + } + } + + return(err); +} + +/*********************************************************************//** +Check whether a table has an FTS index defined on it. +@return true if an FTS index exists on the table */ +static +bool +row_quiesce_table_has_fts_index( +/*============================*/ + const dict_table_t* table) /*!< in: quiesce this table */ +{ + bool exists = false; + + dict_mutex_enter_for_mysql(); + + for (const dict_index_t* index = UT_LIST_GET_FIRST(table->indexes); + index != 0; + index = UT_LIST_GET_NEXT(indexes, index)) { + + if (index->type & DICT_FTS) { + exists = true; + break; + } + } + + dict_mutex_exit_for_mysql(); + + return(exists); +} + +/*********************************************************************//** +Quiesce the tablespace that the table resides in. */ +UNIV_INTERN +void +row_quiesce_table_start( +/*====================*/ + dict_table_t* table, /*!< in: quiesce this table */ + trx_t* trx) /*!< in/out: transaction/session */ +{ + ut_a(trx->mysql_thd != 0); + ut_a(srv_n_purge_threads > 0); + ut_ad(!srv_read_only_mode); + + char table_name[MAX_FULL_NAME_LEN + 1]; + + ut_a(trx->mysql_thd != 0); + + innobase_format_name( + table_name, sizeof(table_name), table->name, FALSE); + + ib_logf(IB_LOG_LEVEL_INFO, + "Sync to disk of '%s' started.", table_name); + + if (trx_purge_state() != PURGE_STATE_DISABLED) { + trx_purge_stop(); + } + + ut_a(table->id > 0); + + ulint count = 0; + + while (ibuf_contract_in_background(table->id, TRUE) != 0) { + if (!(++count % 20)) { + ib_logf(IB_LOG_LEVEL_INFO, + "Merging change buffer entries for '%s'", + table_name); + } + } + + if (!trx_is_interrupted(trx)) { + buf_LRU_flush_or_remove_pages( + table->space, BUF_REMOVE_FLUSH_WRITE, trx); + + if (trx_is_interrupted(trx)) { + + ib_logf(IB_LOG_LEVEL_WARN, "Quiesce aborted!"); + + } else if (row_quiesce_write_cfg(table, trx->mysql_thd) + != DB_SUCCESS) { + + ib_logf(IB_LOG_LEVEL_WARN, + "There was an error writing to the " + "meta data file"); + } else { + ib_logf(IB_LOG_LEVEL_INFO, + "Table '%s' flushed to disk", table_name); + } + } else { + ib_logf(IB_LOG_LEVEL_WARN, "Quiesce aborted!"); + } + + dberr_t err = row_quiesce_set_state(table, QUIESCE_COMPLETE, trx); + ut_a(err == DB_SUCCESS); +} + +/*********************************************************************//** +Cleanup after table quiesce. */ +UNIV_INTERN +void +row_quiesce_table_complete( +/*=======================*/ + dict_table_t* table, /*!< in: quiesce this table */ + trx_t* trx) /*!< in/out: transaction/session */ +{ + ulint count = 0; + char table_name[MAX_FULL_NAME_LEN + 1]; + + ut_a(trx->mysql_thd != 0); + + innobase_format_name( + table_name, sizeof(table_name), table->name, FALSE); + + /* We need to wait for the operation to complete if the + transaction has been killed. */ + + while (table->quiesce != QUIESCE_COMPLETE) { + + /* Print a warning after every minute. */ + if (!(count % 60)) { + ib_logf(IB_LOG_LEVEL_WARN, + "Waiting for quiesce of '%s' to complete", + table_name); + } + + /* Sleep for a second. */ + os_thread_sleep(1000000); + + ++count; + } + + /* Remove the .cfg file now that the user has resumed + normal operations. Otherwise it will cause problems when + the user tries to drop the database (remove directory). */ + char cfg_name[OS_FILE_MAX_PATH]; + + srv_get_meta_data_filename(table, cfg_name, sizeof(cfg_name)); + + os_file_delete_if_exists(cfg_name); + + ib_logf(IB_LOG_LEVEL_INFO, + "Deleting the meta-data file '%s'", cfg_name); + + if (trx_purge_state() != PURGE_STATE_DISABLED) { + trx_purge_run(); + } + + dberr_t err = row_quiesce_set_state(table, QUIESCE_NONE, trx); + ut_a(err == DB_SUCCESS); +} + +/*********************************************************************//** +Set a table's quiesce state. +@return DB_SUCCESS or error code. */ +UNIV_INTERN +dberr_t +row_quiesce_set_state( +/*==================*/ + dict_table_t* table, /*!< in: quiesce this table */ + ib_quiesce_t state, /*!< in: quiesce state to set */ + trx_t* trx) /*!< in/out: transaction */ +{ + ut_a(srv_n_purge_threads > 0); + + if (srv_read_only_mode) { + + ib_senderrf(trx->mysql_thd, + IB_LOG_LEVEL_WARN, ER_READ_ONLY_MODE); + + return(DB_UNSUPPORTED); + + } else if (table->space == TRX_SYS_SPACE) { + + char table_name[MAX_FULL_NAME_LEN + 1]; + + innobase_format_name( + table_name, sizeof(table_name), table->name, FALSE); + + ib_senderrf(trx->mysql_thd, IB_LOG_LEVEL_WARN, + ER_TABLE_IN_SYSTEM_TABLESPACE, table_name); + + return(DB_UNSUPPORTED); + } else if (row_quiesce_table_has_fts_index(table)) { + + ib_senderrf(trx->mysql_thd, IB_LOG_LEVEL_WARN, + ER_NOT_SUPPORTED_YET, + "FLUSH TABLES on tables that have an FTS index. " + "FTS auxiliary tables will not be flushed."); + + } else if (DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID)) { + /* If this flag is set then the table may not have any active + FTS indexes but it will still have the auxiliary tables. */ + + ib_senderrf(trx->mysql_thd, IB_LOG_LEVEL_WARN, + ER_NOT_SUPPORTED_YET, + "FLUSH TABLES on a table that had an FTS index, " + "created on a hidden column, the " + "auxiliary tables haven't been dropped as yet. " + "FTS auxiliary tables will not be flushed."); + } + + row_mysql_lock_data_dictionary(trx); + + dict_table_x_lock_indexes(table); + + switch (state) { + case QUIESCE_START: + ut_a(table->quiesce == QUIESCE_NONE); + break; + + case QUIESCE_COMPLETE: + ut_a(table->quiesce == QUIESCE_START); + break; + + case QUIESCE_NONE: + ut_a(table->quiesce == QUIESCE_COMPLETE); + break; + } + + table->quiesce = state; + + dict_table_x_unlock_indexes(table); + + row_mysql_unlock_data_dictionary(trx); + + return(DB_SUCCESS); +} + diff --git a/storage/innobase/row/row0row.cc b/storage/innobase/row/row0row.cc index 8c703b1e06c..be786f954fb 100644 --- a/storage/innobase/row/row0row.cc +++ b/storage/innobase/row/row0row.cc @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -50,28 +50,26 @@ Created 4/20/1996 Heikki Tuuri /*****************************************************************//** When an insert or purge to a table is performed, this function builds the entry to be inserted into or purged from an index on the table. -@return index entry which should be inserted or purged, or NULL if the -externally stored columns in the clustered index record are -unavailable and ext != NULL */ +@return index entry which should be inserted or purged +@retval NULL if the externally stored columns in the clustered index record +are unavailable and ext != NULL, or row is missing some needed columns. */ UNIV_INTERN dtuple_t* -row_build_index_entry( -/*==================*/ - const dtuple_t* row, /*!< in: row which should be - inserted or purged */ - row_ext_t* ext, /*!< in: externally stored column prefixes, - or NULL */ - dict_index_t* index, /*!< in: index on the table */ - mem_heap_t* heap) /*!< in: memory heap from which the memory for - the index entry is allocated */ +row_build_index_entry_low( +/*======================*/ + const dtuple_t* row, /*!< in: row which should be + inserted or purged */ + const row_ext_t* ext, /*!< in: externally stored column + prefixes, or NULL */ + dict_index_t* index, /*!< in: index on the table */ + mem_heap_t* heap) /*!< in: memory heap from which + the memory for the index entry + is allocated */ { dtuple_t* entry; ulint entry_len; ulint i; - ut_ad(row && index && heap); - ut_ad(dtuple_check_typed(row)); - entry_len = dict_index_get_n_fields(index); entry = dtuple_create(heap, entry_len); @@ -96,8 +94,19 @@ row_build_index_entry( = dtuple_get_nth_field(entry, i); const dfield_t* dfield2 = dtuple_get_nth_field(row, col_no); - ulint len - = dfield_get_len(dfield2); + ulint len; + +#if DATA_MISSING != 0 +# error "DATA_MISSING != 0" +#endif + if (UNIV_UNLIKELY(dfield_get_type(dfield2)->mtype + == DATA_MISSING)) { + /* The field has not been initialized in the row. + This should be from trx_undo_rec_get_partial_row(). */ + return(NULL); + } + + len = dfield_get_len(dfield2); dfield_copy(dfield, dfield2); @@ -171,8 +180,6 @@ row_build_index_entry( } } - ut_ad(dtuple_check_typed(entry)); - return(entry); } @@ -211,21 +218,23 @@ row_build( of an index, or NULL if index->table should be consulted instead */ + const dtuple_t* add_cols, + /*!< in: default values of + added columns, or NULL */ + const ulint* col_map,/*!< in: mapping of old column + numbers to new ones, or NULL */ row_ext_t** ext, /*!< out, own: cache of externally stored column prefixes, or NULL */ mem_heap_t* heap) /*!< in: memory heap from which the memory needed is allocated */ { + const byte* copy; dtuple_t* row; - const dict_table_t* table; - ulint n_fields; ulint n_ext_cols; ulint* ext_cols = NULL; /* remove warning */ ulint len; - ulint row_len; byte* buf; - ulint i; ulint j; mem_heap_t* tmp_heap = NULL; ulint offsets_[REC_OFFS_NORMAL_SIZE]; @@ -234,6 +243,7 @@ row_build( ut_ad(index && rec && heap); ut_ad(dict_index_is_clust(index)); ut_ad(!mutex_own(&trx_sys->mutex)); + ut_ad(!col_map || col_table); if (!offsets) { offsets = rec_get_offsets(rec, index, offsets_, @@ -260,55 +270,84 @@ row_build( buf = static_cast<byte*>( mem_heap_alloc(heap, rec_offs_size(offsets))); - rec = rec_copy(buf, rec, offsets); - /* Avoid a debug assertion in rec_offs_validate(). */ - rec_offs_make_valid(rec, index, (ulint*) offsets); + copy = rec_copy(buf, rec, offsets); + } else { + copy = rec; } - table = index->table; - row_len = dict_table_get_n_cols(table); - - row = dtuple_create(heap, row_len); - - dict_table_copy_types(row, table); - - dtuple_set_info_bits(row, rec_get_info_bits( - rec, dict_table_is_comp(table))); - - n_fields = rec_offs_n_fields(offsets); n_ext_cols = rec_offs_n_extern(offsets); if (n_ext_cols) { ext_cols = static_cast<ulint*>( mem_heap_alloc(heap, n_ext_cols * sizeof *ext_cols)); } - for (i = j = 0; i < n_fields; i++) { - dict_field_t* ind_field + /* Avoid a debug assertion in rec_offs_validate(). */ + rec_offs_make_valid(copy, index, const_cast<ulint*>(offsets)); + + if (!col_table) { + ut_ad(!col_map); + ut_ad(!add_cols); + col_table = index->table; + } + + if (add_cols) { + ut_ad(col_map); + row = dtuple_copy(add_cols, heap); + /* dict_table_copy_types() would set the fields to NULL */ + for (ulint i = 0; i < dict_table_get_n_cols(col_table); i++) { + dict_col_copy_type( + dict_table_get_nth_col(col_table, i), + dfield_get_type(dtuple_get_nth_field(row, i))); + } + } else { + row = dtuple_create(heap, dict_table_get_n_cols(col_table)); + dict_table_copy_types(row, col_table); + } + + dtuple_set_info_bits(row, rec_get_info_bits( + copy, rec_offs_comp(offsets))); + + j = 0; + + for (ulint i = 0; i < rec_offs_n_fields(offsets); i++) { + const dict_field_t* ind_field = dict_index_get_nth_field(index, i); + + if (ind_field->prefix_len) { + /* Column prefixes can only occur in key + fields, which cannot be stored externally. For + a column prefix, there should also be the full + field in the clustered index tuple. The row + tuple comprises full fields, not prefixes. */ + ut_ad(!rec_offs_nth_extern(offsets, i)); + continue; + } + const dict_col_t* col = dict_field_get_col(ind_field); ulint col_no = dict_col_get_no(col); - dfield_t* dfield - = dtuple_get_nth_field(row, col_no); - - if (ind_field->prefix_len == 0) { - const byte* field = rec_get_nth_field( - rec, offsets, i, &len); + if (col_map) { + col_no = col_map[col_no]; - dfield_set_data(dfield, field, len); + if (col_no == ULINT_UNDEFINED) { + /* dropped column */ + continue; + } } + dfield_t* dfield = dtuple_get_nth_field(row, col_no); + + const byte* field = rec_get_nth_field( + copy, offsets, i, &len); + + dfield_set_data(dfield, field, len); + if (rec_offs_nth_extern(offsets, i)) { dfield_set_ext(dfield); - if (UNIV_LIKELY_NULL(col_table)) { - ut_a(col_no - < dict_table_get_n_cols(col_table)); - col = dict_table_get_nth_col( - col_table, col_no); - } + col = dict_table_get_nth_col(col_table, col_no); if (col->ord_part) { /* We will have to fetch prefixes of @@ -319,14 +358,20 @@ row_build( } } + rec_offs_make_valid(rec, index, const_cast<ulint*>(offsets)); + ut_ad(dtuple_check_typed(row)); if (!ext) { /* REDUNDANT and COMPACT formats store a local 768-byte prefix of each externally stored - column. No cache is needed. */ - ut_ad(dict_table_get_format(index->table) - < UNIV_FORMAT_B); + column. No cache is needed. + + During online table rebuild, + row_log_table_apply_delete_low() + may use a cache that was set up by + row_log_table_delete(). */ + } else if (j) { *ext = row_ext_create(j, ext_cols, index->table->flags, row, heap); @@ -402,28 +447,14 @@ row_rec_to_index_entry_low( /*******************************************************************//** Converts an index record to a typed data tuple. NOTE that externally stored (often big) fields are NOT copied to heap. -@return own: index entry built; see the NOTE below! */ +@return own: index entry built */ UNIV_INTERN dtuple_t* row_rec_to_index_entry( /*===================*/ - ulint type, /*!< in: ROW_COPY_DATA, or - ROW_COPY_POINTERS: the former - copies also the data fields to - heap as the latter only places - pointers to data fields on the - index page */ - const rec_t* rec, /*!< in: record in the index; - NOTE: in the case - ROW_COPY_POINTERS the data - fields in the row will point - directly into this record, - therefore, the buffer page of - this record must be at least - s-latched and the latch held - as long as the dtuple is used! */ + const rec_t* rec, /*!< in: record in the index */ const dict_index_t* index, /*!< in: index */ - ulint* offsets,/*!< in/out: rec_get_offsets(rec) */ + const ulint* offsets,/*!< in: rec_get_offsets(rec) */ ulint* n_ext, /*!< out: number of externally stored columns */ mem_heap_t* heap) /*!< in: memory heap from which @@ -431,25 +462,21 @@ row_rec_to_index_entry( { dtuple_t* entry; byte* buf; + const rec_t* copy_rec; ut_ad(rec && heap && index); ut_ad(rec_offs_validate(rec, index, offsets)); - if (type == ROW_COPY_DATA) { - /* Take a copy of rec to heap */ - buf = static_cast<byte*>( - mem_heap_alloc(heap, rec_offs_size(offsets))); + /* Take a copy of rec to heap */ + buf = static_cast<byte*>( + mem_heap_alloc(heap, rec_offs_size(offsets))); - rec = rec_copy(buf, rec, offsets); - /* Avoid a debug assertion in rec_offs_validate(). */ - rec_offs_make_valid(rec, index, offsets); -#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG - } else { - ut_a(!rec_offs_any_null_extern(rec, offsets)); -#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ - } + copy_rec = rec_copy(buf, rec, offsets); - entry = row_rec_to_index_entry_low(rec, index, offsets, n_ext, heap); + rec_offs_make_valid(copy_rec, index, const_cast<ulint*>(offsets)); + entry = row_rec_to_index_entry_low( + copy_rec, index, offsets, n_ext, heap); + rec_offs_make_valid(rec, index, const_cast<ulint*>(offsets)); dtuple_set_info_bits(entry, rec_get_info_bits(rec, rec_offs_comp(offsets))); diff --git a/storage/innobase/row/row0sel.cc b/storage/innobase/row/row0sel.cc index 96884e89511..bfda669d97a 100644 --- a/storage/innobase/row/row0sel.cc +++ b/storage/innobase/row/row0sel.cc @@ -57,7 +57,6 @@ Created 12/19/1997 Heikki Tuuri #include "read0read.h" #include "buf0lru.h" #include "ha_prototypes.h" -#include "srv0mon.h" #include "my_compare.h" /* enum icp_result */ @@ -673,8 +672,8 @@ sel_enqueue_prefetched_row( /*********************************************************************//** Builds a previous version of a clustered index record for a consistent read @return DB_SUCCESS or error code */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t row_sel_build_prev_vers( /*====================*/ read_view_t* read_view, /*!< in: read view */ @@ -691,7 +690,7 @@ row_sel_build_prev_vers( afterwards */ mtr_t* mtr) /*!< in: mtr */ { - ulint err; + dberr_t err; if (*old_vers_heap) { mem_heap_empty(*old_vers_heap); @@ -707,10 +706,9 @@ row_sel_build_prev_vers( /*********************************************************************//** Builds the last committed version of a clustered index record for a -semi-consistent read. -@return DB_SUCCESS or error code */ -static -ulint +semi-consistent read. */ +static __attribute__((nonnull)) +void row_sel_build_committed_vers_for_mysql( /*===================================*/ dict_index_t* clust_index, /*!< in: clustered index */ @@ -726,18 +724,16 @@ row_sel_build_committed_vers_for_mysql( afterwards */ mtr_t* mtr) /*!< in: mtr */ { - ulint err; - if (prebuilt->old_vers_heap) { mem_heap_empty(prebuilt->old_vers_heap); } else { - prebuilt->old_vers_heap = mem_heap_create(200); + prebuilt->old_vers_heap = mem_heap_create( + rec_offs_size(*offsets)); } - err = row_vers_build_for_semi_consistent_read( + row_vers_build_for_semi_consistent_read( rec, mtr, clust_index, offsets, offset_heap, prebuilt->old_vers_heap, old_vers); - return(err); } /*********************************************************************//** @@ -809,8 +805,8 @@ row_sel_test_other_conds( Retrieves the clustered index record corresponding to a record in a non-clustered index. Does the necessary locking. @return DB_SUCCESS or error code */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t row_sel_get_clust_rec( /*==================*/ sel_node_t* node, /*!< in: select_node */ @@ -828,7 +824,7 @@ row_sel_get_clust_rec( dict_index_t* index; rec_t* clust_rec; rec_t* old_vers; - ulint err; + dberr_t err; mem_heap_t* heap = NULL; ulint offsets_[REC_OFFS_NORMAL_SIZE]; ulint* offsets = offsets_; @@ -982,7 +978,7 @@ err_exit: Sets a lock on a record. @return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, or error code */ UNIV_INLINE -enum db_err +dberr_t sel_set_rec_lock( /*=============*/ const buf_block_t* block, /*!< in: buffer block of rec */ @@ -995,7 +991,7 @@ sel_set_rec_lock( que_thr_t* thr) /*!< in: query thread */ { trx_t* trx; - enum db_err err; + dberr_t err; trx = thr_get_trx(thr); @@ -1084,7 +1080,7 @@ row_sel_open_pcur( (FALSE: no init) */ btr_pcur_open_at_index_side(plan->asc, index, BTR_SEARCH_LEAF, - &(plan->pcur), FALSE, mtr); + &(plan->pcur), false, 0, mtr); } ut_ad(plan->n_rows_prefetched == 0); @@ -1313,8 +1309,8 @@ func_exit: /*********************************************************************//** Performs a select step. @return DB_SUCCESS or error code */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t row_sel( /*====*/ sel_node_t* node, /*!< in: select node */ @@ -1347,7 +1343,7 @@ row_sel( &mtr must be committed before we move to the next non-clustered record */ ulint found_flag; - ulint err; + dberr_t err; mem_heap_t* heap = NULL; ulint offsets_[REC_OFFS_NORMAL_SIZE]; ulint* offsets = offsets_; @@ -2083,11 +2079,9 @@ row_sel_step( table_node = static_cast<sym_node_t*>( que_node_get_next(table_node))) { - enum db_err err; - - err = static_cast<enum db_err>(lock_table( + dberr_t err = lock_table( 0, table_node->table, i_lock_mode, - thr)); + thr); if (err != DB_SUCCESS) { trx_t* trx; @@ -2120,7 +2114,7 @@ row_sel_step( } } - enum db_err err = static_cast<enum db_err>(row_sel(node, thr)); + dberr_t err = row_sel(node, thr); /* NOTE! if queries are parallelized, the following assignment may have problems; the assignment should be made only if thr is the @@ -2305,42 +2299,6 @@ row_printf_step( return(thr); } -/******************************************************************** -Creates a key in Innobase dtuple format.*/ - -void -row_create_key( -/*===========*/ - dtuple_t* tuple, /* in: tuple where to build; - NOTE: we assume that the type info - in the tuple is already according - to index! */ - dict_index_t* index, /* in: index of the key value */ - doc_id_t* doc_id) /* in: doc id to search. */ -{ - dtype_t type; - dict_field_t* field; - doc_id_t temp_doc_id; - dfield_t* dfield = dtuple_get_nth_field(tuple, 0); - - ut_a(dict_index_get_n_unique(index) == 1); - - /* Permit us to access any field in the tuple (ULINT_MAX): */ - dtuple_set_n_fields(tuple, ULINT_MAX); - - field = dict_index_get_nth_field(index, 0); - dict_col_copy_type(field->col, &type); - ut_a(dtype_get_mtype(&type) == DATA_INT); - - /* Convert to storage byte order */ - mach_write_to_8((byte*) &temp_doc_id, *doc_id); - *doc_id = temp_doc_id; - - ut_a(sizeof(*doc_id) == field->fixed_len); - dfield_set_data(dfield, doc_id, field->fixed_len); - - dtuple_set_n_fields(tuple, 1); -} /****************************************************************//** Converts a key value stored in MySQL format to an Innobase dtuple. The last field of the key value may be just a prefix of a fixed length field: hence @@ -2536,6 +2494,7 @@ row_sel_convert_mysql_key_to_innobase( dfield_set_len(dfield, len - (ulint) (key_ptr - key_end)); } + ut_ad(0); } n_fields++; @@ -3008,8 +2967,8 @@ row_sel_store_mysql_rec( /*********************************************************************//** Builds a previous version of a clustered index record for a consistent read @return DB_SUCCESS or error code */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t row_sel_build_prev_vers_for_mysql( /*==============================*/ read_view_t* read_view, /*!< in: read view */ @@ -3026,7 +2985,7 @@ row_sel_build_prev_vers_for_mysql( afterwards */ mtr_t* mtr) /*!< in: mtr */ { - ulint err; + dberr_t err; if (prebuilt->old_vers_heap) { mem_heap_empty(prebuilt->old_vers_heap); @@ -3045,8 +3004,8 @@ Retrieves the clustered index record corresponding to a record in a non-clustered index. Does the necessary locking. Used in the MySQL interface. @return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, or error code */ -static -enum db_err +static __attribute__((nonnull, warn_unused_result)) +dberr_t row_sel_get_clust_rec_for_mysql( /*============================*/ row_prebuilt_t* prebuilt,/*!< in: prebuilt struct in the handle */ @@ -3073,7 +3032,7 @@ row_sel_get_clust_rec_for_mysql( dict_index_t* clust_index; const rec_t* clust_rec; rec_t* old_vers; - enum db_err err; + dberr_t err; trx_t* trx; *out_rec = NULL; @@ -3172,17 +3131,13 @@ row_sel_get_clust_rec_for_mysql( clust_rec, clust_index, *offsets, trx->read_view)) { - ulint db_err; - /* The following call returns 'offsets' associated with 'old_vers' */ - db_err = row_sel_build_prev_vers_for_mysql( + err = row_sel_build_prev_vers_for_mysql( trx->read_view, clust_index, prebuilt, clust_rec, offsets, offset_heap, &old_vers, mtr); - err = static_cast<enum db_err>(db_err); - if (err != DB_SUCCESS || old_vers == NULL) { goto err_exit; @@ -3226,7 +3181,10 @@ row_sel_get_clust_rec_for_mysql( func_exit: *out_rec = clust_rec; - if (prebuilt->select_lock_type != LOCK_NONE) { + /* Store the current position if select_lock_type is not + LOCK_NONE or if we are scanning using InnoDB APIs */ + if (prebuilt->select_lock_type != LOCK_NONE + || prebuilt->innodb_api) { /* We may use the cursor in update or in unlock_row(): store its position */ @@ -3633,7 +3591,7 @@ row_search_idx_cond_check( return(result); case ICP_ERROR: case ICP_ABORTED_BY_USER: - return(result); + return(result); } ut_error; @@ -3649,7 +3607,7 @@ position and fetch next or fetch prev must not be tried to the cursor! @return DB_SUCCESS, DB_RECORD_NOT_FOUND, DB_END_OF_INDEX, DB_DEADLOCK, DB_LOCK_TABLE_FULL, DB_CORRUPTION, or DB_TOO_BIG_RECORD */ UNIV_INTERN -ulint +dberr_t row_search_for_mysql( /*=================*/ byte* buf, /*!< in/out: buffer for the fetched @@ -3678,9 +3636,9 @@ row_search_for_mysql( dict_index_t* clust_index; que_thr_t* thr; const rec_t* rec; - const rec_t* result_rec; + const rec_t* result_rec = NULL; const rec_t* clust_rec; - ulint err = DB_SUCCESS; + dberr_t err = DB_SUCCESS; ibool unique_search = FALSE; ibool mtr_has_extra_clust_latch = FALSE; ibool moves_up = FALSE; @@ -3701,48 +3659,41 @@ row_search_for_mysql( ulint offsets_[REC_OFFS_NORMAL_SIZE]; ulint* offsets = offsets_; ibool table_lock_waited = FALSE; + byte* next_buf = 0; rec_offs_init(offsets_); ut_ad(index && pcur && search_tuple); - if (UNIV_UNLIKELY(prebuilt->table->ibd_file_missing)) { - ut_print_timestamp(stderr); - fprintf(stderr, " InnoDB: Error:\n" - "InnoDB: MySQL is trying to use a table handle" - " but the .ibd file for\n" - "InnoDB: table %s does not exist.\n" - "InnoDB: Have you deleted the .ibd file" - " from the database directory under\n" - "InnoDB: the MySQL datadir, or have you used" - " DISCARD TABLESPACE?\n" - "InnoDB: Look from\n" - "InnoDB: " REFMAN "innodb-troubleshooting.html\n" - "InnoDB: how you can resolve the problem.\n", - prebuilt->table->name); + /* We don't support FTS queries from the HANDLER interfaces, because + we implemented FTS as reversed inverted index with auxiliary tables. + So anything related to traditional index query would not apply to + it. */ + if (index->type & DICT_FTS) { + return(DB_END_OF_INDEX); + } #ifdef UNIV_SYNC_DEBUG - ut_ad(!sync_thread_levels_nonempty_trx(trx->has_search_latch)); + ut_ad(!sync_thread_levels_nonempty_trx(trx->has_search_latch)); #endif /* UNIV_SYNC_DEBUG */ - return(DB_ERROR); - } - if (UNIV_UNLIKELY(!prebuilt->index_usable)) { + if (dict_table_is_discarded(prebuilt->table)) { + + return(DB_TABLESPACE_DELETED); + + } else if (prebuilt->table->ibd_file_missing) { + + return(DB_TABLESPACE_NOT_FOUND); + + } else if (!prebuilt->index_usable) { -#ifdef UNIV_SYNC_DEBUG - ut_ad(!sync_thread_levels_nonempty_trx(trx->has_search_latch)); -#endif /* UNIV_SYNC_DEBUG */ return(DB_MISSING_HISTORY); - } - if (dict_index_is_corrupted(index)) { -#ifdef UNIV_SYNC_DEBUG - ut_ad(!sync_thread_levels_nonempty_trx(trx->has_search_latch)); -#endif /* UNIV_SYNC_DEBUG */ + } else if (dict_index_is_corrupted(index)) { + return(DB_CORRUPTION); - } - if (prebuilt->magic_n != ROW_PREBUILT_ALLOCATED) { + } else if (prebuilt->magic_n != ROW_PREBUILT_ALLOCATED) { fprintf(stderr, "InnoDB: Error: trying to free a corrupt\n" "InnoDB: table handle. Magic n %lu, table name ", @@ -3846,7 +3797,6 @@ row_search_for_mysql( prebuilt->n_rows_fetched++; - srv_n_rows_read++; err = DB_SUCCESS; goto func_exit; } @@ -3925,7 +3875,8 @@ row_search_for_mysql( && dict_index_is_clust(index) && !prebuilt->templ_contains_blob && !prebuilt->used_in_HANDLER - && (prebuilt->mysql_row_len < UNIV_PAGE_SIZE / 8)) { + && (prebuilt->mysql_row_len < UNIV_PAGE_SIZE / 8) + && !prebuilt->innodb_api) { mode = PAGE_CUR_GE; @@ -3973,8 +3924,8 @@ row_search_for_mysql( rec, offsets)) { case ICP_NO_MATCH: case ICP_OUT_OF_RANGE: - case ICP_ERROR: case ICP_ABORTED_BY_USER: + case ICP_ERROR: goto shortcut_mismatch; case ICP_MATCH: goto shortcut_match; @@ -4005,8 +3956,6 @@ row_search_for_mysql( /* ut_print_name(stderr, index->name); fputs(" shortcut\n", stderr); */ - srv_n_rows_read++; - err = DB_SUCCESS; goto release_search_latch_if_needed; @@ -4179,12 +4128,12 @@ wait_table_again: /* Try to place a gap lock on the next index record to prevent phantoms in ORDER BY ... DESC queries */ - const rec_t* next = page_rec_get_next_const(rec); + const rec_t* next_rec = page_rec_get_next_const(rec); - offsets = rec_get_offsets(next, index, offsets, + offsets = rec_get_offsets(next_rec, index, offsets, ULINT_UNDEFINED, &heap); err = sel_set_rec_lock(btr_pcur_get_block(pcur), - next, index, offsets, + next_rec, index, offsets, prebuilt->select_lock_type, LOCK_GAP, thr); @@ -4197,16 +4146,10 @@ wait_table_again: goto lock_wait_or_error; } } - } else { - if (mode == PAGE_CUR_G) { - btr_pcur_open_at_index_side( - TRUE, index, BTR_SEARCH_LEAF, pcur, FALSE, - &mtr); - } else if (mode == PAGE_CUR_L) { - btr_pcur_open_at_index_side( - FALSE, index, BTR_SEARCH_LEAF, pcur, FALSE, - &mtr); - } + } else if (mode == PAGE_CUR_G || mode == PAGE_CUR_L) { + btr_pcur_open_at_index_side( + mode == PAGE_CUR_G, index, BTR_SEARCH_LEAF, + pcur, false, 0, &mtr); } rec_loop: @@ -4348,6 +4291,9 @@ wrong_offs: /* Calculate the 'offsets' associated with 'rec' */ + ut_ad(fil_page_get_type(btr_pcur_get_page(pcur)) == FIL_PAGE_INDEX); + ut_ad(btr_page_get_index_id(btr_pcur_get_page(pcur)) == index->id); + offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap); if (UNIV_UNLIKELY(srv_force_recovery > 0)) { @@ -4539,15 +4485,10 @@ no_gap_lock: /* The following call returns 'offsets' associated with 'old_vers' */ - err = row_sel_build_committed_vers_for_mysql( + row_sel_build_committed_vers_for_mysql( clust_index, prebuilt, rec, &offsets, &heap, &old_vers, &mtr); - if (err != DB_SUCCESS) { - - goto lock_wait_or_error; - } - /* Check whether it was a deadlock or not, if not a deadlock and the transaction had to wait then release the lock it is waiting on. */ @@ -4649,8 +4590,8 @@ no_gap_lock: case ICP_NO_MATCH: goto next_rec; case ICP_OUT_OF_RANGE: - case ICP_ERROR: case ICP_ABORTED_BY_USER: + case ICP_ERROR: err = DB_RECORD_NOT_FOUND; goto idx_cond_failed; case ICP_MATCH: @@ -4690,12 +4631,15 @@ locks_ok: delete marked record and the record following it. For now this is applicable only to clustered indexes while - doing a unique search. There is scope for further optimization + doing a unique search except for HANDLER queries because + HANDLER allows NEXT and PREV even in unique search on + clustered index. There is scope for further optimization applicable to unique secondary indexes. Current behaviour is to widen the scope of a lock on an already delete marked record if the same record is deleted twice by the same transaction */ if (index == clust_index && unique_search - && !prebuilt->used_in_HANDLER) { + && !prebuilt->used_in_HANDLER) { + err = DB_RECORD_NOT_FOUND; goto normal_return; @@ -4712,8 +4656,8 @@ locks_ok: } goto next_rec; case ICP_OUT_OF_RANGE: - case ICP_ERROR: case ICP_ABORTED_BY_USER: + case ICP_ERROR: err = DB_RECORD_NOT_FOUND; goto idx_cond_failed; case ICP_MATCH: @@ -4831,9 +4775,10 @@ requires_clust_rec: && !prebuilt->templ_contains_blob && !prebuilt->clust_index_was_generated && !prebuilt->used_in_HANDLER + && !prebuilt->innodb_api && prebuilt->template_type != ROW_MYSQL_DUMMY_TEMPLATE - && !prebuilt->result) { + && !prebuilt->in_fts_query) { /* Inside an update, for example, we do not cache rows, since we may use the cursor position to do the actual @@ -4849,29 +4794,58 @@ requires_clust_rec: /* We only convert from InnoDB row format to MySQL row format when ICP is disabled. */ - if (!prebuilt->idx_cond - && !row_sel_store_mysql_rec( - row_sel_fetch_last_buf(prebuilt), - prebuilt, result_rec, - result_rec != rec, - result_rec != rec ? clust_index : index, - offsets)) { - - /* Only fresh inserts may contain incomplete - externally stored columns. Pretend that such - records do not exist. Such records may only be - accessed at the READ UNCOMMITTED isolation - level or when rolling back a recovered - transaction. Rollback happens at a lower - level, not here. */ - goto next_rec; - } + if (!prebuilt->idx_cond) { - row_sel_enqueue_cache_row_for_mysql(buf, prebuilt); + /* We use next_buf to track the allocation of buffers + where we store and enqueue the buffers for our + pre-fetch optimisation. + + If next_buf == 0 then we store the converted record + directly into the MySQL record buffer (buf). If it is + != 0 then we allocate a pre-fetch buffer and store the + converted record there. + + If the conversion fails and the MySQL record buffer + was not written to then we reset next_buf so that + we can re-use the MySQL record buffer in the next + iteration. */ + + next_buf = next_buf + ? row_sel_fetch_last_buf(prebuilt) : buf; + + if (!row_sel_store_mysql_rec( + next_buf, prebuilt, result_rec, + result_rec != rec, + result_rec != rec ? clust_index : index, + offsets)) { + + if (next_buf == buf) { + ut_a(prebuilt->n_fetch_cached == 0); + next_buf = 0; + } + + /* Only fresh inserts may contain incomplete + externally stored columns. Pretend that such + records do not exist. Such records may only be + accessed at the READ UNCOMMITTED isolation + level or when rolling back a recovered + transaction. Rollback happens at a lower + level, not here. */ + goto next_rec; + } + + if (next_buf != buf) { + row_sel_enqueue_cache_row_for_mysql( + next_buf, prebuilt); + } + } else { + row_sel_enqueue_cache_row_for_mysql(buf, prebuilt); + } if (prebuilt->n_fetch_cached < MYSQL_FETCH_CACHE_SIZE) { goto next_rec; } + } else { if (UNIV_UNLIKELY (prebuilt->template_type == ROW_MYSQL_DUMMY_TEMPLATE)) { @@ -4892,7 +4866,7 @@ requires_clust_rec: rec_offs_size(offsets)); mach_write_to_4(buf, rec_offs_extra_size(offsets) + 4); - } else if (!prebuilt->idx_cond) { + } else if (!prebuilt->idx_cond && !prebuilt->innodb_api) { /* The record was not yet converted to MySQL format. */ if (!row_sel_store_mysql_rec( buf, prebuilt, result_rec, @@ -4935,11 +4909,16 @@ idx_cond_failed: || !dict_index_is_clust(index) || direction != 0 || prebuilt->select_lock_type != LOCK_NONE - || prebuilt->used_in_HANDLER) { + || prebuilt->used_in_HANDLER + || prebuilt->innodb_api) { /* Inside an update always store the cursor position */ btr_pcur_store_position(pcur, &mtr); + + if (prebuilt->innodb_api) { + prebuilt->innodb_api_rec = result_rec; + } } goto normal_return; @@ -5032,7 +5011,7 @@ lock_table_wait: mtr_commit(&mtr); mtr_has_extra_clust_latch = FALSE; - trx->error_state = static_cast<enum db_err>(err); + trx->error_state = err; /* The following is a patch for MySQL */ @@ -5101,8 +5080,23 @@ normal_return: mtr_commit(&mtr); - if (prebuilt->n_fetch_cached > 0) { - row_sel_dequeue_cached_row_for_mysql(buf, prebuilt); + if (prebuilt->idx_cond != 0) { + + /* When ICP is active we don't write to the MySQL buffer + directly, only to buffers that are enqueued in the pre-fetch + queue. We need to dequeue the first buffer and copy the contents + to the record buffer that was passed in by MySQL. */ + + if (prebuilt->n_fetch_cached > 0) { + row_sel_dequeue_cached_row_for_mysql(buf, prebuilt); + err = DB_SUCCESS; + } + + } else if (next_buf != 0) { + + /* We may or may not have enqueued some buffers to the + pre-fetch queue, but we definitely wrote to the record + buffer passed to use by MySQL. */ err = DB_SUCCESS; } @@ -5112,9 +5106,6 @@ normal_return: dict_index_name_print(stderr, index); fprintf(stderr, " cnt %lu ret value %lu err\n", cnt, err); */ #endif /* UNIV_SEARCH_DEBUG */ - if (err == DB_SUCCESS) { - srv_n_rows_read++; - } func_exit: trx->op_info = ""; @@ -5139,6 +5130,9 @@ func_exit: #ifdef UNIV_SYNC_DEBUG ut_ad(!sync_thread_levels_nonempty_trx(trx->has_search_latch)); #endif /* UNIV_SYNC_DEBUG */ + + DEBUG_SYNC_C("innodb_row_search_for_mysql_exit"); + return(err); } @@ -5157,7 +5151,22 @@ row_search_check_if_query_cache_permitted( dict_table_t* table; ibool ret = FALSE; - table = dict_table_open_on_name(norm_name, FALSE); + /* Disable query cache altogether for all tables if recovered XA + transactions in prepared state exist. This is because we do not + restore the table locks for those transactions and we may wrongly + set ret=TRUE above if "lock_table_get_n_locks(table) == 0". See + "Bug#14658648 XA ROLLBACK (DISTRIBUTED DATABASE) NOT WORKING WITH + QUERY CACHE ENABLED". + Read trx_sys->n_prepared_recovered_trx without mutex protection, + not possible to end up with a torn read since n_prepared_recovered_trx + is word size. */ + if (trx_sys->n_prepared_recovered_trx > 0) { + + return(FALSE); + } + + table = dict_table_open_on_name(norm_name, FALSE, FALSE, + DICT_ERR_IGNORE_NONE); if (table == NULL) { @@ -5191,7 +5200,7 @@ row_search_check_if_query_cache_permitted( } } - dict_table_close(table, FALSE); + dict_table_close(table, FALSE, FALSE); return(ret); } @@ -5229,8 +5238,6 @@ row_search_autoinc_read_column( data = rec_get_nth_field(rec, offsets, col_no, &len); - ut_a(len != UNIV_SQL_NULL); - switch (mtype) { case DATA_INT: ut_a(len <= sizeof value); @@ -5289,7 +5296,7 @@ Read the max AUTOINC value from an index. @return DB_SUCCESS if all OK else error code, DB_RECORD_NOT_FOUND if column name can't be found in index */ UNIV_INTERN -ulint +dberr_t row_search_max_autoinc( /*===================*/ dict_index_t* index, /*!< in: index to search */ @@ -5299,7 +5306,7 @@ row_search_max_autoinc( ulint i; ulint n_cols; dict_field_t* dfield = NULL; - ulint error = DB_SUCCESS; + dberr_t error = DB_SUCCESS; n_cols = dict_index_get_n_ordering_defined_by_user(index); @@ -5321,10 +5328,9 @@ row_search_max_autoinc( mtr_start(&mtr); - /* Open at the high/right end (FALSE), and INIT - cursor (TRUE) */ + /* Open at the high/right end (false), and init cursor */ btr_pcur_open_at_index_side( - FALSE, index, BTR_SEARCH_LEAF, &pcur, TRUE, &mtr); + false, index, BTR_SEARCH_LEAF, &pcur, true, 0, &mtr); if (page_get_n_recs(btr_pcur_get_page(&pcur)) > 0) { const rec_t* rec; diff --git a/storage/innobase/row/row0uins.cc b/storage/innobase/row/row0uins.cc index 78fd4ad5199..25b2b6b62ce 100644 --- a/storage/innobase/row/row0uins.cc +++ b/storage/innobase/row/row0uins.cc @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1997, 2010, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1997, 2012, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -38,6 +38,7 @@ Created 2/25/1997 Heikki Tuuri #include "mach0data.h" #include "row0undo.h" #include "row0vers.h" +#include "row0log.h" #include "trx0trx.h" #include "trx0rec.h" #include "row0row.h" @@ -60,25 +61,64 @@ introduced where a call to log_free_check() is bypassed. */ Removes a clustered index record. The pcur in node was positioned on the record, now it is detached. @return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t row_undo_ins_remove_clust_rec( /*==========================*/ undo_node_t* node) /*!< in: undo node */ { btr_cur_t* btr_cur; ibool success; - ulint err; - ulint n_tries = 0; + dberr_t err; + ulint n_tries = 0; mtr_t mtr; + dict_index_t* index = node->pcur.btr_cur.index; + bool online; + + ut_ad(dict_index_is_clust(index)); mtr_start(&mtr); - success = btr_pcur_restore_position(BTR_MODIFY_LEAF, &(node->pcur), - &mtr); + /* This is similar to row_undo_mod_clust(). Even though we + call row_log_table_rollback() elsewhere, the DDL thread may + already have copied this row to the sort buffers or to the new + table. We must log the removal, so that the row will be + correctly purged. However, we can log the removal out of sync + with the B-tree modification. */ + + online = dict_index_is_online_ddl(index); + if (online) { + ut_ad(node->trx->dict_operation_lock_mode + != RW_X_LATCH); + ut_ad(node->table->id != DICT_INDEXES_ID); + mtr_s_lock(dict_index_get_lock(index), &mtr); + } + + success = btr_pcur_restore_position( + online + ? BTR_MODIFY_LEAF | BTR_ALREADY_S_LATCHED + : BTR_MODIFY_LEAF, &node->pcur, &mtr); ut_a(success); + btr_cur = btr_pcur_get_btr_cur(&node->pcur); + + ut_ad(rec_get_trx_id(btr_cur_get_rec(btr_cur), btr_cur->index) + == node->trx->id); + + if (online && dict_index_is_online_ddl(index)) { + const rec_t* rec = btr_cur_get_rec(btr_cur); + mem_heap_t* heap = NULL; + const ulint* offsets = rec_get_offsets( + rec, index, NULL, ULINT_UNDEFINED, &heap); + row_log_table_delete( + rec, index, offsets, + trx_read_trx_id(row_get_trx_id_offset(index, offsets) + + rec)); + mem_heap_free(heap); + } + if (node->table->id == DICT_INDEXES_ID) { + ut_ad(!online); ut_ad(node->trx->dict_operation_lock_mode == RW_X_LATCH); /* Drop the index tree associated with the row in @@ -90,14 +130,12 @@ row_undo_ins_remove_clust_rec( mtr_start(&mtr); - success = btr_pcur_restore_position(BTR_MODIFY_LEAF, - &(node->pcur), &mtr); + success = btr_pcur_restore_position( + BTR_MODIFY_LEAF, &node->pcur, &mtr); ut_a(success); } - btr_cur = btr_pcur_get_btr_cur(&(node->pcur)); - - if (btr_cur_optimistic_delete(btr_cur, &mtr)) { + if (btr_cur_optimistic_delete(btr_cur, 0, &mtr)) { err = DB_SUCCESS; goto func_exit; } @@ -111,7 +149,7 @@ retry: &(node->pcur), &mtr); ut_a(success); - btr_cur_pessimistic_delete(&err, FALSE, btr_cur, + btr_cur_pessimistic_delete(&err, FALSE, btr_cur, 0, trx_is_recv(node->trx) ? RB_RECOVERY : RB_NORMAL, &mtr); @@ -142,8 +180,8 @@ func_exit: /***************************************************************//** Removes a secondary index entry if found. @return DB_SUCCESS, DB_FAIL, or DB_OUT_OF_FILE_SPACE */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t row_undo_ins_remove_sec_low( /*========================*/ ulint mode, /*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE, @@ -154,22 +192,31 @@ row_undo_ins_remove_sec_low( { btr_pcur_t pcur; btr_cur_t* btr_cur; - ulint err; + dberr_t err = DB_SUCCESS; mtr_t mtr; enum row_search_result search_result; + log_free_check(); + mtr_start(&mtr); - btr_cur = btr_pcur_get_btr_cur(&pcur); + if (mode == BTR_MODIFY_LEAF) { + mode = BTR_MODIFY_LEAF | BTR_ALREADY_S_LATCHED; + mtr_s_lock(dict_index_get_lock(index), &mtr); + } else { + ut_ad(mode == BTR_MODIFY_TREE); + mtr_x_lock(dict_index_get_lock(index), &mtr); + } - ut_ad(mode == BTR_MODIFY_TREE || mode == BTR_MODIFY_LEAF); + if (row_log_online_op_try(index, entry, 0)) { + goto func_exit_no_pcur; + } search_result = row_search_index_entry(index, entry, mode, &pcur, &mtr); switch (search_result) { case ROW_NOT_FOUND: - err = DB_SUCCESS; goto func_exit; case ROW_FOUND: break; @@ -181,23 +228,24 @@ row_undo_ins_remove_sec_low( ut_error; } - if (mode == BTR_MODIFY_LEAF) { - err = btr_cur_optimistic_delete(btr_cur, &mtr) + btr_cur = btr_pcur_get_btr_cur(&pcur); + + if (mode != BTR_MODIFY_TREE) { + err = btr_cur_optimistic_delete(btr_cur, 0, &mtr) ? DB_SUCCESS : DB_FAIL; } else { - ut_ad(mode == BTR_MODIFY_TREE); - /* No need to distinguish RB_RECOVERY here, because we are deleting a secondary index record: the distinction between RB_NORMAL and RB_RECOVERY only matters when deleting a record that contains externally stored columns. */ ut_ad(!dict_index_is_clust(index)); - btr_cur_pessimistic_delete(&err, FALSE, btr_cur, + btr_cur_pessimistic_delete(&err, FALSE, btr_cur, 0, RB_NORMAL, &mtr); } func_exit: btr_pcur_close(&pcur); +func_exit_no_pcur: mtr_commit(&mtr); return(err); @@ -207,14 +255,14 @@ func_exit: Removes a secondary index entry from the index if found. Tries first optimistic, then pessimistic descent down the tree. @return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t row_undo_ins_remove_sec( /*====================*/ dict_index_t* index, /*!< in: index */ dtuple_t* entry) /*!< in: index entry to insert */ { - ulint err; + dberr_t err; ulint n_tries = 0; /* Try first optimistic descent to the B-tree */ @@ -261,7 +309,7 @@ row_undo_ins_parse_undo_rec( table_id_t table_id; ulint type; ulint dummy; - ibool dummy_extern; + bool dummy_extern; ut_ad(node); @@ -271,12 +319,13 @@ row_undo_ins_parse_undo_rec( node->rec_type = type; node->update = NULL; - node->table = dict_table_open_on_id(table_id, dict_locked); + node->table = dict_table_open_on_id(table_id, dict_locked, FALSE); /* Skip the UNDO if we can't find the table or the .ibd file. */ if (UNIV_UNLIKELY(node->table == NULL)) { } else if (UNIV_UNLIKELY(node->table->ibd_file_missing)) { - dict_table_close(node->table, dict_locked); +close_table: + dict_table_close(node->table, dict_locked, FALSE); node->table = NULL; } else { clust_index = dict_table_get_first_index(node->table); @@ -286,10 +335,7 @@ row_undo_ins_parse_undo_rec( ptr, clust_index, &node->ref, node->heap); if (!row_undo_search_clust_to_pcur(node)) { - - dict_table_close(node->table, dict_locked); - - node->table = NULL; + goto close_table; } } else { @@ -299,10 +345,7 @@ row_undo_ins_parse_undo_rec( node->table->name); fprintf(stderr, " has no indexes, " "ignoring the table\n"); - - dict_table_close(node->table, dict_locked); - - node->table = NULL; + goto close_table; } } } @@ -310,27 +353,32 @@ row_undo_ins_parse_undo_rec( /***************************************************************//** Removes secondary index records. @return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t row_undo_ins_remove_sec_rec( /*========================*/ undo_node_t* node) /*!< in/out: row undo node */ { - ulint err = DB_SUCCESS; + dberr_t err = DB_SUCCESS; + dict_index_t* index = node->index; mem_heap_t* heap; heap = mem_heap_create(1024); - while (node->index != NULL) { + while (index != NULL) { dtuple_t* entry; - if (node->index->type & DICT_FTS) { - dict_table_next_uncorrupted_index(node->index); + if (index->type & DICT_FTS) { + dict_table_next_uncorrupted_index(index); continue; } - entry = row_build_index_entry(node->row, node->ext, - node->index, heap); + /* An insert undo record TRX_UNDO_INSERT_REC will + always contain all fields of the index. It does not + matter if any indexes were created afterwards; all + index entries can be reconstructed from the row. */ + entry = row_build_index_entry( + node->row, node->ext, index, heap); if (UNIV_UNLIKELY(!entry)) { /* The database must have crashed after inserting a clustered index record but before @@ -343,9 +391,7 @@ row_undo_ins_remove_sec_rec( transactions. */ ut_a(trx_is_recv(node->trx)); } else { - log_free_check(); - - err = row_undo_ins_remove_sec(node->index, entry); + err = row_undo_ins_remove_sec(index, entry); if (UNIV_UNLIKELY(err != DB_SUCCESS)) { goto func_exit; @@ -353,10 +399,11 @@ row_undo_ins_remove_sec_rec( } mem_heap_empty(heap); - dict_table_next_uncorrupted_index(node->index); + dict_table_next_uncorrupted_index(index); } func_exit: + node->index = index; mem_heap_free(heap); return(err); } @@ -369,15 +416,14 @@ if it figures out that an index record will be removed in the purge anyway, it will remove it in the rollback. @return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ UNIV_INTERN -ulint +dberr_t row_undo_ins( /*=========*/ undo_node_t* node) /*!< in: row undo node */ { - ulint err; - ibool dict_locked; + dberr_t err; + ibool dict_locked; - ut_ad(node); ut_ad(node->state == UNDO_NODE_INSERT); dict_locked = node->trx->dict_operation_lock_mode == RW_X_LATCH; @@ -392,24 +438,46 @@ row_undo_ins( /* Iterate over all the indexes and undo the insert.*/ + node->index = dict_table_get_first_index(node->table); + ut_ad(dict_index_is_clust(node->index)); + + if (dict_index_is_online_ddl(node->index)) { + /* Note that we are rolling back this transaction, so + that all inserts and updates with this DB_TRX_ID can + be skipped. */ + row_log_table_rollback(node->index, node->trx->id); + } + /* Skip the clustered index (the first index) */ - node->index = dict_table_get_next_index( - dict_table_get_first_index(node->table)); + node->index = dict_table_get_next_index(node->index); dict_table_skip_corrupt_index(node->index); err = row_undo_ins_remove_sec_rec(node); - if (UNIV_UNLIKELY(err != DB_SUCCESS)) { - goto func_exit; - } + if (err == DB_SUCCESS) { - log_free_check(); + log_free_check(); - err = row_undo_ins_remove_clust_rec(node); + if (node->table->id == DICT_INDEXES_ID) { -func_exit: - dict_table_close(node->table, dict_locked); + if (!dict_locked) { + mutex_enter(&dict_sys->mutex); + } + } + + // FIXME: We need to update the dict_index_t::space and + // page number fields too. + err = row_undo_ins_remove_clust_rec(node); + + if (node->table->id == DICT_INDEXES_ID + && !dict_locked) { + + mutex_exit(&dict_sys->mutex); + } + } + + dict_table_close(node->table, dict_locked, FALSE); node->table = NULL; diff --git a/storage/innobase/row/row0umod.cc b/storage/innobase/row/row0umod.cc index 42034c5b80d..c1a4ba76052 100644 --- a/storage/innobase/row/row0umod.cc +++ b/storage/innobase/row/row0umod.cc @@ -37,6 +37,7 @@ Created 2/27/1997 Heikki Tuuri #include "mach0data.h" #include "row0undo.h" #include "row0vers.h" +#include "row0log.h" #include "trx0trx.h" #include "trx0rec.h" #include "row0row.h" @@ -71,11 +72,20 @@ introduced where a call to log_free_check() is bypassed. */ /***********************************************************//** Undoes a modify in a clustered index record. @return DB_SUCCESS, DB_FAIL, or error code: we may run out of file space */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t row_undo_mod_clust_low( /*===================*/ undo_node_t* node, /*!< in: row undo node */ + ulint** offsets,/*!< out: rec_get_offsets() on the record */ + mem_heap_t** offsets_heap, + /*!< in/out: memory heap that can be emptied */ + mem_heap_t* heap, /*!< in/out: memory heap */ + const dtuple_t**rebuilt_old_pk, + /*!< out: row_log_table_get_pk() + before the update, or NULL if + the table is not being rebuilt online or + the PRIMARY KEY definition does not change */ que_thr_t* thr, /*!< in: query thread */ mtr_t* mtr, /*!< in: mtr; must be committed before latching any further pages */ @@ -83,12 +93,12 @@ row_undo_mod_clust_low( { btr_pcur_t* pcur; btr_cur_t* btr_cur; - ulint err; + dberr_t err; #ifdef UNIV_DEBUG ibool success; #endif /* UNIV_DEBUG */ - pcur = &(node->pcur); + pcur = &node->pcur; btr_cur = btr_pcur_get_btr_cur(pcur); #ifdef UNIV_DEBUG @@ -97,31 +107,40 @@ row_undo_mod_clust_low( btr_pcur_restore_position(mode, pcur, mtr); ut_ad(success); + ut_ad(rec_get_trx_id(btr_cur_get_rec(btr_cur), + btr_cur_get_index(btr_cur)) + == thr_get_trx(thr)->id); + + if (mode != BTR_MODIFY_LEAF + && dict_index_is_online_ddl(btr_cur_get_index(btr_cur))) { + *rebuilt_old_pk = row_log_table_get_pk( + btr_cur_get_rec(btr_cur), + btr_cur_get_index(btr_cur), NULL, &heap); + } else { + *rebuilt_old_pk = NULL; + } - if (mode == BTR_MODIFY_LEAF) { + if (mode != BTR_MODIFY_TREE) { + ut_ad((mode & ~BTR_ALREADY_S_LATCHED) == BTR_MODIFY_LEAF); - err = btr_cur_optimistic_update(BTR_NO_LOCKING_FLAG - | BTR_NO_UNDO_LOG_FLAG - | BTR_KEEP_SYS_FLAG, - btr_cur, node->update, - node->cmpl_info, thr, mtr); + err = btr_cur_optimistic_update( + BTR_NO_LOCKING_FLAG | BTR_NO_UNDO_LOG_FLAG + | BTR_KEEP_SYS_FLAG, + btr_cur, offsets, offsets_heap, + node->update, node->cmpl_info, + thr, thr_get_trx(thr)->id, mtr); } else { - mem_heap_t* heap = NULL; big_rec_t* dummy_big_rec; - ut_ad(mode == BTR_MODIFY_TREE); - err = btr_cur_pessimistic_update( BTR_NO_LOCKING_FLAG | BTR_NO_UNDO_LOG_FLAG | BTR_KEEP_SYS_FLAG, - btr_cur, &heap, &dummy_big_rec, node->update, - node->cmpl_info, thr, mtr); + btr_cur, offsets, offsets_heap, heap, + &dummy_big_rec, node->update, + node->cmpl_info, thr, thr_get_trx(thr)->id, mtr); ut_a(!dummy_big_rec); - if (UNIV_LIKELY_NULL(heap)) { - mem_heap_free(heap); - } } return(err); @@ -134,8 +153,8 @@ delete-marked record and there no longer exist transactions that would see the delete-marked record. In other words, we roll back the insert by purging the record. @return DB_SUCCESS, DB_FAIL, or error code: we may run out of file space */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t row_undo_mod_remove_clust_low( /*==========================*/ undo_node_t* node, /*!< in: row undo node */ @@ -144,7 +163,7 @@ row_undo_mod_remove_clust_low( ulint mode) /*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE */ { btr_cur_t* btr_cur; - ulint err; + dberr_t err; ut_ad(node->rec_type == TRX_UNDO_UPD_DEL_REC); @@ -159,8 +178,14 @@ row_undo_mod_remove_clust_low( btr_cur = btr_pcur_get_btr_cur(&node->pcur); + /* We are about to remove an old, delete-marked version of the + record that may have been delete-marked by a different transaction + than the rolling-back one. */ + ut_ad(rec_get_deleted_flag(btr_cur_get_rec(btr_cur), + dict_table_is_comp(node->table))); + if (mode == BTR_MODIFY_LEAF) { - err = btr_cur_optimistic_delete(btr_cur, mtr) + err = btr_cur_optimistic_delete(btr_cur, 0, mtr) ? DB_SUCCESS : DB_FAIL; } else { @@ -169,7 +194,7 @@ row_undo_mod_remove_clust_low( /* This operation is analogous to purge, we can free also inherited externally stored fields */ - btr_cur_pessimistic_delete(&err, FALSE, btr_cur, + btr_cur_pessimistic_delete(&err, FALSE, btr_cur, 0, thr_is_recv(thr) ? RB_RECOVERY_PURGE_REC : RB_NONE, mtr); @@ -186,8 +211,8 @@ row_undo_mod_remove_clust_low( Undoes a modify in a clustered index record. Sets also the node state for the next round of undo. @return DB_SUCCESS or error code: we may run out of file space */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t row_undo_mod_clust( /*===============*/ undo_node_t* node, /*!< in: row undo node */ @@ -195,21 +220,42 @@ row_undo_mod_clust( { btr_pcur_t* pcur; mtr_t mtr; - ulint err; + dberr_t err; + dict_index_t* index; + bool online; - ut_ad(node && thr); + ut_ad(thr_get_trx(thr) == node->trx); + ut_ad(node->trx->dict_operation_lock_mode); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_SHARED) + || rw_lock_own(&dict_operation_lock, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ log_free_check(); + pcur = &node->pcur; + index = btr_cur_get_index(btr_pcur_get_btr_cur(pcur)); + mtr_start(&mtr); - pcur = &(node->pcur); + online = dict_index_is_online_ddl(index); + if (online) { + ut_ad(node->trx->dict_operation_lock_mode != RW_X_LATCH); + mtr_s_lock(dict_index_get_lock(index), &mtr); + } - mtr_start(&mtr); + mem_heap_t* heap = mem_heap_create(1024); + mem_heap_t* offsets_heap = NULL; + ulint* offsets = NULL; + const dtuple_t* rebuilt_old_pk; /* Try optimistic processing of the record, keeping changes within the index page */ - err = row_undo_mod_clust_low(node, thr, &mtr, BTR_MODIFY_LEAF); + err = row_undo_mod_clust_low(node, &offsets, &offsets_heap, + heap, &rebuilt_old_pk, + thr, &mtr, online + ? BTR_MODIFY_LEAF | BTR_ALREADY_S_LATCHED + : BTR_MODIFY_LEAF); if (err != DB_SUCCESS) { btr_pcur_commit_specify_mtr(pcur, &mtr); @@ -219,7 +265,40 @@ row_undo_mod_clust( mtr_start(&mtr); - err = row_undo_mod_clust_low(node, thr, &mtr, BTR_MODIFY_TREE); + err = row_undo_mod_clust_low( + node, &offsets, &offsets_heap, heap, &rebuilt_old_pk, + thr, &mtr, BTR_MODIFY_TREE); + ut_ad(err == DB_SUCCESS || err == DB_OUT_OF_FILE_SPACE); + } + + /* Online rebuild cannot be initiated while we are holding + dict_operation_lock and index->lock. (It can be aborted.) */ + ut_ad(online || !dict_index_is_online_ddl(index)); + + if (err == DB_SUCCESS && online) { +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&index->lock, RW_LOCK_SHARED) + || rw_lock_own(&index->lock, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + switch (node->rec_type) { + case TRX_UNDO_DEL_MARK_REC: + row_log_table_insert( + btr_pcur_get_rec(pcur), index, offsets); + break; + case TRX_UNDO_UPD_EXIST_REC: + row_log_table_update( + btr_pcur_get_rec(pcur), index, offsets, + rebuilt_old_pk); + break; + case TRX_UNDO_UPD_DEL_REC: + row_log_table_delete( + btr_pcur_get_rec(pcur), index, offsets, + node->trx->id); + break; + default: + ut_ad(0); + break; + } } btr_pcur_commit_specify_mtr(pcur, &mtr); @@ -228,8 +307,11 @@ row_undo_mod_clust( mtr_start(&mtr); - err = row_undo_mod_remove_clust_low(node, thr, &mtr, - BTR_MODIFY_LEAF); + /* It is not necessary to call row_log_table, + because the record is delete-marked and would thus + be omitted from the rebuilt copy of the table. */ + err = row_undo_mod_remove_clust_low( + node, thr, &mtr, BTR_MODIFY_LEAF); if (err != DB_SUCCESS) { btr_pcur_commit_specify_mtr(pcur, &mtr); @@ -240,6 +322,9 @@ row_undo_mod_clust( err = row_undo_mod_remove_clust_low(node, thr, &mtr, BTR_MODIFY_TREE); + + ut_ad(err == DB_SUCCESS + || err == DB_OUT_OF_FILE_SPACE); } btr_pcur_commit_specify_mtr(pcur, &mtr); @@ -249,14 +334,18 @@ row_undo_mod_clust( trx_undo_rec_release(node->trx, node->undo_no); + if (offsets_heap) { + mem_heap_free(offsets_heap); + } + mem_heap_free(heap); return(err); } /***********************************************************//** Delete marks or removes a secondary index entry if found. @return DB_SUCCESS, DB_FAIL, or DB_OUT_OF_FILE_SPACE */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t row_undo_mod_del_mark_or_remove_sec_low( /*====================================*/ undo_node_t* node, /*!< in: row undo node */ @@ -270,7 +359,7 @@ row_undo_mod_del_mark_or_remove_sec_low( btr_cur_t* btr_cur; ibool success; ibool old_has; - ulint err; + dberr_t err = DB_SUCCESS; mtr_t mtr; mtr_t mtr_vers; enum row_search_result search_result; @@ -278,9 +367,30 @@ row_undo_mod_del_mark_or_remove_sec_low( log_free_check(); mtr_start(&mtr); - btr_cur = btr_pcur_get_btr_cur(&pcur); + if (*index->name == TEMP_INDEX_PREFIX) { + /* The index->online_status may change if the + index->name starts with TEMP_INDEX_PREFIX (meaning + that the index is or was being created online). It is + protected by index->lock. */ + if (mode == BTR_MODIFY_LEAF) { + mode = BTR_MODIFY_LEAF | BTR_ALREADY_S_LATCHED; + mtr_s_lock(dict_index_get_lock(index), &mtr); + } else { + ut_ad(mode == BTR_MODIFY_TREE); + mtr_x_lock(dict_index_get_lock(index), &mtr); + } + + if (row_log_online_op_try(index, entry, 0)) { + goto func_exit_no_pcur; + } + } else { + /* For secondary indexes, + index->online_status==ONLINE_INDEX_CREATION unless + index->name starts with TEMP_INDEX_PREFIX. */ + ut_ad(!dict_index_is_online_ddl(index)); + } - ut_ad(mode == BTR_MODIFY_TREE || mode == BTR_MODIFY_LEAF); + btr_cur = btr_pcur_get_btr_cur(&pcur); search_result = row_search_index_entry(index, entry, mode, &pcur, &mtr); @@ -296,8 +406,6 @@ row_undo_mod_del_mark_or_remove_sec_low( In normal processing, if an update ends in a deadlock before it has inserted all updated secondary index records, then the undo will not find those records. */ - - err = DB_SUCCESS; goto func_exit; case ROW_FOUND: break; @@ -329,16 +437,14 @@ row_undo_mod_del_mark_or_remove_sec_low( } else { /* Remove the index record */ - if (mode == BTR_MODIFY_LEAF) { - success = btr_cur_optimistic_delete(btr_cur, &mtr); + if (mode != BTR_MODIFY_TREE) { + success = btr_cur_optimistic_delete(btr_cur, 0, &mtr); if (success) { err = DB_SUCCESS; } else { err = DB_FAIL; } } else { - ut_ad(mode == BTR_MODIFY_TREE); - /* No need to distinguish RB_RECOVERY_PURGE here, because we are deleting a secondary index record: the distinction between RB_NORMAL and @@ -346,7 +452,7 @@ row_undo_mod_del_mark_or_remove_sec_low( record that contains externally stored columns. */ ut_ad(!dict_index_is_clust(index)); - btr_cur_pessimistic_delete(&err, FALSE, btr_cur, + btr_cur_pessimistic_delete(&err, FALSE, btr_cur, 0, RB_NORMAL, &mtr); /* The delete operation may fail if we have little @@ -359,6 +465,7 @@ row_undo_mod_del_mark_or_remove_sec_low( func_exit: btr_pcur_close(&pcur); +func_exit_no_pcur: mtr_commit(&mtr); return(err); @@ -373,8 +480,8 @@ not cause problems because in row0sel.cc, in queries we always retrieve the clustered index record or an earlier version of it, if the secondary index record through which we do the search is delete-marked. @return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t row_undo_mod_del_mark_or_remove_sec( /*================================*/ undo_node_t* node, /*!< in: row undo node */ @@ -382,7 +489,7 @@ row_undo_mod_del_mark_or_remove_sec( dict_index_t* index, /*!< in: index */ dtuple_t* entry) /*!< in: index entry */ { - ulint err; + dberr_t err; err = row_undo_mod_del_mark_or_remove_sec_low(node, thr, index, entry, BTR_MODIFY_LEAF); @@ -401,42 +508,67 @@ Delete unmarks a secondary index entry which must be found. It might not be delete-marked at the moment, but it does not harm to unmark it anyway. We also need to update the fields of the secondary index record if we updated its fields but alphabetically they stayed the same, e.g., 'abc' -> 'aBc'. -@return DB_FAIL or DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ -static -ulint +@retval DB_SUCCESS on success +@retval DB_FAIL if BTR_MODIFY_TREE should be tried +@retval DB_OUT_OF_FILE_SPACE when running out of tablespace +@retval DB_DUPLICATE_KEY if the value was missing + and an insert would lead to a duplicate exists */ +static __attribute__((nonnull, warn_unused_result)) +dberr_t row_undo_mod_del_unmark_sec_and_undo_update( /*========================================*/ ulint mode, /*!< in: search mode: BTR_MODIFY_LEAF or BTR_MODIFY_TREE */ que_thr_t* thr, /*!< in: query thread */ dict_index_t* index, /*!< in: index */ - const dtuple_t* entry) /*!< in: index entry */ + dtuple_t* entry) /*!< in: index entry */ { - mem_heap_t* heap; btr_pcur_t pcur; - btr_cur_t* btr_cur; + btr_cur_t* btr_cur = btr_pcur_get_btr_cur(&pcur); upd_t* update; - ulint err = DB_SUCCESS; + dberr_t err = DB_SUCCESS; big_rec_t* dummy_big_rec; mtr_t mtr; trx_t* trx = thr_get_trx(thr); + const ulint flags + = BTR_KEEP_SYS_FLAG | BTR_NO_LOCKING_FLAG; enum row_search_result search_result; - /* Ignore indexes that are being created. */ - if (UNIV_UNLIKELY(*index->name == TEMP_INDEX_PREFIX)) { - - return(DB_SUCCESS); - } + ut_ad(trx->id); log_free_check(); mtr_start(&mtr); - ut_ad(mode == BTR_MODIFY_TREE || mode == BTR_MODIFY_LEAF); + if (*index->name == TEMP_INDEX_PREFIX) { + /* The index->online_status may change if the + index->name starts with TEMP_INDEX_PREFIX (meaning + that the index is or was being created online). It is + protected by index->lock. */ + if (mode == BTR_MODIFY_LEAF) { + mode = BTR_MODIFY_LEAF | BTR_ALREADY_S_LATCHED; + mtr_s_lock(dict_index_get_lock(index), &mtr); + } else { + ut_ad(mode == BTR_MODIFY_TREE); + mtr_x_lock(dict_index_get_lock(index), &mtr); + } + + if (row_log_online_op_try(index, entry, trx->id)) { + goto func_exit_no_pcur; + } + } else { + /* For secondary indexes, + index->online_status==ONLINE_INDEX_CREATION unless + index->name starts with TEMP_INDEX_PREFIX. */ + ut_ad(!dict_index_is_online_ddl(index)); + } search_result = row_search_index_entry(index, entry, mode, &pcur, &mtr); switch (search_result) { + mem_heap_t* heap; + mem_heap_t* offsets_heap; + ulint* offsets; case ROW_BUFFERED: case ROW_NOT_DELETED_REF: /* These are invalid outcomes, because the mode passed @@ -444,81 +576,184 @@ row_undo_mod_del_unmark_sec_and_undo_update( flags BTR_INSERT, BTR_DELETE, or BTR_DELETE_MARK. */ ut_error; case ROW_NOT_FOUND: - fputs("InnoDB: error in sec index entry del undo in\n" - "InnoDB: ", stderr); - dict_index_name_print(stderr, trx, index); - fputs("\n" - "InnoDB: tuple ", stderr); - dtuple_print(stderr, entry); - fputs("\n" - "InnoDB: record ", stderr); - rec_print(stderr, btr_pcur_get_rec(&pcur), index); - putc('\n', stderr); - trx_print(stderr, trx, 0); - fputs("\n" - "InnoDB: Submit a detailed bug report" - " to http://bugs.mysql.com\n", stderr); - ut_ad(0); + if (*index->name != TEMP_INDEX_PREFIX) { + /* During online secondary index creation, it + is possible that MySQL is waiting for a + meta-data lock upgrade before invoking + ha_innobase::commit_inplace_alter_table() + while this ROLLBACK is executing. InnoDB has + finished building the index, but it does not + yet exist in MySQL. In this case, we suppress + the printout to the error log. */ + fputs("InnoDB: error in sec index entry del undo in\n" + "InnoDB: ", stderr); + dict_index_name_print(stderr, trx, index); + fputs("\n" + "InnoDB: tuple ", stderr); + dtuple_print(stderr, entry); + fputs("\n" + "InnoDB: record ", stderr); + rec_print(stderr, btr_pcur_get_rec(&pcur), index); + putc('\n', stderr); + trx_print(stderr, trx, 0); + fputs("\n" + "InnoDB: Submit a detailed bug report" + " to http://bugs.mysql.com\n", stderr); + + ib_logf(IB_LOG_LEVEL_WARN, + "record in index %s was not found" + " on rollback, trying to insert", + index->name); + } + + if (btr_cur->up_match >= dict_index_get_n_unique(index) + || btr_cur->low_match >= dict_index_get_n_unique(index)) { + if (*index->name != TEMP_INDEX_PREFIX) { + ib_logf(IB_LOG_LEVEL_WARN, + "record in index %s was not found on" + " rollback, and a duplicate exists", + index->name); + } + err = DB_DUPLICATE_KEY; + break; + } + + /* Insert the missing record that we were trying to + delete-unmark. */ + big_rec_t* big_rec; + rec_t* insert_rec; + offsets = NULL; + offsets_heap = NULL; + + err = btr_cur_optimistic_insert( + flags, btr_cur, &offsets, &offsets_heap, + entry, &insert_rec, &big_rec, + 0, thr, &mtr); + ut_ad(!big_rec); + + if (err == DB_FAIL && mode == BTR_MODIFY_TREE) { + err = btr_cur_pessimistic_insert( + flags, btr_cur, + &offsets, &offsets_heap, + entry, &insert_rec, &big_rec, + 0, thr, &mtr); + /* There are no off-page columns in + secondary indexes. */ + ut_ad(!big_rec); + } + + if (err == DB_SUCCESS) { + page_update_max_trx_id( + btr_cur_get_block(btr_cur), + btr_cur_get_page_zip(btr_cur), + trx->id, &mtr); + } + + if (offsets_heap) { + mem_heap_free(offsets_heap); + } + break; case ROW_FOUND: - btr_cur = btr_pcur_get_btr_cur(&pcur); - err = btr_cur_del_mark_set_sec_rec(BTR_NO_LOCKING_FLAG, - btr_cur, FALSE, thr, &mtr); + err = btr_cur_del_mark_set_sec_rec( + BTR_NO_LOCKING_FLAG, + btr_cur, FALSE, thr, &mtr); ut_a(err == DB_SUCCESS); - heap = mem_heap_create(100); - + heap = mem_heap_create( + sizeof(upd_t) + + dtuple_get_n_fields(entry) * sizeof(upd_field_t)); + offsets_heap = NULL; + offsets = rec_get_offsets( + btr_cur_get_rec(btr_cur), + index, NULL, ULINT_UNDEFINED, &offsets_heap); update = row_upd_build_sec_rec_difference_binary( - index, entry, btr_cur_get_rec(btr_cur), trx, heap); + btr_cur_get_rec(btr_cur), index, offsets, entry, heap); if (upd_get_n_fields(update) == 0) { /* Do nothing */ - } else if (mode == BTR_MODIFY_LEAF) { + } else if (mode != BTR_MODIFY_TREE) { /* Try an optimistic updating of the record, keeping changes within the page */ + /* TODO: pass offsets, not &offsets */ err = btr_cur_optimistic_update( - BTR_KEEP_SYS_FLAG | BTR_NO_LOCKING_FLAG, - btr_cur, update, 0, thr, &mtr); + flags, btr_cur, &offsets, &offsets_heap, + update, 0, thr, thr_get_trx(thr)->id, &mtr); switch (err) { case DB_OVERFLOW: case DB_UNDERFLOW: case DB_ZIP_OVERFLOW: err = DB_FAIL; + default: + break; } } else { - ut_a(mode == BTR_MODIFY_TREE); err = btr_cur_pessimistic_update( - BTR_KEEP_SYS_FLAG | BTR_NO_LOCKING_FLAG, - btr_cur, &heap, &dummy_big_rec, - update, 0, thr, &mtr); + flags, btr_cur, &offsets, &offsets_heap, + heap, &dummy_big_rec, + update, 0, thr, thr_get_trx(thr)->id, &mtr); ut_a(!dummy_big_rec); } mem_heap_free(heap); + mem_heap_free(offsets_heap); } btr_pcur_close(&pcur); +func_exit_no_pcur: mtr_commit(&mtr); return(err); } /***********************************************************//** +Flags a secondary index corrupted. */ +static __attribute__((nonnull)) +void +row_undo_mod_sec_flag_corrupted( +/*============================*/ + trx_t* trx, /*!< in/out: transaction */ + dict_index_t* index) /*!< in: secondary index */ +{ + ut_ad(!dict_index_is_clust(index)); + + switch (trx->dict_operation_lock_mode) { + case RW_S_LATCH: + /* Because row_undo() is holding an S-latch + on the data dictionary during normal rollback, + we can only mark the index corrupted in the + data dictionary cache. TODO: fix this somehow.*/ + mutex_enter(&dict_sys->mutex); + dict_set_corrupted_index_cache_only(index, index->table); + mutex_exit(&dict_sys->mutex); + break; + default: + ut_ad(0); + /* fall through */ + case RW_X_LATCH: + /* This should be the rollback of a data dictionary + transaction. */ + dict_set_corrupted(index, trx, "rollback"); + } +} + +/***********************************************************//** Undoes a modify in secondary indexes when undo record type is UPD_DEL. @return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t row_undo_mod_upd_del_sec( /*=====================*/ undo_node_t* node, /*!< in: row undo node */ que_thr_t* thr) /*!< in: query thread */ { mem_heap_t* heap; - ulint err = DB_SUCCESS; + dberr_t err = DB_SUCCESS; ut_ad(node->rec_type == TRX_UNDO_UPD_DEL_REC); ut_ad(!node->undo_row); + heap = mem_heap_create(1024); while (node->index != NULL) { @@ -530,6 +765,13 @@ row_undo_mod_upd_del_sec( continue; } + /* During online index creation, + HA_ALTER_INPLACE_NO_LOCK_AFTER_PREPARE should + guarantee that any active transaction has not modified + indexed columns such that col->ord_part was 0 at the + time when the undo log record was written. When we get + to roll back an undo log entry TRX_UNDO_DEL_MARK_REC, + it should always cover all affected indexes. */ entry = row_build_index_entry( node->row, node->ext, index, heap); @@ -566,15 +808,15 @@ row_undo_mod_upd_del_sec( /***********************************************************//** Undoes a modify in secondary indexes when undo record type is DEL_MARK. @return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t row_undo_mod_del_mark_sec( /*======================*/ undo_node_t* node, /*!< in: row undo node */ que_thr_t* thr) /*!< in: query thread */ { mem_heap_t* heap; - ulint err = DB_SUCCESS; + dberr_t err = DB_SUCCESS; ut_ad(!node->undo_row); @@ -589,6 +831,13 @@ row_undo_mod_del_mark_sec( continue; } + /* During online index creation, + HA_ALTER_INPLACE_NO_LOCK_AFTER_PREPARE should + guarantee that any active transaction has not modified + indexed columns such that col->ord_part was 0 at the + time when the undo log record was written. When we get + to roll back an undo log entry TRX_UNDO_DEL_MARK_REC, + it should always cover all affected indexes. */ entry = row_build_index_entry( node->row, node->ext, index, heap); @@ -601,8 +850,17 @@ row_undo_mod_del_mark_sec( BTR_MODIFY_TREE, thr, index, entry); } - if (UNIV_UNLIKELY(err != DB_SUCCESS)) { - + if (err == DB_DUPLICATE_KEY) { + row_undo_mod_sec_flag_corrupted( + thr_get_trx(thr), index); + err = DB_SUCCESS; + /* Do not return any error to the caller. The + duplicate will be reported by ALTER TABLE or + CREATE UNIQUE INDEX. Unfortunately we cannot + report the duplicate key value to the DDL + thread, because the altered_table object is + private to its call stack. */ + } else if (err != DB_SUCCESS) { break; } @@ -618,18 +876,18 @@ row_undo_mod_del_mark_sec( /***********************************************************//** Undoes a modify in secondary indexes when undo record type is UPD_EXIST. @return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t row_undo_mod_upd_exist_sec( /*=======================*/ undo_node_t* node, /*!< in: row undo node */ que_thr_t* thr) /*!< in: query thread */ { mem_heap_t* heap; - ulint err = DB_SUCCESS; + dberr_t err = DB_SUCCESS; if (node->index == NULL - || (node->cmpl_info & UPD_NODE_NO_ORD_CHANGE)) { + || ((node->cmpl_info & UPD_NODE_NO_ORD_CHANGE))) { /* No change in secondary indexes */ return(err); @@ -715,7 +973,11 @@ row_undo_mod_upd_exist_sec( BTR_MODIFY_TREE, thr, index, entry); } - if (UNIV_UNLIKELY(err != DB_SUCCESS)) { + if (err == DB_DUPLICATE_KEY) { + row_undo_mod_sec_flag_corrupted( + thr_get_trx(thr), index); + err = DB_SUCCESS; + } else if (err != DB_SUCCESS) { break; } @@ -730,12 +992,11 @@ row_undo_mod_upd_exist_sec( /***********************************************************//** Parses the row reference and other info in a modify undo log record. */ -static +static __attribute__((nonnull)) void row_undo_mod_parse_undo_rec( /*========================*/ undo_node_t* node, /*!< in: row undo node */ - que_thr_t* thr, /*!< in: query thread */ ibool dict_locked) /*!< in: TRUE if own dict_sys->mutex */ { dict_index_t* clust_index; @@ -747,16 +1008,13 @@ row_undo_mod_parse_undo_rec( ulint info_bits; ulint type; ulint cmpl_info; - ibool dummy_extern; - trx_t* trx; + bool dummy_extern; - ut_ad(node && thr); - trx = thr_get_trx(thr); ptr = trx_undo_rec_get_pars(node->undo_rec, &type, &cmpl_info, &dummy_extern, &undo_no, &table_id); node->rec_type = type; - node->table = dict_table_open_on_id(table_id, dict_locked); + node->table = dict_table_open_on_id(table_id, dict_locked, FALSE); /* TODO: other fixes associated with DROP TABLE + rollback in the same table by another user */ @@ -767,7 +1025,7 @@ row_undo_mod_parse_undo_rec( } if (node->table->ibd_file_missing) { - dict_table_close(node->table, dict_locked); + dict_table_close(node->table, dict_locked, FALSE); /* We skip undo operations to missing .ibd files */ node->table = NULL; @@ -784,14 +1042,14 @@ row_undo_mod_parse_undo_rec( node->heap); trx_undo_update_rec_get_update(ptr, clust_index, type, trx_id, - roll_ptr, info_bits, trx, + roll_ptr, info_bits, node->trx, node->heap, &(node->update)); node->new_trx_id = trx_id; node->cmpl_info = cmpl_info; if (!row_undo_search_clust_to_pcur(node)) { - dict_table_close(node->table, dict_locked); + dict_table_close(node->table, dict_locked, FALSE); node->table = NULL; } @@ -801,21 +1059,23 @@ row_undo_mod_parse_undo_rec( Undoes a modify operation on a row of a table. @return DB_SUCCESS or error code */ UNIV_INTERN -ulint +dberr_t row_undo_mod( /*=========*/ undo_node_t* node, /*!< in: row undo node */ que_thr_t* thr) /*!< in: query thread */ { - ulint err; - ibool dict_locked; + dberr_t err; + ibool dict_locked; ut_ad(node && thr); ut_ad(node->state == UNDO_NODE_MODIFY); dict_locked = thr_get_trx(thr)->dict_operation_lock_mode == RW_X_LATCH; - row_undo_mod_parse_undo_rec(node, thr, dict_locked); + ut_ad(thr_get_trx(thr) == node->trx); + + row_undo_mod_parse_undo_rec(node, dict_locked); if (node->table == NULL) { /* It is already undone, or will be undone by another query @@ -827,8 +1087,18 @@ row_undo_mod( return(DB_SUCCESS); } - node->index = dict_table_get_next_index( - dict_table_get_first_index(node->table)); + node->index = dict_table_get_first_index(node->table); + ut_ad(dict_index_is_clust(node->index)); + + if (dict_index_is_online_ddl(node->index)) { + /* Note that we are rolling back this transaction, so + that all inserts and updates with this DB_TRX_ID can + be skipped. */ + row_log_table_rollback(node->index, node->trx->id); + } + + /* Skip the clustered index (the first index) */ + node->index = dict_table_get_next_index(node->index); /* Skip all corrupted secondary index */ dict_table_skip_corrupt_index(node->index); @@ -853,7 +1123,7 @@ row_undo_mod( err = row_undo_mod_clust(node, thr); } - dict_table_close(node->table, dict_locked); + dict_table_close(node->table, dict_locked, FALSE); node->table = NULL; diff --git a/storage/innobase/row/row0undo.cc b/storage/innobase/row/row0undo.cc index a73f858599d..9977a1e8f04 100644 --- a/storage/innobase/row/row0undo.cc +++ b/storage/innobase/row/row0undo.cc @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1997, 2011, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1997, 2012, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -216,7 +216,8 @@ row_undo_search_clust_to_pcur( } node->row = row_build(ROW_COPY_DATA, clust_index, rec, - offsets, NULL, ext, node->heap); + offsets, NULL, + NULL, NULL, ext, node->heap); if (node->rec_type == TRX_UNDO_UPD_EXIST_REC) { node->undo_row = dtuple_copy(node->row, node->heap); row_upd_replace(node->undo_row, &node->undo_ext, @@ -244,14 +245,14 @@ Fetches an undo log record and does the undo for the recorded operation. If none left, or a partial rollback completed, returns control to the parent node, which is always a query thread node. @return DB_SUCCESS if operation successfully completed, else error code */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t row_undo( /*=====*/ undo_node_t* node, /*!< in: row undo node */ que_thr_t* thr) /*!< in: query thread */ { - ulint err; + dberr_t err; trx_t* trx; roll_ptr_t roll_ptr; ibool locked_data_dict; @@ -332,7 +333,7 @@ row_undo_step( /*==========*/ que_thr_t* thr) /*!< in: query thread */ { - ulint err; + dberr_t err; undo_node_t* node; trx_t* trx; @@ -348,17 +349,17 @@ row_undo_step( err = row_undo(node, thr); - trx->error_state = static_cast<enum db_err>(err); + trx->error_state = err; if (err != DB_SUCCESS) { /* SQL error detected */ - fprintf(stderr, "InnoDB: Fatal error %lu in rollback.\n", - (ulong) err); + fprintf(stderr, "InnoDB: Fatal error (%s) in rollback.\n", + ut_strerr(err)); if (err == DB_OUT_OF_FILE_SPACE) { fprintf(stderr, - "InnoDB: Error 13 means out of tablespace.\n" + "InnoDB: Out of tablespace.\n" "InnoDB: Consider increasing" " your tablespace.\n"); diff --git a/storage/innobase/row/row0upd.cc b/storage/innobase/row/row0upd.cc index 28faa59add8..f97c0c3c82b 100644 --- a/storage/innobase/row/row0upd.cc +++ b/storage/innobase/row/row0upd.cc @@ -23,14 +23,13 @@ Update of a row Created 12/27/1996 Heikki Tuuri *******************************************************/ -#include "m_string.h" /* for my_sys.h */ -#include "my_sys.h" /* DEBUG_SYNC_C */ #include "row0upd.h" #ifdef UNIV_NONINL #include "row0upd.ic" #endif +#include "ha_prototypes.h" #include "dict0dict.h" #include "trx0undo.h" #include "rem0rec.h" @@ -43,8 +42,9 @@ Created 12/27/1996 Heikki Tuuri #include "que0que.h" #include "row0ext.h" #include "row0ins.h" -#include "row0sel.h" +#include "row0log.h" #include "row0row.h" +#include "row0sel.h" #include "rem0cmp.h" #include "lock0lock.h" #include "log0log.h" @@ -178,8 +178,8 @@ NOTE that this function will temporarily commit mtr and lose the pcur position! @return DB_SUCCESS or an error code */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t row_upd_check_references_constraints( /*=================================*/ upd_node_t* node, /*!< in: row update node */ @@ -197,7 +197,7 @@ row_upd_check_references_constraints( trx_t* trx; const rec_t* rec; ulint n_ext; - ulint err; + dberr_t err; ibool got_s_lock = FALSE; if (UT_LIST_GET_FIRST(table->referenced_list) == NULL) { @@ -212,11 +212,12 @@ row_upd_check_references_constraints( heap = mem_heap_create(500); - entry = row_rec_to_index_entry(ROW_COPY_DATA, rec, index, offsets, - &n_ext, heap); + entry = row_rec_to_index_entry(rec, index, offsets, &n_ext, heap); mtr_commit(mtr); + DEBUG_SYNC_C("foreign_constraint_check_for_update"); + mtr_start(mtr); if (trx->dict_operation_lock_mode == 0) { @@ -225,6 +226,7 @@ row_upd_check_references_constraints( row_mysql_freeze_data_dictionary(trx); } +run_again: foreign = UT_LIST_GET_FIRST(table->referenced_list); while (foreign) { @@ -238,18 +240,20 @@ row_upd_check_references_constraints( || row_upd_changes_first_fields_binary( entry, index, node->update, foreign->n_fields))) { + dict_table_t* foreign_table = foreign->foreign_table; dict_table_t* ref_table = NULL; - if (foreign->foreign_table == NULL) { + if (foreign_table == NULL) { ref_table = dict_table_open_on_name( - foreign->foreign_table_name_lookup, FALSE); + foreign->foreign_table_name_lookup, + FALSE, FALSE, DICT_ERR_IGNORE_NONE); } - if (foreign->foreign_table) { + if (foreign_table) { os_inc_counter(dict_sys->mutex, - foreign->foreign_table + foreign_table ->n_foreign_key_checks_running); } @@ -261,18 +265,20 @@ row_upd_check_references_constraints( err = row_ins_check_foreign_constraint( FALSE, foreign, table, entry, thr); - if (foreign->foreign_table) { + if (foreign_table) { os_dec_counter(dict_sys->mutex, - foreign->foreign_table + foreign_table ->n_foreign_key_checks_running); } if (ref_table != NULL) { - dict_table_close(ref_table, FALSE); + dict_table_close(ref_table, FALSE, FALSE); } - if (err != DB_SUCCESS) { - + /* Some table foreign key dropped, try again */ + if (err == DB_DICT_CHANGED) { + goto run_again; + } else if (err != DB_SUCCESS) { goto func_exit; } } @@ -289,6 +295,8 @@ func_exit: mem_heap_free(heap); + DEBUG_SYNC_C("foreign_constraint_check_for_update_done"); + return(err); } @@ -465,6 +473,47 @@ row_upd_changes_field_size_or_external( return(FALSE); } + +/***********************************************************//** +Returns true if row update contains disowned external fields. +@return true if the update contains disowned external fields. */ +UNIV_INTERN +bool +row_upd_changes_disowned_external( +/*==============================*/ + const upd_t* update) /*!< in: update vector */ +{ + const upd_field_t* upd_field; + const dfield_t* new_val; + ulint new_len; + ulint n_fields; + ulint i; + + n_fields = upd_get_n_fields(update); + + for (i = 0; i < n_fields; i++) { + const byte* field_ref; + + upd_field = upd_get_nth_field(update, i); + new_val = &(upd_field->new_val); + new_len = dfield_get_len(new_val); + + if (!dfield_is_ext(new_val)) { + continue; + } + + ut_ad(new_len >= BTR_EXTERN_FIELD_REF_SIZE); + + field_ref = static_cast<const byte*>(dfield_get_data(new_val)) + + new_len - BTR_EXTERN_FIELD_REF_SIZE; + + if (field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_OWNER_FLAG) { + return(true); + } + } + + return(false); +} #endif /* !UNIV_HOTBACKUP */ /***********************************************************//** @@ -560,7 +609,7 @@ byte* row_upd_write_sys_vals_to_log( /*==========================*/ dict_index_t* index, /*!< in: clustered index */ - trx_t* trx, /*!< in: transaction */ + trx_id_t trx_id, /*!< in: transaction id */ roll_ptr_t roll_ptr,/*!< in: roll ptr of the undo log record */ byte* log_ptr,/*!< pointer to a buffer of size > 20 opened in mlog */ @@ -576,7 +625,7 @@ row_upd_write_sys_vals_to_log( trx_write_roll_ptr(log_ptr, roll_ptr); log_ptr += DATA_ROLL_PTR_LEN; - log_ptr += mach_ull_write_compressed(log_ptr, trx->id); + log_ptr += mach_ull_write_compressed(log_ptr, trx_id); return(log_ptr); } @@ -779,10 +828,10 @@ UNIV_INTERN upd_t* row_upd_build_sec_rec_difference_binary( /*====================================*/ + const rec_t* rec, /*!< in: secondary index record */ dict_index_t* index, /*!< in: index */ + const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */ const dtuple_t* entry, /*!< in: entry to insert */ - const rec_t* rec, /*!< in: secondary index record */ - trx_t* trx, /*!< in: transaction */ mem_heap_t* heap) /*!< in: memory heap from which allocated */ { upd_field_t* upd_field; @@ -792,18 +841,16 @@ row_upd_build_sec_rec_difference_binary( upd_t* update; ulint n_diff; ulint i; - ulint offsets_[REC_OFFS_SMALL_SIZE]; - const ulint* offsets; - rec_offs_init(offsets_); /* This function is used only for a secondary index */ ut_a(!dict_index_is_clust(index)); + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(rec_offs_n_fields(offsets) == dtuple_get_n_fields(entry)); + ut_ad(!rec_offs_any_extern(offsets)); update = upd_create(dtuple_get_n_fields(entry), heap); n_diff = 0; - offsets = rec_get_offsets(rec, index, offsets_, - ULINT_UNDEFINED, &heap); for (i = 0; i < dtuple_get_n_fields(entry); i++) { @@ -828,7 +875,7 @@ row_upd_build_sec_rec_difference_binary( dfield_copy(&(upd_field->new_val), dfield); - upd_field_set_field_no(upd_field, i, index, trx); + upd_field_set_field_no(upd_field, i, index, NULL); n_diff++; } @@ -846,12 +893,15 @@ the equal ordering fields. NOTE: we compare the fields as binary strings! @return own: update vector of differing fields, excluding roll ptr and trx id */ UNIV_INTERN -upd_t* +const upd_t* row_upd_build_difference_binary( /*============================*/ dict_index_t* index, /*!< in: clustered index */ const dtuple_t* entry, /*!< in: entry to insert */ const rec_t* rec, /*!< in: clustered index record */ + const ulint* offsets,/*!< in: rec_get_offsets(rec,index), or NULL */ + bool no_sys, /*!< in: skip the system columns + DB_TRX_ID and DB_ROLL_PTR */ trx_t* trx, /*!< in: transaction */ mem_heap_t* heap) /*!< in: memory heap from which allocated */ { @@ -861,11 +911,9 @@ row_upd_build_difference_binary( ulint len; upd_t* update; ulint n_diff; - ulint roll_ptr_pos; ulint trx_id_pos; ulint i; ulint offsets_[REC_OFFS_NORMAL_SIZE]; - const ulint* offsets; rec_offs_init(offsets_); /* This function is used only for a clustered index */ @@ -875,11 +923,16 @@ row_upd_build_difference_binary( n_diff = 0; - roll_ptr_pos = dict_index_get_sys_col_pos(index, DATA_ROLL_PTR); trx_id_pos = dict_index_get_sys_col_pos(index, DATA_TRX_ID); + ut_ad(dict_index_get_sys_col_pos(index, DATA_ROLL_PTR) + == trx_id_pos + 1); - offsets = rec_get_offsets(rec, index, offsets_, - ULINT_UNDEFINED, &heap); + if (!offsets) { + offsets = rec_get_offsets(rec, index, offsets_, + ULINT_UNDEFINED, &heap); + } else { + ut_ad(rec_offs_validate(rec, index, offsets)); + } for (i = 0; i < dtuple_get_n_fields(entry); i++) { @@ -890,9 +943,9 @@ row_upd_build_difference_binary( /* NOTE: we compare the fields as binary strings! (No collation) */ - if (i == trx_id_pos || i == roll_ptr_pos) { + if (no_sys && (i == trx_id_pos || i == trx_id_pos + 1)) { - goto skip_compare; + continue; } if (!dfield_is_ext(dfield) @@ -907,8 +960,6 @@ row_upd_build_difference_binary( n_diff++; } -skip_compare: - ; } update->n_fields = n_diff; @@ -1386,9 +1437,9 @@ row_upd_changes_some_index_ord_field_binary( /***********************************************************//** Checks if an FTS Doc ID column is affected by an UPDATE. -@return TRUE if the Doc ID column is changed */ +@return whether the Doc ID column is changed */ UNIV_INTERN -ulint +bool row_upd_changes_doc_id( /*===================*/ dict_table_t* table, /*!< in: table */ @@ -1431,61 +1482,6 @@ row_upd_changes_fts_column( } /***********************************************************//** -Checks if an update vector changes the table's FTS-indexed columns. -NOTE: must not be called for tables which do not have an FTS-index. -Also, the vector returned must be explicitly freed as it's allocated -using the ut_malloc() allocator. -@return vector of FTS indexes that were affected by the update */ -UNIV_INTERN -ib_vector_t* -row_upd_changes_fts_columns( -/*========================*/ - dict_table_t* table, /*!< in: table */ - upd_t* update) /*!< in: update vector for the row */ -{ - ulint i; - ulint offset; - fts_t* fts = table->fts; - ib_vector_t* updated_fts_indexes = NULL; - - for (i = 0; i < upd_get_n_fields(update); ++i) { - upd_field_t* upd_field = upd_get_nth_field(update, i); - - offset = row_upd_changes_fts_column(table, upd_field); - - if (offset != ULINT_UNDEFINED) { - - dict_index_t* index; - - /* TODO: Investigate if we can check whether the - existing set of affected indexes matches the new - affected set. If matched then we don't need to - do the extra malloc()/free(). */ - - /* This vector is created from the ut_malloc() - allocator because we only want to keep one instance - around not matter how many times this row is - updated. The old entry should be deleted when - we update the FTS row info with this new vector. */ - if (updated_fts_indexes == NULL) { - ib_alloc_t* ut_alloc; - - ut_alloc = ib_ut_allocator_create(); - - updated_fts_indexes = ib_vector_create( - ut_alloc, sizeof(dict_index_t*), 2); - } - - index = static_cast<dict_index_t*>( - ib_vector_getp(fts->indexes, offset)); - ib_vector_push(updated_fts_indexes, &index); - } - } - - return(updated_fts_indexes); -} - -/***********************************************************//** Checks if an update vector changes some of the first ordering fields of an index record. This is only used in foreign key checks and we can assume that index does not contain column prefixes. @@ -1633,7 +1629,7 @@ row_upd_store_row( } node->row = row_build(ROW_COPY_DATA, clust_index, rec, offsets, - NULL, ext, node->heap); + NULL, NULL, NULL, ext, node->heap); if (node->is_delete) { node->upd_row = NULL; node->upd_ext = NULL; @@ -1652,8 +1648,8 @@ row_upd_store_row( Updates a secondary index entry of a row. @return DB_SUCCESS if operation successfully completed, else error code or DB_LOCK_WAIT */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t row_upd_sec_index_entry( /*====================*/ upd_node_t* node, /*!< in: row update node */ @@ -1667,11 +1663,13 @@ row_upd_sec_index_entry( dict_index_t* index; btr_cur_t* btr_cur; ibool referenced; - ulint err = DB_SUCCESS; + dberr_t err = DB_SUCCESS; trx_t* trx = thr_get_trx(thr); - ulint mode = BTR_MODIFY_LEAF; + ulint mode; enum row_search_result search_result; + ut_ad(trx->id); + index = node->index; referenced = row_upd_index_is_referenced(index, trx); @@ -1682,19 +1680,74 @@ row_upd_sec_index_entry( entry = row_build_index_entry(node->row, node->ext, index, heap); ut_a(entry); + log_free_check(); + +#ifdef UNIV_DEBUG + /* Work around Bug#14626800 ASSERTION FAILURE IN DEBUG_SYNC(). + Once it is fixed, remove the 'ifdef', 'if' and this comment. */ + if (!trx->ddl) { + DEBUG_SYNC_C_IF_THD(trx->mysql_thd, + "before_row_upd_sec_index_entry"); + } +#endif /* UNIV_DEBUG */ + mtr_start(&mtr); + if (*index->name == TEMP_INDEX_PREFIX) { + /* The index->online_status may change if the + index->name starts with TEMP_INDEX_PREFIX (meaning + that the index is or was being created online). It is + protected by index->lock. */ + + mtr_s_lock(dict_index_get_lock(index), &mtr); + + switch (dict_index_get_online_status(index)) { + case ONLINE_INDEX_COMPLETE: + /* This is a normal index. Do not log anything. + Perform the update on the index tree directly. */ + break; + case ONLINE_INDEX_CREATION: + /* Log a DELETE and optionally INSERT. */ + row_log_online_op(index, entry, 0); + + if (!node->is_delete) { + mem_heap_empty(heap); + entry = row_build_index_entry( + node->upd_row, node->upd_ext, + index, heap); + ut_a(entry); + row_log_online_op(index, entry, trx->id); + } + /* fall through */ + case ONLINE_INDEX_ABORTED: + case ONLINE_INDEX_ABORTED_DROPPED: + mtr_commit(&mtr); + goto func_exit; + } + + /* We can only buffer delete-mark operations if there + are no foreign key constraints referring to the index. */ + mode = referenced + ? BTR_MODIFY_LEAF | BTR_ALREADY_S_LATCHED + : BTR_MODIFY_LEAF | BTR_ALREADY_S_LATCHED + | BTR_DELETE_MARK; + } else { + /* For secondary indexes, + index->online_status==ONLINE_INDEX_CREATION unless + index->name starts with TEMP_INDEX_PREFIX. */ + ut_ad(!dict_index_is_online_ddl(index)); + + /* We can only buffer delete-mark operations if there + are no foreign key constraints referring to the index. */ + mode = referenced + ? BTR_MODIFY_LEAF + : BTR_MODIFY_LEAF | BTR_DELETE_MARK; + } + /* Set the query thread, so that ibuf_insert_low() will be able to invoke thd_get_trx(). */ btr_pcur_get_btr_cur(&pcur)->thr = thr; - /* We can only try to use the insert/delete buffer to buffer - delete-mark operations if the index we're modifying has no foreign - key constraints referring to it. */ - if (!referenced) { - mode |= BTR_DELETE_MARK; - } - search_result = row_search_index_entry(index, entry, mode, &pcur, &mtr); @@ -1711,6 +1764,20 @@ row_upd_sec_index_entry( break; case ROW_NOT_FOUND: + if (*index->name == TEMP_INDEX_PREFIX) { + /* When online CREATE INDEX copied the update + that we already made to the clustered index, + and completed the secondary index creation + before we got here, the old secondary index + record would not exist. The CREATE INDEX + should be waiting for a MySQL meta-data lock + upgrade at least until this UPDATE + returns. After that point, the + TEMP_INDEX_PREFIX would be dropped from the + index name in commit_inplace_alter_table(). */ + break; + } + fputs("InnoDB: error in sec index entry update in\n" "InnoDB: ", stderr); dict_index_name_print(stderr, trx, index); @@ -1730,11 +1797,9 @@ row_upd_sec_index_entry( case ROW_FOUND: /* Delete mark the old index record; it can already be delete marked if we return after a lock wait in - row_ins_index_entry below */ - + row_ins_sec_index_entry() below */ if (!rec_get_deleted_flag( - rec, dict_table_is_comp(index->table))) { - + rec, dict_table_is_comp(index->table))) { err = btr_cur_del_mark_set_sec_rec( 0, btr_cur, TRUE, thr, &mtr); @@ -1764,13 +1829,15 @@ row_upd_sec_index_entry( goto func_exit; } + mem_heap_empty(heap); + /* Build a new index entry */ entry = row_build_index_entry(node->upd_row, node->upd_ext, index, heap); ut_a(entry); /* Insert new index entry */ - err = row_ins_index_entry(index, entry, 0, TRUE, thr); + err = row_ins_sec_index_entry(index, entry, thr); func_exit: mem_heap_free(heap); @@ -1783,8 +1850,8 @@ Updates the secondary index record if it is changed in the row update or deletes it if this is a delete. @return DB_SUCCESS if operation successfully completed, else error code or DB_LOCK_WAIT */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t row_upd_sec_step( /*=============*/ upd_node_t* node, /*!< in: row update node */ @@ -1897,8 +1964,8 @@ fields of the clustered index record change. This should be quite rare in database applications. @return DB_SUCCESS if operation successfully completed, else error code or DB_LOCK_WAIT */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t row_upd_clust_rec_by_insert( /*========================*/ upd_node_t* node, /*!< in/out: row update node */ @@ -1914,7 +1981,7 @@ row_upd_clust_rec_by_insert( trx_t* trx; dict_table_t* table; dtuple_t* entry; - ulint err; + dberr_t err; ibool change_ownership = FALSE; rec_t* rec; ulint* offsets = NULL; @@ -1939,7 +2006,7 @@ row_upd_clust_rec_by_insert( default: ut_error; case UPD_NODE_INSERT_BLOB: - /* A lock wait occurred in row_ins_index_entry() in + /* A lock wait occurred in row_ins_clust_index_entry() in the previous invocation of this function. Mark the off-page columns in the entry inherited. */ @@ -1948,7 +2015,7 @@ row_upd_clust_rec_by_insert( ut_a(change_ownership); /* fall through */ case UPD_NODE_INSERT_CLUSTERED: - /* A lock wait occurred in row_ins_index_entry() in + /* A lock wait occurred in row_ins_clust_index_entry() in the previous invocation of this function. */ break; case UPD_NODE_UPDATE_CLUSTERED: @@ -1961,8 +2028,8 @@ row_upd_clust_rec_by_insert( ut_ad(page_rec_is_user_rec(rec)); err = btr_cur_del_mark_set_clust_rec( - BTR_NO_LOCKING_FLAG, btr_cur_get_block(btr_cur), - rec, index, offsets, TRUE, thr, mtr); + btr_cur_get_block(btr_cur), rec, index, offsets, + thr, mtr); if (err != DB_SUCCESS) { err_exit: mtr_commit(mtr); @@ -1999,9 +2066,9 @@ err_exit: mtr_commit(mtr); - err = row_ins_index_entry(index, entry, - node->upd_ext ? node->upd_ext->n_ext : 0, - TRUE, thr); + err = row_ins_clust_index_entry( + index, entry, thr, + node->upd_ext ? node->upd_ext->n_ext : 0); node->state = change_ownership ? UPD_NODE_INSERT_BLOB : UPD_NODE_INSERT_CLUSTERED; @@ -2027,11 +2094,17 @@ err_exit: offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap); ut_ad(page_rec_is_user_rec(rec)); + ut_ad(rec_get_deleted_flag(rec, rec_offs_comp(offsets))); btr_cur_disown_inherited_fields( btr_cur_get_page_zip(btr_cur), rec, index, offsets, node->update, mtr); + /* It is not necessary to call row_log_table for + this, because during online table rebuild, purge will + not free any BLOBs in the table, whether or not they + are owned by the clustered index record. */ + mtr_commit(mtr); } @@ -2045,20 +2118,24 @@ Updates a clustered index record of a row when the ordering fields do not change. @return DB_SUCCESS if operation successfully completed, else error code or DB_LOCK_WAIT */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t row_upd_clust_rec( /*==============*/ upd_node_t* node, /*!< in: row update node */ dict_index_t* index, /*!< in: clustered index */ + ulint* offsets,/*!< in: rec_get_offsets() on node->pcur */ + mem_heap_t** offsets_heap, + /*!< in/out: memory heap, can be emptied */ que_thr_t* thr, /*!< in: query thread */ mtr_t* mtr) /*!< in: mtr; gets committed here */ { - mem_heap_t* heap = NULL; - big_rec_t* big_rec = NULL; + mem_heap_t* heap = NULL; + big_rec_t* big_rec = NULL; btr_pcur_t* pcur; btr_cur_t* btr_cur; - ulint err; + dberr_t err; + const dtuple_t* rebuilt_old_pk = NULL; ut_ad(node); ut_ad(dict_index_is_clust(index)); @@ -2066,33 +2143,48 @@ row_upd_clust_rec( pcur = node->pcur; btr_cur = btr_pcur_get_btr_cur(pcur); - ut_ad(!rec_get_deleted_flag(btr_pcur_get_rec(pcur), + ut_ad(btr_cur_get_index(btr_cur) == index); + ut_ad(!rec_get_deleted_flag(btr_cur_get_rec(btr_cur), dict_table_is_comp(index->table))); + ut_ad(rec_offs_validate(btr_cur_get_rec(btr_cur), index, offsets)); + + if (dict_index_is_online_ddl(index)) { + rebuilt_old_pk = row_log_table_get_pk( + btr_cur_get_rec(btr_cur), index, offsets, &heap); + } /* Try optimistic updating of the record, keeping changes within the page; we do not check locks because we assume the x-lock on the record to update */ if (node->cmpl_info & UPD_NODE_NO_SIZE_CHANGE) { - err = btr_cur_update_in_place(BTR_NO_LOCKING_FLAG, - btr_cur, node->update, - node->cmpl_info, thr, mtr); + err = btr_cur_update_in_place( + BTR_NO_LOCKING_FLAG, btr_cur, + offsets, node->update, + node->cmpl_info, thr, thr_get_trx(thr)->id, mtr); } else { - err = btr_cur_optimistic_update(BTR_NO_LOCKING_FLAG, - btr_cur, node->update, - node->cmpl_info, thr, mtr); + err = btr_cur_optimistic_update( + BTR_NO_LOCKING_FLAG, btr_cur, + &offsets, offsets_heap, node->update, + node->cmpl_info, thr, thr_get_trx(thr)->id, mtr); + } + + if (err == DB_SUCCESS && dict_index_is_online_ddl(index)) { + row_log_table_update(btr_cur_get_rec(btr_cur), + index, offsets, rebuilt_old_pk); } mtr_commit(mtr); if (UNIV_LIKELY(err == DB_SUCCESS)) { - return(DB_SUCCESS); + goto func_exit; } if (buf_LRU_buf_pool_running_out()) { - return(DB_LOCK_TABLE_FULL); + err = DB_LOCK_TABLE_FULL; + goto func_exit; } /* We may have to modify the tree structure: do a pessimistic descent down the index tree */ @@ -2110,14 +2202,16 @@ row_upd_clust_rec( ut_ad(!rec_get_deleted_flag(btr_pcur_get_rec(pcur), dict_table_is_comp(index->table))); + if (!heap) { + heap = mem_heap_create(1024); + } + err = btr_cur_pessimistic_update( BTR_NO_LOCKING_FLAG | BTR_KEEP_POS_FLAG, btr_cur, - &heap, &big_rec, node->update, node->cmpl_info, thr, mtr); + &offsets, offsets_heap, heap, &big_rec, + node->update, node->cmpl_info, + thr, thr_get_trx(thr)->id, mtr); if (big_rec) { - ulint offsets_[REC_OFFS_NORMAL_SIZE]; - rec_t* rec; - rec_offs_init(offsets_); - ut_a(err == DB_SUCCESS); /* Write out the externally stored columns while still x-latching @@ -2140,12 +2234,10 @@ row_upd_clust_rec( portion of the file, in case the file was somehow truncated in the crash. */ - rec = btr_cur_get_rec(btr_cur); DEBUG_SYNC_C("before_row_upd_extern"); err = btr_store_big_rec_extern_fields( - index, btr_cur_get_block(btr_cur), rec, - rec_get_offsets(rec, index, offsets_, - ULINT_UNDEFINED, &heap), + index, btr_cur_get_block(btr_cur), + btr_cur_get_rec(btr_cur), offsets, big_rec, mtr, BTR_STORE_UPDATE); DEBUG_SYNC_C("after_row_upd_extern"); /* If writing big_rec fails (for example, because of @@ -2164,9 +2256,14 @@ row_upd_clust_rec( ut_a(err == DB_SUCCESS); } - mtr_commit(mtr); + if (err == DB_SUCCESS && dict_index_is_online_ddl(index)) { + row_log_table_update(btr_cur_get_rec(btr_cur), + index, offsets, rebuilt_old_pk); + } - if (UNIV_LIKELY_NULL(heap)) { + mtr_commit(mtr); +func_exit: + if (heap) { mem_heap_free(heap); } @@ -2180,8 +2277,8 @@ row_upd_clust_rec( /***********************************************************//** Delete marks a clustered index record. @return DB_SUCCESS if operation successfully completed, else error code */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t row_upd_del_mark_clust_rec( /*=======================*/ upd_node_t* node, /*!< in: row update node */ @@ -2196,7 +2293,7 @@ row_upd_del_mark_clust_rec( { btr_pcur_t* pcur; btr_cur_t* btr_cur; - ulint err; + dberr_t err; ut_ad(node); ut_ad(dict_index_is_clust(index)); @@ -2214,8 +2311,8 @@ row_upd_del_mark_clust_rec( locks, because we assume that we have an x-lock on the record */ err = btr_cur_del_mark_set_clust_rec( - BTR_NO_LOCKING_FLAG, btr_cur_get_block(btr_cur), - btr_cur_get_rec(btr_cur), index, offsets, TRUE, thr, mtr); + btr_cur_get_block(btr_cur), btr_cur_get_rec(btr_cur), + index, offsets, thr, mtr); if (err == DB_SUCCESS && referenced) { /* NOTE that the following call loses the position of pcur ! */ @@ -2232,8 +2329,8 @@ row_upd_del_mark_clust_rec( Updates the clustered index record. @return DB_SUCCESS if operation successfully completed, DB_LOCK_WAIT in case of a lock wait, else error code */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t row_upd_clust_step( /*===============*/ upd_node_t* node, /*!< in: row update node */ @@ -2242,11 +2339,10 @@ row_upd_clust_step( dict_index_t* index; btr_pcur_t* pcur; ibool success; - ulint err; - mtr_t* mtr; - mtr_t mtr_buf; + dberr_t err; + mtr_t mtr; rec_t* rec; - mem_heap_t* heap = NULL; + mem_heap_t* heap = NULL; ulint offsets_[REC_OFFS_NORMAL_SIZE]; ulint* offsets; ibool referenced; @@ -2259,9 +2355,8 @@ row_upd_clust_step( pcur = node->pcur; /* We have to restore the cursor to its position */ - mtr = &mtr_buf; - mtr_start(mtr); + mtr_start(&mtr); /* If the restoration does not succeed, then the same transaction has deleted the record on which the cursor was, @@ -2273,12 +2368,32 @@ row_upd_clust_step( ut_a(pcur->rel_pos == BTR_PCUR_ON); - success = btr_pcur_restore_position(BTR_MODIFY_LEAF, pcur, mtr); + ulint mode; + +#ifdef UNIV_DEBUG + /* Work around Bug#14626800 ASSERTION FAILURE IN DEBUG_SYNC(). + Once it is fixed, remove the 'ifdef', 'if' and this comment. */ + if (!thr_get_trx(thr)->ddl) { + DEBUG_SYNC_C_IF_THD( + thr_get_trx(thr)->mysql_thd, + "innodb_row_upd_clust_step_enter"); + } +#endif /* UNIV_DEBUG */ + + if (dict_index_is_online_ddl(index)) { + ut_ad(node->table->id != DICT_INDEXES_ID); + mode = BTR_MODIFY_LEAF | BTR_ALREADY_S_LATCHED; + mtr_s_lock(dict_index_get_lock(index), &mtr); + } else { + mode = BTR_MODIFY_LEAF; + } + + success = btr_pcur_restore_position(mode, pcur, &mtr); if (!success) { err = DB_RECORD_NOT_FOUND; - mtr_commit(mtr); + mtr_commit(&mtr); return(err); } @@ -2289,18 +2404,20 @@ row_upd_clust_step( if (node->is_delete && node->table->id == DICT_INDEXES_ID) { - dict_drop_index_tree(btr_pcur_get_rec(pcur), mtr); + ut_ad(!dict_index_is_online_ddl(index)); - mtr_commit(mtr); + dict_drop_index_tree(btr_pcur_get_rec(pcur), &mtr); - mtr_start(mtr); + mtr_commit(&mtr); + + mtr_start(&mtr); success = btr_pcur_restore_position(BTR_MODIFY_LEAF, pcur, - mtr); + &mtr); if (!success) { err = DB_ERROR; - mtr_commit(mtr); + mtr_commit(&mtr); return(err); } @@ -2315,7 +2432,7 @@ row_upd_clust_step( 0, btr_pcur_get_block(pcur), rec, index, offsets, thr); if (err != DB_SUCCESS) { - mtr_commit(mtr); + mtr_commit(&mtr); goto exit_func; } } @@ -2324,17 +2441,14 @@ row_upd_clust_step( if (node->is_delete) { err = row_upd_del_mark_clust_rec( - node, index, offsets, thr, referenced, mtr); + node, index, offsets, thr, referenced, &mtr); if (err == DB_SUCCESS) { node->state = UPD_NODE_UPDATE_ALL_SEC; node->index = dict_table_get_next_index(index); } -exit_func: - if (UNIV_LIKELY_NULL(heap)) { - mem_heap_free(heap); - } - return(err); + + goto exit_func; } /* If the update is made for MySQL, we already have the update vector @@ -2348,13 +2462,11 @@ exit_func: row_upd_eval_new_vals(node->update); } - if (UNIV_LIKELY_NULL(heap)) { - mem_heap_free(heap); - } - if (node->cmpl_info & UPD_NODE_NO_ORD_CHANGE) { - return(row_upd_clust_rec(node, index, thr, mtr)); + err = row_upd_clust_rec( + node, index, offsets, &heap, thr, &mtr); + goto exit_func; } row_upd_store_row(node); @@ -2374,20 +2486,21 @@ exit_func: externally! */ err = row_upd_clust_rec_by_insert( - node, index, thr, referenced, mtr); + node, index, thr, referenced, &mtr); if (err != DB_SUCCESS) { - return(err); + goto exit_func; } node->state = UPD_NODE_UPDATE_ALL_SEC; } else { - err = row_upd_clust_rec(node, index, thr, mtr); + err = row_upd_clust_rec( + node, index, offsets, &heap, thr, &mtr); if (err != DB_SUCCESS) { - return(err); + goto exit_func; } node->state = UPD_NODE_UPDATE_SOME_SEC; @@ -2395,6 +2508,10 @@ exit_func: node->index = dict_table_get_next_index(index); +exit_func: + if (heap) { + mem_heap_free(heap); + } return(err); } @@ -2404,14 +2521,14 @@ to this node, we assume that we have a persistent cursor which was on a record, and the position of the cursor is stored in the cursor. @return DB_SUCCESS if operation successfully completed, else error code or DB_LOCK_WAIT */ -static -ulint +static __attribute__((nonnull, warn_unused_result)) +dberr_t row_upd( /*====*/ upd_node_t* node, /*!< in: row update node */ que_thr_t* thr) /*!< in: query thread */ { - ulint err = DB_SUCCESS; + dberr_t err = DB_SUCCESS; ut_ad(node && thr); @@ -2449,6 +2566,17 @@ row_upd( return(DB_SUCCESS); } +#ifdef UNIV_DEBUG + /* Work around Bug#14626800 ASSERTION FAILURE IN DEBUG_SYNC(). + Once it is fixed, remove the 'ifdef', 'if' and this comment. */ + if (!thr_get_trx(thr)->ddl) { + DEBUG_SYNC_C_IF_THD(thr_get_trx(thr)->mysql_thd, + "after_row_upd_clust"); + } +#endif /* UNIV_DEBUG */ + + DBUG_EXECUTE_IF("row_upd_skip_sec", node->index = NULL;); + do { /* Skip corrupted index */ dict_table_skip_corrupt_index(node->index); @@ -2458,7 +2586,6 @@ row_upd( } if (node->index->type != DICT_FTS) { - log_free_check(); err = row_upd_sec_step(node, thr); if (err != DB_SUCCESS) { @@ -2500,7 +2627,7 @@ row_upd_step( upd_node_t* node; sel_node_t* sel_node; que_node_t* parent; - ulint err = DB_SUCCESS; + dberr_t err = DB_SUCCESS; trx_t* trx; ut_ad(thr); @@ -2579,7 +2706,7 @@ row_upd_step( err = row_upd(node, thr); error_handling: - trx->error_state = static_cast<enum db_err>(err); + trx->error_state = err; if (err != DB_SUCCESS) { return(NULL); diff --git a/storage/innobase/row/row0vers.cc b/storage/innobase/row/row0vers.cc index 0aad8675ff8..2c3191928fd 100644 --- a/storage/innobase/row/row0vers.cc +++ b/storage/innobase/row/row0vers.cc @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1997, 2011, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1997, 2012, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -114,7 +114,6 @@ row_vers_impl_x_locked_low( on rec. */ for (version = clust_rec;; version = prev_version) { - ulint err; row_ext_t* ext; const dtuple_t* row; dtuple_t* entry; @@ -128,24 +127,22 @@ row_vers_impl_x_locked_low( heap = mem_heap_create(1024); - err = trx_undo_prev_version_build( + trx_undo_prev_version_build( clust_rec, mtr, version, clust_index, clust_offsets, heap, &prev_version); - /* Free version and clust_offsets. */ + /* Free version and clust_offsets. */ mem_heap_free(old_heap); if (prev_version == NULL) { - /* clust_rec must be a fresh insert, because + /* clust_rec should be a fresh insert, because no previous version was found or the transaction has committed. The caller has to recheck as the synopsis of this function states, whether trx_id is active or not. */ - ut_a(err == DB_SUCCESS || err == DB_MISSING_HISTORY); - break; } @@ -155,15 +152,16 @@ row_vers_impl_x_locked_low( vers_del = rec_get_deleted_flag(prev_version, comp); - prev_trx_id = row_get_rec_trx_id( - prev_version, clust_index, clust_offsets); + prev_trx_id = row_get_rec_trx_id(prev_version, clust_index, + clust_offsets); /* The stack of versions is locked by mtr. Thus, it is safe to fetch the prefixes for externally stored columns. */ row = row_build(ROW_COPY_POINTERS, clust_index, prev_version, - clust_offsets, NULL, &ext, heap); + clust_offsets, + NULL, NULL, NULL, &ext, heap); entry = row_build_index_entry(row, ext, index, heap); @@ -183,8 +181,6 @@ row_vers_impl_x_locked_low( There is no guarantee that the transaction is still active. */ - ut_ad(err == DB_SUCCESS); - /* We check if entry and rec are identified in the alphabetical ordering */ @@ -355,7 +351,6 @@ row_vers_old_has_index_entry( mem_heap_t* heap2; const dtuple_t* row; const dtuple_t* entry; - ulint err; ulint comp; ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_X_FIX) @@ -383,7 +378,8 @@ row_vers_old_has_index_entry( Thus, it is safe to fetch the prefixes for externally stored columns. */ row = row_build(ROW_COPY_POINTERS, clust_index, - rec, clust_offsets, NULL, &ext, heap); + rec, clust_offsets, + NULL, NULL, NULL, &ext, heap); entry = row_build_index_entry(row, ext, index, heap); /* If entry == NULL, the record contains unset BLOB @@ -420,12 +416,12 @@ row_vers_old_has_index_entry( for (;;) { heap2 = heap; heap = mem_heap_create(1024); - err = trx_undo_prev_version_build(rec, mtr, version, - clust_index, clust_offsets, - heap, &prev_version); + trx_undo_prev_version_build(rec, mtr, version, + clust_index, clust_offsets, + heap, &prev_version); mem_heap_free(heap2); /* free version and clust_offsets */ - if (err != DB_SUCCESS || !prev_version) { + if (!prev_version) { /* Versions end here */ mem_heap_free(heap); @@ -444,7 +440,7 @@ row_vers_old_has_index_entry( externally stored columns. */ row = row_build(ROW_COPY_POINTERS, clust_index, prev_version, clust_offsets, - NULL, &ext, heap); + NULL, NULL, NULL, &ext, heap); entry = row_build_index_entry(row, ext, index, heap); /* If entry == NULL, the record contains unset @@ -477,7 +473,7 @@ read should see. We assume that the trx id stored in rec is such that the consistent read should not see rec in its present version. @return DB_SUCCESS or DB_MISSING_HISTORY */ UNIV_INTERN -ulint +dberr_t row_vers_build_for_consistent_read( /*===============================*/ const rec_t* rec, /*!< in: record in a clustered index; the @@ -495,8 +491,9 @@ row_vers_build_for_consistent_read( *old_vers is allocated; memory for possible intermediate versions is allocated and freed locally within the function */ - rec_t** old_vers)/*!< out, own: old version, or NULL if the - record does not exist in the view, that is, + rec_t** old_vers)/*!< out, own: old version, or NULL + if the history is missing or the record + does not exist in the view, that is, it was freshly inserted afterwards */ { const rec_t* version; @@ -504,7 +501,7 @@ row_vers_build_for_consistent_read( trx_id_t trx_id; mem_heap_t* heap = NULL; byte* buf; - ulint err; + dberr_t err; ut_ad(dict_index_is_clust(index)); ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_X_FIX) @@ -558,27 +555,21 @@ row_vers_build_for_consistent_read( rec_offs_make_valid(*old_vers, index, *offsets); err = DB_SUCCESS; - break; } } err = trx_undo_prev_version_build(rec, mtr, version, index, *offsets, heap, - &prev_version); + &prev_version) + ? DB_SUCCESS : DB_MISSING_HISTORY; if (heap2) { mem_heap_free(heap2); /* free version */ } - if (err != DB_SUCCESS) { - break; - } - if (prev_version == NULL) { /* It was a freshly inserted version */ *old_vers = NULL; - err = DB_SUCCESS; - break; } @@ -602,8 +593,6 @@ row_vers_build_for_consistent_read( *old_vers = rec_copy(buf, prev_version, *offsets); rec_offs_make_valid(*old_vers, index, *offsets); - err = DB_SUCCESS; - break; } @@ -617,10 +606,9 @@ row_vers_build_for_consistent_read( /*****************************************************************//** Constructs the last committed version of a clustered index record, -which should be seen by a semi-consistent read. -@return DB_SUCCESS or DB_MISSING_HISTORY */ +which should be seen by a semi-consistent read. */ UNIV_INTERN -ulint +void row_vers_build_for_semi_consistent_read( /*====================================*/ const rec_t* rec, /*!< in: record in a clustered index; the @@ -644,7 +632,6 @@ row_vers_build_for_semi_consistent_read( const rec_t* version; mem_heap_t* heap = NULL; byte* buf; - ulint err; trx_id_t rec_trx_id = 0; ut_ad(dict_index_is_clust(index)); @@ -683,7 +670,7 @@ row_vers_build_for_semi_consistent_read( mutex_exit(&trx_sys->mutex); if (!version_trx) { - +committed_version_trx: /* We found a version that belongs to a committed transaction: return it. */ @@ -693,7 +680,6 @@ row_vers_build_for_semi_consistent_read( if (rec == version) { *old_vers = rec; - err = DB_SUCCESS; break; } @@ -721,30 +707,30 @@ row_vers_build_for_semi_consistent_read( *old_vers = rec_copy(buf, version, *offsets); rec_offs_make_valid(*old_vers, index, *offsets); - err = DB_SUCCESS; - break; } + DEBUG_SYNC_C("after_row_vers_check_trx_active"); + heap2 = heap; heap = mem_heap_create(1024); - err = trx_undo_prev_version_build(rec, mtr, version, index, - *offsets, heap, - &prev_version); - if (heap2) { - mem_heap_free(heap2); /* free version */ + if (!trx_undo_prev_version_build(rec, mtr, version, index, + *offsets, heap, + &prev_version)) { + mem_heap_free(heap); + heap = heap2; + heap2 = NULL; + goto committed_version_trx; } - if (UNIV_UNLIKELY(err != DB_SUCCESS)) { - break; + if (heap2) { + mem_heap_free(heap2); /* free version */ } if (prev_version == NULL) { /* It was a freshly inserted version */ *old_vers = NULL; - err = DB_SUCCESS; - break; } @@ -759,6 +745,4 @@ row_vers_build_for_semi_consistent_read( if (heap) { mem_heap_free(heap); } - - return(err); } |