diff options
Diffstat (limited to 'storage/innobase/btr/btr0cur.cc')
-rw-r--r-- | storage/innobase/btr/btr0cur.cc | 679 |
1 files changed, 456 insertions, 223 deletions
diff --git a/storage/innobase/btr/btr0cur.cc b/storage/innobase/btr/btr0cur.cc index 99794144a04..c5a50f24b83 100644 --- a/storage/innobase/btr/btr0cur.cc +++ b/storage/innobase/btr/btr0cur.cc @@ -210,6 +210,7 @@ btr_rec_free_externally_stored_fields( /** Latches the leaf page or pages requested. @param[in] block leaf page where the search converged @param[in] page_id page id of the leaf +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param[in] latch_mode BTR_SEARCH_LEAF, ... @param[in] cursor cursor @param[in] mtr mini-transaction @@ -218,7 +219,7 @@ btr_latch_leaves_t btr_cur_latch_leaves( buf_block_t* block, const page_id_t page_id, - const page_size_t& page_size, + ulint zip_size, ulint latch_mode, btr_cur_t* cursor, mtr_t* mtr) @@ -249,7 +250,7 @@ btr_cur_latch_leaves( mode = latch_mode == BTR_MODIFY_LEAF ? RW_X_LATCH : RW_S_LATCH; latch_leaves.savepoints[1] = mtr_set_savepoint(mtr); - get_block = btr_block_get(page_id, page_size, mode, + get_block = btr_block_get(page_id, zip_size, mode, cursor->index, mtr); latch_leaves.blocks[1] = get_block; #ifdef UNIV_BTR_DEBUG @@ -282,7 +283,7 @@ btr_cur_latch_leaves( latch_leaves.savepoints[0] = mtr_set_savepoint(mtr); get_block = btr_block_get( page_id_t(page_id.space(), left_page_no), - page_size, RW_X_LATCH, cursor->index, mtr); + zip_size, RW_X_LATCH, cursor->index, mtr); latch_leaves.blocks[0] = get_block; if (spatial) { @@ -298,7 +299,7 @@ btr_cur_latch_leaves( latch_leaves.savepoints[1] = mtr_set_savepoint(mtr); get_block = btr_block_get( - page_id, page_size, RW_X_LATCH, cursor->index, mtr); + page_id, zip_size, RW_X_LATCH, cursor->index, mtr); latch_leaves.blocks[1] = get_block; #ifdef UNIV_BTR_DEBUG @@ -329,7 +330,7 @@ btr_cur_latch_leaves( latch_leaves.savepoints[2] = mtr_set_savepoint(mtr); get_block = btr_block_get( page_id_t(page_id.space(), right_page_no), - page_size, RW_X_LATCH, cursor->index, mtr); + zip_size, RW_X_LATCH, cursor->index, mtr); latch_leaves.blocks[2] = get_block; #ifdef UNIV_BTR_DEBUG ut_a(page_is_comp(get_block->frame) @@ -357,7 +358,7 @@ btr_cur_latch_leaves( latch_leaves.savepoints[0] = mtr_set_savepoint(mtr); get_block = btr_block_get( page_id_t(page_id.space(), left_page_no), - page_size, mode, cursor->index, mtr); + zip_size, mode, cursor->index, mtr); latch_leaves.blocks[0] = get_block; cursor->left_block = get_block; #ifdef UNIV_BTR_DEBUG @@ -369,7 +370,7 @@ btr_cur_latch_leaves( } latch_leaves.savepoints[1] = mtr_set_savepoint(mtr); - get_block = btr_block_get(page_id, page_size, mode, + get_block = btr_block_get(page_id, zip_size, mode, cursor->index, mtr); latch_leaves.blocks[1] = get_block; #ifdef UNIV_BTR_DEBUG @@ -422,8 +423,12 @@ unreadable: } btr_cur_t cur; + /* Relax the assertion in rec_init_offsets(). */ + ut_ad(!index->in_instant_init); + ut_d(index->in_instant_init = true); dberr_t err = btr_cur_open_at_index_side(true, index, BTR_SEARCH_LEAF, &cur, 0, mtr); + ut_d(index->in_instant_init = false); if (err != DB_SUCCESS) { index->table->corrupted = true; return err; @@ -457,8 +462,8 @@ unreadable: return DB_CORRUPTION; } - if (info_bits != REC_INFO_MIN_REC_FLAG - || (comp && rec_get_status(rec) != REC_STATUS_COLUMNS_ADDED)) { + if ((info_bits & ~REC_INFO_DELETED_FLAG) != REC_INFO_MIN_REC_FLAG + || (comp && rec_get_status(rec) != REC_STATUS_INSTANT)) { incompatible: ib::error() << "Table " << index->table->name << " contains unrecognizable instant ALTER metadata"; @@ -476,6 +481,72 @@ incompatible: concurrent operations on the table, including table eviction from the cache. */ + if (info_bits & REC_INFO_DELETED_FLAG) { + /* This metadata record includes a BLOB that identifies + any dropped or reordered columns. */ + ulint trx_id_offset = index->trx_id_offset; + if (!trx_id_offset) { + /* The PRIMARY KEY contains variable-length columns. + For the metadata record, variable-length columns are + always written with zero length. The DB_TRX_ID will + start right after any fixed-length columns. */ + for (uint i = index->n_uniq; i--; ) { + trx_id_offset += index->fields[i].fixed_len; + } + } + + const byte* ptr = rec + trx_id_offset + + (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + + if (mach_read_from_4(ptr + BTR_EXTERN_LEN)) { + goto incompatible; + } + + uint len = mach_read_from_4(ptr + BTR_EXTERN_LEN + 4); + if (!len + || mach_read_from_4(ptr + BTR_EXTERN_OFFSET) + != FIL_PAGE_DATA + || mach_read_from_4(ptr + BTR_EXTERN_SPACE_ID) + != space->id) { + goto incompatible; + } + + buf_block_t* block = buf_page_get( + page_id_t(space->id, + mach_read_from_4(ptr + BTR_EXTERN_PAGE_NO)), + 0, RW_S_LATCH, mtr); + buf_block_dbg_add_level(block, SYNC_EXTERN_STORAGE); + if (fil_page_get_type(block->frame) != FIL_PAGE_TYPE_BLOB + || mach_read_from_4(&block->frame[FIL_PAGE_DATA + + BTR_BLOB_HDR_NEXT_PAGE_NO]) + != FIL_NULL + || mach_read_from_4(&block->frame[FIL_PAGE_DATA + + BTR_BLOB_HDR_PART_LEN]) + != len) { + goto incompatible; + } + + /* The unused part of the BLOB page should be zero-filled. */ + for (const byte* b = block->frame + + (FIL_PAGE_DATA + BTR_BLOB_HDR_SIZE) + len, + * const end = block->frame + srv_page_size + - BTR_EXTERN_LEN; + b < end; ) { + if (*b++) { + goto incompatible; + } + } + + if (index->table->deserialise_columns( + &block->frame[FIL_PAGE_DATA + BTR_BLOB_HDR_SIZE], + len)) { + goto incompatible; + } + + /* Proceed to initialize the default values of + any instantly added columns. */ + } + mem_heap_t* heap = NULL; ulint* offsets = rec_get_offsets(rec, index, NULL, true, ULINT_UNDEFINED, &heap); @@ -489,7 +560,8 @@ inconsistent: record, it is also OK to perform READ UNCOMMITTED and then ignore any extra fields, provided that trx_sys.is_registered(DB_TRX_ID). */ - if (rec_offs_n_fields(offsets) > index->n_fields + if (rec_offs_n_fields(offsets) + > ulint(index->n_fields) + !!index->table->instant && !trx_sys.is_registered(current_trx(), row_get_rec_trx_id(rec, index, offsets))) { @@ -497,10 +569,11 @@ inconsistent: } for (unsigned i = index->n_core_fields; i < index->n_fields; i++) { - ulint len; - const byte* data = rec_get_nth_field(rec, offsets, i, &len); dict_col_t* col = index->fields[i].col; - ut_ad(!col->is_instant()); + const unsigned o = i + !!index->table->instant; + ulint len; + const byte* data = rec_get_nth_field(rec, offsets, o, &len); + ut_ad(!col->is_added()); ut_ad(!col->def_val.data); col->def_val.len = len; switch (len) { @@ -511,7 +584,7 @@ inconsistent: continue; } ut_ad(len != UNIV_SQL_DEFAULT); - if (!rec_offs_nth_extern(offsets, i)) { + if (!rec_offs_nth_extern(offsets, o)) { col->def_val.data = mem_heap_dup( index->table->heap, data, len); } else if (len < BTR_EXTERN_FIELD_REF_SIZE @@ -523,7 +596,7 @@ inconsistent: } else { col->def_val.data = btr_copy_externally_stored_field( &col->def_val.len, data, - dict_table_page_size(index->table), + cur.page_cur.block->zip_size(), len, index->table->heap); } } @@ -592,30 +665,49 @@ bool btr_cur_instant_root_init(dict_index_t* index, const page_t* page) const uint16_t n = page_get_instant(page); - if (n < index->n_uniq + DATA_ROLL_PTR || n > index->n_fields) { + if (n < index->n_uniq + DATA_ROLL_PTR) { /* The PRIMARY KEY (or hidden DB_ROW_ID) and DB_TRX_ID,DB_ROLL_PTR columns must always be present - as 'core' fields. All fields, including those for - instantly added columns, must be present in the data - dictionary. */ + as 'core' fields. */ return true; } - if (memcmp(page_get_infimum_rec(page), "infimum", 8) - || memcmp(page_get_supremum_rec(page), "supremum", 8)) { - /* In a later format, these fields in a FIL_PAGE_TYPE_INSTANT - root page could be repurposed for something else. */ + if (n > REC_MAX_N_FIELDS) { return true; } index->n_core_fields = n; - ut_ad(!index->is_dummy); - ut_d(index->is_dummy = true); - index->n_core_null_bytes = n == index->n_fields - ? UT_BITS_IN_BYTES(unsigned(index->n_nullable)) - : UT_BITS_IN_BYTES(index->get_n_nullable(n)); - ut_d(index->is_dummy = false); - return false; + + const rec_t* infimum = page_get_infimum_rec(page); + const rec_t* supremum = page_get_supremum_rec(page); + + if (!memcmp(infimum, "infimum", 8) + && !memcmp(supremum, "supremum", 8)) { + if (n > index->n_fields) { + /* All fields, including those for instantly + added columns, must be present in the + data dictionary. */ + return true; + } + + ut_ad(!index->is_dummy); + ut_d(index->is_dummy = true); + index->n_core_null_bytes = UT_BITS_IN_BYTES( + index->get_n_nullable(n)); + ut_d(index->is_dummy = false); + return false; + } + + if (memcmp(infimum, field_ref_zero, 8) + || memcmp(supremum, field_ref_zero, 7)) { + /* The infimum and supremum records must either contain + the original strings, or they must be filled with zero + bytes, except for the bytes that we have repurposed. */ + return true; + } + + index->n_core_null_bytes = supremum[7]; + return index->n_core_null_bytes > 128; } /** Optimistically latches the leaf page or pages requested. @@ -673,8 +765,7 @@ btr_cur_optimistic_latch_leaves( cursor->left_block = btr_block_get( page_id_t(cursor->index->table->space_id, left_page_no), - page_size_t(cursor->index->table->space - ->flags), + cursor->index->table->space->zip_size(), mode, cursor->index, mtr); } else { cursor->left_block = NULL; @@ -775,7 +866,7 @@ btr_cur_latch_for_root_leaf( @param[in] lock_intention lock intention for the tree operation @param[in] rec record (current node_ptr) @param[in] rec_size size of the record or max size of node_ptr -@param[in] page_size page size +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param[in] mtr mtr @return true if tree modification is needed */ static @@ -786,7 +877,7 @@ btr_cur_will_modify_tree( btr_intention_t lock_intention, const rec_t* rec, ulint rec_size, - const page_size_t& page_size, + ulint zip_size, mtr_t* mtr) { ut_ad(!page_is_leaf(page)); @@ -874,9 +965,8 @@ btr_cur_will_modify_tree( This is based on the worst case, and we could invoke page_zip_available() on the block->page.zip. */ /* needs 2 records' space also for worst compress rate. */ - if (page_size.is_compressed() - && page_zip_empty_size(index->n_fields, - page_size.physical()) + if (zip_size + && page_zip_empty_size(index->n_fields, zip_size) <= rec_size * 2 + page_get_data_size(page) + page_dir_calc_reserved_space(n_recs + 2)) { return(true); @@ -1315,7 +1405,7 @@ btr_cur_search_to_nth_level_func( Free blocks and read IO bandwidth should be prior for them, when the history list is glowing huge. */ if (lock_intention == BTR_INTENTION_DELETE - && trx_sys.history_size() > BTR_CUR_FINE_HISTORY_LENGTH + && trx_sys.rseg_history_len > BTR_CUR_FINE_HISTORY_LENGTH && buf_get_n_pending_read_ios()) { mtr_x_lock(dict_index_get_lock(index), mtr); } else if (dict_index_is_spatial(index) @@ -1372,7 +1462,7 @@ btr_cur_search_to_nth_level_func( page_cursor = btr_cur_get_page_cur(cursor); - const page_size_t page_size(index->table->space->flags); + const ulint zip_size = index->table->space->zip_size(); /* Start with the root page. */ page_id_t page_id(index->table->space_id, index->page); @@ -1455,7 +1545,7 @@ search_loop: retry_page_get: ut_ad(n_blocks < BTR_MAX_LEVELS); tree_savepoints[n_blocks] = mtr_set_savepoint(mtr); - block = buf_page_get_gen(page_id, page_size, rw_latch, guess, + block = buf_page_get_gen(page_id, zip_size, rw_latch, guess, buf_mode, file, line, mtr, &err); tree_blocks[n_blocks] = block; @@ -1491,7 +1581,7 @@ retry_page_get: ut_ad(!dict_index_is_spatial(index)); if (ibuf_insert(IBUF_OP_INSERT, tuple, index, - page_id, page_size, cursor->thr)) { + page_id, zip_size, cursor->thr)) { cursor->flag = BTR_CUR_INSERT_TO_IBUF; @@ -1504,7 +1594,7 @@ retry_page_get: ut_ad(!dict_index_is_spatial(index)); if (ibuf_insert(IBUF_OP_DELETE_MARK, tuple, - index, page_id, page_size, + index, page_id, zip_size, cursor->thr)) { cursor->flag = BTR_CUR_DEL_MARK_IBUF; @@ -1524,7 +1614,7 @@ retry_page_get: /* The record cannot be purged yet. */ cursor->flag = BTR_CUR_DELETE_REF; } else if (ibuf_insert(IBUF_OP_DELETE, tuple, - index, page_id, page_size, + index, page_id, zip_size, cursor->thr)) { /* The purge was buffered. */ @@ -1571,7 +1661,7 @@ retry_page_get: = mtr_set_savepoint(mtr); get_block = buf_page_get_gen( page_id_t(page_id.space(), left_page_no), - page_size, rw_latch, NULL, buf_mode, + zip_size, rw_latch, NULL, buf_mode, file, line, mtr, &err); prev_tree_blocks[prev_n_blocks] = get_block; prev_n_blocks++; @@ -1601,7 +1691,7 @@ retry_page_get: tree_blocks[n_blocks]); tree_savepoints[n_blocks] = mtr_set_savepoint(mtr); - block = buf_page_get_gen(page_id, page_size, rw_latch, NULL, + block = buf_page_get_gen(page_id, zip_size, rw_latch, NULL, buf_mode, file, line, mtr, &err); tree_blocks[n_blocks] = block; @@ -1699,7 +1789,7 @@ retry_page_get: if (rw_latch == RW_NO_LATCH) { latch_leaves = btr_cur_latch_leaves( - block, page_id, page_size, latch_mode, + block, page_id, zip_size, latch_mode, cursor, mtr); } @@ -2063,7 +2153,7 @@ need_opposite_intention: && latch_mode == BTR_MODIFY_TREE && !btr_cur_will_modify_tree( index, page, lock_intention, node_ptr, - node_ptr_max_size, page_size, mtr) + node_ptr_max_size, zip_size, mtr) && !rtree_parent_modified) { ut_ad(upper_rw_latch == RW_X_LATCH); ut_ad(n_releases <= n_blocks); @@ -2261,12 +2351,12 @@ need_opposite_intention: if (latch_mode == BTR_CONT_MODIFY_TREE) { child_block = btr_block_get( - page_id, page_size, RW_X_LATCH, + page_id, zip_size, RW_X_LATCH, index, mtr); } else { ut_ad(latch_mode == BTR_CONT_SEARCH_TREE); child_block = btr_block_get( - page_id, page_size, RW_SX_LATCH, + page_id, zip_size, RW_SX_LATCH, index, mtr); } @@ -2321,9 +2411,10 @@ need_opposite_intention: ut_ad(index->is_instant()); /* This may be a search tuple for btr_pcur_restore_position(). */ - ut_ad(tuple->info_bits == REC_INFO_METADATA - || tuple->info_bits == REC_INFO_MIN_REC_FLAG); - } else if (rec_is_metadata(btr_cur_get_rec(cursor), index)) { + ut_ad(tuple->is_metadata() + || (tuple->is_metadata(tuple->info_bits + ^ REC_STATUS_INSTANT))); + } else if (rec_is_metadata(btr_cur_get_rec(cursor), *index)) { /* Only user records belong in the adaptive hash index. */ } else { @@ -2450,7 +2541,7 @@ btr_cur_open_at_index_side_func( Free blocks and read IO bandwidth should be prior for them, when the history list is glowing huge. */ if (lock_intention == BTR_INTENTION_DELETE - && trx_sys.history_size() > BTR_CUR_FINE_HISTORY_LENGTH + && trx_sys.rseg_history_len > BTR_CUR_FINE_HISTORY_LENGTH && buf_get_n_pending_read_ios()) { mtr_x_lock(dict_index_get_lock(index), mtr); } else { @@ -2483,7 +2574,7 @@ btr_cur_open_at_index_side_func( cursor->index = index; page_id_t page_id(index->table->space_id, index->page); - const page_size_t page_size(index->table->space->flags); + const ulint zip_size = index->table->space->zip_size(); if (root_leaf_rw_latch == RW_X_LATCH) { node_ptr_max_size = btr_node_ptr_max_size(index); @@ -2506,7 +2597,7 @@ btr_cur_open_at_index_side_func( } tree_savepoints[n_blocks] = mtr_set_savepoint(mtr); - block = buf_page_get_gen(page_id, page_size, rw_latch, NULL, + block = buf_page_get_gen(page_id, zip_size, rw_latch, NULL, BUF_GET, file, line, mtr, &err); ut_ad((block != NULL) == (err == DB_SUCCESS)); tree_blocks[n_blocks] = block; @@ -2562,12 +2653,12 @@ btr_cur_open_at_index_side_func( if (height == level) { if (srv_read_only_mode) { btr_cur_latch_leaves( - block, page_id, page_size, + block, page_id, zip_size, latch_mode, cursor, mtr); } else if (height == 0) { if (rw_latch == RW_NO_LATCH) { btr_cur_latch_leaves( - block, page_id, page_size, + block, page_id, zip_size, latch_mode, cursor, mtr); } /* In versions <= 3.23.52 we had @@ -2698,7 +2789,7 @@ btr_cur_open_at_index_side_func( if (latch_mode == BTR_MODIFY_TREE && !btr_cur_will_modify_tree( cursor->index, page, lock_intention, node_ptr, - node_ptr_max_size, page_size, mtr)) { + node_ptr_max_size, zip_size, mtr)) { ut_ad(upper_rw_latch == RW_X_LATCH); ut_ad(n_releases <= n_blocks); @@ -2795,7 +2886,7 @@ btr_cur_open_at_rnd_pos_func( Free blocks and read IO bandwidth should be prior for them, when the history list is glowing huge. */ if (lock_intention == BTR_INTENTION_DELETE - && trx_sys.history_size() > BTR_CUR_FINE_HISTORY_LENGTH + && trx_sys.rseg_history_len > BTR_CUR_FINE_HISTORY_LENGTH && buf_get_n_pending_read_ios()) { mtr_x_lock(dict_index_get_lock(index), mtr); } else { @@ -2840,7 +2931,7 @@ btr_cur_open_at_rnd_pos_func( cursor->index = index; page_id_t page_id(index->table->space_id, index->page); - const page_size_t page_size(index->table->space->flags); + const ulint zip_size = index->table->space->zip_size(); dberr_t err = DB_SUCCESS; if (root_leaf_rw_latch == RW_X_LATCH) { @@ -2864,7 +2955,7 @@ btr_cur_open_at_rnd_pos_func( } tree_savepoints[n_blocks] = mtr_set_savepoint(mtr); - block = buf_page_get_gen(page_id, page_size, rw_latch, NULL, + block = buf_page_get_gen(page_id, zip_size, rw_latch, NULL, BUF_GET, file, line, mtr, &err); tree_blocks[n_blocks] = block; @@ -2917,7 +3008,7 @@ btr_cur_open_at_rnd_pos_func( if (rw_latch == RW_NO_LATCH || srv_read_only_mode) { btr_cur_latch_leaves( - block, page_id, page_size, + block, page_id, zip_size, latch_mode, cursor, mtr); } @@ -2993,7 +3084,7 @@ btr_cur_open_at_rnd_pos_func( if (latch_mode == BTR_MODIFY_TREE && !btr_cur_will_modify_tree( cursor->index, page, lock_intention, node_ptr, - node_ptr_max_size, page_size, mtr)) { + node_ptr_max_size, zip_size, mtr)) { ut_ad(upper_rw_latch == RW_X_LATCH); ut_ad(n_releases <= n_blocks); @@ -3170,8 +3261,11 @@ btr_cur_ins_lock_and_undo( roll_ptr = roll_ptr_t(1) << ROLL_PTR_INSERT_FLAG_POS; if (!(flags & BTR_KEEP_SYS_FLAG)) { upd_sys: - row_upd_index_entry_sys_field(entry, index, - DATA_ROLL_PTR, roll_ptr); + dfield_t* r = dtuple_get_nth_field( + entry, index->db_roll_ptr()); + ut_ad(r->len == DATA_ROLL_PTR_LEN); + trx_write_roll_ptr(static_cast<byte*>(r->data), + roll_ptr); } } else { err = trx_undo_report_row_operation(thr, index, entry, @@ -3203,12 +3297,12 @@ btr_cur_prefetch_siblings( if (left_page_no != FIL_NULL) { buf_read_page_background( page_id_t(block->page.id.space(), left_page_no), - block->page.size, false); + block->zip_size(), false); } if (right_page_no != FIL_NULL) { buf_read_page_background( page_id_t(block->page.id.space(), right_page_no), - block->page.size, false); + block->zip_size(), false); } if (left_page_no != FIL_NULL || right_page_no != FIL_NULL) { @@ -3275,23 +3369,28 @@ btr_cur_optimistic_insert( || (flags & BTR_CREATE_FLAG)); ut_ad(dtuple_check_typed(entry)); - const page_size_t& page_size = block->page.size; - #ifdef UNIV_DEBUG_VALGRIND - if (page_size.is_compressed()) { - UNIV_MEM_ASSERT_RW(page, page_size.logical()); - UNIV_MEM_ASSERT_RW(block->page.zip.data, page_size.physical()); + if (block->page.zip.data) { + UNIV_MEM_ASSERT_RW(page, srv_page_size); + UNIV_MEM_ASSERT_RW(block->page.zip.data, + block->zip_size()); } #endif /* UNIV_DEBUG_VALGRIND */ leaf = page_is_leaf(page); + if (UNIV_UNLIKELY(entry->is_alter_metadata())) { + ut_ad(leaf); + goto convert_big_rec; + } + /* Calculate the record size when entry is converted to a record */ rec_size = rec_get_converted_size(index, entry, n_ext); if (page_zip_rec_needs_ext(rec_size, page_is_comp(page), - dtuple_get_n_fields(entry), page_size)) { - + dtuple_get_n_fields(entry), + block->zip_size())) { +convert_big_rec: /* The record is so big that we have to store some fields externally on separate database pages */ big_rec_vec = dtuple_convert_big_rec(index, 0, entry, &n_ext); @@ -3304,7 +3403,7 @@ btr_cur_optimistic_insert( rec_size = rec_get_converted_size(index, entry, n_ext); } - if (page_size.is_compressed() && page_zip_is_too_big(index, entry)) { + if (block->page.zip.data && page_zip_is_too_big(index, entry)) { if (big_rec_vec != NULL) { dtuple_convert_back_big_rec(index, entry, big_rec_vec); } @@ -3315,7 +3414,7 @@ btr_cur_optimistic_insert( LIMIT_OPTIMISTIC_INSERT_DEBUG(page_get_n_recs(page), goto fail); - if (leaf && page_size.is_compressed() + if (block->page.zip.data && leaf && (page_get_data_size(page) + rec_size >= dict_index_zip_pad_optimal_page_size(index))) { /* If compression padding tells us that insertion will @@ -3358,7 +3457,7 @@ fail_err: we have to split the page to reserve enough free space for future updates of records. */ - if (leaf && !page_size.is_compressed() && dict_index_is_clust(index) + if (leaf && !block->page.zip.data && dict_index_is_clust(index) && page_get_n_recs(page) >= 2 && dict_index_get_space_reserve() + rec_size > max_size && (btr_page_get_split_rec_to_right(cursor, &dummy) @@ -3421,7 +3520,7 @@ fail_err: } if (*rec) { - } else if (page_size.is_compressed()) { + } else if (block->page.zip.data) { ut_ad(!index->table->is_temporary()); /* Reset the IBUF_BITMAP_FREE bits, because page_cur_tuple_insert() will have attempted page @@ -3462,7 +3561,7 @@ fail_err: } else if (index->disable_ahi) { # endif } else if (entry->info_bits & REC_INFO_MIN_REC_FLAG) { - ut_ad(entry->info_bits == REC_INFO_METADATA); + ut_ad(entry->is_metadata()); ut_ad(index->is_instant()); ut_ad(flags == BTR_NO_LOCKING_FLAG); } else { @@ -3497,7 +3596,7 @@ fail_err: committed mini-transaction, because in crash recovery, the free bits could momentarily be set too high. */ - if (page_size.is_compressed()) { + if (block->page.zip.data) { /* Update the bits in the same mini-transaction. */ ibuf_update_free_bits_zip(block, mtr); } else { @@ -3595,9 +3694,14 @@ btr_cur_pessimistic_insert( } if (page_zip_rec_needs_ext(rec_get_converted_size(index, entry, n_ext), - dict_table_is_comp(index->table), + index->table->not_redundant(), dtuple_get_n_fields(entry), - dict_table_page_size(index->table))) { + btr_cur_get_block(cursor)->zip_size()) + || UNIV_UNLIKELY(entry->is_alter_metadata() + && !dfield_is_ext( + dtuple_get_nth_field( + entry, + index->first_user_field())))) { /* The record is so big that we have to store some fields externally on separate database pages */ @@ -3670,10 +3774,10 @@ btr_cur_pessimistic_insert( if (index->disable_ahi); else # endif if (entry->info_bits & REC_INFO_MIN_REC_FLAG) { - ut_ad(entry->info_bits == REC_INFO_METADATA); + ut_ad(entry->is_metadata()); ut_ad(index->is_instant()); - ut_ad((flags & ulint(~BTR_KEEP_IBUF_BITMAP)) - == BTR_NO_LOCKING_FLAG); + ut_ad(flags & BTR_NO_LOCKING_FLAG); + ut_ad(!(flags & BTR_CREATE_FLAG)); } else { btr_search_update_hash_on_insert( cursor, btr_get_search_latch(index)); @@ -3755,6 +3859,50 @@ btr_cur_upd_lock_and_undo( cmpl_info, rec, offsets, roll_ptr)); } +/** Copy DB_TRX_ID,DB_ROLL_PTR to the redo log. +@param[in] index clustered index +@param[in] trx_id_t DB_TRX_ID +@param[in] roll_ptr DB_ROLL_PTR +@param[in,out] log_ptr redo log buffer +@return current end of the redo log buffer */ +static byte* +btr_cur_log_sys( + const dict_index_t* index, + trx_id_t trx_id, + roll_ptr_t roll_ptr, + byte* log_ptr) +{ + log_ptr += mach_write_compressed(log_ptr, index->db_trx_id()); + /* Yes, we are writing DB_ROLL_PTR,DB_TRX_ID in reverse order, + after emitting the position of DB_TRX_ID in the index. + This is how row_upd_write_sys_vals_to_log() + originally worked, and it is part of the redo log format. */ + trx_write_roll_ptr(log_ptr, roll_ptr); + log_ptr += DATA_ROLL_PTR_LEN; + log_ptr += mach_u64_write_compressed(log_ptr, trx_id); + + return log_ptr; +} + +/** Write DB_TRX_ID,DB_ROLL_PTR to a clustered index entry. +@param[in,out] entry clustered index entry +@param[in] index clustered index +@param[in] trx_id DB_TRX_ID +@param[in] roll_ptr DB_ROLL_PTR */ +static void btr_cur_write_sys( + dtuple_t* entry, + const dict_index_t* index, + trx_id_t trx_id, + roll_ptr_t roll_ptr) +{ + dfield_t* t = dtuple_get_nth_field(entry, index->db_trx_id()); + ut_ad(t->len == DATA_TRX_ID_LEN); + trx_write_trx_id(static_cast<byte*>(t->data), trx_id); + dfield_t* r = dtuple_get_nth_field(entry, index->db_roll_ptr()); + ut_ad(r->len == DATA_ROLL_PTR_LEN); + trx_write_roll_ptr(static_cast<byte*>(r->data), roll_ptr); +} + /***********************************************************//** Writes a redo log record of updating a record in-place. */ void @@ -3794,8 +3942,7 @@ btr_cur_update_in_place_log( log_ptr++; if (dict_index_is_clust(index)) { - log_ptr = row_upd_write_sys_vals_to_log( - index, trx_id, roll_ptr, log_ptr, mtr); + log_ptr = btr_cur_log_sys(index, trx_id, roll_ptr, log_ptr); } else { /* Dummy system fields for a secondary index */ /* TRX_ID Position */ @@ -4148,6 +4295,72 @@ func_exit: return(err); } +/** Trim a metadata record during the rollback of instant ALTER TABLE. +@param[in] entry metadata tuple +@param[in] index primary key +@param[in] update update vector for the rollback */ +ATTRIBUTE_COLD +static void btr_cur_trim_alter_metadata(dtuple_t* entry, + const dict_index_t* index, + const upd_t* update) +{ + ut_ad(index->is_instant()); + ut_ad(update->is_alter_metadata()); + ut_ad(entry->is_alter_metadata()); + + ut_ad(update->fields[0].field_no == index->first_user_field()); + ut_ad(update->fields[0].new_val.ext); + ut_ad(update->fields[0].new_val.len == FIELD_REF_SIZE); + ut_ad(entry->n_fields - 1 == index->n_fields); + + const byte* ptr = static_cast<const byte*>( + update->fields[0].new_val.data); + ut_ad(!mach_read_from_4(ptr + BTR_EXTERN_LEN)); + ut_ad(mach_read_from_4(ptr + BTR_EXTERN_LEN + 4) > 4); + ut_ad(mach_read_from_4(ptr + BTR_EXTERN_OFFSET) == FIL_PAGE_DATA); + ut_ad(mach_read_from_4(ptr + BTR_EXTERN_SPACE_ID) + == index->table->space->id); + + ulint n_fields = update->fields[1].field_no; + ut_ad(n_fields <= index->n_fields); + if (n_fields != index->n_uniq) { + ut_ad(n_fields + >= index->n_core_fields); + entry->n_fields = n_fields; + return; + } + + /* This is based on dict_table_t::deserialise_columns() + and btr_cur_instant_init_low(). */ + mtr_t mtr; + mtr.start(); + buf_block_t* block = buf_page_get( + page_id_t(index->table->space->id, + mach_read_from_4(ptr + BTR_EXTERN_PAGE_NO)), + 0, RW_S_LATCH, &mtr); + buf_block_dbg_add_level(block, SYNC_EXTERN_STORAGE); + ut_ad(fil_page_get_type(block->frame) == FIL_PAGE_TYPE_BLOB); + ut_ad(mach_read_from_4(&block->frame[FIL_PAGE_DATA + + BTR_BLOB_HDR_NEXT_PAGE_NO]) + == FIL_NULL); + ut_ad(mach_read_from_4(&block->frame[FIL_PAGE_DATA + + BTR_BLOB_HDR_PART_LEN]) + == mach_read_from_4(ptr + BTR_EXTERN_LEN + 4)); + n_fields = mach_read_from_4( + &block->frame[FIL_PAGE_DATA + BTR_BLOB_HDR_SIZE]) + + index->first_user_field(); + /* Rollback should not increase the number of fields. */ + ut_ad(n_fields <= index->n_fields); + ut_ad(n_fields + 1 <= entry->n_fields); + /* dict_index_t::clear_instant_alter() cannot be invoked while + rollback of an instant ALTER TABLE transaction is in progress + for an is_alter_metadata() record. */ + ut_ad(n_fields >= index->n_core_fields); + + mtr.commit(); + entry->n_fields = n_fields + 1; +} + /** Trim an update tuple due to instant ADD COLUMN, if needed. For normal records, the trailing instantly added fields that match the initial default values are omitted. @@ -4169,13 +4382,12 @@ btr_cur_trim( const que_thr_t* thr) { if (!index->is_instant()) { - } else if (UNIV_UNLIKELY(update->info_bits == REC_INFO_METADATA)) { + } else if (UNIV_UNLIKELY(update->is_metadata())) { /* We are either updating a metadata record - (instantly adding columns to a table where instant ADD was + (instant ALTER TABLE on a table where instant ALTER was already executed) or rolling back such an operation. */ ut_ad(!upd_get_nth_field(update, 0)->orig_len); - ut_ad(upd_get_nth_field(update, 0)->field_no - > index->n_core_fields); + ut_ad(entry->is_metadata()); if (thr->graph->trx->in_rollback) { /* This rollback can occur either as part of @@ -4192,6 +4404,13 @@ btr_cur_trim( first instantly added column logged by innobase_add_instant_try(). */ ut_ad(update->n_fields > 2); + if (update->is_alter_metadata()) { + btr_cur_trim_alter_metadata( + entry, index, update); + return; + } + ut_ad(!entry->is_alter_metadata()); + ulint n_fields = upd_get_nth_field(update, 0) ->field_no; ut_ad(n_fields + 1 >= entry->n_fields); @@ -4277,9 +4496,7 @@ btr_cur_optimistic_update( || trx_is_recv(thr_get_trx(thr))); #endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ - const bool is_metadata = update->info_bits == REC_INFO_METADATA; - - if (UNIV_LIKELY(!is_metadata) + if (UNIV_LIKELY(!update->is_metadata()) && !row_upd_changes_field_size_or_external(index, *offsets, update)) { @@ -4305,6 +4522,10 @@ any_extern: return(DB_OVERFLOW); } + if (rec_is_metadata(rec, *index) && index->table->instant) { + goto any_extern; + } + for (i = 0; i < upd_get_n_fields(update); i++) { if (dfield_is_ext(&upd_get_nth_field(update, i)->new_val)) { @@ -4349,7 +4570,7 @@ any_extern: if (page_zip_rec_needs_ext(new_rec_size, page_is_comp(page), dict_index_get_n_fields(index), - dict_table_page_size(index->table))) { + block->zip_size())) { goto any_extern; } @@ -4363,10 +4584,10 @@ any_extern: } /* We limit max record size to 16k even for 64k page size. */ - if (new_rec_size >= COMPRESSED_REC_MAX_DATA_SIZE || - (!dict_table_is_comp(index->table) - && new_rec_size >= REDUNDANT_REC_MAX_DATA_SIZE)) { - err = DB_OVERFLOW; + if (new_rec_size >= COMPRESSED_REC_MAX_DATA_SIZE || + (!dict_table_is_comp(index->table) + && new_rec_size >= REDUNDANT_REC_MAX_DATA_SIZE)) { + err = DB_OVERFLOW; goto func_exit; } @@ -4439,8 +4660,8 @@ any_extern: lock_rec_store_on_page_infimum(block, rec); } - if (UNIV_UNLIKELY(is_metadata)) { - ut_ad(new_entry->info_bits == REC_INFO_METADATA); + if (UNIV_UNLIKELY(update->is_metadata())) { + ut_ad(new_entry->is_metadata()); ut_ad(index->is_instant()); /* This can be innobase_add_instant_try() performing a subsequent instant ADD COLUMN, or its rollback by @@ -4455,10 +4676,7 @@ any_extern: page_cur_move_to_prev(page_cursor); if (!(flags & BTR_KEEP_SYS_FLAG)) { - row_upd_index_entry_sys_field(new_entry, index, DATA_ROLL_PTR, - roll_ptr); - row_upd_index_entry_sys_field(new_entry, index, DATA_TRX_ID, - trx_id); + btr_cur_write_sys(new_entry, index, trx_id, roll_ptr); } /* There are no externally stored columns in new_entry */ @@ -4466,7 +4684,7 @@ any_extern: cursor, new_entry, offsets, heap, 0/*n_ext*/, mtr); ut_a(rec); /* <- We calculated above the insert would fit */ - if (UNIV_UNLIKELY(is_metadata)) { + if (UNIV_UNLIKELY(update->is_metadata())) { /* We must empty the PAGE_FREE list, because if this was a rollback, the shortened metadata record would have too many fields, and we would be unable to @@ -4531,7 +4749,8 @@ btr_cur_pess_upd_restore_supremum( const page_id_t page_id(block->page.id.space(), prev_page_no); ut_ad(prev_page_no != FIL_NULL); - prev_block = buf_page_get_with_no_latch(page_id, block->page.size, mtr); + prev_block = buf_page_get_with_no_latch(page_id, block->zip_size(), + mtr); #ifdef UNIV_BTR_DEBUG ut_a(btr_page_get_next(prev_block->frame, mtr) == page_get_page_no(page)); @@ -4657,8 +4876,25 @@ btr_cur_pessimistic_update( rec = btr_cur_get_rec(cursor); ut_ad(rec_offs_validate(rec, index, *offsets)); - dtuple_t* new_entry = row_rec_to_index_entry( - rec, index, *offsets, &n_ext, entry_heap); + dtuple_t* new_entry; + + const bool is_metadata = rec_is_metadata(rec, *index); + + if (UNIV_UNLIKELY(is_metadata)) { + ut_ad(update->is_metadata()); + ut_ad(flags & BTR_NO_LOCKING_FLAG); + ut_ad(index->is_instant()); + new_entry = row_metadata_to_tuple( + rec, index, *offsets, + &n_ext, entry_heap, + update->info_bits, !thr_get_trx(thr)->in_rollback); + ut_ad(new_entry->n_fields + == ulint(index->n_fields) + + update->is_alter_metadata()); + } else { + new_entry = row_rec_to_index_entry(rec, index, *offsets, + &n_ext, entry_heap); + } /* The page containing the clustered index record corresponding to new_entry is latched in mtr. If the @@ -4670,9 +4906,6 @@ btr_cur_pessimistic_update( entry_heap); btr_cur_trim(new_entry, index, update, thr); - const bool is_metadata = new_entry->info_bits - & REC_INFO_MIN_REC_FLAG; - /* We have to set appropriate extern storage bits in the new record to be inserted: we have to remember which fields were such */ @@ -4700,11 +4933,14 @@ btr_cur_pessimistic_update( } if (page_zip_rec_needs_ext( - rec_get_converted_size(index, new_entry, n_ext), - page_is_comp(page), - dict_index_get_n_fields(index), - block->page.size)) { - + rec_get_converted_size(index, new_entry, n_ext), + page_is_comp(page), + dict_index_get_n_fields(index), + block->zip_size()) + || (UNIV_UNLIKELY(update->is_alter_metadata()) + && !dfield_is_ext(dtuple_get_nth_field( + new_entry, + index->first_user_field())))) { big_rec_vec = dtuple_convert_big_rec(index, update, new_entry, &n_ext); if (UNIV_UNLIKELY(big_rec_vec == NULL)) { @@ -4753,10 +4989,7 @@ btr_cur_pessimistic_update( } if (!(flags & BTR_KEEP_SYS_FLAG)) { - row_upd_index_entry_sys_field(new_entry, index, DATA_ROLL_PTR, - roll_ptr); - row_upd_index_entry_sys_field(new_entry, index, DATA_TRX_ID, - trx_id); + btr_cur_write_sys(new_entry, index, trx_id, roll_ptr); } if (!page_zip) { @@ -4765,10 +4998,10 @@ btr_cur_pessimistic_update( } if (UNIV_UNLIKELY(is_metadata)) { - ut_ad(new_entry->info_bits == REC_INFO_METADATA); + ut_ad(new_entry->is_metadata()); ut_ad(index->is_instant()); /* This can be innobase_add_instant_try() performing a - subsequent instant ADD COLUMN, or its rollback by + subsequent instant ALTER TABLE, or its rollback by row_undo_mod_clust_low(). */ ut_ad(flags & BTR_NO_LOCKING_FLAG); } else { @@ -4817,7 +5050,8 @@ btr_cur_pessimistic_update( btr_cur_get_block(cursor), rec, block); } - if (!rec_get_deleted_flag(rec, rec_offs_comp(*offsets))) { + if (!rec_get_deleted_flag(rec, rec_offs_comp(*offsets)) + || rec_is_alter_metadata(rec, *index)) { /* The new inserted record owns its possible externally stored fields */ btr_cur_unmark_extern_fields( @@ -5027,8 +5261,7 @@ btr_cur_del_mark_set_clust_rec_log( *log_ptr++ = 0; *log_ptr++ = 1; - log_ptr = row_upd_write_sys_vals_to_log( - index, trx_id, roll_ptr, log_ptr, mtr); + log_ptr = btr_cur_log_sys(index, trx_id, roll_ptr, log_ptr); mach_write_to_2(log_ptr, page_offset(rec)); log_ptr += 2; @@ -5460,42 +5693,41 @@ btr_cur_optimistic_delete_func( if (UNIV_UNLIKELY(block->page.id.page_no() == cursor->index->page && page_get_n_recs(block->frame) == 1 + (cursor->index->is_instant() - && !rec_is_metadata(rec, cursor->index)))) { + && !rec_is_metadata(rec, *cursor->index)))) { /* The whole index (and table) becomes logically empty. Empty the whole page. That is, if we are deleting the only user record, also delete the metadata record - if one exists (it exists if and only if is_instant()). + if one exists for instant ADD COLUMN (not generic ALTER TABLE). If we are deleting the metadata record and the table becomes empty, clean up the whole page. */ dict_index_t* index = cursor->index; + const rec_t* first_rec = page_rec_get_next_const( + page_get_infimum_rec(block->frame)); ut_ad(!index->is_instant() - || rec_is_metadata( - page_rec_get_next_const( - page_get_infimum_rec(block->frame)), - index)); - if (UNIV_UNLIKELY(rec_get_info_bits(rec, page_rec_is_comp(rec)) - & REC_INFO_MIN_REC_FLAG)) { - /* This should be rolling back instant ADD COLUMN. - If this is a recovered transaction, then - index->is_instant() will hold until the - insert into SYS_COLUMNS is rolled back. */ - ut_ad(index->table->supports_instant()); - ut_ad(index->is_primary()); - } else { - lock_update_delete(block, rec); - } - btr_page_empty(block, buf_block_get_page_zip(block), - index, 0, mtr); - page_cur_set_after_last(block, btr_cur_get_page_cur(cursor)); - - if (index->is_primary()) { - /* Concurrent access is prevented by - root_block->lock X-latch, so this should be - safe. */ - index->remove_instant(); + || rec_is_metadata(first_rec, *index)); + const bool is_metadata = rec_is_metadata(rec, *index); + /* We can remove the metadata when rolling back an + instant ALTER TABLE operation, or when deleting the + last user record on the page such that only metadata for + instant ADD COLUMN (not generic ALTER TABLE) remains. */ + const bool empty_table = is_metadata + || !index->is_instant() + || (first_rec != rec + && rec_is_add_metadata(first_rec, *index)); + if (UNIV_LIKELY(empty_table)) { + if (UNIV_LIKELY(!is_metadata)) { + lock_update_delete(block, rec); + } + btr_page_empty(block, buf_block_get_page_zip(block), + index, 0, mtr); + if (index->is_instant()) { + /* MDEV-17383: free metadata BLOBs! */ + index->clear_instant_alter(); + } + page_cur_set_after_last(block, + btr_cur_get_page_cur(cursor)); + return true; } - - return true; } offsets = rec_get_offsets(rec, cursor->index, offsets, true, @@ -5677,10 +5909,10 @@ btr_cur_pessimistic_delete( } if (page_is_leaf(page)) { - const bool is_metadata = rec_get_info_bits( - rec, page_rec_is_comp(rec)) & REC_INFO_MIN_REC_FLAG; + const bool is_metadata = rec_is_metadata( + rec, page_rec_is_comp(rec)); if (UNIV_UNLIKELY(is_metadata)) { - /* This should be rolling back instant ADD COLUMN. + /* This should be rolling back instant ALTER TABLE. If this is a recovered transaction, then index->is_instant() will hold until the insert into SYS_COLUMNS is rolled back. */ @@ -5696,30 +5928,34 @@ btr_cur_pessimistic_delete( goto discard_page; } } else if (page_get_n_recs(page) == 1 - + (index->is_instant() - && !rec_is_metadata(rec, index))) { + + (index->is_instant() && !is_metadata)) { /* The whole index (and table) becomes logically empty. Empty the whole page. That is, if we are deleting the only user record, also delete the metadata record - if one exists (it exists if and only if is_instant()). - If we are deleting the metadata record and the + if one exists for instant ADD COLUMN + (not generic ALTER TABLE). + If we are deleting the metadata record + (in the rollback of instant ALTER TABLE) and the table becomes empty, clean up the whole page. */ + + const rec_t* first_rec = page_rec_get_next_const( + page_get_infimum_rec(page)); ut_ad(!index->is_instant() - || rec_is_metadata( - page_rec_get_next_const( - page_get_infimum_rec(page)), - index)); - btr_page_empty(block, page_zip, index, 0, mtr); - page_cur_set_after_last(block, - btr_cur_get_page_cur(cursor)); - if (index->is_primary()) { - /* Concurrent access is prevented by - index->lock and root_block->lock - X-latch, so this should be safe. */ - index->remove_instant(); + || rec_is_metadata(first_rec, *index)); + if (is_metadata || !index->is_instant() + || (first_rec != rec + && rec_is_add_metadata(first_rec, *index))) { + btr_page_empty(block, page_zip, index, 0, mtr); + if (index->is_instant()) { + /* MDEV-17383: free metadata BLOBs! */ + index->clear_instant_alter(); + } + page_cur_set_after_last( + block, + btr_cur_get_page_cur(cursor)); + ret = TRUE; + goto return_after_reservations; } - ret = TRUE; - goto return_after_reservations; } if (UNIV_LIKELY(!is_metadata)) { @@ -5827,7 +6063,7 @@ discard_page: || btr_cur_will_modify_tree( index, page, BTR_INTENTION_DELETE, rec, btr_node_ptr_max_size(index), - block->page.size, mtr); + block->zip_size(), mtr); page_cur_delete_rec(btr_cur_get_page_cur(cursor), index, offsets, mtr); #ifdef UNIV_ZIP_DEBUG @@ -5994,7 +6230,7 @@ btr_estimate_n_rows_in_range_on_level( const fil_space_t* space = index->table->space; page_id_t page_id(space->id, slot1->page_no); - const page_size_t page_size(space->flags); + const ulint zip_size = space->zip_size(); level = slot1->page_level; @@ -6011,7 +6247,7 @@ btr_estimate_n_rows_in_range_on_level( attempting to read a page that is no longer part of the B-tree. We pass BUF_GET_POSSIBLY_FREED in order to silence a debug assertion about this. */ - block = buf_page_get_gen(page_id, page_size, RW_S_LATCH, + block = buf_page_get_gen(page_id, zip_size, RW_S_LATCH, NULL, BUF_GET_POSSIBLY_FREED, __FILE__, __LINE__, &mtr, &err); @@ -7232,7 +7468,7 @@ struct btr_blob_log_check_t { mtr_x_lock(dict_index_get_lock(index), m_mtr); m_pcur->btr_cur.page_cur.block = btr_block_get( page_id_t(index->table->space_id, page_no), - page_size_t(index->table->space->flags), + index->table->space->zip_size(), RW_X_LATCH, index, m_mtr); m_pcur->btr_cur.page_cur.rec = m_pcur->btr_cur.page_cur.block->frame @@ -7320,9 +7556,6 @@ btr_store_big_rec_extern_fields( ut_ad(buf_block_get_frame(rec_block) == page_align(rec)); ut_a(dict_index_is_clust(index)); - ut_a(dict_table_page_size(index->table) - .equals_to(rec_block->page.size)); - btr_blob_log_check_t redo_log(pcur, btr_mtr, offsets, &rec_block, &rec, op); page_zip = buf_block_get_page_zip(rec_block); @@ -7366,15 +7599,13 @@ btr_store_big_rec_extern_fields( } #endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ - const page_size_t page_size(dict_table_page_size(index->table)); - /* Space available in compressed page to carry blob data */ - const ulint payload_size_zip = page_size.physical() + const ulint payload_size_zip = rec_block->physical_size() - FIL_PAGE_DATA; /* Space available in uncompressed page to carry blob data */ - const ulint payload_size = page_size.physical() - - FIL_PAGE_DATA - BTR_BLOB_HDR_SIZE - FIL_PAGE_DATA_END; + const ulint payload_size = payload_size_zip + - (BTR_BLOB_HDR_SIZE + FIL_PAGE_DATA_END); /* We have to create a file segment to the tablespace for each field and put the pointer to the field in rec */ @@ -7430,7 +7661,7 @@ btr_store_big_rec_extern_fields( mtr.set_flush_observer(btr_mtr->get_flush_observer()); buf_page_get(rec_block->page.id, - rec_block->page.size, RW_X_LATCH, &mtr); + rec_block->zip_size(), RW_X_LATCH, &mtr); if (prev_page_no == FIL_NULL) { hint_page_no = 1 + rec_page_no; @@ -7478,7 +7709,7 @@ btr_store_big_rec_extern_fields( prev_block = buf_page_get( page_id_t(space_id, prev_page_no), - rec_block->page.size, + rec_block->zip_size(), RW_X_LATCH, &mtr); buf_block_dbg_add_level(prev_block, @@ -7567,16 +7798,20 @@ btr_store_big_rec_extern_fields( + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION + 4, rec_page_no, MLOG_4BYTES, &mtr); - - /* Zero out the unused part of the page. */ - memset(page + page_zip_get_size(page_zip) - - c_stream.avail_out, - 0, c_stream.avail_out); mlog_log_string(page + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION, page_zip_get_size(page_zip) - - FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION, + - FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION + - c_stream.avail_out, &mtr); + /* Zero out the unused part of the page. */ + if (c_stream.avail_out) { + mlog_memset(block, + page_zip_get_size(page_zip) + - c_stream.avail_out, + c_stream.avail_out, + 0, &mtr); + } /* Copy the page to compressed storage, because it will be flushed to disk from there. */ @@ -7837,10 +8072,9 @@ btr_free_externally_stored_field( ut_ad(space_id == index->table->space->id); ut_ad(space_id == index->table->space_id); - const page_size_t ext_page_size(dict_table_page_size(index->table)); - const page_size_t& rec_page_size(rec == NULL - ? univ_page_size - : ext_page_size); + const ulint ext_zip_size = index->table->space->zip_size(); + const ulint rec_zip_size = rec ? ext_zip_size : 0; + if (rec == NULL) { /* This is a call from row_purge_upd_exist_or_extern(). */ ut_ad(!page_zip); @@ -7867,7 +8101,7 @@ btr_free_externally_stored_field( #ifdef UNIV_DEBUG rec_block = #endif /* UNIV_DEBUG */ - buf_page_get(page_id, rec_page_size, RW_X_LATCH, &mtr); + buf_page_get(page_id, rec_zip_size, RW_X_LATCH, &mtr); buf_block_dbg_add_level(rec_block, SYNC_NO_ORDER_CHECK); page_no = mach_read_from_4(field_ref + BTR_EXTERN_PAGE_NO); @@ -7893,13 +8127,13 @@ btr_free_externally_stored_field( } ext_block = buf_page_get( - page_id_t(space_id, page_no), ext_page_size, + page_id_t(space_id, page_no), ext_zip_size, RW_X_LATCH, &mtr); buf_block_dbg_add_level(ext_block, SYNC_EXTERN_STORAGE); page = buf_block_get_frame(ext_block); - if (ext_page_size.is_compressed()) { + if (ext_zip_size) { /* Note that page_zip will be NULL in row_purge_upd_exist_or_extern(). */ switch (fil_page_get_type(page)) { @@ -8068,7 +8302,7 @@ btr_copy_blob_prefix( mtr_start(&mtr); block = buf_page_get(page_id_t(space_id, page_no), - univ_page_size, RW_S_LATCH, &mtr); + 0, RW_S_LATCH, &mtr); buf_block_dbg_add_level(block, SYNC_EXTERN_STORAGE); page = buf_block_get_frame(block); @@ -8106,7 +8340,7 @@ by a lock or a page latch. @param[out] buf the externally stored part of the field, or a prefix of it @param[in] len length of buf, in bytes -@param[in] page_size compressed BLOB page size +@param[in] zip_size ROW_FORMAT=COMPRESSED page size @param[in] space_id space id of the BLOB pages @param[in] offset offset on the first BLOB page @return number of bytes written to buf */ @@ -8115,7 +8349,7 @@ ulint btr_copy_zblob_prefix( byte* buf, ulint len, - const page_size_t& page_size, + ulint zip_size, ulint space_id, ulint page_no, ulint offset) @@ -8135,7 +8369,8 @@ btr_copy_zblob_prefix( heap = mem_heap_create(40000); page_zip_set_alloc(&d_stream, heap); - ut_ad(page_size.is_compressed()); + ut_ad(zip_size); + ut_ad(ut_is_2pow(zip_size)); ut_ad(space_id); err = inflateInit(&d_stream); @@ -8150,7 +8385,7 @@ btr_copy_zblob_prefix( is being held on the clustered index record, or, in row_merge_copy_blobs(), by an exclusive table lock. */ bpage = buf_page_get_zip(page_id_t(space_id, page_no), - page_size); + zip_size); if (UNIV_UNLIKELY(!bpage)) { ib::error() << "Cannot load compressed BLOB " @@ -8182,8 +8417,7 @@ btr_copy_zblob_prefix( } d_stream.next_in = bpage->zip.data + offset; - d_stream.avail_in = static_cast<uInt>(page_size.physical() - - offset); + d_stream.avail_in = uInt(zip_size - offset); err = inflate(&d_stream, Z_NO_FLUSH); switch (err) { @@ -8253,7 +8487,7 @@ by a lock or a page latch. @param[out] buf the externally stored part of the field, or a prefix of it @param[in] len length of buf, in bytes -@param[in] page_size BLOB page size +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param[in] space_id space id of the first BLOB page @param[in] page_no page number of the first BLOB page @param[in] offset offset on the first BLOB page @@ -8263,7 +8497,7 @@ ulint btr_copy_externally_stored_field_prefix_low( byte* buf, ulint len, - const page_size_t& page_size, + ulint zip_size, ulint space_id, ulint page_no, ulint offset) @@ -8272,11 +8506,10 @@ btr_copy_externally_stored_field_prefix_low( return(0); } - if (page_size.is_compressed()) { - return(btr_copy_zblob_prefix(buf, len, page_size, + if (zip_size) { + return(btr_copy_zblob_prefix(buf, len, zip_size, space_id, page_no, offset)); } else { - ut_ad(page_size.equals_to(univ_page_size)); return(btr_copy_blob_prefix(buf, len, space_id, page_no, offset)); } @@ -8286,7 +8519,7 @@ btr_copy_externally_stored_field_prefix_low( The clustered index record must be protected by a lock or a page latch. @param[out] buf the field, or a prefix of it @param[in] len length of buf, in bytes -@param[in] page_size BLOB page size +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param[in] data 'internally' stored part of the field containing also the reference to the external part; must be protected by a lock or a page latch @@ -8297,7 +8530,7 @@ ulint btr_copy_externally_stored_field_prefix( byte* buf, ulint len, - const page_size_t& page_size, + ulint zip_size, const byte* data, ulint local_len) { @@ -8336,7 +8569,7 @@ btr_copy_externally_stored_field_prefix( return(local_len + btr_copy_externally_stored_field_prefix_low(buf + local_len, len - local_len, - page_size, + zip_size, space_id, page_no, offset)); } @@ -8347,7 +8580,7 @@ The clustered index record must be protected by a lock or a page latch. @param[in] data 'internally' stored part of the field containing also the reference to the external part; must be protected by a lock or a page latch -@param[in] page_size BLOB page size +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param[in] local_len length of data @param[in,out] heap mem heap @return the whole field copied to heap */ @@ -8355,7 +8588,7 @@ byte* btr_copy_externally_stored_field( ulint* len, const byte* data, - const page_size_t& page_size, + ulint zip_size, ulint local_len, mem_heap_t* heap) { @@ -8386,7 +8619,7 @@ btr_copy_externally_stored_field( *len = local_len + btr_copy_externally_stored_field_prefix_low(buf + local_len, extern_len, - page_size, + zip_size, space_id, page_no, offset); @@ -8397,7 +8630,7 @@ btr_copy_externally_stored_field( @param[in] rec record in a clustered index; must be protected by a lock or a page latch @param[in] offset array returned by rec_get_offsets() -@param[in] page_size BLOB page size +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param[in] no field number @param[out] len length of the field @param[in,out] heap mem heap @@ -8406,7 +8639,7 @@ byte* btr_rec_copy_externally_stored_field( const rec_t* rec, const ulint* offsets, - const page_size_t& page_size, + ulint zip_size, ulint no, ulint* len, mem_heap_t* heap) @@ -8440,5 +8673,5 @@ btr_rec_copy_externally_stored_field( } return(btr_copy_externally_stored_field(len, data, - page_size, local_len, heap)); + zip_size, local_len, heap)); } |