summaryrefslogtreecommitdiff
path: root/storage/innobase/include/btr0btr.h
blob: a56598d3620cda7b28acb9094588ca88e624560f (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
/*****************************************************************************

Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2012, Facebook Inc.
Copyright (c) 2014, 2023, MariaDB Corporation.

This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
Foundation; version 2 of the License.

This program is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.

You should have received a copy of the GNU General Public License along with
this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA

*****************************************************************************/

/**************************************************//**
@file include/btr0btr.h
The B-tree

Created 6/2/1994 Heikki Tuuri
*******************************************************/

#pragma once

#include "dict0dict.h"
#include "data0data.h"
#include "rem0types.h"
#include "page0cur.h"
#include "btr0types.h"
#include "gis0type.h"

#define BTR_MAX_NODE_LEVEL	50	/*!< Maximum B-tree page level
					(not really a hard limit).
					Used in debug assertions
					in btr_page_set_level and
					btr_page_get_level */

/** Maximum record size which can be stored on a page, without using the
special big record storage structure */
#define	BTR_PAGE_MAX_REC_SIZE	(srv_page_size / 2 - 200)

/** @brief Maximum depth of a B-tree in InnoDB.

Note that this isn't a maximum as such; none of the tree operations
avoid producing trees bigger than this. It is instead a "max depth
that other code must work with", useful for e.g.  fixed-size arrays
that must store some information about each level in a tree. In other
words: if a B-tree with bigger depth than this is encountered, it is
not acceptable for it to lead to mysterious memory corruption, but it
is acceptable for the program to die with a clear assert failure. */
#define BTR_MAX_LEVELS		100

#define BTR_LATCH_MODE_WITHOUT_FLAGS(latch_mode)		\
	btr_latch_mode((latch_mode) & ~(BTR_INSERT	\
				| BTR_DELETE_MARK		\
				| BTR_RTREE_UNDO_INS		\
				| BTR_RTREE_DELETE_MARK		\
				| BTR_DELETE			\
				| BTR_IGNORE_SEC_UNIQUE		\
				| BTR_ALREADY_S_LATCHED		\
				| BTR_LATCH_FOR_INSERT		\
				| BTR_LATCH_FOR_DELETE))

#define BTR_LATCH_MODE_WITHOUT_INTENTION(latch_mode)			\
	btr_latch_mode((latch_mode)					\
		       & ~(BTR_LATCH_FOR_INSERT | BTR_LATCH_FOR_DELETE))

/**************************************************************//**
Checks and adjusts the root node of a tree during IMPORT TABLESPACE.
@return error code, or DB_SUCCESS */
dberr_t
btr_root_adjust_on_import(
/*======================*/
	const dict_index_t*	index)	/*!< in: index tree */
	MY_ATTRIBUTE((warn_unused_result));

/** Report a decryption failure. */
ATTRIBUTE_COLD void btr_decryption_failed(const dict_index_t &index);

/** Get an index page and declare its latching order level.
@param[in]	index	index tree
@param[in]	page	page number
@param[in]	mode	latch mode
@param[in]	merge	whether change buffer merge should be attempted
@param[in,out]	mtr	mini-transaction
@param[out]	err	error code
@return block */
buf_block_t *btr_block_get(const dict_index_t &index,
                           uint32_t page, ulint mode, bool merge,
                           mtr_t *mtr, dberr_t *err= nullptr);

/**************************************************************//**
Gets the index id field of a page.
@return index id */
UNIV_INLINE
index_id_t
btr_page_get_index_id(
/*==================*/
	const page_t*	page)	/*!< in: index page */
	MY_ATTRIBUTE((warn_unused_result));
/** Read the B-tree or R-tree PAGE_LEVEL.
@param page B-tree or R-tree page
@return number of child page links to reach the leaf level
@retval 0 for leaf pages */
inline uint16_t btr_page_get_level(const page_t *page)
{
  uint16_t level= mach_read_from_2(my_assume_aligned<2>
                                   (PAGE_HEADER + PAGE_LEVEL + page));
  ut_ad(level <= BTR_MAX_NODE_LEVEL);
  return level;
} MY_ATTRIBUTE((warn_unused_result))

/** Read FIL_PAGE_NEXT.
@param page  buffer pool page
@return previous page number */
inline uint32_t btr_page_get_next(const page_t* page)
{
  return mach_read_from_4(my_assume_aligned<4>(page + FIL_PAGE_NEXT));
}

/** Read FIL_PAGE_PREV.
@param page  buffer pool page
@return previous page number */
inline uint32_t btr_page_get_prev(const page_t* page)
{
  return mach_read_from_4(my_assume_aligned<4>(page + FIL_PAGE_PREV));
}

/**************************************************************//**
Gets the child node file address in a node pointer.
NOTE: the offsets array must contain all offsets for the record since
we read the last field according to offsets and assume that it contains
the child page number. In other words offsets must have been retrieved
with rec_get_offsets(n_fields=ULINT_UNDEFINED).
@return child node address */
UNIV_INLINE
uint32_t
btr_node_ptr_get_child_page_no(
/*===========================*/
	const rec_t*	rec,	/*!< in: node pointer record */
	const rec_offs*	offsets)/*!< in: array returned by rec_get_offsets() */
	MY_ATTRIBUTE((warn_unused_result));

/** Create the root node for a new index tree.
@param[in]	type			type of the index
@param[in,out]	space			tablespace where created
@param[in]	index_id		index id
@param[in]	index			index, or NULL to create a system table
@param[in,out]	mtr			mini-transaction
@param[out]	err			error code
@return	page number of the created root
@retval	FIL_NULL	if did not succeed */
uint32_t
btr_create(
	ulint			type,
	fil_space_t*		space,
	index_id_t		index_id,
	dict_index_t*		index,
	mtr_t*			mtr,
	dberr_t*		err)
	MY_ATTRIBUTE((nonnull(2,5,6), warn_unused_result));

/** Free a persistent index tree if it exists.
@param[in,out]	space		tablespce
@param[in]	page		root page number
@param[in]	index_id	PAGE_INDEX_ID contents
@param[in,out]	mtr		mini-transaction */
void btr_free_if_exists(fil_space_t *space, uint32_t page,
                        index_id_t index_id, mtr_t *mtr);

/** Drop a temporary table
@param table   temporary table */
void btr_drop_temporary_table(const dict_table_t &table);

/** Read the last used AUTO_INCREMENT value from PAGE_ROOT_AUTO_INC.
@param[in,out]	index	clustered index
@return	the last used AUTO_INCREMENT value
@retval	0 on error or if no AUTO_INCREMENT value was used yet */
ib_uint64_t
btr_read_autoinc(dict_index_t* index)
	MY_ATTRIBUTE((nonnull, warn_unused_result));

/** Read the last used AUTO_INCREMENT value from PAGE_ROOT_AUTO_INC,
or fall back to MAX(auto_increment_column).
@param[in]	table	table containing an AUTO_INCREMENT column
@param[in]	col_no	index of the AUTO_INCREMENT column
@return	the AUTO_INCREMENT value
@retval	0 on error or if no AUTO_INCREMENT value was used yet */
ib_uint64_t
btr_read_autoinc_with_fallback(const dict_table_t* table, unsigned col_no)
	MY_ATTRIBUTE((nonnull, warn_unused_result));

/** Write the next available AUTO_INCREMENT value to PAGE_ROOT_AUTO_INC.
@param[in,out]	index	clustered index
@param[in]	autoinc	the AUTO_INCREMENT value
@param[in]	reset	whether to reset the AUTO_INCREMENT
			to a possibly smaller value than currently
			exists in the page */
void
btr_write_autoinc(dict_index_t* index, ib_uint64_t autoinc, bool reset = false)
	MY_ATTRIBUTE((nonnull));

/** Write instant ALTER TABLE metadata to a root page.
@param[in,out]	root	clustered index root page
@param[in]	index	clustered index with instant ALTER TABLE
@param[in,out]	mtr	mini-transaction */
void btr_set_instant(buf_block_t* root, const dict_index_t& index, mtr_t* mtr);

ATTRIBUTE_COLD __attribute__((nonnull))
/** Reset the table to the canonical format on ROLLBACK of instant ALTER TABLE.
@param[in]      index   clustered index with instant ALTER TABLE
@param[in]      all     whether to reset FIL_PAGE_TYPE as well
@param[in,out]  mtr     mini-transaction */
void btr_reset_instant(const dict_index_t &index, bool all, mtr_t *mtr);

/*************************************************************//**
Makes tree one level higher by splitting the root, and inserts
the tuple. It is assumed that mtr contains an x-latch on the tree.
NOTE that the operation of this function must always succeed,
we cannot reverse it: therefore enough free disk space must be
guaranteed to be available before this function is called.
@return inserted record */
rec_t*
btr_root_raise_and_insert(
/*======================*/
	ulint		flags,	/*!< in: undo logging and locking flags */
	btr_cur_t*	cursor,	/*!< in: cursor at which to insert: must be
				on the root page; when the function returns,
				the cursor is positioned on the predecessor
				of the inserted record */
	rec_offs**	offsets,/*!< out: offsets on inserted record */
	mem_heap_t**	heap,	/*!< in/out: pointer to memory heap
				that can be emptied, or NULL */
	const dtuple_t*	tuple,	/*!< in: tuple to insert */
	ulint		n_ext,	/*!< in: number of externally stored columns */
	mtr_t*		mtr,	/*!< in: mtr */
	dberr_t*	err)	/*!< out: error code */
	MY_ATTRIBUTE((nonnull, warn_unused_result));
/*************************************************************//**
Reorganizes an index page.

IMPORTANT: On success, the caller will have to update IBUF_BITMAP_FREE
if this is a compressed leaf page in a secondary index. This has to
be done either within the same mini-transaction, or by invoking
ibuf_reset_free_bits() before mtr_commit(). On uncompressed pages,
IBUF_BITMAP_FREE is unaffected by reorganization.

@param cursor  page cursor
@param mtr     mini-transaction
@return error code
@retval DB_FAIL if reorganizing a ROW_FORMAT=COMPRESSED page failed */
dberr_t btr_page_reorganize(page_cur_t *cursor, mtr_t *mtr)
  MY_ATTRIBUTE((nonnull, warn_unused_result));
/** Decide if the page should be split at the convergence point of inserts
converging to the left.
@param[in]	cursor	insert position
@return the first record to be moved to the right half page
@retval	NULL if no split is recommended */
rec_t* btr_page_get_split_rec_to_left(const btr_cur_t* cursor);
/** Decide if the page should be split at the convergence point of inserts
converging to the right.
@param[in]	cursor		insert position
@param[out]	split_rec	if split recommended, the first record
				on the right half page, or
				NULL if the to-be-inserted record
				should be first
@return whether split is recommended */
bool
btr_page_get_split_rec_to_right(const btr_cur_t* cursor, rec_t** split_rec);

/*************************************************************//**
Splits an index page to halves and inserts the tuple. It is assumed
that mtr holds an x-latch to the index tree. NOTE: the tree x-latch is
released within this function! NOTE that the operation of this
function must always succeed, we cannot reverse it: therefore enough
free disk space (2 pages) must be guaranteed to be available before
this function is called.

@return inserted record */
rec_t*
btr_page_split_and_insert(
/*======================*/
	ulint		flags,	/*!< in: undo logging and locking flags */
	btr_cur_t*	cursor,	/*!< in: cursor at which to insert; when the
				function returns, the cursor is positioned
				on the predecessor of the inserted record */
	rec_offs**	offsets,/*!< out: offsets on inserted record */
	mem_heap_t**	heap,	/*!< in/out: pointer to memory heap
				that can be emptied, or NULL */
	const dtuple_t*	tuple,	/*!< in: tuple to insert */
	ulint		n_ext,	/*!< in: number of externally stored columns */
	mtr_t*		mtr,	/*!< in: mtr */
	dberr_t*	err)	/*!< out: error code */
	MY_ATTRIBUTE((nonnull, warn_unused_result));
/*******************************************************//**
Inserts a data tuple to a tree on a non-leaf level. It is assumed
that mtr holds an x-latch on the tree. */
dberr_t
btr_insert_on_non_leaf_level(
	ulint		flags,	/*!< in: undo logging and locking flags */
	dict_index_t*	index,	/*!< in: index */
	ulint		level,	/*!< in: level, must be > 0 */
	dtuple_t*	tuple,	/*!< in: the record to be inserted */
	mtr_t*		mtr)	/*!< in: mtr */
	MY_ATTRIBUTE((nonnull, warn_unused_result));

/** Set a child page pointer record as the predefined minimum record.
@tparam has_prev  whether the page is supposed to have a left sibling
@param[in,out]  rec     leftmost record on a leftmost non-leaf page
@param[in,out]  block   buffer pool block
@param[in,out]  mtr     mini-transaction */
template<bool has_prev= false>
inline void btr_set_min_rec_mark(rec_t *rec, const buf_block_t &block,
                                 mtr_t *mtr)
{
  ut_ad(block.page.frame == page_align(rec));
  ut_ad(!page_is_leaf(block.page.frame));
  ut_ad(has_prev == page_has_prev(block.page.frame));

  rec-= page_rec_is_comp(rec) ? REC_NEW_INFO_BITS : REC_OLD_INFO_BITS;

  if (block.page.zip.data)
    /* This flag is computed from other contents on a ROW_FORMAT=COMPRESSED
    page. We are not modifying the compressed page frame at all. */
    *rec|= REC_INFO_MIN_REC_FLAG;
  else
    mtr->write<1>(block, rec, *rec | REC_INFO_MIN_REC_FLAG);
}

/** Seek to the parent page of a B-tree page.
@param[in,out]	mtr	mini-transaction
@param[in,out]	cursor	cursor pointing to the x-latched parent page
@return whether the cursor was successfully positioned */
bool btr_page_get_father(mtr_t* mtr, btr_cur_t* cursor)
	MY_ATTRIBUTE((nonnull,warn_unused_result));
#ifdef UNIV_DEBUG
/************************************************************//**
Checks that the node pointer to a page is appropriate.
@return TRUE */
ibool
btr_check_node_ptr(
/*===============*/
	dict_index_t*	index,	/*!< in: index tree */
	buf_block_t*	block,	/*!< in: index page */
	mtr_t*		mtr)	/*!< in: mtr */
	MY_ATTRIBUTE((warn_unused_result));
#endif /* UNIV_DEBUG */
/*************************************************************//**
Tries to merge the page first to the left immediate brother if such a
brother exists, and the node pointers to the current page and to the
brother reside on the same page. If the left brother does not satisfy these
conditions, looks at the right brother. If the page is the only one on that
level lifts the records of the page to the father page, thus reducing the
tree height. It is assumed that mtr holds an x-latch on the tree and on the
page. If cursor is on the leaf level, mtr must also hold x-latches to
the brothers, if they exist.
@return error code
@retval DB_FAIL if the tree could not be merged */
dberr_t
btr_compress(
/*=========*/
	btr_cur_t*	cursor,	/*!< in/out: cursor on the page to merge
				or lift; the page must not be empty:
				when deleting records, use btr_discard_page()
				if the page would become empty */
	bool		adjust,	/*!< in: whether the cursor position should be
				adjusted even when compression occurs */
	mtr_t*		mtr)	/*!< in/out: mini-transaction */
	MY_ATTRIBUTE((nonnull, warn_unused_result));
/*************************************************************//**
Discards a page from a B-tree. This is used to remove the last record from
a B-tree page: the whole page must be removed at the same time. This cannot
be used for the root page, which is allowed to be empty. */
dberr_t
btr_discard_page(
/*=============*/
	btr_cur_t*	cursor,	/*!< in: cursor on the page to discard: not on
				the root page */
	mtr_t*		mtr);	/*!< in: mtr */

/**************************************************************//**
Allocates a new file page to be used in an index tree. NOTE: we assume
that the caller has made the reservation for free extents!
@retval NULL if no page could be allocated */
buf_block_t*
btr_page_alloc(
/*===========*/
	dict_index_t*	index,		/*!< in: index tree */
	uint32_t	hint_page_no,	/*!< in: hint of a good page */
	byte		file_direction,	/*!< in: direction where a possible
					page split is made */
	ulint		level,		/*!< in: level where the page is placed
					in the tree */
	mtr_t*		mtr,		/*!< in/out: mini-transaction
					for the allocation */
	mtr_t*		init_mtr,	/*!< in/out: mini-transaction
					for x-latching and initializing
					the page */
	dberr_t*	err)		/*!< out: error code */
	MY_ATTRIBUTE((warn_unused_result));
/** Empty an index page (possibly the root page). @see btr_page_create().
@param[in,out]	block		page to be emptied
@param[in,out]	page_zip	compressed page frame, or NULL
@param[in]	index		index of the page
@param[in]	level		B-tree level of the page (0=leaf)
@param[in,out]	mtr		mini-transaction */
void
btr_page_empty(
	buf_block_t*	block,
	page_zip_des_t*	page_zip,
	dict_index_t*	index,
	ulint		level,
	mtr_t*		mtr)
	MY_ATTRIBUTE((nonnull(1, 3, 5)));
/**************************************************************//**
Creates a new index page (not the root, and also not
used in page reorganization).  @see btr_page_empty(). */
void
btr_page_create(
/*============*/
	buf_block_t*	block,	/*!< in/out: page to be created */
	page_zip_des_t*	page_zip,/*!< in/out: compressed page, or NULL */
	dict_index_t*	index,	/*!< in: index */
	ulint		level,	/*!< in: the B-tree level of the page */
	mtr_t*		mtr);	/*!< in: mtr */

/** Free an index page.
@param[in,out]	index	index tree
@param[in,out]	block	block to be freed
@param[in,out]	mtr	mini-transaction
@param[in]	blob	whether this is freeing a BLOB page
@param[in]	latched	whether index->table->space->x_lock() was called */
MY_ATTRIBUTE((nonnull))
dberr_t btr_page_free(dict_index_t *index, buf_block_t *block, mtr_t *mtr,
                      bool blob= false, bool space_latched= false);

/**************************************************************//**
Gets the root node of a tree and x- or s-latches it.
@return root page, x- or s-latched */
buf_block_t*
btr_root_block_get(
/*===============*/
	dict_index_t*		index,	/*!< in: index tree */
	rw_lock_type_t		mode,	/*!< in: either RW_S_LATCH
					or RW_X_LATCH */
	mtr_t*			mtr,	/*!< in: mtr */
	dberr_t*		err);	/*!< out: error code */
/*************************************************************//**
Reorganizes an index page.

IMPORTANT: On success, the caller will have to update IBUF_BITMAP_FREE
if this is a compressed leaf page in a secondary index. This has to
be done either within the same mini-transaction, or by invoking
ibuf_reset_free_bits() before mtr_commit(). On uncompressed pages,
IBUF_BITMAP_FREE is unaffected by reorganization.

@return error code
@retval DB_FAIL if reorganizing a ROW_FORMAT=COMPRESSED page failed */
dberr_t btr_page_reorganize_block(
	ulint		z_level,/*!< in: compression level to be used
				if dealing with compressed page */
	buf_block_t*	block,	/*!< in/out: B-tree page */
	dict_index_t*	index,	/*!< in: the index tree of the page */
	mtr_t*		mtr)	/*!< in/out: mini-transaction */
	__attribute__((nonnull, warn_unused_result));

#ifdef UNIV_BTR_PRINT
/*************************************************************//**
Prints size info of a B-tree. */
void
btr_print_size(
/*===========*/
	dict_index_t*	index)	/*!< in: index tree */
	MY_ATTRIBUTE((nonnull));
/**************************************************************//**
Prints directories and other info of all nodes in the index. */
void
btr_print_index(
/*============*/
	dict_index_t*	index,	/*!< in: index */
	ulint		width)	/*!< in: print this many entries from start
				and end */
	MY_ATTRIBUTE((nonnull));
#endif /* UNIV_BTR_PRINT */
/************************************************************//**
Checks the size and number of fields in a record based on the definition of
the index.
@return TRUE if ok */
ibool
btr_index_rec_validate(
/*===================*/
	const rec_t*		rec,		/*!< in: index record */
	const dict_index_t*	index,		/*!< in: index */
	ibool			dump_on_error)	/*!< in: TRUE if the function
						should print hex dump of record
						and page on error */
	MY_ATTRIBUTE((warn_unused_result));
/**************************************************************//**
Checks the consistency of an index tree.
@return	DB_SUCCESS if ok, error code if not */
dberr_t
btr_validate_index(
/*===============*/
	dict_index_t*	index,	/*!< in: index */
	const trx_t*	trx)	/*!< in: transaction or 0 */
	MY_ATTRIBUTE((warn_unused_result));

/** Remove a page from the level list of pages.
@param[in]	block		page to remove
@param[in]	index		index tree
@param[in,out]	mtr		mini-transaction */
dberr_t btr_level_list_remove(const buf_block_t& block,
                              const dict_index_t& index, mtr_t* mtr)
  MY_ATTRIBUTE((warn_unused_result));

/*************************************************************//**
If page is the only on its level, this function moves its records to the
father page, thus reducing the tree height.
@return father block */
buf_block_t*
btr_lift_page_up(
	dict_index_t*	index,	/*!< in: index tree */
	buf_block_t*	block,	/*!< in: page which is the only on its level;
				must not be empty: use
				btr_discard_only_page_on_level if the last
				record from the page should be removed */
	mtr_t*		mtr,	/*!< in/out: mini-transaction */
	dberr_t*	err)	/*!< out: error code */
	__attribute__((nonnull));

#define BTR_N_LEAF_PAGES	1
#define BTR_TOTAL_SIZE		2

#include "btr0btr.inl"

/****************************************************************
Global variable controlling if scrubbing should be performed */
extern my_bool srv_immediate_scrub_data_uncompressed;