From b1646d0433c98662c50af029a121d681ddfb7a2b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marko=20M=C3=A4kel=C3=A4?= Date: Mon, 6 Mar 2023 17:17:32 +0200 Subject: MDEV-30567 rec_get_offsets() is not optimal rec_init_offsets_comp_ordinary(), rec_init_offsets(), rec_get_offsets_reverse(), rec_get_nth_field_offs_old(): Simplify some bitwise arithmetics to avoid conditional jumps, and add branch prediction hints with the assumption that most variable-length columns are short. Tested by: Matthias Leich --- storage/innobase/rem/rem0rec.cc | 208 +++++++++++++++++----------------------- 1 file changed, 90 insertions(+), 118 deletions(-) diff --git a/storage/innobase/rem/rem0rec.cc b/storage/innobase/rem/rem0rec.cc index 902f3f2d5ca..d54dc57655e 100644 --- a/storage/innobase/rem/rem0rec.cc +++ b/storage/innobase/rem/rem0rec.cc @@ -217,14 +217,12 @@ rec_get_n_extern_new( stored in one byte for 0..127. The length will be encoded in two bytes when it is 128 or more, or when the field is stored externally. */ - if (DATA_BIG_COL(col)) { - if (len & 0x80) { - /* 1exxxxxxx xxxxxxxx */ - if (len & 0x40) { - n_extern++; - } - lens--; + if (UNIV_UNLIKELY(len & 0x80) && DATA_BIG_COL(col)) { + /* 1exxxxxxx xxxxxxxx */ + if (len & 0x40) { + n_extern++; } + lens--; } } } while (++i < n); @@ -244,6 +242,10 @@ enum rec_leaf_format { REC_LEAF_INSTANT }; +#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 11 +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Wconversion" /* GCC 5 to 10 need this */ +#endif /** Determine the offset to each field in a leaf-page record in ROW_FORMAT=COMPACT,DYNAMIC,COMPRESSED. This is a special case of rec_init_offsets() and rec_get_offsets_func(). @@ -361,8 +363,7 @@ start: do { if (mblob) { if (i == index->first_user_field()) { - offs = static_cast(offs - + FIELD_REF_SIZE); + offs += FIELD_REF_SIZE; len = combine(offs, STORED_OFFPAGE); any |= REC_OFFS_EXTERNAL; field--; @@ -433,27 +434,23 @@ start: stored in one byte for 0..127. The length will be encoded in two bytes when it is 128 or more, or when the field is stored externally. */ - if ((len & 0x80) && DATA_BIG_COL(col)) { + if (UNIV_UNLIKELY(len & 0x80) && DATA_BIG_COL(col)) { /* 1exxxxxxx xxxxxxxx */ - len = static_cast(len << 8 - | *lens--); - offs = static_cast(offs - + get_value(len)); - if (UNIV_UNLIKELY(len & 0x4000)) { - ut_ad(index->is_primary()); - any |= REC_OFFS_EXTERNAL; - len = combine(offs, STORED_OFFPAGE); - } else { - len = offs; - } - + len <<= 8; + len |= *lens--; + static_assert(STORED_OFFPAGE == 0x4000, ""); + static_assert(REC_OFFS_EXTERNAL == 0x4000, ""); + const rec_offs ext = len & REC_OFFS_EXTERNAL; + offs += get_value(len); + len = offs | ext; + any |= ext; + ut_ad(!ext || index->is_primary()); continue; } - len = offs = static_cast(offs + len); + len = offs += len; } else { - len = offs = static_cast(offs - + field->fixed_len); + len = offs += field->fixed_len; } } while (field++, rec_offs_base(offsets)[++i] = len, i < rec_offs_n_fields(offsets)); @@ -478,7 +475,7 @@ rec_offs_make_valid( { const bool is_alter_metadata = leaf && rec_is_alter_metadata(rec, *index); - ut_ad(is_alter_metadata + ut_ad((leaf && rec_is_metadata(rec, *index)) || index->is_dummy || index->is_ibuf() || (leaf ? rec_offs_n_fields(offsets) @@ -572,7 +569,8 @@ rec_offs_validate( } /* index->n_def == 0 for dummy indexes if !comp */ ut_ad(!comp || index->n_def); - ut_ad(!index->n_def || i <= max_n_fields); + ut_ad(!index->n_def || i <= max_n_fields + || rec_is_metadata(rec, *index)); } while (i--) { ulint curr = get_value(rec_offs_base(offsets)[1 + i]); @@ -610,7 +608,7 @@ rec_init_offsets( ulint i = 0; rec_offs offs; - /* This assertion was relaxed for the btr_cur_open_at_index_side() + /* This assertion was relaxed for the btr_cur_t::open_leaf() call in btr_cur_instant_init_low(). We cannot invoke index->is_instant(), because the same assertion would fail there until btr_cur_instant_init_low() has invoked @@ -678,8 +676,7 @@ rec_init_offsets( do { rec_offs len; if (UNIV_UNLIKELY(i == n_node_ptr_field)) { - len = offs = static_cast( - offs + REC_NODE_PTR_SIZE); + len = offs += REC_NODE_PTR_SIZE; goto resolved; } @@ -719,29 +716,25 @@ rec_init_offsets( encoded in two bytes when it is 128 or more, or when the field is stored externally. */ - if (DATA_BIG_COL(col)) { - if (len & 0x80) { - /* 1exxxxxxx xxxxxxxx */ - len = static_cast( - len << 8 | *lens--); - - /* B-tree node pointers - must not contain externally - stored columns. Thus - the "e" flag must be 0. */ - ut_a(!(len & 0x4000)); - offs = static_cast( - offs + get_value(len)); - len = offs; - - goto resolved; - } + if (UNIV_UNLIKELY(len & 0x80) + && DATA_BIG_COL(col)) { + /* 1exxxxxxx xxxxxxxx */ + len <<= 8; + len |= *lens--; + + /* B-tree node pointers + must not contain externally + stored columns. Thus + the "e" flag must be 0. */ + ut_a(!(len & 0x4000)); + offs += len & 0x3fff; + len = offs; + goto resolved; } - len = offs = static_cast(offs + len); + len = offs += len; } else { - len = offs = static_cast( - offs + field->fixed_len); + len = offs += field->fixed_len; } resolved: rec_offs_base(offsets)[i + 1] = len; @@ -758,35 +751,30 @@ resolved: rec_offs any; if (rec_get_1byte_offs_flag(rec)) { - offs = static_cast(offs + n_fields); + offs += static_cast(n_fields); any = offs; /* Determine offsets to fields */ do { offs = rec_1_get_field_end_info(rec, i); if (offs & REC_1BYTE_SQL_NULL_MASK) { - offs &= static_cast( - ~REC_1BYTE_SQL_NULL_MASK); - set_type(offs, SQL_NULL); + offs ^= REC_1BYTE_SQL_NULL_MASK + | SQL_NULL; } rec_offs_base(offsets)[1 + i] = offs; } while (++i < n); } else { - offs = static_cast(offs + 2 * n_fields); + offs += static_cast(2 * n_fields); any = offs; /* Determine offsets to fields */ do { offs = rec_2_get_field_end_info(rec, i); - if (offs & REC_2BYTE_SQL_NULL_MASK) { - offs &= static_cast( - ~REC_2BYTE_SQL_NULL_MASK); - set_type(offs, SQL_NULL); - } - if (offs & REC_2BYTE_EXTERN_MASK) { - offs &= static_cast( - ~REC_2BYTE_EXTERN_MASK); - set_type(offs, STORED_OFFPAGE); - any |= REC_OFFS_EXTERNAL; - } + static_assert(REC_2BYTE_SQL_NULL_MASK + == SQL_NULL, ""); + static_assert(REC_2BYTE_EXTERN_MASK + == STORED_OFFPAGE, ""); + static_assert(REC_OFFS_EXTERNAL + == STORED_OFFPAGE, ""); + any |= (offs & REC_OFFS_EXTERNAL); rec_offs_base(offsets)[1 + i] = offs; } while (++i < n); } @@ -838,7 +826,7 @@ rec_get_offsets_func( bool alter_metadata = false; ut_ad(index->n_core_fields >= n_core); - /* This assertion was relaxed for the btr_cur_open_at_index_side() + /* This assertion was relaxed for the btr_cur_t::open_leaf() call in btr_cur_instant_init_low(). We cannot invoke index->is_instant(), because the same assertion would fail there until btr_cur_instant_init_low() has invoked @@ -863,19 +851,19 @@ rec_get_offsets_func( ut_ad(!n_core); n = dict_index_get_n_unique_in_tree_nonleaf(index) + 1; break; + default: + ut_ad("corrupted record header" == 0); + /* fall through */ case REC_STATUS_INFIMUM: case REC_STATUS_SUPREMUM: /* infimum or supremum record */ ut_ad(rec_get_heap_no_new(rec) == ulint(rec_get_status(rec) - == REC_STATUS_INFIMUM - ? PAGE_HEAP_NO_INFIMUM - : PAGE_HEAP_NO_SUPREMUM)); + == REC_STATUS_INFIMUM + ? PAGE_HEAP_NO_INFIMUM + : PAGE_HEAP_NO_SUPREMUM)); n = 1; break; - default: - ut_error; - return(NULL); } } else { n = rec_get_n_fields_old(rec); @@ -897,9 +885,7 @@ rec_get_offsets_func( ut_ad(!is_user_rec || !n_core || index->is_dummy || dict_index_is_ibuf(index) || n == n_fields /* btr_pcur_restore_position() */ - || (n + (index->id == DICT_INDEXES_ID) - >= n_core && n <= index->n_fields - + unsigned(rec_is_alter_metadata(rec, false)))); + || (n + (index->id == DICT_INDEXES_ID) >= n_core)); if (is_user_rec && n_core && n < index->n_fields) { ut_ad(!index->is_dummy); @@ -1000,8 +986,7 @@ rec_get_offsets_reverse( do { rec_offs len; if (UNIV_UNLIKELY(i == n_node_ptr_field)) { - len = offs = static_cast( - offs + REC_NODE_PTR_SIZE); + len = offs += REC_NODE_PTR_SIZE; goto resolved; } @@ -1038,30 +1023,23 @@ rec_get_offsets_reverse( stored in one byte for 0..127. The length will be encoded in two bytes when it is 128 or more, or when the field is stored externally. */ - if (DATA_BIG_COL(col)) { - if (len & 0x80) { - /* 1exxxxxxx xxxxxxxx */ - len = static_cast( - len << 8 | *lens++); - - offs = static_cast( - offs + get_value(len)); - if (UNIV_UNLIKELY(len & 0x4000)) { - any_ext = REC_OFFS_EXTERNAL; - len = combine(offs, - STORED_OFFPAGE); - } else { - len = offs; - } - - goto resolved; - } + if (UNIV_UNLIKELY(len & 0x80) && DATA_BIG_COL(col)) { + /* 1exxxxxxx xxxxxxxx */ + len &= 0x7f; + len <<= 8; + len |= *lens++; + static_assert(STORED_OFFPAGE == 0x4000, ""); + static_assert(REC_OFFS_EXTERNAL == 0x4000, ""); + rec_offs ext = len & REC_OFFS_EXTERNAL; + offs += get_value(len); + len = offs | ext; + any_ext |= ext; + goto resolved; } - len = offs = static_cast(offs + len); + len = offs += len; } else { - len = offs = static_cast(offs - + field->fixed_len); + len = offs += field->fixed_len; } resolved: rec_offs_base(offsets)[i + 1] = len; @@ -1101,7 +1079,7 @@ rec_get_nth_field_offs_old( return(os); } - next_os = next_os & ~REC_1BYTE_SQL_NULL_MASK; + next_os &= ~REC_1BYTE_SQL_NULL_MASK; } else { os = rec_2_get_field_start_offs(rec, n); @@ -1113,8 +1091,7 @@ rec_get_nth_field_offs_old( return(os); } - next_os = next_os & ~(REC_2BYTE_SQL_NULL_MASK - | REC_2BYTE_EXTERN_MASK); + next_os &= ~(REC_2BYTE_SQL_NULL_MASK | REC_2BYTE_EXTERN_MASK); } *len = next_os - os; @@ -1267,7 +1244,8 @@ rec_get_converted_size_comp_prefix_low( } else if (dfield_is_ext(dfield)) { ut_ad(DATA_BIG_COL(field->col)); extra_size += 2; - } else if (len < 128 || !DATA_BIG_COL(field->col)) { + } else if (UNIV_LIKELY(len < 128) + || !DATA_BIG_COL(field->col)) { extra_size++; } else { /* For variable-length columns, we look up the @@ -1618,14 +1596,7 @@ start: /* set the null flag if necessary */ if (dfield_is_null(field)) { -#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6 -# pragma GCC diagnostic push -# pragma GCC diagnostic ignored "-Wconversion" /* GCC 5 may need this here */ -#endif *nulls |= static_cast(null_mask); -#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6 -# pragma GCC diagnostic pop -#endif null_mask <<= 1; continue; } @@ -1734,6 +1705,9 @@ rec_convert_dtuple_to_rec_new( REC_INFO_BITS_MASK, REC_INFO_BITS_SHIFT); return buf; } +#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 11 +# pragma GCC diagnostic pop /* ignored "-Wconversion" */ +#endif /*********************************************************//** Builds a physical record out of a data tuple and @@ -2096,14 +2070,12 @@ rec_copy_prefix_to_buf( stored in one byte for 0..127. The length will be encoded in two bytes when it is 128 or more, or when the column is stored externally. */ - if (DATA_BIG_COL(col)) { - if (len & 0x80) { - /* 1exxxxxx */ - len &= 0x3f; - len <<= 8; - len |= *lens--; - UNIV_PREFETCH_R(lens); - } + if (UNIV_UNLIKELY(len & 0x80) && DATA_BIG_COL(col)) { + /* 1exxxxxx */ + len &= 0x3f; + len <<= 8; + len |= *lens--; + UNIV_PREFETCH_R(lens); } prefix_len += len; } -- cgit v1.2.1 From 062ba0bd4a2da1fc720c7da8feb3f179a9be1583 Mon Sep 17 00:00:00 2001 From: Thirunarayanan Balathandayuthapani Date: Fri, 3 Mar 2023 19:05:44 +0530 Subject: MDEV-30183 Assertion `!memcmp(rec_trx_id, old_pk_trx_id->data, 6 + 7)' failed in row_log_table_apply_update - This failure caused by commit 358921ce32203a9a8dd277a5ba7ac177c9e79e53 row_ins_duplicate_online() should consider if the record is an exact match of the tuple when number of matching fields equals with number of unique fields + DB_TRX_ID + DB_ROLL_PTR --- .../suite/innodb/r/online_table_rebuild.result | 20 ++++++++++++++++++ .../suite/innodb/t/online_table_rebuild.test | 24 ++++++++++++++++++++++ storage/innobase/row/row0ins.cc | 2 +- 3 files changed, 45 insertions(+), 1 deletion(-) diff --git a/mysql-test/suite/innodb/r/online_table_rebuild.result b/mysql-test/suite/innodb/r/online_table_rebuild.result index d4bddbc5305..46d9780decb 100644 --- a/mysql-test/suite/innodb/r/online_table_rebuild.result +++ b/mysql-test/suite/innodb/r/online_table_rebuild.result @@ -43,5 +43,25 @@ t1 CREATE TABLE `t1` ( PRIMARY KEY (`f1`) ) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci DROP TABLE t1; +# +# MDEV-30183 Assertion `!memcmp(rec_trx_id, old_pk_trx_id->data, +# 6 + 7)' failed in row_log_table_apply_update +# +set @old_sql_mode = @@sql_mode; +set @@sql_mode=""; +CREATE TABLE t1(col_int int, col_varchar varchar(500))ENGINE=InnoDB; +INSERT INTO t1(col_int) values(2560); +set debug_sync="row_log_table_apply1_before SIGNAL con1_begin WAIT_FOR con1_commit"; +ALTER TABLE t1 ADD PRIMARY KEY ( col_varchar); +connection con1; +SET DEBUG_SYNC="now WAIT_FOR con1_begin"; +UPDATE t1 SET col_int = 2178; +INSERT INTO t1(col_int) VALUES(3016); +UPDATE t1 set col_int=2802; +SET DEBUG_SYNC="now SIGNAL con1_commit"; +connection default; +ERROR 23000: Duplicate entry '' for key 'PRIMARY' +DROP TABLE t1; +SET @@sql_mode = @old_sql_mode; disconnect con1; SET DEBUG_SYNC=reset; diff --git a/mysql-test/suite/innodb/t/online_table_rebuild.test b/mysql-test/suite/innodb/t/online_table_rebuild.test index 1d34738703c..02e9639eae2 100644 --- a/mysql-test/suite/innodb/t/online_table_rebuild.test +++ b/mysql-test/suite/innodb/t/online_table_rebuild.test @@ -59,5 +59,29 @@ connection default; reap; SHOW CREATE TABLE t1; DROP TABLE t1; + +--echo # +--echo # MDEV-30183 Assertion `!memcmp(rec_trx_id, old_pk_trx_id->data, +--echo # 6 + 7)' failed in row_log_table_apply_update +--echo # +set @old_sql_mode = @@sql_mode; +set @@sql_mode=""; +CREATE TABLE t1(col_int int, col_varchar varchar(500))ENGINE=InnoDB; +INSERT INTO t1(col_int) values(2560); +set debug_sync="row_log_table_apply1_before SIGNAL con1_begin WAIT_FOR con1_commit"; +send ALTER TABLE t1 ADD PRIMARY KEY ( col_varchar); + +connection con1; +SET DEBUG_SYNC="now WAIT_FOR con1_begin"; +UPDATE t1 SET col_int = 2178; +INSERT INTO t1(col_int) VALUES(3016); +UPDATE t1 set col_int=2802; +SET DEBUG_SYNC="now SIGNAL con1_commit"; + +connection default; +--error ER_DUP_ENTRY +reap; +DROP TABLE t1; +SET @@sql_mode = @old_sql_mode; disconnect con1; SET DEBUG_SYNC=reset; diff --git a/storage/innobase/row/row0ins.cc b/storage/innobase/row/row0ins.cc index 3b21b0315cd..e327717ce65 100644 --- a/storage/innobase/row/row0ins.cc +++ b/storage/innobase/row/row0ins.cc @@ -2253,7 +2253,7 @@ row_ins_duplicate_online(ulint n_uniq, const dtuple_t *entry, ulint trx_id_len; - if (fields == n_uniq + if (fields == n_uniq + 2 && memcmp(rec_get_nth_field(rec, offsets, n_uniq, &trx_id_len), reset_trx_id, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN)) { ut_ad(trx_id_len == DATA_TRX_ID_LEN); -- cgit v1.2.1 From 2458badf9be6f47574e478177031395247c86f8f Mon Sep 17 00:00:00 2001 From: Thirunarayanan Balathandayuthapani Date: Tue, 7 Mar 2023 13:00:59 +0530 Subject: MDEV-30798 deadlock between CHECK TABLE and bulk insert - Deadlock happens when bulk insert acquires the space latch before acquiring the index root page and check table does the opposite. Workaround is to avoid validating the index for check table when bulk insert is in progress for the table. --- storage/innobase/handler/ha_innodb.cc | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc index 07454d24900..f8807e5fdc7 100644 --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -15184,16 +15184,26 @@ ha_innobase::check( } if ((check_opt->flags & T_QUICK) || index->is_corrupted()) { - } else if (btr_validate_index(index, m_prebuilt->trx) - != DB_SUCCESS) { - is_ok = false; - push_warning_printf(thd, - Sql_condition::WARN_LEVEL_WARN, - ER_NOT_KEYFILE, - "InnoDB: The B-tree of" - " index %s is corrupted.", - index->name()); - continue; + } else if (trx_id_t bulk_trx_id = + m_prebuilt->table->bulk_trx_id) { + if (!m_prebuilt->trx->read_view.changes_visible( + bulk_trx_id)) { + is_ok = true; + goto func_exit; + } + + if (btr_validate_index(index, m_prebuilt->trx) + != DB_SUCCESS) { + is_ok = false; + push_warning_printf( + thd, + Sql_condition::WARN_LEVEL_WARN, + ER_NOT_KEYFILE, + "InnoDB: The B-tree of" + " index %s is corrupted.", + index->name()); + continue; + } } /* Instead of invoking change_active_index(), set up @@ -15316,6 +15326,7 @@ ha_innobase::check( } # endif /* defined UNIV_AHI_DEBUG || defined UNIV_DEBUG */ #endif /* BTR_CUR_HASH_ADAPT */ +func_exit: m_prebuilt->trx->op_info = ""; DBUG_RETURN(is_ok ? HA_ADMIN_OK : HA_ADMIN_CORRUPT); -- cgit v1.2.1 From 1e58b8afc086da755cf9209ed17fc36351da5563 Mon Sep 17 00:00:00 2001 From: Julius Goryavsky Date: Tue, 7 Mar 2023 11:13:20 +0100 Subject: move alloca() definition from all *.h files to one new header file --- debian/libmariadb-dev.install | 1 + include/CMakeLists.txt | 3 ++ include/my_alloca.h | 45 ++++++++++++++++++++++ include/my_global.h | 8 +--- include/my_sys.h | 14 +------ include/mysql/service_encryption.h | 11 ++---- .../cracklib_password_check.c | 2 +- .../handlersocket/hstcpsvr_worker.cpp | 5 +-- plugin/handler_socket/libhsclient/allocator.hpp | 1 + 9 files changed, 58 insertions(+), 32 deletions(-) create mode 100644 include/my_alloca.h diff --git a/debian/libmariadb-dev.install b/debian/libmariadb-dev.install index 1e52e2acfdc..a0737fee00c 100644 --- a/debian/libmariadb-dev.install +++ b/debian/libmariadb-dev.install @@ -14,6 +14,7 @@ usr/include/mariadb/mariadb_version.h usr/include/mariadb/my_config.h usr/include/mariadb/my_global.h usr/include/mariadb/my_sys.h +usr/include/mariadb/my_alloca.h usr/include/mariadb/mysql.h usr/include/mariadb/mysql/ usr/include/mariadb/mysql/client_plugin.h diff --git a/include/CMakeLists.txt b/include/CMakeLists.txt index 1024821e569..a82d1143649 100644 --- a/include/CMakeLists.txt +++ b/include/CMakeLists.txt @@ -37,6 +37,7 @@ SET(HEADERS ma_dyncol.h my_list.h my_alloc.h + my_alloca.h typelib.h my_dbug.h m_string.h @@ -111,7 +112,9 @@ ${footer} ENDMACRO() INSTALL_COMPAT_HEADER(my_global.h "") +INSTALL_COMPAT_HEADER(my_alloca.h "") INSTALL_COMPAT_HEADER(my_config.h "") +INSTALL_COMPAT_HEADER(my_alloca.h "") INSTALL_COMPAT_HEADER(my_sys.h "") INSTALL_COMPAT_HEADER(mysql_version.h " #include diff --git a/include/my_alloca.h b/include/my_alloca.h new file mode 100644 index 00000000000..761c2adb890 --- /dev/null +++ b/include/my_alloca.h @@ -0,0 +1,45 @@ +/* Copyright (c) 2023, MariaDB Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +#ifndef MY_ALLOCA_INCLUDED +#define MY_ALLOCA_INCLUDED + +#ifdef _WIN32 +#include /*for alloca*/ +/* + MSVC may define "alloca" when compiling in /Ze mode + (with extensions from Microsoft), but otherwise only + the _alloca function is defined: +*/ +#ifndef alloca +#define alloca _alloca +#endif +#else +#ifdef HAVE_ALLOCA_H +#include +#endif +#endif + +#if defined(HAVE_ALLOCA) +/* + If the GCC/LLVM compiler from the MinGW is used, + alloca may not be defined when using the MSVC CRT: +*/ +#if defined(__GNUC__) && !defined(HAVE_ALLOCA_H) && !defined(alloca) +#define alloca __builtin_alloca +#endif /* GNUC */ +#endif + +#endif /* MY_ALLOCA_INCLUDED */ diff --git a/include/my_global.h b/include/my_global.h index 586b3f7c59c..d4097820639 100644 --- a/include/my_global.h +++ b/include/my_global.h @@ -318,13 +318,6 @@ C_MODE_END #ifdef HAVE_UNISTD_H #include #endif -#if defined(__cplusplus) && defined(NO_CPLUSPLUS_ALLOCA) -#undef HAVE_ALLOCA -#undef HAVE_ALLOCA_H -#endif -#ifdef HAVE_ALLOCA_H -#include -#endif #include /* Recommended by debian */ /* We need the following to go around a problem with openssl on solaris */ @@ -481,6 +474,7 @@ typedef unsigned short ushort; #endif #include +#include /* Wen using the embedded library, users might run into link problems, diff --git a/include/my_sys.h b/include/my_sys.h index 0b851841f58..2a7cb32fe68 100644 --- a/include/my_sys.h +++ b/include/my_sys.h @@ -28,9 +28,7 @@ C_MODE_START #include /* for CHARSET_INFO */ #include #include -#ifdef _WIN32 -#include /*for alloca*/ -#endif +#include #include #include @@ -185,16 +183,6 @@ my_bool my_test_if_atomic_write(File handle, int pagesize); extern my_bool my_may_have_atomic_write; #if defined(HAVE_ALLOCA) && !defined(HAVE_valgrind) -#if defined(_AIX) && !defined(__GNUC__) && !defined(_AIX43) -#pragma alloca -#endif /* _AIX */ -#if defined(__MWERKS__) -#undef alloca -#define alloca _alloca -#endif /* __MWERKS__ */ -#if defined(__GNUC__) && !defined(HAVE_ALLOCA_H) && ! defined(alloca) -#define alloca __builtin_alloca -#endif /* GNUC */ #define my_alloca(SZ) alloca((size_t) (SZ)) #define my_afree(PTR) ((void)0) #define MAX_ALLOCA_SZ 4096 diff --git a/include/mysql/service_encryption.h b/include/mysql/service_encryption.h index 69d205a27e8..280b9c69e35 100644 --- a/include/mysql/service_encryption.h +++ b/include/mysql/service_encryption.h @@ -24,22 +24,19 @@ *provider* (encryption plugin). */ -#ifdef __cplusplus -extern "C" { -#endif - #ifndef MYSQL_ABI_CHECK +#include #ifdef _WIN32 -#include #ifndef __cplusplus #define inline __inline #endif #else #include -#ifdef HAVE_ALLOCA_H -#include #endif #endif + +#ifdef __cplusplus +extern "C" { #endif /* returned from encryption_key_get_latest_version() */ diff --git a/plugin/cracklib_password_check/cracklib_password_check.c b/plugin/cracklib_password_check/cracklib_password_check.c index 470e6e5280f..5a7c7f3f234 100644 --- a/plugin/cracklib_password_check/cracklib_password_check.c +++ b/plugin/cracklib_password_check/cracklib_password_check.c @@ -13,7 +13,7 @@ along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA */ -#include +#include #include #include #include diff --git a/plugin/handler_socket/handlersocket/hstcpsvr_worker.cpp b/plugin/handler_socket/handlersocket/hstcpsvr_worker.cpp index 9863602af7a..f6bbe9004c2 100644 --- a/plugin/handler_socket/handlersocket/hstcpsvr_worker.cpp +++ b/plugin/handler_socket/handlersocket/hstcpsvr_worker.cpp @@ -6,7 +6,7 @@ * See COPYRIGHT.txt for details. */ -#include +#include #include #include #include @@ -17,9 +17,6 @@ #if __linux__ #include #endif -#ifdef HAVE_ALLOCA_H -#include -#endif #include "hstcpsvr_worker.hpp" #include "string_buffer.hpp" diff --git a/plugin/handler_socket/libhsclient/allocator.hpp b/plugin/handler_socket/libhsclient/allocator.hpp index dd3a28ba7bd..9df6a1ab752 100644 --- a/plugin/handler_socket/libhsclient/allocator.hpp +++ b/plugin/handler_socket/libhsclient/allocator.hpp @@ -11,6 +11,7 @@ #include #include +#include #if 0 extern "C" { -- cgit v1.2.1 From 20d2c9038aa6e1582d9f76ba866b250438887208 Mon Sep 17 00:00:00 2001 From: Robin Newhouse Date: Wed, 8 Feb 2023 19:49:47 +0000 Subject: Fix mini-benchmark The mini-benchmark.sh script failed to run in the latest Fedora distributions in GitLab CI. It requires `lscpu` resolved by installing util-linux. Additionally, executing the benchmark inside a Docker container had failed because of increased Docker security in recent updates. In particular the `renice` and `taskset` operations are not permitted. Neither are the required `perf` operations. https://docs.docker.com/engine/security/seccomp/ Allow these operations to fail gracefully, and test then skip `perf`, allowing the remaining benchmark activities to proceed. Other minor changes to the CI are included such as allowing sanitizer jobs to fail and using "needs" in the mini-benchmark pipeline. All new code of the whole pull request, including one or several files that are either new files or modified ones, are contributed under the BSD-new license. I am contributing on behalf of my employer Amazon Web Services, Inc. --- .gitlab-ci.yml | 5 ++++- support-files/mini-benchmark.sh | 27 ++++++++++++++++++++------- 2 files changed, 24 insertions(+), 8 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 2639fc579d7..8c2b4ae363d 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -331,6 +331,7 @@ mysql-test-run-asan: needs: - "fedora-sanitizer: [-DWITH_ASAN=YES]" <<: *mysql-test-run-def + allow_failure: true artifacts: when: always # Also show results when tests fail reports: @@ -489,6 +490,8 @@ mini-benchmark: stage: test dependencies: - fedora + needs: + - fedora script: - ls -la rpm; rm -vf rpm/*.el?.* # Delete artifacts from Centos builds # Don't use cracklib, otherwise the Sysbench user password will be rejected @@ -503,7 +506,7 @@ mini-benchmark: - | mariadb --skip-column-names -e "SELECT @@version, @@version_comment" | tee /tmp/version grep $MARIADB_MAJOR_VERSION /tmp/version || echo "MariaDB didn't install properly" - - yum install -y sysbench procps-ng perf || yum install -y https://kojipkgs.fedoraproject.org//packages/luajit/2.0.4/3.el7/x86_64/luajit-2.0.4-3.el7.x86_64.rpm https://kojipkgs.fedoraproject.org//packages/sysbench/1.0.17/2.el7/x86_64/sysbench-1.0.17-2.el7.x86_64.rpm https://kojipkgs.fedoraproject.org//packages/ck/0.5.2/2.el7/x86_64/ck-0.5.2-2.el7.x86_64.rpm + - yum install -y sysbench procps-ng perf util-linux || yum install -y https://kojipkgs.fedoraproject.org//packages/luajit/2.0.4/3.el7/x86_64/luajit-2.0.4-3.el7.x86_64.rpm https://kojipkgs.fedoraproject.org//packages/sysbench/1.0.17/2.el7/x86_64/sysbench-1.0.17-2.el7.x86_64.rpm https://kojipkgs.fedoraproject.org//packages/ck/0.5.2/2.el7/x86_64/ck-0.5.2-2.el7.x86_64.rpm - /usr/share/mysql/mini-benchmark - cp -av */sysbench-run-*.log */metrics.txt .. # Move files one level down so they can be saved as artifacts artifacts: diff --git a/support-files/mini-benchmark.sh b/support-files/mini-benchmark.sh index d5dd194bc3f..18de6dbec51 100755 --- a/support-files/mini-benchmark.sh +++ b/support-files/mini-benchmark.sh @@ -106,16 +106,29 @@ then done | sort -u > mariadbd-dependencies.txt # shellcheck disable=SC2046 debuginfo-install -y mariadb-server $(cat mariadbd-dependencies.txt) - - echo "Using 'perf' to record performance counters in perf.data files" - PERF="perf record -g --freq=99 --output=perf.data --timestamp-filename --pid=$MARIADB_SERVER_PID --" + + if [ ! $(perf record echo "testing perf" > /dev/null 2>&1) ] + then + echo "perf does not have permission to run on this system. Skipping." + PERF="" + else + echo "Using 'perf' to record performance counters in perf.data files" + PERF="perf record -g --freq=99 --output=perf.data --timestamp-filename --pid=$MARIADB_SERVER_PID --" + fi elif [ -e /usr/bin/perf ] then # If flamegraphs were not requested, log normal perf counters if possible - echo "Using 'perf' to log basic performance counters for benchmark" + + if [ ! $(perf stat echo "testing perf" > /dev/null 2>&1) ] + then + echo "perf does not have permission to run on this system. Skipping." + PERF="" + else + echo "Using 'perf' to log basic performance counters for benchmark" + PERF="perf stat -p $MARIADB_SERVER_PID --" + fi fi - PERF="perf stat -p $MARIADB_SERVER_PID --" # Run sysbench on another CPU if system has more than one available if [ "$(nproc)" -gt 1 ] @@ -133,10 +146,10 @@ uname -a echo echo "Set highest priority for MariaDB Server process ID $MARIADB_SERVER_PID" -renice --priority -20 --pid "$MARIADB_SERVER_PID" +renice --priority -20 --pid "$MARIADB_SERVER_PID" || echo "renice failed. Not setting priority." echo "Set CPU affinity 0 for MariaDB Server process ID $MARIADB_SERVER_PID" -taskset -cp 0 "$MARIADB_SERVER_PID" +taskset -cp 0 "$MARIADB_SERVER_PID" || echo "taskset failed. Not setting cpu affinity." mariadb -e " CREATE DATABASE IF NOT EXISTS sbtest; -- cgit v1.2.1 From 231c0eb7a68570da6b5b3741b2f1431c2f43c475 Mon Sep 17 00:00:00 2001 From: Weijun Huang Date: Thu, 2 Mar 2023 21:21:46 +0100 Subject: MDEV-23000: Ensure we get a warning from THD::drop_temporary_table() in case of disk errors --- sql/sql_class.h | 2 +- sql/temporary_tables.cc | 13 +++++++------ 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/sql/sql_class.h b/sql/sql_class.h index 8c51312ce2a..a5a5f3df44d 100644 --- a/sql/sql_class.h +++ b/sql/sql_class.h @@ -5050,7 +5050,7 @@ private: bool use_temporary_table(TABLE *table, TABLE **out_table); void close_temporary_table(TABLE *table); bool log_events_and_free_tmp_shares(); - void free_tmp_table_share(TMP_TABLE_SHARE *share, bool delete_table); + bool free_tmp_table_share(TMP_TABLE_SHARE *share, bool delete_table); void free_temporary_table(TABLE *table); bool lock_temporary_tables(); void unlock_temporary_tables(); diff --git a/sql/temporary_tables.cc b/sql/temporary_tables.cc index 8236d157b73..8555aa1a7f5 100644 --- a/sql/temporary_tables.cc +++ b/sql/temporary_tables.cc @@ -670,7 +670,7 @@ bool THD::drop_temporary_table(TABLE *table, bool *is_trans, bool delete_table) temporary_tables->remove(share); /* Free the TABLE_SHARE and/or delete the files. */ - free_tmp_table_share(share, delete_table); + result= free_tmp_table_share(share, delete_table); end: if (locked) @@ -1455,20 +1455,21 @@ bool THD::log_events_and_free_tmp_shares() @param share [IN] TABLE_SHARE to free @param delete_table [IN] Whether to delete the table files? - @return void + @return false Success + true Error */ -void THD::free_tmp_table_share(TMP_TABLE_SHARE *share, bool delete_table) +bool THD::free_tmp_table_share(TMP_TABLE_SHARE *share, bool delete_table) { + bool error= false; DBUG_ENTER("THD::free_tmp_table_share"); if (delete_table) { - rm_temporary_table(share->db_type(), share->path.str); + error= rm_temporary_table(share->db_type(), share->path.str); } free_table_share(share); my_free(share); - - DBUG_VOID_RETURN; + DBUG_RETURN(error); } -- cgit v1.2.1 From 08267ba0c88d2f3ba1bacee9bb9a1e4da921a60a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marko=20M=C3=A4kel=C3=A4?= Date: Thu, 9 Mar 2023 16:16:58 +0200 Subject: MDEV-30819 InnoDB fails to start up after downgrading from MariaDB 11.0 While downgrades are not supported and misguided attempts at it could cause serious corruption especially after commit b07920b634f455c39e3650c6163bec2a8ce0ffe0 it might be useful if InnoDB would start up even after an upgrade to MariaDB Server 11.0 or later had removed the change buffer. innodb_change_buffering_update(): Disallow anything else than innodb_change_buffering=none when the change buffer is corrupted. ibuf_init_at_db_start(): Mention a possible downgrade in the corruption error message. If innodb_change_buffering=none, ignore the error but do not initialize ibuf.index. ibuf_free_excess_pages(), ibuf_contract(), ibuf_merge_space(), ibuf_update_max_tablespace_id(), ibuf_delete_for_discarded_space(), ibuf_print(): Check for !ibuf.index. ibuf_check_bitmap_on_import(): Remove some unnecessary code. This function is only accessing change buffer bitmap pages in a data file that is not attached to the rest of the database. It is not accessing the change buffer tree itself, hence it does not need any additional mutex protection. This has been tested both by starting up MariaDB Server 10.8 on a 11.0 data directory, and by running ./mtr --big-test while ibuf_init_at_db_start() was tweaked to always fail. --- storage/innobase/handler/ha_innodb.cc | 14 +++++++++++++- storage/innobase/ibuf/ibuf0ibuf.cc | 26 ++++++++++++++------------ storage/innobase/srv/srv0start.cc | 2 +- 3 files changed, 28 insertions(+), 14 deletions(-) diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc index f359f843049..ef7b8e51794 100644 --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -19719,10 +19719,22 @@ static MYSQL_SYSVAR_BOOL(numa_interleave, srv_numa_interleave, NULL, NULL, FALSE); #endif /* HAVE_LIBNUMA */ +static void innodb_change_buffering_update(THD *thd, struct st_mysql_sys_var*, + void*, const void *save) +{ + ulong i= *static_cast(save); + if (i != IBUF_USE_NONE && !ibuf.index) + push_warning(thd, Sql_condition::WARN_LEVEL_WARN, ER_NOT_KEYFILE, + "InnoDB: The change buffer is corrupted."); + else + innodb_change_buffering= i; +} + static MYSQL_SYSVAR_ENUM(change_buffering, innodb_change_buffering, PLUGIN_VAR_RQCMDARG, "Buffer changes to secondary indexes.", - NULL, NULL, IBUF_USE_NONE, &innodb_change_buffering_typelib); + nullptr, innodb_change_buffering_update, + IBUF_USE_NONE, &innodb_change_buffering_typelib); static MYSQL_SYSVAR_UINT(change_buffer_max_size, srv_change_buffer_max_size, diff --git a/storage/innobase/ibuf/ibuf0ibuf.cc b/storage/innobase/ibuf/ibuf0ibuf.cc index dff0ad57057..fc97aabfa13 100644 --- a/storage/innobase/ibuf/ibuf0ibuf.cc +++ b/storage/innobase/ibuf/ibuf0ibuf.cc @@ -422,8 +422,13 @@ ibuf_init_at_db_start(void) if (!header_page) { err_exit: - sql_print_error("InnoDB: The change buffer is corrupted"); + sql_print_error("InnoDB: The change buffer is corrupted" + " or has been removed on upgrade" + " to MariaDB 11.0 or later"); mtr.commit(); + if (innodb_change_buffering == IBUF_USE_NONE) { + err = DB_SUCCESS; + } return err; } @@ -2002,6 +2007,7 @@ void ibuf_free_excess_pages(void) /*========================*/ { + if (UNIV_UNLIKELY(!ibuf.index)) return; /* Free at most a few pages at a time, so that we do not delay the requested service too much */ @@ -2439,6 +2445,7 @@ will be merged from ibuf trees to the pages read @retval 0 if ibuf.empty */ ulint ibuf_contract() { + if (UNIV_UNLIKELY(!ibuf.index)) return 0; mtr_t mtr; btr_pcur_t pcur; ulint sum_sizes; @@ -2494,6 +2501,7 @@ ibuf_merge_space( /*=============*/ ulint space) /*!< in: tablespace id to merge */ { + if (UNIV_UNLIKELY(!ibuf.index)) return 0; mtr_t mtr; btr_pcur_t pcur; @@ -2952,6 +2960,7 @@ void ibuf_update_max_tablespace_id(void) /*===============================*/ { + if (UNIV_UNLIKELY(!ibuf.index)) return; ulint max_space_id; const rec_t* rec; const byte* field; @@ -2959,7 +2968,7 @@ ibuf_update_max_tablespace_id(void) btr_pcur_t pcur; mtr_t mtr; - ut_a(!dict_table_is_comp(ibuf.index->table)); + ut_ad(!ibuf.index->table->not_redundant()); ibuf_mtr_start(&mtr); @@ -4499,6 +4508,8 @@ in DISCARD TABLESPACE, IMPORT TABLESPACE, or read-ahead. @param[in] space missing or to-be-discarded tablespace */ void ibuf_delete_for_discarded_space(ulint space) { + if (UNIV_UNLIKELY(!ibuf.index)) return; + btr_pcur_t pcur; const rec_t* ibuf_rec; mtr_t mtr; @@ -4608,6 +4619,7 @@ ibuf_print( /*=======*/ FILE* file) /*!< in: file where to print */ { + if (UNIV_UNLIKELY(!ibuf.index)) return; mutex_enter(&ibuf_mutex); fprintf(file, @@ -4647,8 +4659,6 @@ dberr_t ibuf_check_bitmap_on_import(const trx_t* trx, fil_space_t* space) mtr_t mtr; - mutex_enter(&ibuf_mutex); - /* The two bitmap pages (allocation bitmap and ibuf bitmap) repeat every page_size pages. For example if page_size is 16 KiB, then the two bitmap pages repeat every 16 KiB * 16384 = 256 MiB. In the loop @@ -4657,17 +4667,14 @@ dberr_t ibuf_check_bitmap_on_import(const trx_t* trx, fil_space_t* space) for (uint32_t page_no = 0; page_no < size; page_no += physical_size) { if (trx_is_interrupted(trx)) { - mutex_exit(&ibuf_mutex); return(DB_INTERRUPTED); } mtr_start(&mtr); - ibuf_enter(&mtr); buf_block_t* bitmap_page = ibuf_bitmap_get_map_page( page_id_t(space->id, page_no), zip_size, &mtr); if (!bitmap_page) { - mutex_exit(&ibuf_mutex); mtr.commit(); return DB_CORRUPTION; } @@ -4690,7 +4697,6 @@ dberr_t ibuf_check_bitmap_on_import(const trx_t* trx, fil_space_t* space) physical_size))); } #endif /* UNIV_DEBUG */ - ibuf_exit(&mtr); mtr_commit(&mtr); continue; } @@ -4704,8 +4710,6 @@ dberr_t ibuf_check_bitmap_on_import(const trx_t* trx, fil_space_t* space) bitmap_page->frame, cur_page_id, zip_size, IBUF_BITMAP_IBUF, &mtr)) { - mutex_exit(&ibuf_mutex); - ibuf_exit(&mtr); mtr_commit(&mtr); ib_errf(trx->mysql_thd, @@ -4738,11 +4742,9 @@ dberr_t ibuf_check_bitmap_on_import(const trx_t* trx, fil_space_t* space) } } - ibuf_exit(&mtr); mtr_commit(&mtr); } - mutex_exit(&ibuf_mutex); return(DB_SUCCESS); } diff --git a/storage/innobase/srv/srv0start.cc b/storage/innobase/srv/srv0start.cc index 311a01ed719..f56f2846872 100644 --- a/storage/innobase/srv/srv0start.cc +++ b/storage/innobase/srv/srv0start.cc @@ -2046,7 +2046,7 @@ void innodb_shutdown() || srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO); ut_ad(lock_sys.is_initialised() || !srv_was_started); ut_ad(log_sys.is_initialised() || !srv_was_started); - ut_ad(ibuf.index || !srv_was_started + ut_ad(ibuf.index || !innodb_change_buffering || !srv_was_started || srv_force_recovery >= SRV_FORCE_NO_IBUF_MERGE); dict_stats_deinit(); -- cgit v1.2.1 From 7d6b3d40085562d6f9f110f4ba921cf061548844 Mon Sep 17 00:00:00 2001 From: Vlad Lesin Date: Mon, 6 Mar 2023 19:09:13 +0300 Subject: MDEV-30775 Performance regression in fil_space_t::try_to_close() introduced in MDEV-23855 fil_node_open_file_low() tries to close files from the top of fil_system.space_list if the number of opened files is exceeded. It invokes fil_space_t::try_to_close(), which iterates the list searching for the first opened space. Then it just closes the space, leaving it in the same position in fil_system.space_list. On heavy files opening, like during 'SHOW TABLE STATUS ...' execution, if the number of opened files limit is reached, fil_space_t::try_to_close() iterates more and more closed spaces before reaching any opened space for each fil_node_open_file_low() call. What causes performance regression if the number of spaces is big enough. The fix is to keep opened spaces at the top of fil_system.space_list, and move closed files at the end of the list. For this purpose fil_space_t::space_list_last_opened pointer is introduced. It points to the last inserted opened space in fil_space_t::space_list. When space is opened, it's inserted to the position just after the pointer points to in fil_space_t::space_list to preserve the logic, inroduced in MDEV-23855. Any closed space is added to the end of fil_space_t::space_list. As opened spaces are located at the top of fil_space_t::space_list, fil_space_t::try_to_close() finds opened space faster. There can be the case when opened and closed spaces are mixed in fil_space_t::space_list if fil_system.freeze_space_list was set during fil_node_open_file_low() execution. But this should not cause any error, as fil_space_t::try_to_close() still iterates spaces in the list. There is no need in any test case for the fix, as it does not change any functionality, but just fixes performance regression. --- extra/mariabackup/xtrabackup.cc | 7 +++-- storage/innobase/fil/fil0fil.cc | 33 ++++++++++++++++------- storage/innobase/include/fil0fil.h | 54 +++++++++++++++++++++++++++++++++++--- storage/innobase/srv/srv0start.cc | 3 ++- 4 files changed, 81 insertions(+), 16 deletions(-) diff --git a/extra/mariabackup/xtrabackup.cc b/extra/mariabackup/xtrabackup.cc index 522072a05fd..c57c2685c94 100644 --- a/extra/mariabackup/xtrabackup.cc +++ b/extra/mariabackup/xtrabackup.cc @@ -3361,7 +3361,9 @@ static void xb_load_single_table_tablespace(const char *dirname, if (err == DB_SUCCESS && file->space_id() != SRV_TMP_SPACE_ID) { space = fil_space_t::create( name, file->space_id(), file->flags(), - FIL_TYPE_TABLESPACE, NULL/* TODO: crypt_data */); + FIL_TYPE_TABLESPACE, NULL/* TODO: crypt_data */, + FIL_ENCRYPTION_DEFAULT, + file->handle() != OS_FILE_CLOSED); ut_a(space != NULL); space->add(file->filepath(), @@ -5242,7 +5244,8 @@ exit: ut_ad(fil_space_t::physical_size(flags) == info.page_size); if (fil_space_t::create(dest_space_name, info.space_id, flags, - FIL_TYPE_TABLESPACE, 0)) { + FIL_TYPE_TABLESPACE, 0, FIL_ENCRYPTION_DEFAULT, + true)) { *success = xb_space_create_file(real_name, info.space_id, flags, &file); } else { diff --git a/storage/innobase/fil/fil0fil.cc b/storage/innobase/fil/fil0fil.cc index 45786e39696..9b6afbeb793 100644 --- a/storage/innobase/fil/fil0fil.cc +++ b/storage/innobase/fil/fil0fil.cc @@ -119,6 +119,9 @@ bool fil_space_t::try_to_close(bool print_info) } node->close(); + + fil_system.move_closed_last_to_space_list(node->space); + return true; } @@ -409,13 +412,7 @@ static bool fil_node_open_file_low(fil_node_t *node) ut_ad(node->is_open()); - if (UNIV_LIKELY(!fil_system.freeze_space_list)) - { - /* Move the file last in fil_system.space_list, so that - fil_space_t::try_to_close() should close it as a last resort. */ - UT_LIST_REMOVE(fil_system.space_list, node->space); - UT_LIST_ADD_LAST(fil_system.space_list, node->space); - } + fil_system.move_opened_last_to_space_list(node->space); fil_system.n_open++; return true; @@ -809,6 +806,8 @@ std::vector fil_system_t::detach(fil_space_t *space, space->is_in_default_encrypt= false; default_encrypt_tables.remove(*space); } + if (space_list_last_opened == space) + space_list_last_opened = UT_LIST_GET_PREV(space_list, space); UT_LIST_REMOVE(space_list, space); if (space == sys_space) sys_space= nullptr; @@ -933,12 +932,14 @@ fil_space_free( @param purpose tablespace purpose @param crypt_data encryption information @param mode encryption mode +@param opened true if space files are opened @return pointer to created tablespace, to be filled in with add() @retval nullptr on failure (such as when the same tablespace exists) */ fil_space_t *fil_space_t::create(const char *name, ulint id, ulint flags, fil_type_t purpose, fil_space_crypt_t *crypt_data, - fil_encryption_t mode) + fil_encryption_t mode, + bool opened) { fil_space_t* space; @@ -1004,7 +1005,10 @@ fil_space_t *fil_space_t::create(const char *name, ulint id, ulint flags, HASH_INSERT(fil_space_t, hash, &fil_system.spaces, id, space); - UT_LIST_ADD_LAST(fil_system.space_list, space); + if (opened) + fil_system.add_opened_last_to_space_list(space); + else + UT_LIST_ADD_LAST(fil_system.space_list, space); switch (id) { case 0: @@ -1334,6 +1338,15 @@ void fil_system_t::close() #endif /* __linux__ */ } +void fil_system_t::add_opened_last_to_space_list(fil_space_t *space) +{ + if (UNIV_LIKELY(space_list_last_opened != nullptr)) + UT_LIST_INSERT_AFTER(space_list, space_list_last_opened, space); + else + UT_LIST_ADD_FIRST(space_list, space); + space_list_last_opened= space; +} + /** Extend all open data files to the recovered size */ ATTRIBUTE_COLD void fil_system_t::extend_to_recv_size() { @@ -2412,7 +2425,7 @@ err_exit: if (fil_space_t* space = fil_space_t::create(name, space_id, flags, FIL_TYPE_TABLESPACE, - crypt_data, mode)) { + crypt_data, mode, true)) { space->punch_hole = punch_hole; fil_node_t* node = space->add(path, file, size, false, true); mtr_t mtr; diff --git a/storage/innobase/include/fil0fil.h b/storage/innobase/include/fil0fil.h index af941e359f8..b124f5c6358 100644 --- a/storage/innobase/include/fil0fil.h +++ b/storage/innobase/include/fil0fil.h @@ -906,11 +906,13 @@ public: @param purpose tablespace purpose @param crypt_data encryption information @param mode encryption mode + @param opened true if space files are opened @return pointer to created tablespace, to be filled in with add() @retval nullptr on failure (such as when the same tablespace exists) */ static fil_space_t *create(const char *name, ulint id, ulint flags, fil_type_t purpose, fil_space_crypt_t *crypt_data, - fil_encryption_t mode= FIL_ENCRYPTION_DEFAULT); + fil_encryption_t mode= FIL_ENCRYPTION_DEFAULT, + bool opened= false); MY_ATTRIBUTE((warn_unused_result)) /** Acquire a tablespace reference. @@ -1361,6 +1363,11 @@ struct fil_system_t { private: bool m_initialised; + + /** Points to the last opened space in space_list. Protected with + fil_system.mutex. */ + fil_space_t *space_list_last_opened= nullptr; + #ifdef __linux__ /** available block devices that reside on non-rotational storage */ std::vector ssd; @@ -1405,8 +1412,11 @@ public: /** nonzero if fil_node_open_file_low() should avoid moving the tablespace to the end of space_list, for FIFO policy of try_to_close() */ ulint freeze_space_list; - UT_LIST_BASE_NODE_T(fil_space_t) space_list; - /*!< list of all file spaces */ + + /** List of all file spaces, opened spaces should be at the top of the list + to optimize try_to_close() execution. Protected with fil_system.mutex. */ + UT_LIST_BASE_NODE_T(fil_space_t) space_list; + UT_LIST_BASE_NODE_T(fil_space_t) named_spaces; /*!< list of all file spaces for which a FILE_MODIFY @@ -1422,6 +1432,44 @@ public: has issued a warning about potential space_id reuse */ + /** Add the file to the end of opened spaces list in + fil_system.space_list, so that fil_space_t::try_to_close() should close + it as a last resort. + @param space space to add */ + void add_opened_last_to_space_list(fil_space_t *space); + + /** Move the file to the end of opened spaces list in + fil_system.space_list, so that fil_space_t::try_to_close() should close + it as a last resort. + @param space space to move */ + void move_opened_last_to_space_list(fil_space_t *space) + { + /* In the case when several files of the same space are added in a + row, there is no need to remove and add a space to the same position + in space_list. It can be for system or temporary tablespaces. */ + if (freeze_space_list || space_list_last_opened == space) + return; + + UT_LIST_REMOVE(space_list, space); + + add_opened_last_to_space_list(space); + } + + /** Move closed file last in fil_system.space_list, so that + fil_space_t::try_to_close() iterates opened files first in FIFO order, + i.e. first opened, first closed. + @param space space to move */ + void move_closed_last_to_space_list(fil_space_t *space) + { + if (UNIV_UNLIKELY(freeze_space_list)) + return; + + if (space_list_last_opened == space) + space_list_last_opened= UT_LIST_GET_PREV(space_list, space); + UT_LIST_REMOVE(space_list, space); + UT_LIST_ADD_LAST(space_list, space); + } + /** Return the next tablespace from default_encrypt_tables list. @param space previous tablespace (nullptr to start from the start) @param recheck whether the removal condition needs to be rechecked after diff --git a/storage/innobase/srv/srv0start.cc b/storage/innobase/srv/srv0start.cc index f56f2846872..cc2bb699fd9 100644 --- a/storage/innobase/srv/srv0start.cc +++ b/storage/innobase/srv/srv0start.cc @@ -563,7 +563,8 @@ err_exit: fil_set_max_space_id_if_bigger(space_id); fil_space_t *space= fil_space_t::create(undo_name, space_id, fsp_flags, - FIL_TYPE_TABLESPACE, NULL); + FIL_TYPE_TABLESPACE, NULL, + FIL_ENCRYPTION_DEFAULT, true); ut_a(fil_validate()); ut_a(space); -- cgit v1.2.1 From 47036387756d465c68ac4360eb031e78f48d9e58 Mon Sep 17 00:00:00 2001 From: Alexander Barkov Date: Tue, 14 Mar 2023 05:29:04 +0400 Subject: MDEV-30805 SIGSEGV in my_convert and UBSAN: member access within null pointer of type 'const struct MY_CHARSET_HANDLER' in my_convert Type_handler::partition_field_append_value() erroneously passed the address of my_collation_contextually_typed_binary to conversion functions copy_and_convert() and my_convert(). This happened because generate_partition_syntax_for_frm() was called from mysql_create_frm_image() in the stage when the fields in List can still contain unresolved contextual collations, like "binary" in the reported crash scenario: ALTER TABLE t CHANGE COLUMN a a CHAR BINARY; Fix: 1. Splitting mysql_prepare_create_table() into two parts: - mysql_prepare_create_table_stage1() interates through List and calls Create_field::prepare_stage1(), which performs basic attribute initialization, including context collation resolution. - mysql_prepare_create_table_finalize() - the rest of the old mysql_prepare_create_table() code. 2. Changing mysql_create_frm_image(): It now calls: - mysql_prepare_create_table_stage1() in the very beginning, before the partition related code. - mysql_prepare_create_table_finalize() in the end, instead of the old mysql_prepare_create_table() call 3. Adding mysql_prepare_create_table() as a wrapper for two calls: mysql_prepare_create_table_stage1() || mysql_prepare_create_table_finalize() so the code stays unchanged in the other places where mysql_prepare_create_table() was used. 4. Changing prototype for Type_handler::Column_definition_prepare_stage1() Removing arguments: - handler *file - ulonglong table_flags Adding a new argument instead: - column_definition_type_t type This allows to call Column_definition_prepare_stage1() and therefore to call mysql_prepare_create_table_stage1() before instantiation of a handler. This simplifies the code, because in case of a partitioned table, mysql_create_frm_image() creates a handler of the underlying partition first, the frees it and created a ha_partition instance instead. mysql_prepare_create_table() before the fix was called with the final (ha_partition) handler. 5. Moving parts of Column_definition_prepare_stage1() which need a pointer to handler and table_flags to Column_definition_prepare_stage2(). --- mysql-test/main/partition_charset.result | 21 +++ mysql-test/main/partition_charset.test | 20 +++ mysql-test/main/partition_utf8-debug.result | 2 +- sql/field.h | 10 +- sql/json_table.cc | 5 +- sql/sql_table.cc | 221 ++++++++++++++++++---------- sql/sql_type.cc | 42 +++--- sql/sql_type.h | 27 ++-- sql/sql_type_fixedbin.h | 4 +- sql/sql_type_geom.cc | 3 +- sql/sql_type_geom.h | 3 +- 11 files changed, 221 insertions(+), 137 deletions(-) diff --git a/mysql-test/main/partition_charset.result b/mysql-test/main/partition_charset.result index f8f75e8bee7..a0019dd8fc3 100644 --- a/mysql-test/main/partition_charset.result +++ b/mysql-test/main/partition_charset.result @@ -20,3 +20,24 @@ create table t1 (a varchar(1), primary key (a)) partition by list (ascii(a)) (partition p1 values in (65)); ERROR HY000: This partition function is not allowed +# +# Start of 10.9 tests +# +# +# MDEV-30805 SIGSEGV in my_convert and UBSAN: member access within null pointer of type 'const struct MY_CHARSET_HANDLER' in my_convert +# +CREATE TABLE t1 (a CHAR CHARACTER SET ucs2) +PARTITION BY RANGE COLUMNS (a) +(PARTITION p0 VALUES LESS THAN ('a')); +ALTER TABLE t1 CHANGE COLUMN a a CHAR BINARY; +SHOW CREATE TABLE t1; +Table Create Table +t1 CREATE TABLE `t1` ( + `a` char(1) CHARACTER SET latin1 COLLATE latin1_bin DEFAULT NULL +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci + PARTITION BY RANGE COLUMNS(`a`) +(PARTITION `p0` VALUES LESS THAN ('a') ENGINE = MyISAM) +DROP TABLE t1; +# +# End of 10.9 tests +# diff --git a/mysql-test/main/partition_charset.test b/mysql-test/main/partition_charset.test index 6842e5268fa..b8a17ce4fca 100644 --- a/mysql-test/main/partition_charset.test +++ b/mysql-test/main/partition_charset.test @@ -27,3 +27,23 @@ partition by list (ascii(a)) #insert into t1 values ('A'); #replace into t1 values ('A'); #drop table t1; + +--echo # +--echo # Start of 10.9 tests +--echo # + +--echo # +--echo # MDEV-30805 SIGSEGV in my_convert and UBSAN: member access within null pointer of type 'const struct MY_CHARSET_HANDLER' in my_convert +--echo # + +CREATE TABLE t1 (a CHAR CHARACTER SET ucs2) + PARTITION BY RANGE COLUMNS (a) + (PARTITION p0 VALUES LESS THAN ('a')); +ALTER TABLE t1 CHANGE COLUMN a a CHAR BINARY; +SHOW CREATE TABLE t1; +DROP TABLE t1; + +--echo # +--echo # End of 10.9 tests +--echo # + diff --git a/mysql-test/main/partition_utf8-debug.result b/mysql-test/main/partition_utf8-debug.result index db1396198ab..2b4982a3dcc 100644 --- a/mysql-test/main/partition_utf8-debug.result +++ b/mysql-test/main/partition_utf8-debug.result @@ -77,7 +77,7 @@ CREATE OR REPLACE TABLE t1 (a DATE) CHARACTER SET utf8 PARTITION BY LIST COLUMNS (a) (PARTITION p0 VALUES IN (FROM_DAYS(100))); Warnings: Note 1003 PARTITION BY LIST COLUMNS(`a`) -(PARTITION `p0` VALUES IN (_utf8mb3 0x303030302d30302d3030) ENGINE = MyISAM) +(PARTITION `p0` VALUES IN (_latin1 0x303030302d30302d3030) ENGINE = MyISAM) SELECT PARTITION_DESCRIPTION FROM INFORMATION_SCHEMA.PARTITIONS WHERE TABLE_NAME='t1'; PARTITION_DESCRIPTION '0000-00-00' diff --git a/sql/field.h b/sql/field.h index 13d80099124..d4b59a88f59 100644 --- a/sql/field.h +++ b/sql/field.h @@ -5377,7 +5377,7 @@ public: bool sp_prepare_create_field(THD *thd, MEM_ROOT *mem_root); bool prepare_stage1(THD *thd, MEM_ROOT *mem_root, - handler *file, ulonglong table_flags, + column_definition_type_t type, const Column_derived_attributes *derived_attr); void prepare_stage1_simple(CHARSET_INFO *cs) { @@ -5385,11 +5385,9 @@ public: create_length_to_internal_length_simple(); } bool prepare_stage1_typelib(THD *thd, MEM_ROOT *mem_root, - handler *file, ulonglong table_flags); - bool prepare_stage1_string(THD *thd, MEM_ROOT *mem_root, - handler *file, ulonglong table_flags); - bool prepare_stage1_bit(THD *thd, MEM_ROOT *mem_root, - handler *file, ulonglong table_flags); + column_definition_type_t deftype); + bool prepare_stage1_string(THD *thd, MEM_ROOT *mem_root); + bool prepare_stage1_bit(THD *thd, MEM_ROOT *mem_root); bool bulk_alter(const Column_derived_attributes *derived_attr, const Column_bulk_alter_attributes *bulk_attr) diff --git a/sql/json_table.cc b/sql/json_table.cc index 05ee83bd3d8..4f3cfb6b090 100644 --- a/sql/json_table.cc +++ b/sql/json_table.cc @@ -787,8 +787,9 @@ bool Create_json_table::add_json_table_fields(THD *thd, TABLE *table, */ sql_f->length= sql_f->char_length; - if (sql_f->prepare_stage1(thd, thd->mem_root, table->file, - table->file->ha_table_flags(), &da)) + if (sql_f->prepare_stage1(thd, thd->mem_root, + COLUMN_DEFINITION_TABLE_FIELD, + &da)) goto err_exit; while ((jc2= it2++) != jc) diff --git a/sql/sql_table.cc b/sql/sql_table.cc index 5fc564e73b8..fe808178a71 100644 --- a/sql/sql_table.cc +++ b/sql/sql_table.cc @@ -2395,17 +2395,16 @@ static void check_duplicate_key(THD *thd, const Key *key, const KEY *key_info, bool Column_definition::prepare_stage1_typelib(THD *thd, MEM_ROOT *mem_root, - handler *file, - ulonglong table_flags) + column_definition_type_t deftype) { /* Pass the last parameter to prepare_interval_field() as follows: - - If we are preparing for an SP variable (file is NULL), we pass "false", + - If we are preparing for an SP variable, we pass "false", to force allocation and full copying of TYPELIB values on the given mem_root, even if no character set conversion is needed. This is needed because a life cycle of an SP variable is longer than the current query. - - If we are preparing for a CREATE TABLE, (file != NULL), we pass "true". + - If we are preparing for a CREATE TABLE, we pass "true". This will create the typelib in runtime memory - we will free the occupied memory at the same time when we free this sql_field -- at the end of execution. @@ -2413,11 +2412,11 @@ bool Column_definition::prepare_stage1_typelib(THD *thd, values in "interval" in cases when no character conversion is needed, to avoid extra copying. */ - if (prepare_interval_field(mem_root, file != NULL)) + if (prepare_interval_field(mem_root, + deftype == COLUMN_DEFINITION_TABLE_FIELD)) return true; // E.g. wrong values with commas: SET('a,b') create_length_to_internal_length_typelib(); - DBUG_ASSERT(file || !default_value); // SP variables have no default_value if (default_value && default_value->expr->basic_const_item()) { if ((charset != default_value->expr->collation.collation && @@ -2430,14 +2429,11 @@ bool Column_definition::prepare_stage1_typelib(THD *thd, bool Column_definition::prepare_stage1_string(THD *thd, - MEM_ROOT *mem_root, - handler *file, - ulonglong table_flags) + MEM_ROOT *mem_root) { create_length_to_internal_length_string(); if (prepare_blob_field(thd)) return true; - DBUG_ASSERT(file || !default_value); // SP variables have no default_value /* Convert the default value from client character set into the column character set if necessary. @@ -2457,13 +2453,9 @@ bool Column_definition::prepare_stage1_string(THD *thd, bool Column_definition::prepare_stage1_bit(THD *thd, - MEM_ROOT *mem_root, - handler *file, - ulonglong table_flags) + MEM_ROOT *mem_root) { pack_flag= FIELDFLAG_NUMBER; - if (!(table_flags & HA_CAN_BIT_FIELD)) - pack_flag|= FIELDFLAG_TREAT_BIT_AS_CHAR; create_length_to_internal_length_bit(); return false; } @@ -2471,14 +2463,15 @@ bool Column_definition::prepare_stage1_bit(THD *thd, bool Column_definition::prepare_stage1(THD *thd, MEM_ROOT *mem_root, - handler *file, - ulonglong table_flags, + column_definition_type_t deftype, const Column_derived_attributes *derived_attr) { + // SP variables have no default_value + DBUG_ASSERT(deftype == COLUMN_DEFINITION_TABLE_FIELD || !default_value); + return type_handler()->Column_definition_prepare_stage1(thd, mem_root, - this, file, - table_flags, + this, deftype, derived_attr); } @@ -2702,10 +2695,77 @@ key_add_part_check_null(const handler *file, KEY *key_info, /* - Preparation for table creation + Prepare for a table creation. + Stage 1: prepare the field list. +*/ +static bool mysql_prepare_create_table_stage1(THD *thd, + HA_CREATE_INFO *create_info, + Alter_info *alter_info) +{ + DBUG_ENTER("mysql_prepare_create_table_stage1"); + const Column_derived_attributes dattr(create_info->default_table_charset); + const Column_bulk_alter_attributes + battr(create_info->alter_table_convert_to_charset); + Create_field *sql_field; + List_iterator_fast it(alter_info->create_list); + + DBUG_EXECUTE_IF("test_pseudo_invisible",{ + mysql_add_invisible_field(thd, &alter_info->create_list, + "invisible", &type_handler_slong, INVISIBLE_SYSTEM, + new (thd->mem_root)Item_int(thd, 9)); + }); + DBUG_EXECUTE_IF("test_completely_invisible",{ + mysql_add_invisible_field(thd, &alter_info->create_list, + "invisible", &type_handler_slong, INVISIBLE_FULL, + new (thd->mem_root)Item_int(thd, 9)); + }); + DBUG_EXECUTE_IF("test_invisible_index",{ + LEX_CSTRING temp; + temp.str= "invisible"; + temp.length= strlen("invisible"); + mysql_add_invisible_index(thd, &alter_info->key_list + , &temp, Key::MULTIPLE); + }); + + + for ( ; (sql_field=it++) ; ) + { + /* Virtual fields are always NULL */ + if (sql_field->vcol_info) + sql_field->flags&= ~NOT_NULL_FLAG; + + /* + Initialize length from its original value (number of characters), + which was set in the parser. This is necessary if we're + executing a prepared statement for the second time. + */ + sql_field->length= sql_field->char_length; + + if (sql_field->bulk_alter(&dattr, &battr)) + DBUG_RETURN(true); + + if (sql_field->prepare_stage1(thd, thd->mem_root, + COLUMN_DEFINITION_TABLE_FIELD, + &dattr)) + DBUG_RETURN(true); + + DBUG_ASSERT(sql_field->charset); + + if (check_column_name(sql_field->field_name.str)) + { + my_error(ER_WRONG_COLUMN_NAME, MYF(0), sql_field->field_name.str); + DBUG_RETURN(TRUE); + } + } + DBUG_RETURN(false); +} + + +/* + Preparation for table creation, final stage. SYNOPSIS - mysql_prepare_create_table() + mysql_prepare_create_table_finalize() thd Thread object. create_info Create information (like MAX_ROWS). alter_info List of columns and indexes to create @@ -2728,11 +2788,12 @@ key_add_part_check_null(const handler *file, KEY *key_info, */ static int -mysql_prepare_create_table(THD *thd, HA_CREATE_INFO *create_info, - Alter_info *alter_info, uint *db_options, - handler *file, KEY **key_info_buffer, - uint *key_count, int create_table_mode, - const LEX_CSTRING db, const LEX_CSTRING table_name) +mysql_prepare_create_table_finalize(THD *thd, HA_CREATE_INFO *create_info, + Alter_info *alter_info, uint *db_options, + handler *file, KEY **key_info_buffer, + uint *key_count, int create_table_mode, + const LEX_CSTRING db, + const LEX_CSTRING table_name) { const char *key_name; Create_field *sql_field,*dup_field; @@ -2748,28 +2809,8 @@ mysql_prepare_create_table(THD *thd, HA_CREATE_INFO *create_info, bool tmp_table= create_table_mode == C_ALTER_TABLE; const bool create_simple= thd->lex->create_simple(); bool is_hash_field_needed= false; - const Column_derived_attributes dattr(create_info->default_table_charset); - const Column_bulk_alter_attributes - battr(create_info->alter_table_convert_to_charset); DBUG_ENTER("mysql_prepare_create_table"); - DBUG_EXECUTE_IF("test_pseudo_invisible",{ - mysql_add_invisible_field(thd, &alter_info->create_list, - "invisible", &type_handler_slong, INVISIBLE_SYSTEM, - new (thd->mem_root)Item_int(thd, 9)); - }); - DBUG_EXECUTE_IF("test_completely_invisible",{ - mysql_add_invisible_field(thd, &alter_info->create_list, - "invisible", &type_handler_slong, INVISIBLE_FULL, - new (thd->mem_root)Item_int(thd, 9)); - }); - DBUG_EXECUTE_IF("test_invisible_index",{ - LEX_CSTRING temp; - temp.str= "invisible"; - temp.length= strlen("invisible"); - mysql_add_invisible_index(thd, &alter_info->key_list - , &temp, Key::MULTIPLE); - }); LEX_CSTRING* connect_string = &create_info->connect_string; if (connect_string->length != 0 && connect_string->length > CONNECT_STRING_MAXLEN && @@ -2804,42 +2845,16 @@ mysql_prepare_create_table(THD *thd, HA_CREATE_INFO *create_info, DBUG_RETURN(TRUE); } + for (field_no=0; (sql_field=it++) ; field_no++) { - /* Virtual fields are always NULL */ - if (sql_field->vcol_info) - sql_field->flags&= ~NOT_NULL_FLAG; - - /* - Initialize length from its original value (number of characters), - which was set in the parser. This is necessary if we're - executing a prepared statement for the second time. - */ - sql_field->length= sql_field->char_length; - - if (sql_field->bulk_alter(&dattr, &battr)) - DBUG_RETURN(true); - - if (sql_field->prepare_stage1(thd, thd->mem_root, - file, file->ha_table_flags(), - &dattr)) - DBUG_RETURN(true); - - DBUG_ASSERT(sql_field->charset); + if (!(sql_field->flags & NOT_NULL_FLAG)) + null_fields++; if (sql_field->real_field_type() == MYSQL_TYPE_BIT && file->ha_table_flags() & HA_CAN_BIT_FIELD) total_uneven_bit_length+= sql_field->length & 7; - if (!(sql_field->flags & NOT_NULL_FLAG)) - null_fields++; - - if (check_column_name(sql_field->field_name.str)) - { - my_error(ER_WRONG_COLUMN_NAME, MYF(0), sql_field->field_name.str); - DBUG_RETURN(TRUE); - } - /* Check if we have used the same field name before */ for (dup_no=0; (dup_field=it2++) != sql_field; dup_no++) { @@ -3827,6 +3842,49 @@ without_overlaps_err: DBUG_RETURN(FALSE); } + +/* + Preparation for table creation + + SYNOPSIS + mysql_prepare_create_table() + thd Thread object. + create_info Create information (like MAX_ROWS). + alter_info List of columns and indexes to create + db_options INOUT Table options (like HA_OPTION_PACK_RECORD). + file The handler for the new table. + key_info_buffer OUT An array of KEY structs for the indexes. + key_count OUT The number of elements in the array. + create_table_mode C_ORDINARY_CREATE, C_ALTER_TABLE, + C_CREATE_SELECT, C_ASSISTED_DISCOVERY + + DESCRIPTION + Prepares the table and key structures for table creation. + + NOTES + sets create_info->varchar if the table has a varchar + + RETURN VALUES + FALSE OK + TRUE error +*/ + +static int +mysql_prepare_create_table(THD *thd, HA_CREATE_INFO *create_info, + Alter_info *alter_info, uint *db_options, + handler *file, KEY **key_info_buffer, + uint *key_count, int create_table_mode, + const LEX_CSTRING db, + const LEX_CSTRING table_name) +{ + return mysql_prepare_create_table_stage1(thd, create_info, alter_info) || + mysql_prepare_create_table_finalize(thd, create_info, alter_info, + db_options, file, key_info_buffer, + key_count, create_table_mode, + db, table_name); +} + + /** check comment length of table, column, index and partition @@ -3955,7 +4013,8 @@ bool Column_definition::prepare_blob_field(THD *thd) bool Column_definition::sp_prepare_create_field(THD *thd, MEM_ROOT *mem_root) { const Column_derived_attributes dattr(thd->variables.collation_database); - return prepare_stage1(thd, mem_root, NULL, HA_CAN_GEOMETRY, &dattr) || + return prepare_stage1(thd, mem_root, + COLUMN_DEFINITION_ROUTINE_LOCAL, &dattr) || prepare_stage2(NULL, HA_CAN_GEOMETRY); } @@ -4050,6 +4109,9 @@ handler *mysql_create_frm_image(THD *thd, const LEX_CSTRING &db, DBUG_RETURN(NULL); } + if (mysql_prepare_create_table_stage1(thd, create_info, alter_info)) + DBUG_RETURN(NULL); + db_options= create_info->table_options_with_row_type(); if (unlikely(!(file= get_new_handler((TABLE_SHARE*) 0, thd->mem_root, @@ -4266,9 +4328,10 @@ handler *mysql_create_frm_image(THD *thd, const LEX_CSTRING &db, } #endif - if (mysql_prepare_create_table(thd, create_info, alter_info, &db_options, - file, key_info, key_count, - create_table_mode, db, table_name)) + if (mysql_prepare_create_table_finalize(thd, create_info, + alter_info, &db_options, + file, key_info, key_count, + create_table_mode, db, table_name)) goto err; create_info->table_options=db_options; diff --git a/sql/sql_type.cc b/sql/sql_type.cc index 1c433043db7..0a51f5a7524 100644 --- a/sql/sql_type.cc +++ b/sql/sql_type.cc @@ -3008,8 +3008,7 @@ bool Type_handler:: Column_definition_prepare_stage1(THD *thd, MEM_ROOT *mem_root, Column_definition *def, - handler *file, - ulonglong table_flags, + column_definition_type_t type, const Column_derived_attributes *derived_attr) const @@ -3022,8 +3021,7 @@ bool Type_handler_null:: Column_definition_prepare_stage1(THD *thd, MEM_ROOT *mem_root, Column_definition *def, - handler *file, - ulonglong table_flags, + column_definition_type_t type, const Column_derived_attributes *derived_attr) const @@ -3037,8 +3035,7 @@ bool Type_handler_row:: Column_definition_prepare_stage1(THD *thd, MEM_ROOT *mem_root, Column_definition *def, - handler *file, - ulonglong table_flags, + column_definition_type_t type, const Column_derived_attributes *derived_attr) const @@ -3052,8 +3049,7 @@ bool Type_handler_temporal_result:: Column_definition_prepare_stage1(THD *thd, MEM_ROOT *mem_root, Column_definition *def, - handler *file, - ulonglong table_flags, + column_definition_type_t type, const Column_derived_attributes *derived_attr) const @@ -3067,8 +3063,7 @@ bool Type_handler_numeric:: Column_definition_prepare_stage1(THD *thd, MEM_ROOT *mem_root, Column_definition *def, - handler *file, - ulonglong table_flags, + column_definition_type_t type, const Column_derived_attributes *derived_attr) const @@ -3081,8 +3076,7 @@ bool Type_handler_newdecimal:: Column_definition_prepare_stage1(THD *thd, MEM_ROOT *mem_root, Column_definition *def, - handler *file, - ulonglong table_flags, + column_definition_type_t type, const Column_derived_attributes *derived_attr) const @@ -3096,28 +3090,26 @@ bool Type_handler_bit:: Column_definition_prepare_stage1(THD *thd, MEM_ROOT *mem_root, Column_definition *def, - handler *file, - ulonglong table_flags, + column_definition_type_t type, const Column_derived_attributes *derived_attr) const { def->charset= &my_charset_numeric; - return def->prepare_stage1_bit(thd, mem_root, file, table_flags); + return def->prepare_stage1_bit(thd, mem_root); } bool Type_handler_typelib:: Column_definition_prepare_stage1(THD *thd, MEM_ROOT *mem_root, Column_definition *def, - handler *file, - ulonglong table_flags, + column_definition_type_t type, const Column_derived_attributes *derived_attr) const { return def->prepare_charset_for_string(derived_attr) || - def->prepare_stage1_typelib(thd, mem_root, file, table_flags); + def->prepare_stage1_typelib(thd, mem_root, type); } @@ -3125,14 +3117,13 @@ bool Type_handler_string_result:: Column_definition_prepare_stage1(THD *thd, MEM_ROOT *mem_root, Column_definition *def, - handler *file, - ulonglong table_flags, + column_definition_type_t type, const Column_derived_attributes *derived_attr) const { return def->prepare_charset_for_string(derived_attr) || - def->prepare_stage1_string(thd, mem_root, file, table_flags); + def->prepare_stage1_string(thd, mem_root); } @@ -3343,10 +3334,11 @@ bool Type_handler_bit:: handler *file, ulonglong table_flags) const { - /* - We have sql_field->pack_flag already set here, see - mysql_prepare_create_table(). - */ + if (!(table_flags & HA_CAN_BIT_FIELD)) + { + def->pack_flag|= FIELDFLAG_TREAT_BIT_AS_CHAR; + def->create_length_to_internal_length_bit(); + } return false; } diff --git a/sql/sql_type.h b/sql/sql_type.h index 741e0c9bb96..d931c7ffb6d 100644 --- a/sql/sql_type.h +++ b/sql/sql_type.h @@ -3962,8 +3962,7 @@ public: virtual bool Column_definition_prepare_stage1(THD *thd, MEM_ROOT *mem_root, Column_definition *c, - handler *file, - ulonglong table_flags, + column_definition_type_t type, const Column_derived_attributes *derived_attr) const; @@ -4441,8 +4440,7 @@ public: bool Column_definition_prepare_stage1(THD *thd, MEM_ROOT *mem_root, Column_definition *c, - handler *file, - ulonglong table_flags, + column_definition_type_t type, const Column_derived_attributes *derived_attr) const override; @@ -4756,8 +4754,7 @@ public: bool Column_definition_prepare_stage1(THD *thd, MEM_ROOT *mem_root, Column_definition *c, - handler *file, - ulonglong table_flags, + column_definition_type_t type, const Column_derived_attributes *derived_attr) const override; @@ -5310,8 +5307,7 @@ public: bool Column_definition_prepare_stage1(THD *thd, MEM_ROOT *mem_root, Column_definition *c, - handler *file, - ulonglong table_flags, + column_definition_type_t type, const Column_derived_attributes *derived_attr) const override; @@ -5414,8 +5410,7 @@ public: bool Column_definition_prepare_stage1(THD *thd, MEM_ROOT *mem_root, Column_definition *c, - handler *file, - ulonglong table_flags, + column_definition_type_t type, const Column_derived_attributes *derived_attr) const override; @@ -5947,8 +5942,7 @@ public: bool Column_definition_prepare_stage1(THD *thd, MEM_ROOT *mem_root, Column_definition *c, - handler *file, - ulonglong table_flags, + column_definition_type_t type, const Column_derived_attributes *derived_attr) const override; @@ -6791,8 +6785,7 @@ public: bool Column_definition_prepare_stage1(THD *thd, MEM_ROOT *mem_root, Column_definition *c, - handler *file, - ulonglong table_flags, + column_definition_type_t type, const Column_derived_attributes *derived_attr) const override; @@ -6849,8 +6842,7 @@ public: bool Column_definition_prepare_stage1(THD *thd, MEM_ROOT *mem_root, Column_definition *c, - handler *file, - ulonglong table_flags, + column_definition_type_t type, const Column_derived_attributes *derived_attr) const override; @@ -7286,8 +7278,7 @@ public: bool Column_definition_prepare_stage1(THD *thd, MEM_ROOT *mem_root, Column_definition *c, - handler *file, - ulonglong table_flags, + column_definition_type_t type, const Column_derived_attributes *derived_attr) const override; diff --git a/sql/sql_type_fixedbin.h b/sql/sql_type_fixedbin.h index 223bf2cf398..c1be1c9ccba 100644 --- a/sql/sql_type_fixedbin.h +++ b/sql/sql_type_fixedbin.h @@ -382,8 +382,8 @@ public: } bool Column_definition_prepare_stage1(THD *thd, MEM_ROOT *mem_root, - Column_definition *def, handler *file, - ulonglong table_flags, + Column_definition *def, + column_definition_type_t type, const Column_derived_attributes *derived_attr) const override { diff --git a/sql/sql_type_geom.cc b/sql/sql_type_geom.cc index 5732ae47217..3bdc34b4d65 100644 --- a/sql/sql_type_geom.cc +++ b/sql/sql_type_geom.cc @@ -280,8 +280,7 @@ bool Type_handler_geometry:: Column_definition_prepare_stage1(THD *thd, MEM_ROOT *mem_root, Column_definition *def, - handler *file, - ulonglong table_flags, + column_definition_type_t type, const Column_derived_attributes *derived_attr) const { diff --git a/sql/sql_type_geom.h b/sql/sql_type_geom.h index 3bc25808bc3..db951297519 100644 --- a/sql/sql_type_geom.h +++ b/sql/sql_type_geom.h @@ -108,8 +108,7 @@ public: bool Column_definition_prepare_stage1(THD *thd, MEM_ROOT *mem_root, Column_definition *c, - handler *file, - ulonglong table_flags, + column_definition_type_t type, const Column_derived_attributes *derived_attr) const override; -- cgit v1.2.1 From e97560eac0321d43e98033f00fb0bd4f008bc9e9 Mon Sep 17 00:00:00 2001 From: Igor Babaev Date: Mon, 30 Jan 2023 19:42:27 -0800 Subject: MDEV-28958 Crash when checking whether condition can be pushed into view Do not set any flags in the items for constant subformulas TRUE/FALSE when checking pushability of a formula into a view. Occurrences of these subformulas can be ignored when checking pushability of the formula. At the same time the items used for these constants became immutable starting from version 10.7. Approved by Oleksandr Byelkin --- mysql-test/main/derived_cond_pushdown.result | 17 +++++++++++++++++ mysql-test/main/derived_cond_pushdown.test | 21 +++++++++++++++++++++ sql/item.h | 23 ++++++++++++++++------- 3 files changed, 54 insertions(+), 7 deletions(-) diff --git a/mysql-test/main/derived_cond_pushdown.result b/mysql-test/main/derived_cond_pushdown.result index 8c936f3374a..4a0021828ed 100644 --- a/mysql-test/main/derived_cond_pushdown.result +++ b/mysql-test/main/derived_cond_pushdown.result @@ -20697,3 +20697,20 @@ id select_type table type possible_keys key key_len ref rows Extra drop view v1; drop table t1; # End of 10.4 tests +# +# MDEV-28958: condition pushable into view after simplification +# contains constant TRUE/FALSE as subformula +# +create table t1 (c1 int); +insert into t1 values (3), (7), (1), (3), (1), (3); +create table t2 (c2 int); +insert into t2 values (3), (5), (7), (3); +create view v1 as select * from t1 group by c1; +create view v2 as select c1 as a, c2 as b from v1,t2 where c1=c2; +select * from v2 group by a,b having a=b or b > a+10; +a b +3 3 +7 7 +drop view v1,v2; +drop table t1,t2; +# End of 10.7 tests diff --git a/mysql-test/main/derived_cond_pushdown.test b/mysql-test/main/derived_cond_pushdown.test index 17115c143ac..dc454bf80de 100644 --- a/mysql-test/main/derived_cond_pushdown.test +++ b/mysql-test/main/derived_cond_pushdown.test @@ -3944,3 +3944,24 @@ drop view v1; drop table t1; --echo # End of 10.4 tests + +--echo # +--echo # MDEV-28958: condition pushable into view after simplification +--echo # contains constant TRUE/FALSE as subformula +--echo # + +create table t1 (c1 int); +insert into t1 values (3), (7), (1), (3), (1), (3); + +create table t2 (c2 int); +insert into t2 values (3), (5), (7), (3); + +create view v1 as select * from t1 group by c1; +create view v2 as select c1 as a, c2 as b from v1,t2 where c1=c2; + +select * from v2 group by a,b having a=b or b > a+10; + +drop view v1,v2; +drop table t1,t2; + +--echo # End of 10.7 tests diff --git a/sql/item.h b/sql/item.h index cf0fe455ef4..a90cb01c80b 100644 --- a/sql/item.h +++ b/sql/item.h @@ -2687,18 +2687,27 @@ public: void register_in(THD *thd); bool depends_only_on(table_map view_map) - { return marker & MARKER_FULL_EXTRACTION; } - int get_extraction_flag() const - { return marker & MARKER_EXTRACTION_MASK; } + { return get_extraction_flag() & MARKER_FULL_EXTRACTION; } + int get_extraction_flag() const + { + if (basic_const_item()) + return MARKER_FULL_EXTRACTION; + else + return marker & MARKER_EXTRACTION_MASK; + } void set_extraction_flag(int16 flags) { - marker &= ~MARKER_EXTRACTION_MASK; - marker|= flags; + if (!basic_const_item()) + { + marker= marker & ~MARKER_EXTRACTION_MASK; + marker|= flags; + } } void clear_extraction_flag() { - marker &= ~MARKER_EXTRACTION_MASK; - } + if (!basic_const_item()) + marker= marker & ~MARKER_EXTRACTION_MASK; + } void check_pushable_cond(Pushdown_checker excl_dep_func, uchar *arg); bool pushable_cond_checker_for_derived(uchar *arg) { -- cgit v1.2.1 From 1495f057c8a3aa5783de0993f6f95aae3e352f7a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marko=20M=C3=A4kel=C3=A4?= Date: Thu, 16 Mar 2023 13:39:23 +0200 Subject: MDEV-30860 Race condition between buffer pool flush and log file deletion in mariadb-backup --prepare srv_start(): If we are going to close the log file in mariadb-backup --prepare, call buf_flush_sync() before calling recv_sys.debug_free() to ensure that the log file will not be accessed. This fixes a rather rare failure in the test mariabackup.innodb_force_recovery where buf_flush_page_cleaner() would invoke log_checkpoint_low() because !recv_recovery_is_on() would hold due to the fact that recv_sys.debug_free() had already been called. Then, the log write for the checkpoint would fail because srv_start() had invoked log_sys.log.close_file(). --- storage/innobase/srv/srv0start.cc | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/storage/innobase/srv/srv0start.cc b/storage/innobase/srv/srv0start.cc index cc2bb699fd9..44fca2c81a5 100644 --- a/storage/innobase/srv/srv0start.cc +++ b/storage/innobase/srv/srv0start.cc @@ -922,9 +922,7 @@ static lsn_t srv_prepare_to_delete_redo_log_file(bool old_exists) { DBUG_ENTER("srv_prepare_to_delete_redo_log_file"); - /* Disable checkpoints in the page cleaner. */ - ut_ad(!recv_sys.recovery_on); - recv_sys.recovery_on= true; + ut_ad(recv_sys.recovery_on); /* Clean the buffer pool. */ buf_flush_sync(); @@ -1606,10 +1604,10 @@ file_checked: } } - recv_sys.debug_free(); - if (srv_operation == SRV_OPERATION_RESTORE || srv_operation == SRV_OPERATION_RESTORE_EXPORT) { + buf_flush_sync(); + recv_sys.debug_free(); /* After applying the redo log from SRV_OPERATION_BACKUP, flush the changes to the data files and truncate or delete the log. @@ -1701,6 +1699,8 @@ file_checked: return(srv_init_abort(err)); } } + + recv_sys.debug_free(); } ut_ad(err == DB_SUCCESS); -- cgit v1.2.1 From f2096478d5750b983f9a9cc4691d20e152dafd4a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marko=20M=C3=A4kel=C3=A4?= Date: Thu, 16 Mar 2023 15:52:42 +0200 Subject: MDEV-29835 InnoDB hang on B-tree split or merge This is a follow-up to commit de4030e4d49805a7ded5c0bfee01cc3fd7623522 (MDEV-30400), which fixed some hangs related to B-tree split or merge. btr_root_block_get(): Use and update the root page guess. This is just a minor performance optimization, not affecting correctness. btr_validate_level(): Remove the parameter "lockout", and always acquire an exclusive dict_index_t::lock in CHECK TABLE without QUICK. This is needed in order to avoid latching order violation in btr_page_get_father_node_ptr_for_validate(). btr_cur_need_opposite_intention(): Return true in case btr_cur_compress_recommendation() would hold later during the mini-transaction, or if a page underflow or overflow is possible. If we return true, our caller will escalate to aqcuiring an exclusive dict_index_t::lock, to prevent a latching order violation and deadlock during btr_compress() or btr_page_split_and_insert(). btr_cur_t::search_leaf(), btr_cur_t::open_leaf(): Also invoke btr_cur_need_opposite_intention() on the leaf page. btr_cur_t::open_leaf(): When escalating to exclusive index locking, acquire exclusive latches on all pages as well. innobase_instant_try(): Return an error code if the root page cannot be retrieved. In addition to the normal stress testing with Random Query Generator (RQG) this has been tested with ./mtr --mysqld=--loose-innodb-limit-optimistic-insert-debug=2 but with the injection in btr_cur_optimistic_insert() for non-leaf pages adjusted so that it would use the value 3. (Otherwise, infinite page splits could occur in some mtr tests.) Tested by: Matthias Leich --- storage/innobase/btr/btr0btr.cc | 141 ++++++++------------------ storage/innobase/btr/btr0cur.cc | 160 +++++++++++++++++++----------- storage/innobase/handler/handler0alter.cc | 1 + storage/innobase/include/btr0btr.h | 2 +- storage/innobase/include/btr0types.h | 3 + storage/innobase/include/mtr0mtr.h | 3 + storage/innobase/row/row0log.cc | 6 +- 7 files changed, 156 insertions(+), 160 deletions(-) diff --git a/storage/innobase/btr/btr0btr.cc b/storage/innobase/btr/btr0btr.cc index 7fd851f7b0e..1b69f4c7170 100644 --- a/storage/innobase/btr/btr0btr.cc +++ b/storage/innobase/btr/btr0btr.cc @@ -254,7 +254,7 @@ Gets the root node of a tree and x- or s-latches it. buf_block_t* btr_root_block_get( /*===============*/ - const dict_index_t* index, /*!< in: index tree */ + dict_index_t* index, /*!< in: index tree */ rw_lock_type_t mode, /*!< in: either RW_S_LATCH or RW_X_LATCH */ mtr_t* mtr, /*!< in: mtr */ @@ -266,11 +266,31 @@ btr_root_block_get( return nullptr; } - buf_block_t *block = btr_block_get(*index, index->page, mode, false, mtr, - err); - if (block) + buf_block_t *block; +#ifndef BTR_CUR_ADAPT + static constexpr buf_block_t *guess= nullptr; +#else + buf_block_t *&guess= btr_search_get_info(index)->root_guess; + guess= +#endif + block= + buf_page_get_gen(page_id_t{index->table->space->id, index->page}, + index->table->space->zip_size(), mode, guess, BUF_GET, + mtr, err, false); + ut_ad(!block == (*err != DB_SUCCESS)); + + if (UNIV_LIKELY(block != nullptr)) { - if (index->is_ibuf()); + if (!!page_is_comp(block->page.frame) != index->table->not_redundant() || + btr_page_get_index_id(block->page.frame) != index->id || + !fil_page_index_page_check(block->page.frame) || + index->is_spatial() != + (fil_page_get_type(block->page.frame) == FIL_PAGE_RTREE)) + { + *err= DB_PAGE_CORRUPTED; + block= nullptr; + } + else if (index->is_ibuf()); else if (!btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF, *block, *index->table->space) || !btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP, @@ -280,6 +300,9 @@ btr_root_block_get( block= nullptr; } } + else if (*err == DB_DECRYPTION_FAILED) + btr_decryption_failed(*index); + return block; } @@ -290,7 +313,7 @@ static page_t* btr_root_get( /*=========*/ - const dict_index_t* index, /*!< in: index tree */ + dict_index_t* index, /*!< in: index tree */ mtr_t* mtr, /*!< in: mtr */ dberr_t* err) /*!< out: error code */ { @@ -502,9 +525,7 @@ btr_block_reget(mtr_t *mtr, const dict_index_t &index, return block; } -#if 0 /* MDEV-29385 FIXME: Acquire the page latch upfront. */ ut_ad(mtr->memo_contains_flagged(&index.lock, MTR_MEMO_X_LOCK)); -#endif return btr_block_get(index, id.page_no(), rw_latch, true, mtr, err); } @@ -773,9 +794,7 @@ btr_page_get_father_node_ptr_for_validate( const uint32_t page_no = btr_cur_get_block(cursor)->page.id().page_no(); dict_index_t* index = btr_cur_get_index(cursor); ut_ad(!dict_index_is_spatial(index)); - - ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK - | MTR_MEMO_SX_LOCK)); + ut_ad(mtr->memo_contains(index->lock, MTR_MEMO_X_LOCK)); ut_ad(dict_index_get_page(index) != page_no); const auto level = btr_page_get_level(btr_cur_get_page(cursor)); @@ -793,10 +812,6 @@ btr_page_get_father_node_ptr_for_validate( } const rec_t* node_ptr = btr_cur_get_rec(cursor); -#if 0 /* MDEV-29835 FIXME */ - ut_ad(!btr_cur_get_block(cursor)->page.lock.not_recursive() - || mtr->memo_contains(index->lock, MTR_MEMO_X_LOCK)); -#endif offsets = rec_get_offsets(node_ptr, index, offsets, 0, ULINT_UNDEFINED, &heap); @@ -2456,11 +2471,10 @@ btr_insert_on_non_leaf_level( } ut_ad(cursor.flag == BTR_CUR_BINARY); -#if 0 /* MDEV-29835 FIXME */ - ut_ad(!btr_cur_get_block(&cursor)->page.lock.not_recursive() + ut_ad(btr_cur_get_block(&cursor) + != mtr->at_savepoint(mtr->get_savepoint() - 1) || index->is_spatial() || mtr->memo_contains(index->lock, MTR_MEMO_X_LOCK)); -#endif if (UNIV_LIKELY(err == DB_SUCCESS)) { err = btr_cur_optimistic_insert(flags, @@ -2568,10 +2582,8 @@ btr_attach_half_pages( prev_block = mtr->get_already_latched(id, MTR_MEMO_PAGE_X_FIX); #if 1 /* MDEV-29835 FIXME: acquire page latches upfront */ if (!prev_block) { -# if 0 /* MDEV-29835 FIXME */ ut_ad(mtr->memo_contains(index->lock, MTR_MEMO_X_LOCK)); -# endif prev_block = btr_block_get(*index, prev_page_no, RW_X_LATCH, !level, mtr); } @@ -2582,10 +2594,8 @@ btr_attach_half_pages( next_block = mtr->get_already_latched(id, MTR_MEMO_PAGE_X_FIX); #if 1 /* MDEV-29835 FIXME: acquire page latches upfront */ if (!next_block) { -# if 0 /* MDEV-29835 FIXME */ ut_ad(mtr->memo_contains(index->lock, MTR_MEMO_X_LOCK)); -# endif next_block = btr_block_get(*index, next_page_no, RW_X_LATCH, !level, mtr); } @@ -3397,9 +3407,7 @@ dberr_t btr_level_list_remove(const buf_block_t& block, #if 1 /* MDEV-29835 FIXME: acquire page latches upfront */ if (!prev) { -# if 0 /* MDEV-29835 FIXME */ ut_ad(mtr->memo_contains(index.lock, MTR_MEMO_X_LOCK)); -# endif prev= btr_block_get(index, id.page_no(), RW_X_LATCH, page_is_leaf(block.page.frame), mtr, &err); if (UNIV_UNLIKELY(!prev)) @@ -3415,9 +3423,7 @@ dberr_t btr_level_list_remove(const buf_block_t& block, #if 1 /* MDEV-29835 FIXME: acquire page latches upfront */ if (!next) { -# if 0 /* MDEV-29835 FIXME */ ut_ad(mtr->memo_contains(index.lock, MTR_MEMO_X_LOCK)); -# endif next= btr_block_get(index, id.page_no(), RW_X_LATCH, page_is_leaf(block.page.frame), mtr, &err); if (UNIV_UNLIKELY(!next)) @@ -4291,7 +4297,7 @@ btr_discard_page( if (UNIV_UNLIKELY(!merge_block)) { return err; } -#if 0 /* MDEV-29385 FIXME: Acquire the page latch upfront. */ +#if 1 /* MDEV-29835 FIXME: Acquire the page latch upfront. */ ut_ad(!memcmp_aligned<4>(merge_block->page.frame + FIL_PAGE_NEXT, block->page.frame + FIL_PAGE_OFFSET, @@ -4317,7 +4323,7 @@ btr_discard_page( if (UNIV_UNLIKELY(!merge_block)) { return err; } -#if 0 /* MDEV-29385 FIXME: Acquire the page latch upfront. */ +#if 1 /* MDEV-29835 FIXME: Acquire the page latch upfront. */ ut_ad(!memcmp_aligned<4>(merge_block->page.frame + FIL_PAGE_PREV, block->page.frame + FIL_PAGE_OFFSET, @@ -4898,8 +4904,7 @@ btr_validate_level( /*===============*/ dict_index_t* index, /*!< in: index tree */ const trx_t* trx, /*!< in: transaction or NULL */ - ulint level, /*!< in: level number */ - bool lockout)/*!< in: true if X-latch index is intended */ + ulint level) /*!< in: level number */ { buf_block_t* block; page_t* page; @@ -4918,18 +4923,10 @@ btr_validate_level( #ifdef UNIV_ZIP_DEBUG page_zip_des_t* page_zip; #endif /* UNIV_ZIP_DEBUG */ - ulint savepoint = 0; - uint32_t parent_page_no = FIL_NULL; - uint32_t parent_right_page_no = FIL_NULL; - bool rightmost_child = false; mtr.start(); - if (lockout) { - mtr_x_lock_index(index, &mtr); - } else { - mtr_sx_lock_index(index, &mtr); - } + mtr_x_lock_index(index, &mtr); dberr_t err; block = btr_root_block_get(index, RW_SX_LATCH, &mtr, &err); @@ -5025,11 +5022,7 @@ func_exit: mem_heap_empty(heap); offsets = offsets2 = NULL; - if (lockout) { - mtr_x_lock_index(index, &mtr); - } else { - mtr_sx_lock_index(index, &mtr); - } + mtr_x_lock_index(index, &mtr); page = block->page.frame; @@ -5073,7 +5066,6 @@ func_exit: if (right_page_no != FIL_NULL) { const rec_t* right_rec; - savepoint = mtr.get_savepoint(); right_block = btr_block_get(*index, right_page_no, RW_SX_LATCH, !level, &mtr, &err); @@ -5177,11 +5169,6 @@ broken_links: father_page = btr_cur_get_page(&node_cur); node_ptr = btr_cur_get_rec(&node_cur); - parent_page_no = page_get_page_no(father_page); - parent_right_page_no = btr_page_get_next(father_page); - rightmost_child = page_rec_is_supremum( - page_rec_get_next(node_ptr)); - rec = page_rec_get_prev(page_get_supremum_rec(page)); if (rec) { btr_cur_position(index, rec, block, &node_cur); @@ -5263,37 +5250,6 @@ broken_links: } } else if (const rec_t* right_node_ptr = page_rec_get_next(node_ptr)) { - if (!lockout && rightmost_child) { - - /* To obey latch order of tree blocks, - we should release the right_block once to - obtain lock of the uncle block. */ - ut_ad(right_block - == mtr.at_savepoint(savepoint)); - mtr.rollback_to_savepoint(savepoint, - savepoint + 1); - - if (parent_right_page_no != FIL_NULL) { - btr_block_get(*index, - parent_right_page_no, - RW_SX_LATCH, false, - &mtr); - } - - right_block = btr_block_get(*index, - right_page_no, - RW_SX_LATCH, - !level, &mtr, - &err); - if (!right_block) { - btr_validate_report1(index, level, - block); - fputs("InnoDB: broken FIL_PAGE_NEXT" - " link\n", stderr); - goto invalid_page; - } - } - btr_cur_position( index, page_get_infimum_rec(right_block->page.frame), @@ -5365,20 +5321,6 @@ node_ptr_fails: mtr.start(); - if (!lockout) { - if (rightmost_child) { - if (parent_right_page_no != FIL_NULL) { - btr_block_get(*index, - parent_right_page_no, - RW_SX_LATCH, false, - &mtr); - } - } else if (parent_page_no != FIL_NULL) { - btr_block_get(*index, parent_page_no, - RW_SX_LATCH, false, &mtr); - } - } - block = btr_block_get(*index, right_page_no, RW_SX_LATCH, !level, &mtr, &err); goto loop; @@ -5396,21 +5338,16 @@ btr_validate_index( dict_index_t* index, /*!< in: index */ const trx_t* trx) /*!< in: transaction or NULL */ { - const bool lockout= index->is_spatial(); - mtr_t mtr; mtr.start(); - if (lockout) - mtr_x_lock_index(index, &mtr); - else - mtr_sx_lock_index(index, &mtr); + mtr_x_lock_index(index, &mtr); dberr_t err; if (page_t *root= btr_root_get(index, &mtr, &err)) for (auto level= btr_page_get_level(root);; level--) { - if (dberr_t err_level= btr_validate_level(index, trx, level, lockout)) + if (dberr_t err_level= btr_validate_level(index, trx, level)) err= err_level; if (!level) break; diff --git a/storage/innobase/btr/btr0cur.cc b/storage/innobase/btr/btr0cur.cc index 62c7d44d286..27ed631099d 100644 --- a/storage/innobase/btr/btr0cur.cc +++ b/storage/innobase/btr/btr0cur.cc @@ -748,29 +748,34 @@ btr_cur_will_modify_tree( /** Detects whether the modifying record might need a opposite modification to the intention. -@param[in] page page -@param[in] lock_intention lock intention for the tree operation -@param[in] rec record (current node_ptr) +@param page page +@param lock_intention lock intention for the tree operation +@param node_ptr_max_size the maximum size of a node pointer +@param compress_limit BTR_CUR_PAGE_COMPRESS_LIMIT(index) +@param rec record (current node_ptr) @return true if tree modification is needed */ -static -bool -btr_cur_need_opposite_intention( - const page_t* page, - btr_intention_t lock_intention, - const rec_t* rec) +static bool btr_cur_need_opposite_intention(const page_t *page, + btr_intention_t lock_intention, + ulint node_ptr_max_size, + ulint compress_limit, + const rec_t *rec) { - switch (lock_intention) { - case BTR_INTENTION_DELETE: - return (page_has_prev(page) && page_rec_is_first(rec, page)) || - (page_has_next(page) && page_rec_is_last(rec, page)); - case BTR_INTENTION_INSERT: - return page_has_next(page) && page_rec_is_last(rec, page); - case BTR_INTENTION_BOTH: - return(false); - } - - MY_ASSERT_UNREACHABLE(); - return(false); + if (lock_intention != BTR_INTENTION_INSERT) + { + /* We compensate also for btr_cur_compress_recommendation() */ + if (!page_has_siblings(page) || + page_rec_is_first(rec, page) || page_rec_is_last(rec, page) || + page_get_data_size(page) < node_ptr_max_size + compress_limit) + return true; + if (lock_intention == BTR_INTENTION_DELETE) + return false; + } + else if (page_has_next(page) && page_rec_is_last(rec, page)) + return true; + LIMIT_OPTIMISTIC_INSERT_DEBUG(page_get_n_recs(page), return true); + const ulint max_size= page_get_max_insert_size_after_reorganize(page, 2); + return max_size < BTR_CUR_PAGE_REORGANIZE_LIMIT + node_ptr_max_size || + max_size < node_ptr_max_size * 2; } /** @@ -1038,7 +1043,7 @@ dberr_t btr_cur_t::search_leaf(const dtuple_t *tuple, page_cur_mode_t mode, const ulint savepoint= mtr->get_savepoint(); - ulint node_ptr_max_size= 0; + ulint node_ptr_max_size= 0, compress_limit= 0; rw_lock_type_t rw_latch= RW_S_LATCH; switch (latch_mode) { @@ -1050,13 +1055,19 @@ dberr_t btr_cur_t::search_leaf(const dtuple_t *tuple, page_cur_mode_t mode, ut_ad(mtr->memo_contains_flagged(&index()->lock, MTR_MEMO_X_LOCK)); break; } - if (lock_intention == BTR_INTENTION_DELETE && buf_pool.n_pend_reads && - trx_sys.history_size_approx() > BTR_CUR_FINE_HISTORY_LENGTH) - /* Most delete-intended operations are due to the purge of history. - Prioritize them when the history list is growing huge. */ - mtr_x_lock_index(index(), mtr); - else - mtr_sx_lock_index(index(), mtr); + if (lock_intention == BTR_INTENTION_DELETE) + { + compress_limit= BTR_CUR_PAGE_COMPRESS_LIMIT(index()); + if (buf_pool.n_pend_reads && + trx_sys.history_size_approx() > BTR_CUR_FINE_HISTORY_LENGTH) + { + /* Most delete-intended operations are due to the purge of history. + Prioritize them when the history list is growing huge. */ + mtr_x_lock_index(index(), mtr); + break; + } + } + mtr_sx_lock_index(index(), mtr); break; #ifdef UNIV_DEBUG case BTR_CONT_MODIFY_TREE: @@ -1331,6 +1342,10 @@ release_tree: !btr_block_get(*index(), btr_page_get_next(block->page.frame), RW_X_LATCH, false, mtr, &err)) goto func_exit; + if (btr_cur_need_opposite_intention(block->page.frame, lock_intention, + node_ptr_max_size, compress_limit, + page_cur.rec)) + goto need_opposite_intention; } reached_latched_leaf: @@ -1384,6 +1399,7 @@ release_tree: break; case BTR_MODIFY_TREE: if (btr_cur_need_opposite_intention(block->page.frame, lock_intention, + node_ptr_max_size, compress_limit, page_cur.rec)) /* If the rec is the first or last in the page for pessimistic delete intention, it might cause node_ptr insert for the upper @@ -1536,6 +1552,17 @@ release_tree: goto search_loop; } +ATTRIBUTE_COLD void mtr_t::index_lock_upgrade() +{ + auto &slot= m_memo[get_savepoint() - 1]; + if (slot.type == MTR_MEMO_X_LOCK) + return; + ut_ad(slot.type == MTR_MEMO_SX_LOCK); + index_lock *lock= static_cast(slot.object); + lock->u_x_upgrade(SRW_LOCK_CALL); + slot.type= MTR_MEMO_X_LOCK; +} + ATTRIBUTE_COLD dberr_t btr_cur_t::pessimistic_search_leaf(const dtuple_t *tuple, page_cur_mode_t mode, mtr_t *mtr) @@ -1554,8 +1581,7 @@ dberr_t btr_cur_t::pessimistic_search_leaf(const dtuple_t *tuple, ut_ad(block->page.id().page_no() == index()->page); block->page.fix(); mtr->rollback_to_savepoint(1); - ut_ad(mtr->memo_contains_flagged(&index()->lock, - MTR_MEMO_SX_LOCK | MTR_MEMO_X_LOCK)); + mtr->index_lock_upgrade(); const page_cur_mode_t page_mode{btr_cur_nonleaf_mode(mode)}; @@ -1785,7 +1811,6 @@ search_loop: dberr_t btr_cur_t::open_leaf(bool first, dict_index_t *index, btr_latch_mode latch_mode, mtr_t *mtr) { - btr_intention_t lock_intention; ulint n_blocks= 0; mem_heap_t *heap= nullptr; rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; @@ -1797,7 +1822,7 @@ dberr_t btr_cur_t::open_leaf(bool first, dict_index_t *index, const bool latch_by_caller= latch_mode & BTR_ALREADY_S_LATCHED; latch_mode= btr_latch_mode(latch_mode & ~BTR_ALREADY_S_LATCHED); - lock_intention= btr_cur_get_and_clear_intention(&latch_mode); + btr_intention_t lock_intention= btr_cur_get_and_clear_intention(&latch_mode); /* Store the position of the tree latch we push to mtr so that we know how to release it when we have latched the leaf node */ @@ -1805,7 +1830,7 @@ dberr_t btr_cur_t::open_leaf(bool first, dict_index_t *index, auto savepoint= mtr->get_savepoint(); rw_lock_type_t upper_rw_latch= RW_X_LATCH; - ulint node_ptr_max_size= 0; + ulint node_ptr_max_size= 0, compress_limit= 0; if (latch_mode == BTR_MODIFY_TREE) { @@ -1814,12 +1839,18 @@ dberr_t btr_cur_t::open_leaf(bool first, dict_index_t *index, and read IO bandwidth should be prioritized for them, when the history list is growing huge. */ savepoint++; - if (lock_intention == BTR_INTENTION_DELETE - && buf_pool.n_pend_reads - && trx_sys.history_size_approx() > BTR_CUR_FINE_HISTORY_LENGTH) - mtr_x_lock_index(index, mtr); - else - mtr_sx_lock_index(index, mtr); + if (lock_intention == BTR_INTENTION_DELETE) + { + compress_limit= BTR_CUR_PAGE_COMPRESS_LIMIT(index); + + if (buf_pool.n_pend_reads && + trx_sys.history_size_approx() > BTR_CUR_FINE_HISTORY_LENGTH) + { + mtr_x_lock_index(index, mtr); + goto index_locked; + } + } + mtr_sx_lock_index(index, mtr); } else { @@ -1840,9 +1871,11 @@ dberr_t btr_cur_t::open_leaf(bool first, dict_index_t *index, } } +index_locked: ut_ad(savepoint == mtr->get_savepoint()); - const rw_lock_type_t root_leaf_rw_latch= rw_lock_type_t(latch_mode & ~12); + const rw_lock_type_t root_leaf_rw_latch= + rw_lock_type_t(latch_mode & (RW_S_LATCH | RW_X_LATCH)); page_cur.index = index; @@ -1913,15 +1946,28 @@ dberr_t btr_cur_t::open_leaf(bool first, dict_index_t *index, !btr_block_get(*index, btr_page_get_next(block->page.frame), RW_X_LATCH, false, mtr, &err)) break; + + if (!index->lock.have_x() && + btr_cur_need_opposite_intention(block->page.frame, + lock_intention, + node_ptr_max_size, + compress_limit, page_cur.rec)) + goto need_opposite_intention; } else { if (rw_latch == RW_NO_LATCH) mtr->upgrade_buffer_fix(leaf_savepoint - 1, - rw_lock_type_t(latch_mode)); - /* Release index->lock if needed, and the non-leaf pages. */ - mtr->rollback_to_savepoint(savepoint - !latch_by_caller, - leaf_savepoint - 1); + rw_lock_type_t(latch_mode & + (RW_X_LATCH | RW_S_LATCH))); + if (latch_mode != BTR_CONT_MODIFY_TREE) + { + ut_ad(latch_mode == BTR_MODIFY_LEAF || + latch_mode == BTR_SEARCH_LEAF); + /* Release index->lock if needed, and the non-leaf pages. */ + mtr->rollback_to_savepoint(savepoint - !latch_by_caller, + leaf_savepoint - 1); + } } break; } @@ -1943,22 +1989,25 @@ dberr_t btr_cur_t::open_leaf(bool first, dict_index_t *index, : !page_cur_move_to_prev(&page_cur)) goto corrupted; - const rec_t *node_ptr= page_cur.rec; - offsets= rec_get_offsets(node_ptr, index, offsets, 0, ULINT_UNDEFINED, + offsets= rec_get_offsets(page_cur.rec, index, offsets, 0, ULINT_UNDEFINED, &heap); ut_ad(latch_mode != BTR_MODIFY_TREE || upper_rw_latch == RW_X_LATCH); if (latch_mode != BTR_MODIFY_TREE); - else if (btr_cur_need_opposite_intention(block->page.frame, - lock_intention, node_ptr)) + else if (btr_cur_need_opposite_intention(block->page.frame, lock_intention, + node_ptr_max_size, compress_limit, + page_cur.rec)) { + need_opposite_intention: /* If the rec is the first or last in the page for pessimistic delete intention, it might cause node_ptr insert for the upper level. We should change the intention and retry. */ mtr->rollback_to_savepoint(savepoint); - lock_intention= BTR_INTENTION_BOTH; + mtr->index_lock_upgrade(); + /* X-latch all pages from now on */ + latch_mode= BTR_CONT_MODIFY_TREE; page= index->page; height= ULINT_UNDEFINED; n_blocks= 0; @@ -1967,7 +2016,7 @@ dberr_t btr_cur_t::open_leaf(bool first, dict_index_t *index, else { if (!btr_cur_will_modify_tree(index, block->page.frame, - lock_intention, node_ptr, + lock_intention, page_cur.rec, node_ptr_max_size, zip_size, mtr)) { ut_ad(n_blocks); @@ -1997,7 +2046,7 @@ dberr_t btr_cur_t::open_leaf(bool first, dict_index_t *index, } /* Go to the child node */ - page= btr_node_ptr_get_child_page_no(node_ptr, offsets); + page= btr_node_ptr_get_child_page_no(page_cur.rec, offsets); n_blocks++; } @@ -2307,8 +2356,7 @@ convert_big_rec: return(DB_TOO_BIG_RECORD); } - LIMIT_OPTIMISTIC_INSERT_DEBUG(page_get_n_recs(page), - goto fail); + LIMIT_OPTIMISTIC_INSERT_DEBUG(page_get_n_recs(page), goto fail); if (block->page.zip.data && leaf && (page_get_data_size(page) + rec_size @@ -2322,7 +2370,7 @@ fail: /* prefetch siblings of the leaf for the pessimistic operation, if the page is leaf. */ - if (page_is_leaf(page)) { + if (leaf) { btr_cur_prefetch_siblings(block, index); } fail_err: @@ -2391,7 +2439,7 @@ fail_err: #ifdef UNIV_DEBUG if (!(flags & BTR_CREATE_FLAG) - && index->is_primary() && page_is_leaf(page)) { + && leaf && index->is_primary()) { const dfield_t* trx_id = dtuple_get_nth_field( entry, dict_col_get_clust_pos( dict_table_get_sys_col(index->table, diff --git a/storage/innobase/handler/handler0alter.cc b/storage/innobase/handler/handler0alter.cc index 9e9c0a17a39..6a8986d76d2 100644 --- a/storage/innobase/handler/handler0alter.cc +++ b/storage/innobase/handler/handler0alter.cc @@ -6104,6 +6104,7 @@ func_exit: id, MTR_MEMO_PAGE_SX_FIX); if (UNIV_UNLIKELY(!root)) { + err = DB_CORRUPTION; goto func_exit; } diff --git a/storage/innobase/include/btr0btr.h b/storage/innobase/include/btr0btr.h index a1cc10b05db..a56598d3620 100644 --- a/storage/innobase/include/btr0btr.h +++ b/storage/innobase/include/btr0btr.h @@ -445,7 +445,7 @@ Gets the root node of a tree and x- or s-latches it. buf_block_t* btr_root_block_get( /*===============*/ - const dict_index_t* index, /*!< in: index tree */ + dict_index_t* index, /*!< in: index tree */ rw_lock_type_t mode, /*!< in: either RW_S_LATCH or RW_X_LATCH */ mtr_t* mtr, /*!< in: mtr */ diff --git a/storage/innobase/include/btr0types.h b/storage/innobase/include/btr0types.h index 912c022c64f..fc829e7857a 100644 --- a/storage/innobase/include/btr0types.h +++ b/storage/innobase/include/btr0types.h @@ -103,6 +103,9 @@ enum btr_latch_mode { dict_index_t::lock is being held in non-exclusive mode. */ BTR_MODIFY_LEAF_ALREADY_LATCHED = BTR_MODIFY_LEAF | BTR_ALREADY_S_LATCHED, + /** Attempt to modify records in an x-latched tree. */ + BTR_MODIFY_TREE_ALREADY_LATCHED = BTR_MODIFY_TREE + | BTR_ALREADY_S_LATCHED, /** U-latch root and X-latch a leaf page, assuming that dict_index_t::lock is being held in U mode. */ BTR_MODIFY_ROOT_AND_LEAF_ALREADY_LATCHED = BTR_MODIFY_ROOT_AND_LEAF diff --git a/storage/innobase/include/mtr0mtr.h b/storage/innobase/include/mtr0mtr.h index f3fe1841b2e..60e01abe18d 100644 --- a/storage/innobase/include/mtr0mtr.h +++ b/storage/innobase/include/mtr0mtr.h @@ -344,6 +344,9 @@ public: /** Upgrade U locks on a block to X */ void page_lock_upgrade(const buf_block_t &block); + /** Upgrade index U lock to X */ + ATTRIBUTE_COLD void index_lock_upgrade(); + /** Check if we are holding tablespace latch @param space tablespace to search for @return whether space.latch is being held */ diff --git a/storage/innobase/row/row0log.cc b/storage/innobase/row/row0log.cc index 0743dc2bb50..b21ff2b9f86 100644 --- a/storage/innobase/row/row0log.cc +++ b/storage/innobase/row/row0log.cc @@ -3078,6 +3078,9 @@ row_log_apply_op_low( mtr_start(&mtr); index->set_modified(mtr); cursor.page_cur.index = index; + if (has_index_lock) { + mtr_x_lock_index(index, &mtr); + } /* We perform the pessimistic variant of the operations if we already hold index->lock exclusively. First, search the @@ -3085,7 +3088,8 @@ row_log_apply_op_low( depending on when the row in the clustered index was scanned. */ *error = cursor.search_leaf(entry, PAGE_CUR_LE, has_index_lock - ? BTR_MODIFY_TREE : BTR_MODIFY_LEAF, &mtr); + ? BTR_MODIFY_TREE_ALREADY_LATCHED + : BTR_MODIFY_LEAF, &mtr); if (UNIV_UNLIKELY(*error != DB_SUCCESS)) { goto func_exit; } -- cgit v1.2.1 From 4105017a5832c3486002c4ec0c67005ceb5ab88b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marko=20M=C3=A4kel=C3=A4?= Date: Thu, 16 Mar 2023 16:00:45 +0200 Subject: MDEV-30357 Performance regression in locking reads from secondary indexes lock_sec_rec_some_has_impl(): Remove a harmful condition that caused the performance regression and should not have been added in commit b6e41e38720d1e8d33b2abec0d1109615133bc2b in the first place. Locking transactions that have not modified any persistent tables can carry the transaction identifier 0. trx_t::max_inactive_id: A cache for trx_sys_t::find_same_or_older(). The value is not reset on transaction commit so that previous results can be reused for subsequent transactions. The smallest active transaction ID can only increase over time, not decrease. trx_sys_t::find_same_or_older(): Remember the maximum previous id for which rw_trx_hash.iterate() returned false, to avoid redundant iterations. lock_sec_rec_read_check_and_lock(): Add an early return in case we are already holding a covering table lock. lock_rec_convert_impl_to_expl(): Add a template parameter to avoid a redundant run-time check on whether the index is secondary. lock_rec_convert_impl_to_expl_for_trx(): Move some code from lock_rec_convert_impl_to_expl(), to reduce code duplication due to the added template parameter. Reviewed by: Vladislav Lesin Tested by: Matthias Leich --- storage/innobase/include/trx0sys.h | 11 ++++-- storage/innobase/include/trx0trx.h | 4 +++ storage/innobase/lock/lock0lock.cc | 68 ++++++++++++++++++++------------------ storage/innobase/trx/trx0trx.cc | 1 + 4 files changed, 48 insertions(+), 36 deletions(-) diff --git a/storage/innobase/include/trx0sys.h b/storage/innobase/include/trx0sys.h index 4d231077b12..245b981974b 100644 --- a/storage/innobase/include/trx0sys.h +++ b/storage/innobase/include/trx0sys.h @@ -924,14 +924,19 @@ public: /** Determine if the specified transaction or any older one might be active. - @param caller_trx used to get/set pins + @param trx current transaction @param id transaction identifier @return whether any transaction not newer than id might be active */ - bool find_same_or_older(trx_t *caller_trx, trx_id_t id) + bool find_same_or_older(trx_t *trx, trx_id_t id) { - return rw_trx_hash.iterate(caller_trx, find_same_or_older_callback, &id); + if (trx->max_inactive_id >= id) + return false; + bool found= rw_trx_hash.iterate(trx, find_same_or_older_callback, &id); + if (!found) + trx->max_inactive_id= id; + return found; } diff --git a/storage/innobase/include/trx0trx.h b/storage/innobase/include/trx0trx.h index 21e4516f35a..5b2b2264a46 100644 --- a/storage/innobase/include/trx0trx.h +++ b/storage/innobase/include/trx0trx.h @@ -603,6 +603,10 @@ public: Cleared in commit_in_memory() after commit_state(), trx_sys_t::deregister_rw(), release_locks(). */ trx_id_t id; + /** The largest encountered transaction identifier for which no + transaction was observed to be active. This is a cache to speed up + trx_sys_t::find_same_or_older(). */ + trx_id_t max_inactive_id; private: /** mutex protecting state and some of lock diff --git a/storage/innobase/lock/lock0lock.cc b/storage/innobase/lock/lock0lock.cc index 2b30b9b1a03..3c7c3d348af 100644 --- a/storage/innobase/lock/lock0lock.cc +++ b/storage/innobase/lock/lock0lock.cc @@ -1064,13 +1064,16 @@ lock_sec_rec_some_has_impl( const trx_id_t max_trx_id= page_get_max_trx_id(page_align(rec)); - if ((caller_trx->id > max_trx_id && - !trx_sys.find_same_or_older(caller_trx, max_trx_id)) || + /* Note: It is possible to have caller_trx->id == 0 in a locking read + if caller_trx has not modified any persistent tables. */ + if (!trx_sys.find_same_or_older(caller_trx, max_trx_id) || !lock_check_trx_id_sanity(max_trx_id, rec, index, offsets)) return nullptr; - /* In this case it is possible that some transaction has an implicit - x-lock. We have to look in the clustered index. */ + /* We checked above that some active (or XA PREPARE) transaction exists + that is older than PAGE_MAX_TRX_ID. That is, some transaction may be + holding an implicit lock on the record. We have to look up the + clustered index record to find if it is (or was) the case. */ return row_vers_impl_x_locked(caller_trx, rec, index, offsets); } @@ -5157,20 +5160,24 @@ has an implicit lock on the record. The transaction instance must have a reference count > 0 so that it can't be committed and freed before this function has completed. */ static -void +bool lock_rec_convert_impl_to_expl_for_trx( /*==================================*/ + trx_t* trx, /*!< in/out: active transaction */ const page_id_t id, /*!< in: page identifier */ const rec_t* rec, /*!< in: user record on page */ - dict_index_t* index, /*!< in: index of record */ - trx_t* trx, /*!< in/out: active transaction */ - ulint heap_no)/*!< in: rec heap number to lock */ + dict_index_t* index) /*!< in: index of record */ { + if (!trx) + return false; + ut_ad(trx->is_referenced()); ut_ad(page_rec_is_leaf(rec)); ut_ad(!rec_is_metadata(rec, *index)); DEBUG_SYNC_C("before_lock_rec_convert_impl_to_expl_for_trx"); + ulint heap_no= page_rec_get_heap_no(rec); + { LockGuard g{lock_sys.rec_hash, id}; trx->mutex_lock(); @@ -5187,6 +5194,7 @@ lock_rec_convert_impl_to_expl_for_trx( trx->release_reference(); DEBUG_SYNC_C("after_lock_rec_convert_impl_to_expl_for_trx"); + return false; } @@ -5260,7 +5268,6 @@ static void lock_rec_other_trx_holds_expl(trx_t *caller_trx, trx_t *trx, } #endif /* UNIV_DEBUG */ - /** If an implicit x-lock exists on a record, convert it to an explicit one. Often, this is called by a transaction that is about to enter a lock wait @@ -5272,12 +5279,14 @@ This may also be called by the same transaction that is already holding an implicit exclusive lock on the record. In this case, no explicit lock should be created. +@tparam is_primary whether the index is the primary key @param[in,out] caller_trx current transaction @param[in] id index tree leaf page identifier @param[in] rec record on the leaf page @param[in] index the index of the record @param[in] offsets rec_get_offsets(rec,index) @return whether caller_trx already holds an exclusive lock on rec */ +template static bool lock_rec_convert_impl_to_expl( @@ -5295,8 +5304,9 @@ lock_rec_convert_impl_to_expl( ut_ad(!page_rec_is_comp(rec) == !rec_offs_comp(offsets)); ut_ad(page_rec_is_leaf(rec)); ut_ad(!rec_is_metadata(rec, *index)); + ut_ad(index->is_primary() == is_primary); - if (dict_index_is_clust(index)) { + if (is_primary) { trx_id_t trx_id; trx_id = lock_clust_rec_some_has_impl(rec, index, offsets); @@ -5322,20 +5332,7 @@ lock_rec_convert_impl_to_expl( ut_d(lock_rec_other_trx_holds_expl(caller_trx, trx, rec, id)); } - if (trx) { - ulint heap_no = page_rec_get_heap_no(rec); - - ut_ad(trx->is_referenced()); - - /* If the transaction is still active and has no - explicit x-lock set on the record, set one for it. - trx cannot be committed until the ref count is zero. */ - - lock_rec_convert_impl_to_expl_for_trx( - id, rec, index, trx, heap_no); - } - - return false; + return lock_rec_convert_impl_to_expl_for_trx(trx, id, rec, index); } /*********************************************************************//** @@ -5374,8 +5371,9 @@ lock_clust_rec_modify_check_and_lock( /* If a transaction has no explicit x-lock set on the record, set one for it */ - if (lock_rec_convert_impl_to_expl(thr_get_trx(thr), block->page.id(), - rec, index, offsets)) { + if (lock_rec_convert_impl_to_expl(thr_get_trx(thr), + block->page.id(), + rec, index, offsets)) { /* We already hold an implicit exclusive lock. */ return DB_SUCCESS; } @@ -5532,15 +5530,17 @@ lock_sec_rec_read_check_and_lock( return(DB_SUCCESS); } - const page_id_t id{block->page.id()}; - ut_ad(!rec_is_metadata(rec, *index)); trx_t *trx = thr_get_trx(thr); + + if (lock_table_has(trx, index->table, mode)) { + return DB_SUCCESS; + } + if (!page_rec_is_supremum(rec) - && !lock_table_has(trx, index->table, LOCK_X) - && lock_rec_convert_impl_to_expl(thr_get_trx(thr), id, rec, - index, offsets) + && lock_rec_convert_impl_to_expl( + trx, block->page.id(), rec, index, offsets) && gap_mode == LOCK_REC_NOT_GAP) { /* We already hold an implicit exclusive lock. */ return DB_SUCCESS; @@ -5565,7 +5565,8 @@ lock_sec_rec_read_check_and_lock( if (trx->wsrep == 3) trx->wsrep = 1; #endif /* WITH_WSREP */ - ut_ad(lock_rec_queue_validate(false, id, rec, index, offsets)); + ut_ad(lock_rec_queue_validate(false, block->page.id(), + rec, index, offsets)); return(err); } @@ -5622,7 +5623,8 @@ lock_clust_rec_read_check_and_lock( trx_t *trx = thr_get_trx(thr); if (!lock_table_has(trx, index->table, LOCK_X) && heap_no != PAGE_HEAP_NO_SUPREMUM - && lock_rec_convert_impl_to_expl(trx, id, rec, index, offsets) + && lock_rec_convert_impl_to_expl(trx, id, + rec, index, offsets) && gap_mode == LOCK_REC_NOT_GAP) { /* We already hold an implicit exclusive lock. */ return DB_SUCCESS; diff --git a/storage/innobase/trx/trx0trx.cc b/storage/innobase/trx/trx0trx.cc index a0c781f6287..d7ab02844bf 100644 --- a/storage/innobase/trx/trx0trx.cc +++ b/storage/innobase/trx/trx0trx.cc @@ -404,6 +404,7 @@ void trx_t::free() sizeof skip_lock_inheritance_and_n_ref); /* do not poison mutex */ MEM_NOACCESS(&id, sizeof id); + MEM_NOACCESS(&max_inactive_id, sizeof id); MEM_NOACCESS(&state, sizeof state); MEM_NOACCESS(&is_recovered, sizeof is_recovered); #ifdef WITH_WSREP -- cgit v1.2.1 From 9593cccf285ee348fc9a2743c1ed7d24c768439b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marko=20M=C3=A4kel=C3=A4?= Date: Thu, 16 Mar 2023 17:09:08 +0200 Subject: MDEV-26055: Improve adaptive flushing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adaptive flushing is enabled by setting innodb_max_dirty_pages_pct_lwm>0 (not default) and innodb_adaptive_flushing=ON (default). There is also the parameter innodb_adaptive_flushing_lwm (default: 10 per cent of the log capacity). It should enable some adaptive flushing even when innodb_max_dirty_pages_pct_lwm=0. That is not being changed here. This idea was first presented by Inaam Rana several years ago, and I discussed it with Jean-François Gagné at FOSDEM 2023. buf_flush_page_cleaner(): When we are not near the log capacity limit (neither buf_flush_async_lsn nor buf_flush_sync_lsn are set), also try to move clean blocks from the buf_pool.LRU list to buf_pool.free or initiate writes (but not the eviction) of dirty blocks, until the remaining I/O capacity has been consumed. buf_flush_LRU_list_batch(): Add the parameter bool evict, to specify whether dirty least recently used pages (from buf_pool.LRU) should be evicted immediately after they have been written out. Callers outside buf_flush_page_cleaner() will pass evict=true, to retain the existing behaviour. buf_do_LRU_batch(): Add the parameter bool evict. Return counts of evicted and flushed pages. buf_flush_LRU(): Add the parameter bool evict. Assume that the caller holds buf_pool.mutex and will invoke buf_dblwr.flush_buffered_writes() afterwards. buf_flush_list_holding_mutex(): A low-level variant of buf_flush_list() whose caller must hold buf_pool.mutex and invoke buf_dblwr.flush_buffered_writes() afterwards. buf_flush_wait_batch_end_acquiring_mutex(): Remove. It is enough to have buf_flush_wait_batch_end(). page_cleaner_flush_pages_recommendation(): Avoid some floating-point arithmetics. buf_flush_page(), buf_flush_check_neighbor(), buf_flush_check_neighbors(), buf_flush_try_neighbors(): Rename the parameter "bool lru" to "bool evict". buf_free_from_unzip_LRU_list_batch(): Remove the parameter. Only actual page writes will contribute towards the limit. buf_LRU_free_page(): Evict freed pages of temporary tables. buf_pool.done_free: Broadcast whenever a block is freed (and buf_pool.try_LRU_scan is set). buf_pool_t::io_buf_t::reserve(): Retry indefinitely. During the test encryption.innochecksum we easily run out of these buffers for PAGE_COMPRESSED or ENCRYPTED pages. Tested by Matthias Leich and Axel Schwenke --- storage/innobase/buf/buf0buf.cc | 194 ++++++++++-------- storage/innobase/buf/buf0flu.cc | 392 ++++++++++++++++++------------------- storage/innobase/buf/buf0lru.cc | 61 +++--- storage/innobase/include/buf0buf.h | 54 ++--- storage/innobase/include/buf0flu.h | 13 +- 5 files changed, 360 insertions(+), 354 deletions(-) diff --git a/storage/innobase/buf/buf0buf.cc b/storage/innobase/buf/buf0buf.cc index 47cc915a11d..510872c142e 100644 --- a/storage/innobase/buf/buf0buf.cc +++ b/storage/innobase/buf/buf0buf.cc @@ -408,7 +408,6 @@ static bool buf_page_decrypt_after_read(buf_page_t *bpage, if (id.space() == SRV_TMP_SPACE_ID && innodb_encrypt_temporary_tables) { slot = buf_pool.io_buf_reserve(); - ut_a(slot); slot->allocate(); bool ok = buf_tmp_page_decrypt(slot->crypt_buf, dst_frame); slot->release(); @@ -431,7 +430,6 @@ decompress: } slot = buf_pool.io_buf_reserve(); - ut_a(slot); slot->allocate(); decompress_with_slot: @@ -455,7 +453,6 @@ decrypt_failed: } slot = buf_pool.io_buf_reserve(); - ut_a(slot); slot->allocate(); /* decrypt using crypt_buf to dst_frame */ @@ -1293,6 +1290,41 @@ inline bool buf_pool_t::realloc(buf_block_t *block) return(true); /* free_list was enough */ } +void buf_pool_t::io_buf_t::create(ulint n_slots) +{ + this->n_slots= n_slots; + slots= static_cast + (ut_malloc_nokey(n_slots * sizeof *slots)); + memset((void*) slots, 0, n_slots * sizeof *slots); +} + +void buf_pool_t::io_buf_t::close() +{ + for (buf_tmp_buffer_t *s= slots, *e= slots + n_slots; s != e; s++) + { + aligned_free(s->crypt_buf); + aligned_free(s->comp_buf); + } + ut_free(slots); + slots= nullptr; + n_slots= 0; +} + +buf_tmp_buffer_t *buf_pool_t::io_buf_t::reserve() +{ + for (;;) + { + for (buf_tmp_buffer_t *s= slots, *e= slots + n_slots; s != e; s++) + if (s->acquire()) + return s; + os_aio_wait_until_no_pending_writes(); + for (buf_tmp_buffer_t *s= slots, *e= slots + n_slots; s != e; s++) + if (s->acquire()) + return s; + os_aio_wait_until_no_pending_reads(); + } +} + /** Sets the global variable that feeds MySQL's innodb_buffer_pool_resize_status to the specified string. The format and the following parameters are the same as the ones used for printf(3). @@ -1359,21 +1391,23 @@ inline bool buf_pool_t::withdraw_blocks() block = next_block; } - mysql_mutex_unlock(&mutex); /* reserve free_list length */ if (UT_LIST_GET_LEN(withdraw) < withdraw_target) { buf_flush_LRU( std::max(withdraw_target - UT_LIST_GET_LEN(withdraw), - srv_LRU_scan_depth)); - buf_flush_wait_batch_end_acquiring_mutex(true); + srv_LRU_scan_depth), + true); + mysql_mutex_unlock(&buf_pool.mutex); + buf_dblwr.flush_buffered_writes(); + mysql_mutex_lock(&buf_pool.mutex); + buf_flush_wait_batch_end(true); } /* relocate blocks/buddies in withdrawn area */ ulint count2 = 0; - mysql_mutex_lock(&mutex); buf_pool_mutex_exit_forbid(); for (buf_page_t* bpage = UT_LIST_GET_FIRST(LRU), *next_bpage; bpage; bpage = next_bpage) { @@ -2380,11 +2414,6 @@ buf_page_get_low( || (rw_latch == RW_X_LATCH) || (rw_latch == RW_SX_LATCH) || (rw_latch == RW_NO_LATCH)); - ut_ad(!allow_ibuf_merge - || mode == BUF_GET - || mode == BUF_GET_POSSIBLY_FREED - || mode == BUF_GET_IF_IN_POOL - || mode == BUF_GET_IF_IN_POOL_OR_WATCH); if (err) { *err = DB_SUCCESS; @@ -2392,15 +2421,15 @@ buf_page_get_low( #ifdef UNIV_DEBUG switch (mode) { - case BUF_PEEK_IF_IN_POOL: + default: + ut_ad(!allow_ibuf_merge); + ut_ad(mode == BUF_PEEK_IF_IN_POOL); + break; + case BUF_GET_POSSIBLY_FREED: case BUF_GET_IF_IN_POOL: /* The caller may pass a dummy page size, because it does not really matter. */ break; - default: - MY_ASSERT_UNREACHABLE(); - case BUF_GET_POSSIBLY_FREED: - break; case BUF_GET: case BUF_GET_IF_IN_POOL_OR_WATCH: ut_ad(!mtr->is_freeing_tree()); @@ -2547,11 +2576,12 @@ ignore_block: return nullptr; } } else if (mode != BUF_PEEK_IF_IN_POOL) { - } else if (!mtr) { + } else if (!mtr) { ut_ad(!block->page.oldest_modification()); mysql_mutex_lock(&buf_pool.mutex); block->unfix(); +free_unfixed_block: if (!buf_LRU_free_page(&block->page, true)) { ut_ad(0); } @@ -2667,20 +2697,19 @@ wait_for_unfix: /* Decompress the page while not holding buf_pool.mutex. */ - auto ok = buf_zip_decompress(block, false); - block->page.read_unfix(state); - state = block->page.state(); - block->page.lock.x_unlock(); + const auto ok = buf_zip_decompress(block, false); --buf_pool.n_pend_unzip; - if (!ok) { - /* FIXME: Evict the corrupted - ROW_FORMAT=COMPRESSED page! */ - if (err) { *err = DB_PAGE_CORRUPTED; } - return nullptr; + mysql_mutex_lock(&buf_pool.mutex); + } + state = block->page.read_unfix(state); + block->page.lock.x_unlock(); + + if (!ok) { + goto free_unfixed_block; } } @@ -2886,72 +2915,73 @@ buf_page_get_gen( dberr_t* err, bool allow_ibuf_merge) { - if (buf_block_t *block= recv_sys.recover(page_id)) + buf_block_t *block= recv_sys.recover(page_id); + if (UNIV_LIKELY(!block)) + return buf_page_get_low(page_id, zip_size, rw_latch, + guess, mode, mtr, err, allow_ibuf_merge); + else if (UNIV_UNLIKELY(block == reinterpret_cast(-1))) { - if (UNIV_UNLIKELY(block == reinterpret_cast(-1))) - { - corrupted: - if (err) - *err= DB_CORRUPTION; - return nullptr; - } - /* Recovery is a special case; we fix() before acquiring lock. */ - auto s= block->page.fix(); - ut_ad(s >= buf_page_t::FREED); - /* The block may be write-fixed at this point because we are not - holding a lock, but it must not be read-fixed. */ - ut_ad(s < buf_page_t::READ_FIX || s >= buf_page_t::WRITE_FIX); + corrupted: if (err) - *err= DB_SUCCESS; - const bool must_merge= allow_ibuf_merge && - ibuf_page_exists(page_id, block->zip_size()); + *err= DB_CORRUPTION; + return nullptr; + } + /* Recovery is a special case; we fix() before acquiring lock. */ + auto s= block->page.fix(); + ut_ad(s >= buf_page_t::FREED); + /* The block may be write-fixed at this point because we are not + holding a lock, but it must not be read-fixed. */ + ut_ad(s < buf_page_t::READ_FIX || s >= buf_page_t::WRITE_FIX); + if (err) + *err= DB_SUCCESS; + const bool must_merge= allow_ibuf_merge && + ibuf_page_exists(page_id, block->zip_size()); + if (s < buf_page_t::UNFIXED) + { + got_freed_page: + ut_ad(mode == BUF_GET_POSSIBLY_FREED || mode == BUF_PEEK_IF_IN_POOL); + mysql_mutex_lock(&buf_pool.mutex); + block->page.unfix(); + buf_LRU_free_page(&block->page, true); + mysql_mutex_unlock(&buf_pool.mutex); + goto corrupted; + } + else if (must_merge && + fil_page_get_type(block->page.frame) == FIL_PAGE_INDEX && + page_is_leaf(block->page.frame)) + { + block->page.lock.x_lock(); + s= block->page.state(); + ut_ad(s > buf_page_t::FREED); + ut_ad(s < buf_page_t::READ_FIX); if (s < buf_page_t::UNFIXED) { - got_freed_page: - ut_ad(mode == BUF_GET_POSSIBLY_FREED || mode == BUF_PEEK_IF_IN_POOL); - block->page.unfix(); - goto corrupted; + block->page.lock.x_unlock(); + goto got_freed_page; } - else if (must_merge && - fil_page_get_type(block->page.frame) == FIL_PAGE_INDEX && - page_is_leaf(block->page.frame)) + else { - block->page.lock.x_lock(); - s= block->page.state(); - ut_ad(s > buf_page_t::FREED); - ut_ad(s < buf_page_t::READ_FIX); - if (s < buf_page_t::UNFIXED) + if (block->page.is_ibuf_exist()) + block->page.clear_ibuf_exist(); + if (dberr_t e= + ibuf_merge_or_delete_for_page(block, page_id, block->zip_size())) { - block->page.lock.x_unlock(); - goto got_freed_page; - } - else - { - if (block->page.is_ibuf_exist()) - block->page.clear_ibuf_exist(); - if (dberr_t e= - ibuf_merge_or_delete_for_page(block, page_id, block->zip_size())) - { - if (err) - *err= e; - buf_pool.corrupted_evict(&block->page, s); - return nullptr; - } + if (err) + *err= e; + buf_pool.corrupted_evict(&block->page, s); + return nullptr; } + } - if (rw_latch == RW_X_LATCH) - { - mtr->memo_push(block, MTR_MEMO_PAGE_X_FIX); - return block; - } - block->page.lock.x_unlock(); + if (rw_latch == RW_X_LATCH) + { + mtr->memo_push(block, MTR_MEMO_PAGE_X_FIX); + return block; } - mtr->page_lock(block, rw_latch); - return block; + block->page.lock.x_unlock(); } - - return buf_page_get_low(page_id, zip_size, rw_latch, - guess, mode, mtr, err, allow_ibuf_merge); + mtr->page_lock(block, rw_latch); + return block; } /********************************************************************//** diff --git a/storage/innobase/buf/buf0flu.cc b/storage/innobase/buf/buf0flu.cc index cca921a9275..70e1595e00e 100644 --- a/storage/innobase/buf/buf0flu.cc +++ b/storage/innobase/buf/buf0flu.cc @@ -380,13 +380,12 @@ void buf_page_write_complete(const IORequest &request) if (request.is_LRU()) { buf_LRU_free_page(bpage, true); + buf_pool.try_LRU_scan= true; + pthread_cond_signal(&buf_pool.done_free); ut_ad(buf_pool.n_flush_LRU_); if (!--buf_pool.n_flush_LRU_) - { pthread_cond_broadcast(&buf_pool.done_flush_LRU); - pthread_cond_signal(&buf_pool.done_free); - } } else { @@ -763,17 +762,17 @@ inline void buf_pool_t::release_freed_page(buf_page_t *bpage) } /** Write a flushable page to a file. buf_pool.mutex must be held. -@param lru true=buf_pool.LRU; false=buf_pool.flush_list +@param evict whether to evict the page on write completion @param space tablespace @return whether the page was flushed and buf_pool.mutex was released */ -inline bool buf_page_t::flush(bool lru, fil_space_t *space) +inline bool buf_page_t::flush(bool evict, fil_space_t *space) { ut_ad(in_file()); ut_ad(in_LRU_list); ut_ad((space->purpose == FIL_TYPE_TEMPORARY) == (space == fil_system.temp_space)); ut_ad(space->referenced()); - ut_ad(lru || space != fil_system.temp_space); + ut_ad(evict || space != fil_system.temp_space); if (!lock.u_lock_try(true)) return false; @@ -801,7 +800,7 @@ inline bool buf_page_t::flush(bool lru, fil_space_t *space) Thus, it cannot be relocated or removed. */ DBUG_PRINT("ib_buf", ("%s %u page %u:%u", - lru ? "LRU" : "flush_list", + evict ? "LRU" : "flush_list", id().space(), id().page_no())); ut_d(const auto f=) zip.fix.fetch_add(WRITE_FIX - UNFIXED); ut_ad(f >= UNFIXED); @@ -809,7 +808,7 @@ inline bool buf_page_t::flush(bool lru, fil_space_t *space) ut_ad(space == fil_system.temp_space ? oldest_modification() == 2 : oldest_modification() > 2); - if (lru) + if (evict) { ut_ad(buf_pool.n_flush_LRU_ < ULINT_UNDEFINED); buf_pool.n_flush_LRU_++; @@ -831,7 +830,7 @@ inline bool buf_page_t::flush(bool lru, fil_space_t *space) #if defined HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE || defined _WIN32 size_t orig_size; #endif - IORequest::Type type= lru ? IORequest::WRITE_LRU : IORequest::WRITE_ASYNC; + IORequest::Type type= evict ? IORequest::WRITE_LRU : IORequest::WRITE_ASYNC; buf_tmp_buffer_t *slot= nullptr; if (UNIV_UNLIKELY(!frame)) /* ROW_FORMAT=COMPRESSED */ @@ -875,7 +874,7 @@ inline bool buf_page_t::flush(bool lru, fil_space_t *space) { switch (space->chain.start->punch_hole) { case 1: - type= lru ? IORequest::PUNCH_LRU : IORequest::PUNCH; + type= evict ? IORequest::PUNCH_LRU : IORequest::PUNCH; break; case 2: size= orig_size; @@ -912,9 +911,10 @@ inline bool buf_page_t::flush(bool lru, fil_space_t *space) /** Check whether a page can be flushed from the buf_pool. @param id page identifier @param fold id.fold() -@param lru true=buf_pool.LRU; false=buf_pool.flush_list +@param evict true=buf_pool.LRU; false=buf_pool.flush_list @return whether the page can be flushed */ -static bool buf_flush_check_neighbor(const page_id_t id, ulint fold, bool lru) +static bool buf_flush_check_neighbor(const page_id_t id, ulint fold, + bool evict) { mysql_mutex_assert_owner(&buf_pool.mutex); ut_ad(fold == id.fold()); @@ -926,9 +926,9 @@ static bool buf_flush_check_neighbor(const page_id_t id, ulint fold, bool lru) if (!bpage || buf_pool.watch_is_sentinel(*bpage)) return false; - /* We avoid flushing 'non-old' blocks in an LRU flush, because the + /* We avoid flushing 'non-old' blocks in an eviction flush, because the flushed blocks are soon freed */ - if (lru && !bpage->is_old()) + if (evict && !bpage->is_old()) return false; return bpage->oldest_modification() > 1 && bpage->ready_for_flush(); @@ -938,11 +938,11 @@ static bool buf_flush_check_neighbor(const page_id_t id, ulint fold, bool lru) @param space tablespace @param id page identifier of a dirty page @param contiguous whether to consider contiguous areas of pages -@param lru true=buf_pool.LRU; false=buf_pool.flush_list +@param evict true=buf_pool.LRU; false=buf_pool.flush_list @return last page number that can be flushed */ static page_id_t buf_flush_check_neighbors(const fil_space_t &space, page_id_t &id, bool contiguous, - bool lru) + bool evict) { ut_ad(id.page_no() < space.size + (space.physical_size() == 2048 ? 1 @@ -975,7 +975,7 @@ static page_id_t buf_flush_check_neighbors(const fil_space_t &space, for (page_id_t i= id - 1;; --i) { fold--; - if (!buf_flush_check_neighbor(i, fold, lru)) + if (!buf_flush_check_neighbor(i, fold, evict)) { low= i + 1; break; @@ -991,7 +991,7 @@ static page_id_t buf_flush_check_neighbors(const fil_space_t &space, while (++i < high) { ++fold; - if (!buf_flush_check_neighbor(i, fold, lru)) + if (!buf_flush_check_neighbor(i, fold, evict)) break; } @@ -1059,20 +1059,20 @@ and also write zeroes or punch the hole for the freed ranges of pages. @param space tablespace @param page_id page identifier @param contiguous whether to consider contiguous areas of pages -@param lru true=buf_pool.LRU; false=buf_pool.flush_list +@param evict true=buf_pool.LRU; false=buf_pool.flush_list @param n_flushed number of pages flushed so far in this batch @param n_to_flush maximum number of pages we are allowed to flush @return number of pages flushed */ static ulint buf_flush_try_neighbors(fil_space_t *space, const page_id_t page_id, - bool contiguous, bool lru, + bool contiguous, bool evict, ulint n_flushed, ulint n_to_flush) { ut_ad(space->id == page_id.space()); ulint count= 0; page_id_t id= page_id; - page_id_t high= buf_flush_check_neighbors(*space, id, contiguous, lru); + page_id_t high= buf_flush_check_neighbors(*space, id, contiguous, evict); ut_ad(page_id >= id); ut_ad(page_id < high); @@ -1096,13 +1096,13 @@ static ulint buf_flush_try_neighbors(fil_space_t *space, if (buf_page_t *bpage= buf_pool.page_hash.get(id, chain)) { ut_ad(bpage->in_file()); - /* We avoid flushing 'non-old' blocks in an LRU flush, + /* We avoid flushing 'non-old' blocks in an eviction flush, because the flushed blocks are soon freed */ - if (!lru || id == page_id || bpage->is_old()) + if (!evict || id == page_id || bpage->is_old()) { if (!buf_pool.watch_is_sentinel(*bpage) && bpage->oldest_modification() > 1 && bpage->ready_for_flush() && - bpage->flush(lru, space)) + bpage->flush(evict, space)) { ++count; continue; @@ -1128,12 +1128,8 @@ This utility moves the uncompressed frames of pages to the free list. Note that this function does not actually flush any data to disk. It just detaches the uncompressed frames from the compressed pages at the tail of the unzip_LRU and puts those freed frames in the free list. -Note that it is a best effort attempt and it is not guaranteed that -after a call to this function there will be 'max' blocks in the free -list. -@param[in] max desired number of blocks in the free_list @return number of blocks moved to the free list. */ -static ulint buf_free_from_unzip_LRU_list_batch(ulint max) +static ulint buf_free_from_unzip_LRU_list_batch() { ulint scanned = 0; ulint count = 0; @@ -1143,7 +1139,6 @@ static ulint buf_free_from_unzip_LRU_list_batch(ulint max) buf_block_t* block = UT_LIST_GET_LAST(buf_pool.unzip_LRU); while (block - && count < max && UT_LIST_GET_LEN(buf_pool.free) < srv_LRU_scan_depth && UT_LIST_GET_LEN(buf_pool.unzip_LRU) > UT_LIST_GET_LEN(buf_pool.LRU) / 10) { @@ -1214,10 +1209,13 @@ static void buf_flush_discard_page(buf_page_t *bpage) buf_LRU_free_page(bpage, true); } -/** Flush dirty blocks from the end of the LRU list. -@param max maximum number of blocks to make available in buf_pool.free -@param n counts of flushed and evicted pages */ -static void buf_flush_LRU_list_batch(ulint max, flush_counters_t *n) +/** Flush dirty blocks from the end buf_pool.LRU, +and move clean blocks to buf_pool.free. +@param max maximum number of blocks to flush +@param evict whether dirty pages are to be evicted after flushing them +@param n counts of flushed and evicted pages */ +static void buf_flush_LRU_list_batch(ulint max, bool evict, + flush_counters_t *n) { ulint scanned= 0; ulint free_limit= srv_LRU_scan_depth; @@ -1229,6 +1227,7 @@ static void buf_flush_LRU_list_batch(ulint max, flush_counters_t *n) const auto neighbors= UT_LIST_GET_LEN(buf_pool.LRU) < BUF_LRU_OLD_MIN_LEN ? 0 : srv_flush_neighbors; fil_space_t *space= nullptr; + bool do_evict= evict; uint32_t last_space_id= FIL_NULL; static_assert(FIL_NULL > SRV_TMP_SPACE_ID, "consistency"); static_assert(FIL_NULL > SRV_SPACE_ID_UPPER_BOUND, "consistency"); @@ -1236,8 +1235,7 @@ static void buf_flush_LRU_list_batch(ulint max, flush_counters_t *n) for (buf_page_t *bpage= UT_LIST_GET_LAST(buf_pool.LRU); bpage && ((UT_LIST_GET_LEN(buf_pool.LRU) > BUF_LRU_MIN_LEN && - UT_LIST_GET_LEN(buf_pool.free) < free_limit && - n->flushed + n->evicted < max) || + UT_LIST_GET_LEN(buf_pool.free) < free_limit) || recv_recovery_is_on()); ++scanned) { buf_page_t *prev= UT_LIST_GET_PREV(LRU, bpage); @@ -1257,8 +1255,8 @@ static void buf_flush_LRU_list_batch(ulint max, flush_counters_t *n) } else if (state < buf_page_t::READ_FIX) { - /* Block is ready for flush. Dispatch an IO request. The IO - helper thread will put it on free list in IO completion routine. */ + /* Block is ready for flush. Dispatch an IO request. + If evict=true, the page will be evicted by buf_page_write_complete(). */ const page_id_t page_id(bpage->id()); const uint32_t space_id= page_id.space(); if (!space || space->id != space_id) @@ -1271,6 +1269,9 @@ static void buf_flush_LRU_list_batch(ulint max, flush_counters_t *n) space->release(); auto p= buf_flush_space(space_id); space= p.first; + /* For the temporary tablespace, LRU flushing will always + evict pages upon completing the write. */ + do_evict= evict || space == fil_system.temp_space; last_space_id= space_id; mysql_mutex_lock(&buf_pool.mutex); if (p.second) @@ -1292,11 +1293,13 @@ static void buf_flush_LRU_list_batch(ulint max, flush_counters_t *n) { mysql_mutex_unlock(&buf_pool.mutex); n->flushed+= buf_flush_try_neighbors(space, page_id, neighbors == 1, - true, n->flushed, max); + do_evict, n->flushed, max); reacquire_mutex: mysql_mutex_lock(&buf_pool.mutex); } - else if (bpage->flush(true, space)) + else if (n->flushed >= max && !recv_recovery_is_on()) + break; + else if (bpage->flush(do_evict, space)) { ++n->flushed; goto reacquire_mutex; @@ -1324,26 +1327,20 @@ reacquire_mutex: /** Flush and move pages from LRU or unzip_LRU list to the free list. Whether LRU or unzip_LRU is used depends on the state of the system. -@param max maximum number of blocks to make available in buf_pool.free -@return number of flushed pages */ -static ulint buf_do_LRU_batch(ulint max) +@param max maximum number of blocks to flush +@param evict whether dirty pages are to be evicted after flushing them +@param n counts of flushed and evicted pages */ +static void buf_do_LRU_batch(ulint max, bool evict, flush_counters_t *n) { - const ulint n_unzip_LRU_evicted= buf_LRU_evict_from_unzip_LRU() - ? buf_free_from_unzip_LRU_list_batch(max) - : 0; - flush_counters_t n; - n.flushed= 0; - n.evicted= n_unzip_LRU_evicted; - buf_flush_LRU_list_batch(max, &n); - mysql_mutex_assert_owner(&buf_pool.mutex); - - if (const ulint evicted= n.evicted - n_unzip_LRU_evicted) - buf_lru_freed_page_count+= evicted; + if (buf_LRU_evict_from_unzip_LRU()) + buf_free_from_unzip_LRU_list_batch(); + n->evicted= 0; + n->flushed= 0; + buf_flush_LRU_list_batch(max, evict, n); - if (n.flushed) - buf_lru_flush_page_count+= n.flushed; - - return n.flushed; + mysql_mutex_assert_owner(&buf_pool.mutex); + buf_lru_freed_page_count+= n->evicted; + buf_lru_flush_page_count+= n->flushed; } /** This utility flushes dirty blocks from the end of the flush_list. @@ -1398,7 +1395,7 @@ static ulint buf_do_flush_list_batch(ulint max_n, lsn_t lsn) from buf_pool.flush_list must adjust the hazard pointer. Note: A concurrent execution of buf_flush_list_space() may - terminate this scan prematurely. The buf_pool.n_flush_list() + terminate this scan prematurely. The buf_pool.n_flush_list_ should prevent multiple threads from executing buf_do_flush_list_batch() concurrently, but buf_flush_list_space() is ignoring that. */ @@ -1505,46 +1502,53 @@ void buf_flush_wait_batch_end(bool lru) } /** Write out dirty blocks from buf_pool.flush_list. +The caller must invoke buf_dblwr.flush_buffered_writes() +after releasing buf_pool.mutex. @param max_n wished maximum mumber of blocks flushed @param lsn buf_pool.get_oldest_modification(LSN_MAX) target @return the number of processed pages @retval 0 if a buf_pool.flush_list batch is already running */ -static ulint buf_flush_list(ulint max_n= ULINT_UNDEFINED, lsn_t lsn= LSN_MAX) +static ulint buf_flush_list_holding_mutex(ulint max_n= ULINT_UNDEFINED, + lsn_t lsn= LSN_MAX) { ut_ad(lsn); + mysql_mutex_assert_owner(&buf_pool.mutex); - if (buf_pool.n_flush_list()) + if (buf_pool.n_flush_list_) return 0; - mysql_mutex_lock(&buf_pool.mutex); - const bool running= buf_pool.n_flush_list_ != 0; /* FIXME: we are performing a dirty read of buf_pool.flush_list.count while not holding buf_pool.flush_list_mutex */ - if (running || !UT_LIST_GET_LEN(buf_pool.flush_list)) + if (!UT_LIST_GET_LEN(buf_pool.flush_list)) { - if (!running) - pthread_cond_broadcast(&buf_pool.done_flush_list); - mysql_mutex_unlock(&buf_pool.mutex); + pthread_cond_broadcast(&buf_pool.done_flush_list); return 0; } buf_pool.n_flush_list_++; const ulint n_flushed= buf_do_flush_list_batch(max_n, lsn); - const ulint n_flushing= --buf_pool.n_flush_list_; - - buf_pool.try_LRU_scan= true; - - mysql_mutex_unlock(&buf_pool.mutex); - - if (!n_flushing) + if (!--buf_pool.n_flush_list_) pthread_cond_broadcast(&buf_pool.done_flush_list); - buf_dblwr.flush_buffered_writes(); - DBUG_PRINT("ib_buf", ("flush_list completed, " ULINTPF " pages", n_flushed)); return n_flushed; } +/** Write out dirty blocks from buf_pool.flush_list. +@param max_n wished maximum mumber of blocks flushed +@param lsn buf_pool.get_oldest_modification(LSN_MAX) target +@return the number of processed pages +@retval 0 if a buf_pool.flush_list batch is already running */ +static ulint buf_flush_list(ulint max_n= ULINT_UNDEFINED, + lsn_t lsn= LSN_MAX) +{ + mysql_mutex_lock(&buf_pool.mutex); + ulint n= buf_flush_list_holding_mutex(max_n, lsn); + mysql_mutex_unlock(&buf_pool.mutex); + buf_dblwr.flush_buffered_writes(); + return n; +} + /** Try to flush all the dirty pages that belong to a given tablespace. @param space tablespace @param n_flushed number of pages written @@ -1642,7 +1646,7 @@ bool buf_flush_list_space(fil_space_t *space, ulint *n_flushed) mysql_mutex_unlock(&buf_pool.flush_list_mutex); buf_pool.try_LRU_scan= true; - + pthread_cond_broadcast(&buf_pool.done_free); mysql_mutex_unlock(&buf_pool.mutex); if (acquired) @@ -1656,43 +1660,41 @@ bool buf_flush_list_space(fil_space_t *space, ulint *n_flushed) return may_have_skipped; } -/** Write out dirty blocks from buf_pool.LRU. +/** Write out dirty blocks from buf_pool.LRU, +and move clean blocks to buf_pool.free. +The caller must invoke buf_dblwr.flush_buffered_writes() +after releasing buf_pool.mutex. @param max_n wished maximum mumber of blocks flushed -@return the number of processed pages +@param evict whether to evict pages after flushing +@return evict ? number of processed pages : number of pages written @retval 0 if a buf_pool.LRU batch is already running */ -ulint buf_flush_LRU(ulint max_n) +ulint buf_flush_LRU(ulint max_n, bool evict) { - if (buf_pool.n_flush_LRU()) - return 0; - - log_buffer_flush_to_disk(); + mysql_mutex_assert_owner(&buf_pool.mutex); - mysql_mutex_lock(&buf_pool.mutex); - if (buf_pool.n_flush_LRU_) + if (evict) { - mysql_mutex_unlock(&buf_pool.mutex); - return 0; + if (buf_pool.n_flush_LRU_) + return 0; + buf_pool.n_flush_LRU_= 1; } - buf_pool.n_flush_LRU_++; - ulint n_flushed= buf_do_LRU_batch(max_n); - - const ulint n_flushing= --buf_pool.n_flush_LRU_; - - buf_pool.try_LRU_scan= true; - - mysql_mutex_unlock(&buf_pool.mutex); + flush_counters_t n; + buf_do_LRU_batch(max_n, evict, &n); - if (!n_flushing) + if (n.evicted) { - pthread_cond_broadcast(&buf_pool.done_flush_LRU); + buf_pool.try_LRU_scan= true; pthread_cond_signal(&buf_pool.done_free); } - buf_dblwr.flush_buffered_writes(); + if (!evict) + return n.flushed; - DBUG_PRINT("ib_buf", ("LRU flush completed, " ULINTPF " pages", n_flushed)); - return n_flushed; + if (!--buf_pool.n_flush_LRU_) + pthread_cond_broadcast(&buf_pool.done_flush_LRU); + + return n.evicted + n.flushed; } /** Initiate a log checkpoint, discarding the start of the log. @@ -1854,7 +1856,9 @@ ATTRIBUTE_COLD void buf_flush_wait_flushed(lsn_t sync_lsn) { mysql_mutex_unlock(&buf_pool.flush_list_mutex); ulint n_pages= buf_flush_list(srv_max_io_capacity, sync_lsn); - buf_flush_wait_batch_end_acquiring_mutex(false); + mysql_mutex_lock(&buf_pool.mutex); + buf_flush_wait_batch_end(false); + mysql_mutex_unlock(&buf_pool.mutex); if (n_pages) { MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_SYNC_TOTAL_PAGE, @@ -1920,17 +1924,6 @@ ATTRIBUTE_COLD void buf_flush_ahead(lsn_t lsn, bool furious) } } -/** Wait for pending flushes to complete. */ -void buf_flush_wait_batch_end_acquiring_mutex(bool lru) -{ - if (lru ? buf_pool.n_flush_LRU() : buf_pool.n_flush_list()) - { - mysql_mutex_lock(&buf_pool.mutex); - buf_flush_wait_batch_end(lru); - mysql_mutex_unlock(&buf_pool.mutex); - } -} - /** Conduct checkpoint-related flushing for innodb_flush_sync=ON, and try to initiate checkpoints until the target is met. @param lsn minimum value of buf_pool.get_oldest_modification(LSN_MAX) */ @@ -2042,8 +2035,9 @@ af_get_pct_for_lsn( / 7.5)); } -/** This function is called approximately once every second by the -page_cleaner thread if innodb_adaptive_flushing=ON. +/** This function is called approximately once every second by +buf_flush_page_cleaner() if innodb_max_dirty_pages_pct_lwm>0 +and innodb_adaptive_flushing=ON. Based on various factors it decides if there is a need to do flushing. @return number of pages recommended to be flushed @param last_pages_in number of pages flushed in previous batch @@ -2081,52 +2075,43 @@ static ulint page_cleaner_flush_pages_recommendation(ulint last_pages_in, n_pages= std::min(srv_io_capacity, dirty_blocks); } +func_exit: + page_cleaner.flush_pass++; return n_pages; } sum_pages += last_pages_in; - double time_elapsed = difftime(curr_time, prev_time); + const ulint time_elapsed = std::max(curr_time - prev_time, 1); - /* We update our variables every srv_flushing_avg_loops + /* We update our variables every innodb_flushing_avg_loops iterations to smooth out transition in workload. */ if (++n_iterations >= srv_flushing_avg_loops - || time_elapsed >= static_cast(srv_flushing_avg_loops)) { + || time_elapsed >= srv_flushing_avg_loops) { - if (time_elapsed < 1) { - time_elapsed = 1; - } - - avg_page_rate = static_cast( - ((static_cast(sum_pages) - / time_elapsed) - + static_cast(avg_page_rate)) / 2); + avg_page_rate = (sum_pages / time_elapsed + avg_page_rate) / 2; /* How much LSN we have generated since last call. */ - lsn_rate = static_cast( - static_cast(cur_lsn - prev_lsn) - / time_elapsed); + lsn_rate = (cur_lsn - prev_lsn) / time_elapsed; lsn_avg_rate = (lsn_avg_rate + lsn_rate) / 2; - ulint flush_tm = page_cleaner.flush_time; - ulint flush_pass = page_cleaner.flush_pass; - - page_cleaner.flush_time = 0; - page_cleaner.flush_pass = 0; - - if (flush_pass) { - flush_tm /= flush_pass; + if (page_cleaner.flush_pass) { + page_cleaner.flush_time /= page_cleaner.flush_pass; } - MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_TIME, flush_tm); - MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_PASS, flush_pass); - prev_lsn = cur_lsn; prev_time = curr_time; - n_iterations = 0; + MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_TIME, + page_cleaner.flush_time); + MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_PASS, + page_cleaner.flush_pass); + page_cleaner.flush_time = 0; + page_cleaner.flush_pass = 0; + + n_iterations = 0; sum_pages = 0; } @@ -2176,7 +2161,7 @@ static ulint page_cleaner_flush_pages_recommendation(ulint last_pages_in, MONITOR_SET(MONITOR_FLUSH_PCT_FOR_DIRTY, pct_for_dirty); MONITOR_SET(MONITOR_FLUSH_PCT_FOR_LSN, pct_for_lsn); - return(n_pages); + goto func_exit; } /** page_cleaner thread tasked with flushing dirty pages from the buffer @@ -2205,7 +2190,7 @@ static void buf_flush_page_cleaner() if (UNIV_UNLIKELY(lsn_limit != 0)) { -furious_flush: + furious_flush: if (UNIV_LIKELY(srv_flush_sync)) { buf_flush_sync_for_checkpoint(lsn_limit); @@ -2223,7 +2208,8 @@ furious_flush: if (buf_pool.page_cleaner_idle() && (!UT_LIST_GET_LEN(buf_pool.flush_list) || srv_max_dirty_pages_pct_lwm == 0.0)) - my_cond_wait(&buf_pool.do_flush_list, &buf_pool.flush_list_mutex.m_mutex); + my_cond_wait(&buf_pool.do_flush_list, + &buf_pool.flush_list_mutex.m_mutex); else my_cond_timedwait(&buf_pool.do_flush_list, &buf_pool.flush_list_mutex.m_mutex, &abstime); @@ -2251,19 +2237,25 @@ furious_flush: /* wake up buf_flush_wait() */ pthread_cond_broadcast(&buf_pool.done_flush_list); } -unemployed: + unemployed: buf_flush_async_lsn= 0; + set_idle: buf_pool.page_cleaner_set_idle(true); - - DBUG_EXECUTE_IF("ib_log_checkpoint_avoid", continue;); - DBUG_EXECUTE_IF("ib_log_checkpoint_avoid_hard", continue;); - mysql_mutex_unlock(&buf_pool.flush_list_mutex); + end_of_batch: + buf_dblwr.flush_buffered_writes(); + + do + { + DBUG_EXECUTE_IF("ib_log_checkpoint_avoid", continue;); + DBUG_EXECUTE_IF("ib_log_checkpoint_avoid_hard", continue;); - if (!recv_recovery_is_on() && - !srv_startup_is_before_trx_rollback_phase && - srv_operation == SRV_OPERATION_NORMAL) - log_checkpoint(); + if (!recv_recovery_is_on() && + !srv_startup_is_before_trx_rollback_phase && + srv_operation == SRV_OPERATION_NORMAL) + log_checkpoint(); + } + while (false); mysql_mutex_lock(&buf_pool.flush_list_mutex); continue; @@ -2314,56 +2306,40 @@ unemployed: if (!lsn_limit) lsn_limit= soft_lsn_limit; - ulint n_flushed; + ulint n_flushed= 0, n; if (UNIV_UNLIKELY(lsn_limit != 0)) { - n_flushed= buf_flush_list(srv_max_io_capacity, lsn_limit); - /* wake up buf_flush_wait() */ - pthread_cond_broadcast(&buf_pool.done_flush_list); - goto try_checkpoint; + n= srv_max_io_capacity; + goto background_flush; } else if (idle_flush || !srv_adaptive_flushing) { - n_flushed= buf_flush_list(srv_io_capacity); -try_checkpoint: - if (n_flushed) - { - MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_BACKGROUND_TOTAL_PAGE, - MONITOR_FLUSH_BACKGROUND_COUNT, - MONITOR_FLUSH_BACKGROUND_PAGES, - n_flushed); -do_checkpoint: - /* The periodic log_checkpoint() call here makes it harder to - reproduce bugs in crash recovery or mariabackup --prepare, or - in code that writes the redo log records. Omitting the call - here should not affect correctness, because log_free_check() - should still be invoking checkpoints when needed. */ - DBUG_EXECUTE_IF("ib_log_checkpoint_avoid", goto next;); - DBUG_EXECUTE_IF("ib_log_checkpoint_avoid_hard", goto next;); - - if (!recv_recovery_is_on() && srv_operation == SRV_OPERATION_NORMAL) - log_checkpoint(); - } + n= srv_io_capacity; + lsn_limit= LSN_MAX; + background_flush: + mysql_mutex_lock(&buf_pool.mutex); + n_flushed= buf_flush_list_holding_mutex(n, lsn_limit); + /* wake up buf_flush_wait() */ + pthread_cond_broadcast(&buf_pool.done_flush_list); + MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_BACKGROUND_TOTAL_PAGE, + MONITOR_FLUSH_BACKGROUND_COUNT, + MONITOR_FLUSH_BACKGROUND_PAGES, + n_flushed); } - else if (ulint n= page_cleaner_flush_pages_recommendation(last_pages, - oldest_lsn, - dirty_blocks, - dirty_pct)) + else if ((n= page_cleaner_flush_pages_recommendation(last_pages, + oldest_lsn, + dirty_blocks, + dirty_pct)) != 0) { - page_cleaner.flush_pass++; const ulint tm= ut_time_ms(); - last_pages= n_flushed= buf_flush_list(n); + mysql_mutex_lock(&buf_pool.mutex); + last_pages= n_flushed= buf_flush_list_holding_mutex(n); page_cleaner.flush_time+= ut_time_ms() - tm; - - if (n_flushed) - { - MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_ADAPTIVE_TOTAL_PAGE, - MONITOR_FLUSH_ADAPTIVE_COUNT, - MONITOR_FLUSH_ADAPTIVE_PAGES, - n_flushed); - goto do_checkpoint; - } + MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_ADAPTIVE_TOTAL_PAGE, + MONITOR_FLUSH_ADAPTIVE_COUNT, + MONITOR_FLUSH_ADAPTIVE_PAGES, + n_flushed); } else if (buf_flush_async_lsn <= oldest_lsn) { @@ -2371,24 +2347,29 @@ do_checkpoint: goto unemployed; } -#ifndef DBUG_OFF -next: -#endif /* !DBUG_OFF */ - mysql_mutex_lock(&buf_pool.flush_list_mutex); + n= buf_flush_LRU(n >= n_flushed ? n - n_flushed : 0, false); + mysql_mutex_unlock(&buf_pool.mutex); + last_pages+= n; + + if (!idle_flush) + goto end_of_batch; /* when idle flushing kicks in page_cleaner is marked active. reset it back to idle since the it was made active as part of idle flushing stage. */ - if (idle_flush) - buf_pool.page_cleaner_set_idle(true); + mysql_mutex_lock(&buf_pool.flush_list_mutex); + goto set_idle; } mysql_mutex_unlock(&buf_pool.flush_list_mutex); if (srv_fast_shutdown != 2) { - buf_flush_wait_batch_end_acquiring_mutex(true); - buf_flush_wait_batch_end_acquiring_mutex(false); + buf_dblwr.flush_buffered_writes(); + mysql_mutex_lock(&buf_pool.mutex); + buf_flush_wait_batch_end(true); + buf_flush_wait_batch_end(false); + mysql_mutex_unlock(&buf_pool.mutex); } mysql_mutex_lock(&buf_pool.flush_list_mutex); @@ -2444,20 +2425,23 @@ ATTRIBUTE_COLD void buf_flush_buffer_pool() while (buf_pool.get_oldest_modification(0)) { mysql_mutex_unlock(&buf_pool.flush_list_mutex); - buf_flush_list(srv_max_io_capacity); - if (buf_pool.n_flush_list()) + mysql_mutex_lock(&buf_pool.mutex); + buf_flush_list_holding_mutex(srv_max_io_capacity); + if (buf_pool.n_flush_list_) { + mysql_mutex_unlock(&buf_pool.mutex); timespec abstime; service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL, "Waiting to flush " ULINTPF " pages", buf_flush_list_length()); set_timespec(abstime, INNODB_EXTEND_TIMEOUT_INTERVAL / 2); + buf_dblwr.flush_buffered_writes(); mysql_mutex_lock(&buf_pool.mutex); while (buf_pool.n_flush_list_) my_cond_timedwait(&buf_pool.done_flush_list, &buf_pool.mutex.m_mutex, &abstime); - mysql_mutex_unlock(&buf_pool.mutex); } + mysql_mutex_unlock(&buf_pool.mutex); mysql_mutex_lock(&buf_pool.flush_list_mutex); } diff --git a/storage/innobase/buf/buf0lru.cc b/storage/innobase/buf/buf0lru.cc index 2c1f717fb9f..9fa6492d525 100644 --- a/storage/innobase/buf/buf0lru.cc +++ b/storage/innobase/buf/buf0lru.cc @@ -402,6 +402,7 @@ buf_block_t *buf_LRU_get_free_block(bool have_mutex) && recv_sys.apply_log_recs) { goto flush_lru; }); +get_mutex: mysql_mutex_lock(&buf_pool.mutex); got_mutex: buf_LRU_check_size_of_non_data_objects(); @@ -490,15 +491,18 @@ not_found: #ifndef DBUG_OFF flush_lru: #endif - if (!buf_flush_LRU(innodb_lru_flush_size)) { + mysql_mutex_lock(&buf_pool.mutex); + + if (!buf_flush_LRU(innodb_lru_flush_size, true)) { MONITOR_INC(MONITOR_LRU_SINGLE_FLUSH_FAILURE_COUNT); ++flush_failures; } n_iterations++; - mysql_mutex_lock(&buf_pool.mutex); buf_pool.stat.LRU_waits++; - goto got_mutex; + mysql_mutex_unlock(&buf_pool.mutex); + buf_dblwr.flush_buffered_writes(); + goto get_mutex; } /** Move the LRU_old pointer so that the length of the old blocks list @@ -807,50 +811,57 @@ bool buf_LRU_free_page(buf_page_t *bpage, bool zip) /* We cannot use transactional_lock_guard here, because buf_buddy_relocate() in buf_buddy_free() could get stuck. */ hash_lock.lock(); - lsn_t oldest_modification = bpage->oldest_modification_acquire(); + const lsn_t oldest_modification = bpage->oldest_modification_acquire(); if (UNIV_UNLIKELY(!bpage->can_relocate())) { /* Do not free buffer fixed and I/O-fixed blocks. */ goto func_exit; } - if (oldest_modification == 1) { + switch (oldest_modification) { + case 2: + ut_ad(id.space() == SRV_TMP_SPACE_ID); + ut_ad(!bpage->zip.data); + if (!bpage->is_freed()) { + goto func_exit; + } + bpage->clear_oldest_modification(); + break; + case 1: mysql_mutex_lock(&buf_pool.flush_list_mutex); - oldest_modification = bpage->oldest_modification(); - if (oldest_modification) { - ut_ad(oldest_modification == 1); + if (const lsn_t om = bpage->oldest_modification()) { + ut_ad(om == 1); buf_pool.delete_from_flush_list(bpage); } mysql_mutex_unlock(&buf_pool.flush_list_mutex); ut_ad(!bpage->oldest_modification()); - oldest_modification = 0; - } - - if (zip || !bpage->zip.data) { - /* This would completely free the block. */ - /* Do not completely free dirty blocks. */ - - if (oldest_modification) { - goto func_exit; + /* fall through */ + case 0: + if (zip || !bpage->zip.data || !bpage->frame) { + break; } - } else if (oldest_modification && !bpage->frame) { -func_exit: - hash_lock.unlock(); - return(false); - - } else if (bpage->frame) { +relocate_compressed: b = static_cast(ut_zalloc_nokey(sizeof *b)); ut_a(b); mysql_mutex_lock(&buf_pool.flush_list_mutex); new (b) buf_page_t(*bpage); b->frame = nullptr; b->set_state(buf_page_t::UNFIXED + 1); + break; + default: + if (zip || !bpage->zip.data || !bpage->frame) { + /* This would completely free the block. */ + /* Do not completely free dirty blocks. */ +func_exit: + hash_lock.unlock(); + return(false); + } + goto relocate_compressed; } mysql_mutex_assert_owner(&buf_pool.mutex); - DBUG_PRINT("ib_buf", ("free page %u:%u", - id.space(), id.page_no())); + DBUG_PRINT("ib_buf", ("free page %u:%u", id.space(), id.page_no())); ut_ad(bpage->can_relocate()); diff --git a/storage/innobase/include/buf0buf.h b/storage/innobase/include/buf0buf.h index 2dccdda8a2f..e79cbdadcd6 100644 --- a/storage/innobase/include/buf0buf.h +++ b/storage/innobase/include/buf0buf.h @@ -724,13 +724,14 @@ public: ut_ad(s < REINIT); } - void read_unfix(uint32_t s) + uint32_t read_unfix(uint32_t s) { ut_ad(lock.is_write_locked()); ut_ad(s == UNFIXED + 1 || s == IBUF_EXIST + 1 || s == REINIT + 1); - ut_d(auto old_state=) zip.fix.fetch_add(s - READ_FIX); + uint32_t old_state= zip.fix.fetch_add(s - READ_FIX); ut_ad(old_state >= READ_FIX); ut_ad(old_state < WRITE_FIX); + return old_state + (s - READ_FIX); } void set_freed(uint32_t prev_state, uint32_t count= 0) @@ -782,10 +783,10 @@ public: inline void write_complete(bool temporary); /** Write a flushable page to a file. buf_pool.mutex must be held. - @param lru true=buf_pool.LRU; false=buf_pool.flush_list + @param evict whether to evict the page on write completion @param space tablespace @return whether the page was flushed and buf_pool.mutex was released */ - inline bool flush(bool lru, fil_space_t *space); + inline bool flush(bool evict, fil_space_t *space); /** Notify that a page in a temporary tablespace has been modified. */ void set_temp_modified() @@ -1546,9 +1547,6 @@ public: /** broadcast when n_flush_list reaches 0; protected by mutex */ pthread_cond_t done_flush_list; - TPOOL_SUPPRESS_TSAN ulint n_flush_LRU() const { return n_flush_LRU_; } - TPOOL_SUPPRESS_TSAN ulint n_flush_list() const { return n_flush_list_; } - /** @name General fields */ /* @{ */ ulint curr_pool_size; /*!< Current pool size in bytes */ @@ -1755,7 +1753,7 @@ public: last_activity_count= activity_count; } - // n_flush_LRU() + n_flush_list() + // n_flush_LRU_ + n_flush_list_ // is approximately COUNT(is_write_fixed()) in flush_list unsigned freed_page_clock;/*!< a sequence number used @@ -1785,7 +1783,8 @@ public: UT_LIST_BASE_NODE_T(buf_page_t) free; /*!< base node of the free block list */ - /** signaled each time when the free list grows; protected by mutex */ + /** signaled each time when the free list grows and + broadcast each time try_LRU_scan is set; protected by mutex */ pthread_cond_t done_free; UT_LIST_BASE_NODE_T(buf_page_t) withdraw; @@ -1851,9 +1850,9 @@ public: return any_pending; } /** @return total amount of pending I/O */ - ulint io_pending() const + TPOOL_SUPPRESS_TSAN ulint io_pending() const { - return n_pend_reads + n_flush_LRU() + n_flush_list(); + return n_pend_reads + n_flush_LRU_ + n_flush_list_; } private: @@ -1886,34 +1885,12 @@ private: /** array of slots */ buf_tmp_buffer_t *slots; - void create(ulint n_slots) - { - this->n_slots= n_slots; - slots= static_cast - (ut_malloc_nokey(n_slots * sizeof *slots)); - memset((void*) slots, 0, n_slots * sizeof *slots); - } + void create(ulint n_slots); - void close() - { - for (buf_tmp_buffer_t *s= slots, *e= slots + n_slots; s != e; s++) - { - aligned_free(s->crypt_buf); - aligned_free(s->comp_buf); - } - ut_free(slots); - slots= nullptr; - n_slots= 0; - } + void close(); /** Reserve a buffer */ - buf_tmp_buffer_t *reserve() - { - for (buf_tmp_buffer_t *s= slots, *e= slots + n_slots; s != e; s++) - if (s->acquire()) - return s; - return nullptr; - } + buf_tmp_buffer_t *reserve(); } io_buf; /** whether resize() is in the critical path */ @@ -2002,7 +1979,10 @@ inline void buf_page_t::set_oldest_modification(lsn_t lsn) /** Clear oldest_modification after removing from buf_pool.flush_list */ inline void buf_page_t::clear_oldest_modification() { - mysql_mutex_assert_owner(&buf_pool.flush_list_mutex); +#ifdef SAFE_MUTEX + if (oldest_modification() != 2) + mysql_mutex_assert_owner(&buf_pool.flush_list_mutex); +#endif /* SAFE_MUTEX */ ut_d(const auto s= state()); ut_ad(s >= REMOVE_HASH); ut_ad(oldest_modification()); diff --git a/storage/innobase/include/buf0flu.h b/storage/innobase/include/buf0flu.h index f615b856126..d71a05c0ec9 100644 --- a/storage/innobase/include/buf0flu.h +++ b/storage/innobase/include/buf0flu.h @@ -86,11 +86,15 @@ buf_flush_init_for_writing( bool buf_flush_list_space(fil_space_t *space, ulint *n_flushed= nullptr) MY_ATTRIBUTE((warn_unused_result)); -/** Write out dirty blocks from buf_pool.LRU. +/** Write out dirty blocks from buf_pool.LRU, +and move clean blocks to buf_pool.free. +The caller must invoke buf_dblwr.flush_buffered_writes() +after releasing buf_pool.mutex. @param max_n wished maximum mumber of blocks flushed -@return the number of processed pages +@param evict whether to evict pages after flushing +@return evict ? number of processed pages : number of pages written @retval 0 if a buf_pool.LRU batch is already running */ -ulint buf_flush_LRU(ulint max_n); +ulint buf_flush_LRU(ulint max_n, bool evict); /** Wait until a flush batch ends. @param lru true=buf_pool.LRU; false=buf_pool.flush_list */ @@ -131,9 +135,6 @@ inline void buf_flush_note_modification(buf_block_t *b, lsn_t start, lsn_t end) /** Initialize page_cleaner. */ ATTRIBUTE_COLD void buf_flush_page_cleaner_init(); -/** Wait for pending flushes to complete. */ -void buf_flush_wait_batch_end_acquiring_mutex(bool lru); - /** Flush the buffer pool on shutdown. */ ATTRIBUTE_COLD void buf_flush_buffer_pool(); -- cgit v1.2.1 From a55b951e6082a4ce9a1f2ed5ee176ea7dbbaf1f2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marko=20M=C3=A4kel=C3=A4?= Date: Thu, 16 Mar 2023 17:19:58 +0200 Subject: MDEV-26827 Make page flushing even faster For more convenient monitoring of something that could greatly affect the volume of page writes, we add the status variable Innodb_buffer_pool_pages_split that was previously only available via information_schema.innodb_metrics as "innodb_page_splits". This was suggested by Axel Schwenke. buf_flush_page_count: Replaced with buf_pool.stat.n_pages_written. We protect buf_pool.stat (except n_page_gets) with buf_pool.mutex and remove unnecessary export_vars indirection. buf_pool.flush_list_bytes: Moved from buf_pool.stat.flush_list_bytes. Protected by buf_pool.flush_list_mutex. buf_pool_t::page_cleaner_status: Replaces buf_pool_t::n_flush_LRU_, buf_pool_t::n_flush_list_, and buf_pool_t::page_cleaner_is_idle. Protected by buf_pool.flush_list_mutex. We will exclusively broadcast buf_pool.done_flush_list by the buf_flush_page_cleaner thread, and only wait for it when communicating with buf_flush_page_cleaner. There is no need to keep a count of pending writes by the buf_pool.flush_list processing. A single flag suffices for that. Waits for page write completion can be performed by simply waiting on block->page.lock, or by invoking buf_dblwr.wait_for_page_writes(). buf_LRU_block_free_non_file_page(): Broadcast buf_pool.done_free and set buf_pool.try_LRU_scan when freeing a page. This would be executed also as part of buf_page_write_complete(). buf_page_write_complete(): Do not broadcast buf_pool.done_flush_list, and do not acquire buf_pool.mutex unless buf_pool.LRU eviction is needed. Let buf_dblwr count all writes to persistent pages and broadcast a condition variable when no outstanding writes remain. buf_flush_page_cleaner(): Prioritize LRU flushing and eviction right after "furious flushing" (lsn_limit). Simplify the conditions and reduce the hold time of buf_pool.flush_list_mutex. Refuse to shut down or sleep if buf_pool.ran_out(), that is, LRU eviction is needed. buf_pool_t::page_cleaner_wakeup(): Add the optional parameter for_LRU. buf_LRU_get_free_block(): Protect buf_lru_free_blocks_error_printed with buf_pool.mutex. Invoke buf_pool.page_cleaner_wakeup(true) to to ensure that buf_flush_page_cleaner() will process the LRU flush request. buf_do_LRU_batch(), buf_flush_list(), buf_flush_list_space(): Update buf_pool.stat.n_pages_written when submitting writes (while holding buf_pool.mutex), not when completing them. buf_page_t::flush(), buf_flush_discard_page(): Require that the page U-latch be acquired upfront, and remove buf_page_t::ready_for_flush(). buf_pool_t::delete_from_flush_list(): Remove the parameter "bool clear". buf_flush_page(): Count pending page writes via buf_dblwr. buf_flush_try_neighbors(): Take the block of page_id as a parameter. If the tablespace is dropped before our page has been written out, release the page U-latch. buf_pool_invalidate(): Let the caller ensure that there are no outstanding writes. buf_flush_wait_batch_end(false), buf_flush_wait_batch_end_acquiring_mutex(false): Replaced with buf_dblwr.wait_for_page_writes(). buf_flush_wait_LRU_batch_end(): Replaces buf_flush_wait_batch_end(true). buf_flush_list(): Remove some broadcast of buf_pool.done_flush_list. buf_flush_buffer_pool(): Invoke also buf_dblwr.wait_for_page_writes(). buf_pool_t::io_pending(), buf_pool_t::n_flush_list(): Remove. Outstanding writes are reflected by buf_dblwr.pending_writes(). buf_dblwr_t::init(): New function, to initialize the mutex and the condition variables, but not the backing store. buf_dblwr_t::is_created(): Replaces buf_dblwr_t::is_initialised(). buf_dblwr_t::pending_writes(), buf_dblwr_t::writes_pending: Keeps track of writes of persistent data pages. buf_flush_LRU(): Allow calls while LRU flushing may be in progress in another thread. Tested by Matthias Leich (correctness) and Axel Schwenke (performance) --- .../innodb/r/innodb_skip_innodb_is_tables.result | 2 +- .../suite/innodb/r/innodb_status_variables.result | 1 + storage/innobase/btr/btr0btr.cc | 4 +- storage/innobase/buf/buf0buf.cc | 51 +- storage/innobase/buf/buf0dblwr.cc | 66 +- storage/innobase/buf/buf0flu.cc | 731 +++++++++++---------- storage/innobase/buf/buf0lru.cc | 42 +- storage/innobase/buf/buf0rea.cc | 105 ++- storage/innobase/gis/gis0rtree.cc | 2 - storage/innobase/handler/ha_innodb.cc | 34 +- storage/innobase/include/buf0buf.h | 159 +++-- storage/innobase/include/buf0dblwr.h | 69 +- storage/innobase/include/buf0flu.h | 9 +- storage/innobase/include/buf0rea.h | 9 +- storage/innobase/include/fil0fil.h | 2 +- storage/innobase/include/srv0srv.h | 16 - storage/innobase/log/log0log.cc | 8 - storage/innobase/srv/srv0mon.cc | 12 +- storage/innobase/srv/srv0srv.cc | 48 +- storage/innobase/srv/srv0start.cc | 2 +- .../rocksdb/r/innodb_i_s_tables_disabled.result | 2 +- 21 files changed, 705 insertions(+), 669 deletions(-) diff --git a/mysql-test/suite/innodb/r/innodb_skip_innodb_is_tables.result b/mysql-test/suite/innodb/r/innodb_skip_innodb_is_tables.result index 19b426009f2..9bdb546482e 100644 --- a/mysql-test/suite/innodb/r/innodb_skip_innodb_is_tables.result +++ b/mysql-test/suite/innodb/r/innodb_skip_innodb_is_tables.result @@ -199,7 +199,7 @@ compress_pages_page_decompressed compression 0 NULL NULL NULL 0 NULL NULL NULL N compress_pages_page_compression_error compression 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 counter Number of page compression errors compress_pages_encrypted compression 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 counter Number of pages encrypted compress_pages_decrypted compression 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 counter Number of pages decrypted -index_page_splits index 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 counter Number of index page splits +index_page_splits index 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 status_counter Number of index page splits index_page_merge_attempts index 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 counter Number of index page merge attempts index_page_merge_successful index 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 counter Number of successful index page merges index_page_reorg_attempts index 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 counter Number of index page reorganization attempts diff --git a/mysql-test/suite/innodb/r/innodb_status_variables.result b/mysql-test/suite/innodb/r/innodb_status_variables.result index a729dd0a8d4..5b8ca678795 100644 --- a/mysql-test/suite/innodb/r/innodb_status_variables.result +++ b/mysql-test/suite/innodb/r/innodb_status_variables.result @@ -23,6 +23,7 @@ INNODB_BUFFER_POOL_PAGES_OLD INNODB_BUFFER_POOL_PAGES_TOTAL INNODB_BUFFER_POOL_PAGES_LRU_FLUSHED INNODB_BUFFER_POOL_PAGES_LRU_FREED +INNODB_BUFFER_POOL_PAGES_SPLIT INNODB_BUFFER_POOL_READ_AHEAD_RND INNODB_BUFFER_POOL_READ_AHEAD INNODB_BUFFER_POOL_READ_AHEAD_EVICTED diff --git a/storage/innobase/btr/btr0btr.cc b/storage/innobase/btr/btr0btr.cc index 1b69f4c7170..e54c2a101b8 100644 --- a/storage/innobase/btr/btr0btr.cc +++ b/storage/innobase/btr/btr0btr.cc @@ -2975,6 +2975,8 @@ btr_page_split_and_insert( ut_ad(*err == DB_SUCCESS); ut_ad(dtuple_check_typed(tuple)); + buf_pool.pages_split++; + if (cursor->index()->is_spatial()) { /* Split rtree page and update parent */ return rtr_page_split_and_insert(flags, cursor, offsets, heap, @@ -3371,8 +3373,6 @@ func_exit: left_block, right_block, mtr); } - MONITOR_INC(MONITOR_INDEX_SPLIT); - ut_ad(page_validate(buf_block_get_frame(left_block), page_cursor->index)); ut_ad(page_validate(buf_block_get_frame(right_block), diff --git a/storage/innobase/buf/buf0buf.cc b/storage/innobase/buf/buf0buf.cc index 510872c142e..106569f74b2 100644 --- a/storage/innobase/buf/buf0buf.cc +++ b/storage/innobase/buf/buf0buf.cc @@ -1401,8 +1401,10 @@ inline bool buf_pool_t::withdraw_blocks() true); mysql_mutex_unlock(&buf_pool.mutex); buf_dblwr.flush_buffered_writes(); + mysql_mutex_lock(&buf_pool.flush_list_mutex); + buf_flush_wait_LRU_batch_end(); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); mysql_mutex_lock(&buf_pool.mutex); - buf_flush_wait_batch_end(true); } /* relocate blocks/buddies in withdrawn area */ @@ -2265,13 +2267,15 @@ lookup: return bpage; must_read_page: - if (dberr_t err= buf_read_page(page_id, zip_size)) - { + switch (dberr_t err= buf_read_page(page_id, zip_size)) { + case DB_SUCCESS: + case DB_SUCCESS_LOCKED_REC: + goto lookup; + default: ib::error() << "Reading compressed page " << page_id << " failed with error: " << err; return nullptr; } - goto lookup; } /********************************************************************//** @@ -2511,20 +2515,23 @@ loop: corrupted, or if an encrypted page with a valid checksum cannot be decypted. */ - if (dberr_t local_err = buf_read_page(page_id, zip_size)) { - if (local_err != DB_CORRUPTION - && mode != BUF_GET_POSSIBLY_FREED + switch (dberr_t local_err = buf_read_page(page_id, zip_size)) { + case DB_SUCCESS: + case DB_SUCCESS_LOCKED_REC: + buf_read_ahead_random(page_id, zip_size, ibuf_inside(mtr)); + break; + default: + if (mode != BUF_GET_POSSIBLY_FREED && retries++ < BUF_PAGE_READ_MAX_RETRIES) { DBUG_EXECUTE_IF("intermittent_read_failure", retries = BUF_PAGE_READ_MAX_RETRIES;); - } else { - if (err) { - *err = local_err; - } - return nullptr; } - } else { - buf_read_ahead_random(page_id, zip_size, ibuf_inside(mtr)); + /* fall through */ + case DB_PAGE_CORRUPTED: + if (err) { + *err = local_err; + } + return nullptr; } ut_d(if (!(++buf_dbg_counter % 5771)) buf_pool.validate()); @@ -3279,12 +3286,12 @@ retry: buf_unzip_LRU_add_block(reinterpret_cast(bpage), FALSE); } + buf_pool.stat.n_pages_created++; mysql_mutex_unlock(&buf_pool.mutex); mtr->memo_push(reinterpret_cast(bpage), MTR_MEMO_PAGE_X_FIX); bpage->set_accessed(); - buf_pool.stat.n_pages_created++; /* Delete possible entries for the page from the insert buffer: such can exist if the page belonged to an index which was dropped */ @@ -3534,7 +3541,6 @@ dberr_t buf_page_t::read_complete(const fil_node_t &node) ut_d(auto n=) buf_pool.n_pend_reads--; ut_ad(n > 0); - buf_pool.stat.n_pages_read++; const byte *read_frame= zip.data ? zip.data : frame; ut_ad(read_frame); @@ -3686,9 +3692,6 @@ void buf_pool_invalidate() { mysql_mutex_lock(&buf_pool.mutex); - buf_flush_wait_batch_end(true); - buf_flush_wait_batch_end(false); - /* It is possible that a write batch that has been posted earlier is still not complete. For buffer pool invalidation to proceed we must ensure there is NO write activity happening. */ @@ -3839,8 +3842,8 @@ void buf_pool_t::print() << UT_LIST_GET_LEN(flush_list) << ", n pending decompressions=" << n_pend_unzip << ", n pending reads=" << n_pend_reads - << ", n pending flush LRU=" << n_flush_LRU_ - << " list=" << n_flush_list_ + << ", n pending flush LRU=" << n_flush() + << " list=" << buf_dblwr.pending_writes() << ", pages made young=" << stat.n_pages_made_young << ", not young=" << stat.n_pages_not_made_young << ", pages read=" << stat.n_pages_read @@ -3952,13 +3955,13 @@ void buf_stats_get_pool_info(buf_pool_info_t *pool_info) pool_info->flush_list_len = UT_LIST_GET_LEN(buf_pool.flush_list); pool_info->n_pend_unzip = UT_LIST_GET_LEN(buf_pool.unzip_LRU); - mysql_mutex_unlock(&buf_pool.flush_list_mutex); pool_info->n_pend_reads = buf_pool.n_pend_reads; - pool_info->n_pending_flush_lru = buf_pool.n_flush_LRU_; + pool_info->n_pending_flush_lru = buf_pool.n_flush(); - pool_info->n_pending_flush_list = buf_pool.n_flush_list_; + pool_info->n_pending_flush_list = buf_dblwr.pending_writes(); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); current_time = time(NULL); time_elapsed = 0.001 + difftime(current_time, diff --git a/storage/innobase/buf/buf0dblwr.cc b/storage/innobase/buf/buf0dblwr.cc index c71fd8df068..72b1ba5ca2b 100644 --- a/storage/innobase/buf/buf0dblwr.cc +++ b/storage/innobase/buf/buf0dblwr.cc @@ -46,7 +46,17 @@ inline buf_block_t *buf_dblwr_trx_sys_get(mtr_t *mtr) 0, RW_X_LATCH, mtr); } -/** Initialize the doublewrite buffer data structure. +void buf_dblwr_t::init() +{ + if (!active_slot) + { + active_slot= &slots[0]; + mysql_mutex_init(buf_dblwr_mutex_key, &mutex, nullptr); + pthread_cond_init(&cond, nullptr); + } +} + +/** Initialise the persistent storage of the doublewrite buffer. @param header doublewrite page header in the TRX_SYS page */ inline void buf_dblwr_t::init(const byte *header) { @@ -54,8 +64,6 @@ inline void buf_dblwr_t::init(const byte *header) ut_ad(!active_slot->reserved); ut_ad(!batch_running); - mysql_mutex_init(buf_dblwr_mutex_key, &mutex, nullptr); - pthread_cond_init(&cond, nullptr); block1= page_id_t(0, mach_read_from_4(header + TRX_SYS_DOUBLEWRITE_BLOCK1)); block2= page_id_t(0, mach_read_from_4(header + TRX_SYS_DOUBLEWRITE_BLOCK2)); @@ -74,7 +82,7 @@ inline void buf_dblwr_t::init(const byte *header) @return whether the operation succeeded */ bool buf_dblwr_t::create() { - if (is_initialised()) + if (is_created()) return true; mtr_t mtr; @@ -343,7 +351,7 @@ func_exit: void buf_dblwr_t::recover() { ut_ad(recv_sys.parse_start_lsn); - if (!is_initialised()) + if (!is_created()) return; uint32_t page_no_dblwr= 0; @@ -452,10 +460,9 @@ next_page: /** Free the doublewrite buffer. */ void buf_dblwr_t::close() { - if (!is_initialised()) + if (!active_slot) return; - /* Free the double write data structures. */ ut_ad(!active_slot->reserved); ut_ad(!active_slot->first_free); ut_ad(!batch_running); @@ -469,35 +476,41 @@ void buf_dblwr_t::close() mysql_mutex_destroy(&mutex); memset((void*) this, 0, sizeof *this); - active_slot= &slots[0]; } /** Update the doublewrite buffer on write completion. */ -void buf_dblwr_t::write_completed() +void buf_dblwr_t::write_completed(bool with_doublewrite) { ut_ad(this == &buf_dblwr); - ut_ad(srv_use_doublewrite_buf); - ut_ad(is_initialised()); ut_ad(!srv_read_only_mode); mysql_mutex_lock(&mutex); - ut_ad(batch_running); - slot *flush_slot= active_slot == &slots[0] ? &slots[1] : &slots[0]; - ut_ad(flush_slot->reserved); - ut_ad(flush_slot->reserved <= flush_slot->first_free); + ut_ad(writes_pending); + if (!--writes_pending) + pthread_cond_broadcast(&write_cond); - if (!--flush_slot->reserved) + if (with_doublewrite) { - mysql_mutex_unlock(&mutex); - /* This will finish the batch. Sync data files to the disk. */ - fil_flush_file_spaces(); - mysql_mutex_lock(&mutex); + ut_ad(is_created()); + ut_ad(srv_use_doublewrite_buf); + ut_ad(batch_running); + slot *flush_slot= active_slot == &slots[0] ? &slots[1] : &slots[0]; + ut_ad(flush_slot->reserved); + ut_ad(flush_slot->reserved <= flush_slot->first_free); + + if (!--flush_slot->reserved) + { + mysql_mutex_unlock(&mutex); + /* This will finish the batch. Sync data files to the disk. */ + fil_flush_file_spaces(); + mysql_mutex_lock(&mutex); - /* We can now reuse the doublewrite memory buffer: */ - flush_slot->first_free= 0; - batch_running= false; - pthread_cond_broadcast(&cond); + /* We can now reuse the doublewrite memory buffer: */ + flush_slot->first_free= 0; + batch_running= false; + pthread_cond_broadcast(&cond); + } } mysql_mutex_unlock(&mutex); @@ -642,7 +655,7 @@ void buf_dblwr_t::flush_buffered_writes_completed(const IORequest &request) { ut_ad(this == &buf_dblwr); ut_ad(srv_use_doublewrite_buf); - ut_ad(is_initialised()); + ut_ad(is_created()); ut_ad(!srv_read_only_mode); ut_ad(!request.bpage); ut_ad(request.node == fil_system.sys_space->chain.start); @@ -708,7 +721,7 @@ posted, and also when we may have to wait for a page latch! Otherwise a deadlock of threads can occur. */ void buf_dblwr_t::flush_buffered_writes() { - if (!is_initialised() || !srv_use_doublewrite_buf) + if (!is_created() || !srv_use_doublewrite_buf) { fil_flush_file_spaces(); return; @@ -741,6 +754,7 @@ void buf_dblwr_t::add_to_batch(const IORequest &request, size_t size) const ulint buf_size= 2 * block_size(); mysql_mutex_lock(&mutex); + writes_pending++; for (;;) { diff --git a/storage/innobase/buf/buf0flu.cc b/storage/innobase/buf/buf0flu.cc index 70e1595e00e..326636e0c4d 100644 --- a/storage/innobase/buf/buf0flu.cc +++ b/storage/innobase/buf/buf0flu.cc @@ -47,15 +47,12 @@ Created 11/11/1995 Heikki Tuuri #endif /** Number of pages flushed via LRU. Protected by buf_pool.mutex. -Also included in buf_flush_page_count. */ +Also included in buf_pool.stat.n_pages_written. */ ulint buf_lru_flush_page_count; /** Number of pages freed without flushing. Protected by buf_pool.mutex. */ ulint buf_lru_freed_page_count; -/** Number of pages flushed. Protected by buf_pool.mutex. */ -ulint buf_flush_page_count; - /** Flag indicating if the page_cleaner is in active state. */ Atomic_relaxed buf_page_cleaner_is_active; @@ -115,8 +112,7 @@ static void buf_flush_validate_skip() } #endif /* UNIV_DEBUG */ -/** Wake up the page cleaner if needed */ -void buf_pool_t::page_cleaner_wakeup() +void buf_pool_t::page_cleaner_wakeup(bool for_LRU) { if (!page_cleaner_idle()) return; @@ -149,11 +145,12 @@ void buf_pool_t::page_cleaner_wakeup() - by allowing last_activity_count to updated when page-cleaner is made active and has work to do. This ensures that the last_activity signal is consumed by the page-cleaner before the next one is generated. */ - if ((pct_lwm != 0.0 && pct_lwm <= dirty_pct) || - (pct_lwm != 0.0 && last_activity_count == srv_get_activity_count()) || + if (for_LRU || + (pct_lwm != 0.0 && (pct_lwm <= dirty_pct || + last_activity_count == srv_get_activity_count())) || srv_max_buf_pool_modified_pct <= dirty_pct) { - page_cleaner_is_idle= false; + page_cleaner_status-= PAGE_CLEANER_IDLE; pthread_cond_signal(&do_flush_list); } } @@ -183,8 +180,8 @@ void buf_pool_t::insert_into_flush_list(buf_block_t *block, lsn_t lsn) delete_from_flush_list_low(&block->page); } else - stat.flush_list_bytes+= block->physical_size(); - ut_ad(stat.flush_list_bytes <= curr_pool_size); + flush_list_bytes+= block->physical_size(); + ut_ad(flush_list_bytes <= curr_pool_size); block->page.set_oldest_modification(lsn); MEM_CHECK_DEFINED(block->page.zip.data @@ -197,14 +194,12 @@ void buf_pool_t::insert_into_flush_list(buf_block_t *block, lsn_t lsn) } /** Remove a block from flush_list. -@param bpage buffer pool page -@param clear whether to invoke buf_page_t::clear_oldest_modification() */ -void buf_pool_t::delete_from_flush_list(buf_page_t *bpage, bool clear) +@param bpage buffer pool page */ +void buf_pool_t::delete_from_flush_list(buf_page_t *bpage) { delete_from_flush_list_low(bpage); - stat.flush_list_bytes-= bpage->physical_size(); - if (clear) - bpage->clear_oldest_modification(); + flush_list_bytes-= bpage->physical_size(); + bpage->clear_oldest_modification(); #ifdef UNIV_DEBUG buf_flush_validate_skip(); #endif /* UNIV_DEBUG */ @@ -219,10 +214,10 @@ void buf_flush_remove_pages(ulint id) { const page_id_t first(id, 0), end(id + 1, 0); ut_ad(id); - mysql_mutex_lock(&buf_pool.mutex); for (;;) { + mysql_mutex_lock(&buf_pool.mutex); bool deferred= false; mysql_mutex_lock(&buf_pool.flush_list_mutex); @@ -245,18 +240,14 @@ void buf_flush_remove_pages(ulint id) bpage= prev; } + mysql_mutex_unlock(&buf_pool.mutex); mysql_mutex_unlock(&buf_pool.flush_list_mutex); if (!deferred) break; - mysql_mutex_unlock(&buf_pool.mutex); - std::this_thread::yield(); - mysql_mutex_lock(&buf_pool.mutex); - buf_flush_wait_batch_end(false); + buf_dblwr.wait_for_page_writes(); } - - mysql_mutex_unlock(&buf_pool.mutex); } /*******************************************************************//** @@ -301,7 +292,7 @@ buf_flush_relocate_on_flush_list( bpage->clear_oldest_modification(); if (lsn == 1) { - buf_pool.stat.flush_list_bytes -= dpage->physical_size(); + buf_pool.flush_list_bytes -= dpage->physical_size(); dpage->list.prev = nullptr; dpage->list.next = nullptr; dpage->clear_oldest_modification(); @@ -341,6 +332,21 @@ inline void buf_page_t::write_complete(bool temporary) lock.u_unlock(true); } +inline void buf_pool_t::n_flush_inc() +{ + mysql_mutex_assert_owner(&flush_list_mutex); + page_cleaner_status+= LRU_FLUSH; +} + +inline void buf_pool_t::n_flush_dec() +{ + mysql_mutex_lock(&flush_list_mutex); + ut_ad(page_cleaner_status >= LRU_FLUSH); + if ((page_cleaner_status-= LRU_FLUSH) < LRU_FLUSH) + pthread_cond_broadcast(&done_flush_LRU); + mysql_mutex_unlock(&flush_list_mutex); +} + /** Complete write of a file page from buf_pool. @param request write request */ void buf_page_write_complete(const IORequest &request) @@ -356,13 +362,6 @@ void buf_page_write_complete(const IORequest &request) ut_ad(!buf_dblwr.is_inside(bpage->id())); ut_ad(request.node->space->id == bpage->id().space()); - if (state < buf_page_t::WRITE_FIX_REINIT && - request.node->space->use_doublewrite()) - { - ut_ad(request.node->space != fil_system.temp_space); - buf_dblwr.write_completed(); - } - if (request.slot) request.slot->release(); @@ -370,32 +369,31 @@ void buf_page_write_complete(const IORequest &request) buf_page_monitor(*bpage, false); DBUG_PRINT("ib_buf", ("write page %u:%u", bpage->id().space(), bpage->id().page_no())); - const bool temp= fsp_is_system_temporary(bpage->id().space()); - mysql_mutex_lock(&buf_pool.mutex); + mysql_mutex_assert_not_owner(&buf_pool.mutex); mysql_mutex_assert_not_owner(&buf_pool.flush_list_mutex); - buf_pool.stat.n_pages_written++; - bpage->write_complete(temp); if (request.is_LRU()) { + const bool temp= bpage->oldest_modification() == 2; + if (!temp) + buf_dblwr.write_completed(state < buf_page_t::WRITE_FIX_REINIT && + request.node->space->use_doublewrite()); + /* We must hold buf_pool.mutex while releasing the block, so that + no other thread can access it before we have freed it. */ + mysql_mutex_lock(&buf_pool.mutex); + bpage->write_complete(temp); buf_LRU_free_page(bpage, true); - buf_pool.try_LRU_scan= true; - pthread_cond_signal(&buf_pool.done_free); + mysql_mutex_unlock(&buf_pool.mutex); - ut_ad(buf_pool.n_flush_LRU_); - if (!--buf_pool.n_flush_LRU_) - pthread_cond_broadcast(&buf_pool.done_flush_LRU); + buf_pool.n_flush_dec(); } else { - ut_ad(!temp); - ut_ad(buf_pool.n_flush_list_); - if (!--buf_pool.n_flush_list_) - pthread_cond_broadcast(&buf_pool.done_flush_list); + buf_dblwr.write_completed(state < buf_page_t::WRITE_FIX_REINIT && + request.node->space->use_doublewrite()); + bpage->write_complete(false); } - - mysql_mutex_unlock(&buf_pool.mutex); } /** Calculate a ROW_FORMAT=COMPRESSED page checksum and update the page. @@ -739,43 +737,41 @@ not_compressed: } /** Free a page whose underlying file page has been freed. */ -inline void buf_pool_t::release_freed_page(buf_page_t *bpage) +ATTRIBUTE_COLD void buf_pool_t::release_freed_page(buf_page_t *bpage) { mysql_mutex_assert_owner(&mutex); - mysql_mutex_lock(&flush_list_mutex); ut_d(const lsn_t oldest_modification= bpage->oldest_modification();) if (fsp_is_system_temporary(bpage->id().space())) { ut_ad(bpage->frame); ut_ad(oldest_modification == 2); + bpage->clear_oldest_modification(); } else { + mysql_mutex_lock(&flush_list_mutex); ut_ad(oldest_modification > 2); - delete_from_flush_list(bpage, false); + delete_from_flush_list(bpage); + mysql_mutex_unlock(&flush_list_mutex); } - bpage->clear_oldest_modification(); - mysql_mutex_unlock(&flush_list_mutex); - bpage->lock.u_unlock(true); + bpage->lock.u_unlock(true); buf_LRU_free_page(bpage, true); } -/** Write a flushable page to a file. buf_pool.mutex must be held. +/** Write a flushable page to a file or free a freeable block. @param evict whether to evict the page on write completion @param space tablespace -@return whether the page was flushed and buf_pool.mutex was released */ -inline bool buf_page_t::flush(bool evict, fil_space_t *space) +@return whether a page write was initiated and buf_pool.mutex released */ +bool buf_page_t::flush(bool evict, fil_space_t *space) { + mysql_mutex_assert_not_owner(&buf_pool.flush_list_mutex); ut_ad(in_file()); ut_ad(in_LRU_list); ut_ad((space->purpose == FIL_TYPE_TEMPORARY) == (space == fil_system.temp_space)); - ut_ad(space->referenced()); ut_ad(evict || space != fil_system.temp_space); - - if (!lock.u_lock_try(true)) - return false; + ut_ad(space->referenced()); const auto s= state(); ut_a(s >= FREED); @@ -783,18 +779,29 @@ inline bool buf_page_t::flush(bool evict, fil_space_t *space) if (s < UNFIXED) { buf_pool.release_freed_page(this); - mysql_mutex_unlock(&buf_pool.mutex); - return true; + return false; } - if (s >= READ_FIX || oldest_modification() < 2) + ut_d(const auto f=) zip.fix.fetch_add(WRITE_FIX - UNFIXED); + ut_ad(f >= UNFIXED); + ut_ad(f < READ_FIX); + ut_ad((space == fil_system.temp_space) + ? oldest_modification() == 2 + : oldest_modification() > 2); + + /* Increment the I/O operation count used for selecting LRU policy. */ + buf_LRU_stat_inc_io(); + mysql_mutex_unlock(&buf_pool.mutex); + + IORequest::Type type= IORequest::WRITE_ASYNC; + if (UNIV_UNLIKELY(evict)) { - lock.u_unlock(true); - return false; + type= IORequest::WRITE_LRU; + mysql_mutex_lock(&buf_pool.flush_list_mutex); + buf_pool.n_flush_inc(); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); } - mysql_mutex_assert_not_owner(&buf_pool.flush_list_mutex); - /* Apart from the U-lock, this block will also be protected by is_write_fixed() and oldest_modification()>1. Thus, it cannot be relocated or removed. */ @@ -802,25 +809,6 @@ inline bool buf_page_t::flush(bool evict, fil_space_t *space) DBUG_PRINT("ib_buf", ("%s %u page %u:%u", evict ? "LRU" : "flush_list", id().space(), id().page_no())); - ut_d(const auto f=) zip.fix.fetch_add(WRITE_FIX - UNFIXED); - ut_ad(f >= UNFIXED); - ut_ad(f < READ_FIX); - ut_ad(space == fil_system.temp_space - ? oldest_modification() == 2 - : oldest_modification() > 2); - if (evict) - { - ut_ad(buf_pool.n_flush_LRU_ < ULINT_UNDEFINED); - buf_pool.n_flush_LRU_++; - } - else - { - ut_ad(buf_pool.n_flush_list_ < ULINT_UNDEFINED); - buf_pool.n_flush_list_++; - } - buf_flush_page_count++; - - mysql_mutex_unlock(&buf_pool.mutex); buf_block_t *block= reinterpret_cast(this); page_t *write_frame= zip.data; @@ -830,7 +818,6 @@ inline bool buf_page_t::flush(bool evict, fil_space_t *space) #if defined HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE || defined _WIN32 size_t orig_size; #endif - IORequest::Type type= evict ? IORequest::WRITE_LRU : IORequest::WRITE_ASYNC; buf_tmp_buffer_t *slot= nullptr; if (UNIV_UNLIKELY(!frame)) /* ROW_FORMAT=COMPRESSED */ @@ -874,7 +861,10 @@ inline bool buf_page_t::flush(bool evict, fil_space_t *space) { switch (space->chain.start->punch_hole) { case 1: - type= evict ? IORequest::PUNCH_LRU : IORequest::PUNCH; + static_assert(IORequest::PUNCH_LRU - IORequest::PUNCH == + IORequest::WRITE_LRU - IORequest::WRITE_ASYNC, ""); + type= + IORequest::Type(type + (IORequest::PUNCH - IORequest::WRITE_ASYNC)); break; case 2: size= orig_size; @@ -896,15 +886,14 @@ inline bool buf_page_t::flush(bool evict, fil_space_t *space) if (lsn > log_sys.get_flushed_lsn()) log_write_up_to(lsn, true); } + if (UNIV_LIKELY(space->purpose != FIL_TYPE_TEMPORARY)) + buf_dblwr.add_unbuffered(); space->io(IORequest{type, this, slot}, physical_offset(), size, write_frame, this); } else buf_dblwr.add_to_batch(IORequest{this, slot, space->chain.start, type}, size); - - /* Increment the I/O operation count used for selecting LRU policy. */ - buf_LRU_stat_inc_io(); return true; } @@ -931,7 +920,7 @@ static bool buf_flush_check_neighbor(const page_id_t id, ulint fold, if (evict && !bpage->is_old()) return false; - return bpage->oldest_modification() > 1 && bpage->ready_for_flush(); + return bpage->oldest_modification() > 1 && !bpage->is_io_fixed(); } /** Check which neighbors of a page can be flushed from the buf_pool. @@ -1058,6 +1047,7 @@ uint32_t fil_space_t::flush_freed(bool writable) and also write zeroes or punch the hole for the freed ranges of pages. @param space tablespace @param page_id page identifier +@param bpage buffer page @param contiguous whether to consider contiguous areas of pages @param evict true=buf_pool.LRU; false=buf_pool.flush_list @param n_flushed number of pages flushed so far in this batch @@ -1065,10 +1055,12 @@ and also write zeroes or punch the hole for the freed ranges of pages. @return number of pages flushed */ static ulint buf_flush_try_neighbors(fil_space_t *space, const page_id_t page_id, + buf_page_t *bpage, bool contiguous, bool evict, ulint n_flushed, ulint n_to_flush) { ut_ad(space->id == page_id.space()); + ut_ad(bpage->id() == page_id); ulint count= 0; page_id_t id= page_id; @@ -1077,9 +1069,15 @@ static ulint buf_flush_try_neighbors(fil_space_t *space, ut_ad(page_id >= id); ut_ad(page_id < high); - for (ulint id_fold= id.fold(); id < high && !space->is_stopping(); - ++id, ++id_fold) + for (ulint id_fold= id.fold(); id < high; ++id, ++id_fold) { + if (UNIV_UNLIKELY(space->is_stopping())) + { + if (bpage) + bpage->lock.u_unlock(true); + break; + } + if (count + n_flushed >= n_to_flush) { if (id > page_id) @@ -1093,26 +1091,39 @@ static ulint buf_flush_try_neighbors(fil_space_t *space, const buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(id_fold); mysql_mutex_lock(&buf_pool.mutex); - if (buf_page_t *bpage= buf_pool.page_hash.get(id, chain)) + if (buf_page_t *b= buf_pool.page_hash.get(id, chain)) { - ut_ad(bpage->in_file()); - /* We avoid flushing 'non-old' blocks in an eviction flush, - because the flushed blocks are soon freed */ - if (!evict || id == page_id || bpage->is_old()) + ut_ad(b->in_file()); + if (id == page_id) { - if (!buf_pool.watch_is_sentinel(*bpage) && - bpage->oldest_modification() > 1 && bpage->ready_for_flush() && - bpage->flush(evict, space)) + ut_ad(bpage == b); + bpage= nullptr; + ut_ad(!buf_pool.watch_is_sentinel(*b)); + ut_ad(b->oldest_modification() > 1); + flush: + if (b->flush(evict, space)) { ++count; continue; } } + /* We avoid flushing 'non-old' blocks in an eviction flush, + because the flushed blocks are soon freed */ + else if ((!evict || b->is_old()) && !buf_pool.watch_is_sentinel(*b) && + b->oldest_modification() > 1 && b->lock.u_lock_try(true)) + { + if (b->oldest_modification() < 2) + b->lock.u_unlock(true); + else + goto flush; + } } mysql_mutex_unlock(&buf_pool.mutex); } + ut_ad(!bpage); + if (auto n= count - 1) { MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_NEIGHBOR_TOTAL_PAGE, @@ -1185,27 +1196,20 @@ struct flush_counters_t ulint evicted; }; -/** Try to discard a dirty page. +/** Discard a dirty page, and release buf_pool.flush_list_mutex. @param bpage dirty page whose tablespace is not accessible */ static void buf_flush_discard_page(buf_page_t *bpage) { - mysql_mutex_assert_owner(&buf_pool.mutex); - mysql_mutex_assert_not_owner(&buf_pool.flush_list_mutex); ut_ad(bpage->in_file()); ut_ad(bpage->oldest_modification()); - if (!bpage->lock.u_lock_try(false)) - return; - - mysql_mutex_lock(&buf_pool.flush_list_mutex); buf_pool.delete_from_flush_list(bpage); mysql_mutex_unlock(&buf_pool.flush_list_mutex); ut_d(const auto state= bpage->state()); ut_ad(state == buf_page_t::FREED || state == buf_page_t::UNFIXED || state == buf_page_t::IBUF_EXIST || state == buf_page_t::REINIT); - bpage->lock.u_unlock(); - + bpage->lock.u_unlock(true); buf_LRU_free_page(bpage, true); } @@ -1227,7 +1231,6 @@ static void buf_flush_LRU_list_batch(ulint max, bool evict, const auto neighbors= UT_LIST_GET_LEN(buf_pool.LRU) < BUF_LRU_OLD_MIN_LEN ? 0 : srv_flush_neighbors; fil_space_t *space= nullptr; - bool do_evict= evict; uint32_t last_space_id= FIL_NULL; static_assert(FIL_NULL > SRV_TMP_SPACE_ID, "consistency"); static_assert(FIL_NULL > SRV_SPACE_ID_UPPER_BOUND, "consistency"); @@ -1236,27 +1239,47 @@ static void buf_flush_LRU_list_batch(ulint max, bool evict, bpage && ((UT_LIST_GET_LEN(buf_pool.LRU) > BUF_LRU_MIN_LEN && UT_LIST_GET_LEN(buf_pool.free) < free_limit) || - recv_recovery_is_on()); ++scanned) + recv_recovery_is_on()); + ++scanned, bpage= buf_pool.lru_hp.get()) { buf_page_t *prev= UT_LIST_GET_PREV(LRU, bpage); - const lsn_t oldest_modification= bpage->oldest_modification(); buf_pool.lru_hp.set(prev); - const auto state= bpage->state(); + auto state= bpage->state(); ut_ad(state >= buf_page_t::FREED); ut_ad(bpage->in_LRU_list); - if (oldest_modification <= 1) - { + switch (bpage->oldest_modification()) { + case 0: + evict: if (state != buf_page_t::FREED && (state >= buf_page_t::READ_FIX || (~buf_page_t::LRU_MASK & state))) - goto must_skip; - if (buf_LRU_free_page(bpage, true)) - ++n->evicted; + continue; + buf_LRU_free_page(bpage, true); + ++n->evicted; + /* fall through */ + case 1: + continue; } - else if (state < buf_page_t::READ_FIX) + + if (state < buf_page_t::READ_FIX && bpage->lock.u_lock_try(true)) { + ut_ad(!bpage->is_io_fixed()); + bool do_evict= evict; + switch (bpage->oldest_modification()) { + case 1: + mysql_mutex_lock(&buf_pool.flush_list_mutex); + buf_pool.delete_from_flush_list(bpage); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + /* fall through */ + case 0: + bpage->lock.u_unlock(true); + goto evict; + case 2: + /* LRU flushing will always evict pages of the temporary tablespace. */ + do_evict= true; + } /* Block is ready for flush. Dispatch an IO request. - If evict=true, the page will be evicted by buf_page_write_complete(). */ + If do_evict, the page may be evicted by buf_page_write_complete(). */ const page_id_t page_id(bpage->id()); const uint32_t space_id= page_id.space(); if (!space || space->id != space_id) @@ -1269,14 +1292,10 @@ static void buf_flush_LRU_list_batch(ulint max, bool evict, space->release(); auto p= buf_flush_space(space_id); space= p.first; - /* For the temporary tablespace, LRU flushing will always - evict pages upon completing the write. */ - do_evict= evict || space == fil_system.temp_space; last_space_id= space_id; mysql_mutex_lock(&buf_pool.mutex); if (p.second) buf_pool.stat.n_pages_written+= p.second; - goto retry; } else ut_ad(!space); @@ -1288,17 +1307,24 @@ static void buf_flush_LRU_list_batch(ulint max, bool evict, } if (!space) + { + mysql_mutex_lock(&buf_pool.flush_list_mutex); buf_flush_discard_page(bpage); + } else if (neighbors && space->is_rotational()) { mysql_mutex_unlock(&buf_pool.mutex); - n->flushed+= buf_flush_try_neighbors(space, page_id, neighbors == 1, + n->flushed+= buf_flush_try_neighbors(space, page_id, bpage, + neighbors == 1, do_evict, n->flushed, max); reacquire_mutex: mysql_mutex_lock(&buf_pool.mutex); } else if (n->flushed >= max && !recv_recovery_is_on()) + { + bpage->lock.u_unlock(true); break; + } else if (bpage->flush(do_evict, space)) { ++n->flushed; @@ -1306,11 +1332,8 @@ reacquire_mutex: } } else - must_skip: /* Can't evict or dispatch this block. Go to previous. */ ut_ad(buf_pool.lru_hp.is_hp(prev)); - retry: - bpage= buf_pool.lru_hp.get(); } buf_pool.lru_hp.set(nullptr); @@ -1341,6 +1364,7 @@ static void buf_do_LRU_batch(ulint max, bool evict, flush_counters_t *n) mysql_mutex_assert_owner(&buf_pool.mutex); buf_lru_freed_page_count+= n->evicted; buf_lru_flush_page_count+= n->flushed; + buf_pool.stat.n_pages_written+= n->flushed; } /** This utility flushes dirty blocks from the end of the flush_list. @@ -1354,6 +1378,7 @@ static ulint buf_do_flush_list_batch(ulint max_n, lsn_t lsn) ulint scanned= 0; mysql_mutex_assert_owner(&buf_pool.mutex); + mysql_mutex_assert_owner(&buf_pool.flush_list_mutex); const auto neighbors= UT_LIST_GET_LEN(buf_pool.LRU) < BUF_LRU_OLD_MIN_LEN ? 0 : srv_flush_neighbors; @@ -1364,7 +1389,6 @@ static ulint buf_do_flush_list_batch(ulint max_n, lsn_t lsn) /* Start from the end of the list looking for a suitable block to be flushed. */ - mysql_mutex_lock(&buf_pool.flush_list_mutex); ulint len= UT_LIST_GET_LEN(buf_pool.flush_list); for (buf_page_t *bpage= UT_LIST_GET_LAST(buf_pool.flush_list); @@ -1375,32 +1399,42 @@ static ulint buf_do_flush_list_batch(ulint max_n, lsn_t lsn) break; ut_ad(bpage->in_file()); - buf_page_t *prev= UT_LIST_GET_PREV(list, bpage); - - if (oldest_modification == 1) { - buf_pool.delete_from_flush_list(bpage); - skip: - bpage= prev; - continue; - } + buf_page_t *prev= UT_LIST_GET_PREV(list, bpage); - ut_ad(oldest_modification > 2); + if (oldest_modification == 1) + { + clear: + buf_pool.delete_from_flush_list(bpage); + skip: + bpage= prev; + continue; + } - if (!bpage->ready_for_flush()) - goto skip; + ut_ad(oldest_modification > 2); - /* In order not to degenerate this scan to O(n*n) we attempt to - preserve the pointer position. Any thread that would remove 'prev' - from buf_pool.flush_list must adjust the hazard pointer. + if (!bpage->lock.u_lock_try(true)) + goto skip; - Note: A concurrent execution of buf_flush_list_space() may - terminate this scan prematurely. The buf_pool.n_flush_list_ - should prevent multiple threads from executing - buf_do_flush_list_batch() concurrently, - but buf_flush_list_space() is ignoring that. */ - buf_pool.flush_hp.set(prev); - mysql_mutex_unlock(&buf_pool.flush_list_mutex); + ut_ad(!bpage->is_io_fixed()); + + if (bpage->oldest_modification() == 1) + { + bpage->lock.u_unlock(true); + goto clear; + } + + /* In order not to degenerate this scan to O(n*n) we attempt to + preserve the pointer position. Any thread that would remove 'prev' + from buf_pool.flush_list must adjust the hazard pointer. + + Note: A concurrent execution of buf_flush_list_space() may + terminate this scan prematurely. The buf_pool.flush_list_active + should prevent multiple threads from executing + buf_do_flush_list_batch() concurrently, + but buf_flush_list_space() is ignoring that. */ + buf_pool.flush_hp.set(prev); + } const page_id_t page_id(bpage->id()); const uint32_t space_id= page_id.space(); @@ -1408,8 +1442,6 @@ static ulint buf_do_flush_list_batch(ulint max_n, lsn_t lsn) { if (last_space_id != space_id) { - mysql_mutex_lock(&buf_pool.flush_list_mutex); - buf_pool.flush_hp.set(bpage); mysql_mutex_unlock(&buf_pool.flush_list_mutex); mysql_mutex_unlock(&buf_pool.mutex); if (space) @@ -1418,18 +1450,8 @@ static ulint buf_do_flush_list_batch(ulint max_n, lsn_t lsn) space= p.first; last_space_id= space_id; mysql_mutex_lock(&buf_pool.mutex); - if (p.second) - buf_pool.stat.n_pages_written+= p.second; + buf_pool.stat.n_pages_written+= p.second; mysql_mutex_lock(&buf_pool.flush_list_mutex); - bpage= buf_pool.flush_hp.get(); - if (!bpage) - break; - if (bpage->id() != page_id) - continue; - buf_pool.flush_hp.set(UT_LIST_GET_PREV(list, bpage)); - if (bpage->oldest_modification() <= 1 || !bpage->ready_for_flush()) - goto next; - mysql_mutex_unlock(&buf_pool.flush_list_mutex); } else ut_ad(!space); @@ -1442,27 +1464,29 @@ static ulint buf_do_flush_list_batch(ulint max_n, lsn_t lsn) if (!space) buf_flush_discard_page(bpage); - else if (neighbors && space->is_rotational()) - { - mysql_mutex_unlock(&buf_pool.mutex); - count+= buf_flush_try_neighbors(space, page_id, neighbors == 1, - false, count, max_n); - reacquire_mutex: - mysql_mutex_lock(&buf_pool.mutex); - } - else if (bpage->flush(false, space)) + else { - ++count; - goto reacquire_mutex; + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + if (neighbors && space->is_rotational()) + { + mysql_mutex_unlock(&buf_pool.mutex); + count+= buf_flush_try_neighbors(space, page_id, bpage, neighbors == 1, + false, count, max_n); + reacquire_mutex: + mysql_mutex_lock(&buf_pool.mutex); + } + else if (bpage->flush(false, space)) + { + ++count; + goto reacquire_mutex; + } } mysql_mutex_lock(&buf_pool.flush_list_mutex); - next: bpage= buf_pool.flush_hp.get(); } buf_pool.flush_hp.set(nullptr); - mysql_mutex_unlock(&buf_pool.flush_list_mutex); if (space) space->release(); @@ -1472,32 +1496,25 @@ static ulint buf_do_flush_list_batch(ulint max_n, lsn_t lsn) MONITOR_FLUSH_BATCH_SCANNED_NUM_CALL, MONITOR_FLUSH_BATCH_SCANNED_PER_CALL, scanned); - if (count) - MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_BATCH_TOTAL_PAGE, - MONITOR_FLUSH_BATCH_COUNT, - MONITOR_FLUSH_BATCH_PAGES, - count); - mysql_mutex_assert_owner(&buf_pool.mutex); return count; } -/** Wait until a flush batch ends. -@param lru true=buf_pool.LRU; false=buf_pool.flush_list */ -void buf_flush_wait_batch_end(bool lru) +/** Wait until a LRU flush batch ends. */ +void buf_flush_wait_LRU_batch_end() { - const auto &n_flush= lru ? buf_pool.n_flush_LRU_ : buf_pool.n_flush_list_; + mysql_mutex_assert_owner(&buf_pool.flush_list_mutex); + mysql_mutex_assert_not_owner(&buf_pool.mutex); - if (n_flush) + if (buf_pool.n_flush()) { - auto cond= lru ? &buf_pool.done_flush_LRU : &buf_pool.done_flush_list; tpool::tpool_wait_begin(); thd_wait_begin(nullptr, THD_WAIT_DISKIO); do - my_cond_wait(cond, &buf_pool.mutex.m_mutex); - while (n_flush); + my_cond_wait(&buf_pool.done_flush_LRU, + &buf_pool.flush_list_mutex.m_mutex); + while (buf_pool.n_flush()); tpool::tpool_wait_end(); thd_wait_end(nullptr); - pthread_cond_broadcast(cond); } } @@ -1514,21 +1531,31 @@ static ulint buf_flush_list_holding_mutex(ulint max_n= ULINT_UNDEFINED, ut_ad(lsn); mysql_mutex_assert_owner(&buf_pool.mutex); - if (buf_pool.n_flush_list_) + mysql_mutex_lock(&buf_pool.flush_list_mutex); + if (buf_pool.flush_list_active()) + { +nothing_to_do: + mysql_mutex_unlock(&buf_pool.flush_list_mutex); return 0; - - /* FIXME: we are performing a dirty read of buf_pool.flush_list.count - while not holding buf_pool.flush_list_mutex */ - if (!UT_LIST_GET_LEN(buf_pool.flush_list)) + } + if (!buf_pool.get_oldest_modification(0)) { pthread_cond_broadcast(&buf_pool.done_flush_list); - return 0; + goto nothing_to_do; } - - buf_pool.n_flush_list_++; + buf_pool.flush_list_set_active(); const ulint n_flushed= buf_do_flush_list_batch(max_n, lsn); - if (!--buf_pool.n_flush_list_) - pthread_cond_broadcast(&buf_pool.done_flush_list); + if (n_flushed) + buf_pool.stat.n_pages_written+= n_flushed; + buf_pool.flush_list_set_inactive(); + pthread_cond_broadcast(&buf_pool.done_flush_list); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + + if (n_flushed) + MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_BATCH_TOTAL_PAGE, + MONITOR_FLUSH_BATCH_COUNT, + MONITOR_FLUSH_BATCH_PAGES, + n_flushed); DBUG_PRINT("ib_buf", ("flush_list completed, " ULINTPF " pages", n_flushed)); return n_flushed; @@ -1560,6 +1587,7 @@ bool buf_flush_list_space(fil_space_t *space, ulint *n_flushed) bool may_have_skipped= false; ulint max_n_flush= srv_io_capacity; + ulint n_flush= 0; bool acquired= space->acquire(); { @@ -1576,11 +1604,17 @@ bool buf_flush_list_space(fil_space_t *space, ulint *n_flushed) ut_ad(bpage->in_file()); buf_page_t *prev= UT_LIST_GET_PREV(list, bpage); - if (bpage->id().space() != space_id); - else if (bpage->oldest_modification() == 1) + if (bpage->oldest_modification() == 1) + clear: buf_pool.delete_from_flush_list(bpage); - else if (!bpage->ready_for_flush()) + else if (bpage->id().space() != space_id); + else if (!bpage->lock.u_lock_try(true)) may_have_skipped= true; + else if (bpage->oldest_modification() == 1) + { + bpage->lock.u_unlock(true); + goto clear; + } else { /* In order not to degenerate this scan to O(n*n) we attempt to @@ -1592,13 +1626,10 @@ bool buf_flush_list_space(fil_space_t *space, ulint *n_flushed) concurrently. This may terminate our iteration prematurely, leading us to return may_have_skipped=true. */ buf_pool.flush_hp.set(prev); - mysql_mutex_unlock(&buf_pool.flush_list_mutex); if (!acquired) - { was_freed: buf_flush_discard_page(bpage); - } else { if (space->is_stopping()) @@ -1607,28 +1638,24 @@ bool buf_flush_list_space(fil_space_t *space, ulint *n_flushed) acquired= false; goto was_freed; } - if (!bpage->flush(false, space)) - { - may_have_skipped= true; - mysql_mutex_lock(&buf_pool.flush_list_mutex); - goto next_after_skip; - } - if (n_flushed) - ++*n_flushed; - if (!--max_n_flush) + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + if (bpage->flush(false, space)) { + ++n_flush; + if (!--max_n_flush) + { + mysql_mutex_lock(&buf_pool.mutex); + mysql_mutex_lock(&buf_pool.flush_list_mutex); + may_have_skipped= true; + goto done; + } mysql_mutex_lock(&buf_pool.mutex); - mysql_mutex_lock(&buf_pool.flush_list_mutex); - may_have_skipped= true; - break; } - mysql_mutex_lock(&buf_pool.mutex); } mysql_mutex_lock(&buf_pool.flush_list_mutex); if (!buf_pool.flush_hp.is_hp(prev)) may_have_skipped= true; - next_after_skip: bpage= buf_pool.flush_hp.get(); continue; } @@ -1641,14 +1668,19 @@ bool buf_flush_list_space(fil_space_t *space, ulint *n_flushed) buf_flush_list_space(). We should always return true from buf_flush_list_space() if that should be the case; in buf_do_flush_list_batch() we will simply perform less work. */ - +done: buf_pool.flush_hp.set(nullptr); mysql_mutex_unlock(&buf_pool.flush_list_mutex); buf_pool.try_LRU_scan= true; pthread_cond_broadcast(&buf_pool.done_free); + + buf_pool.stat.n_pages_written+= n_flush; mysql_mutex_unlock(&buf_pool.mutex); + if (n_flushed) + *n_flushed= n_flush; + if (acquired) space->release(); @@ -1672,29 +1704,20 @@ ulint buf_flush_LRU(ulint max_n, bool evict) { mysql_mutex_assert_owner(&buf_pool.mutex); - if (evict) - { - if (buf_pool.n_flush_LRU_) - return 0; - buf_pool.n_flush_LRU_= 1; - } - flush_counters_t n; buf_do_LRU_batch(max_n, evict, &n); + ulint pages= n.flushed; + if (n.evicted) { + if (evict) + pages+= n.evicted; buf_pool.try_LRU_scan= true; - pthread_cond_signal(&buf_pool.done_free); + pthread_cond_broadcast(&buf_pool.done_free); } - if (!evict) - return n.flushed; - - if (!--buf_pool.n_flush_LRU_) - pthread_cond_broadcast(&buf_pool.done_flush_LRU); - - return n.evicted + n.flushed; + return pages; } /** Initiate a log checkpoint, discarding the start of the log. @@ -1826,9 +1849,14 @@ static void buf_flush_wait(lsn_t lsn) buf_flush_sync_lsn= lsn; buf_pool.page_cleaner_set_idle(false); pthread_cond_signal(&buf_pool.do_flush_list); + my_cond_wait(&buf_pool.done_flush_list, + &buf_pool.flush_list_mutex.m_mutex); + if (buf_pool.get_oldest_modification(lsn) >= lsn) + break; } - my_cond_wait(&buf_pool.done_flush_list, - &buf_pool.flush_list_mutex.m_mutex); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + buf_dblwr.wait_for_page_writes(); + mysql_mutex_lock(&buf_pool.flush_list_mutex); } } @@ -1849,6 +1877,9 @@ ATTRIBUTE_COLD void buf_flush_wait_flushed(lsn_t sync_lsn) if (buf_pool.get_oldest_modification(sync_lsn) < sync_lsn) { MONITOR_INC(MONITOR_FLUSH_SYNC_WAITS); + thd_wait_begin(nullptr, THD_WAIT_DISKIO); + tpool::tpool_wait_begin(); + #if 1 /* FIXME: remove this, and guarantee that the page cleaner serves us */ if (UNIV_UNLIKELY(!buf_page_cleaner_is_active)) { @@ -1856,28 +1887,23 @@ ATTRIBUTE_COLD void buf_flush_wait_flushed(lsn_t sync_lsn) { mysql_mutex_unlock(&buf_pool.flush_list_mutex); ulint n_pages= buf_flush_list(srv_max_io_capacity, sync_lsn); - mysql_mutex_lock(&buf_pool.mutex); - buf_flush_wait_batch_end(false); - mysql_mutex_unlock(&buf_pool.mutex); if (n_pages) { MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_SYNC_TOTAL_PAGE, MONITOR_FLUSH_SYNC_COUNT, MONITOR_FLUSH_SYNC_PAGES, n_pages); } + buf_dblwr.wait_for_page_writes(); mysql_mutex_lock(&buf_pool.flush_list_mutex); } while (buf_pool.get_oldest_modification(sync_lsn) < sync_lsn); } else #endif - { - thd_wait_begin(nullptr, THD_WAIT_DISKIO); - tpool::tpool_wait_begin(); buf_flush_wait(sync_lsn); - tpool::tpool_wait_end(); - thd_wait_end(nullptr); - } + + tpool::tpool_wait_end(); + thd_wait_end(nullptr); } mysql_mutex_unlock(&buf_pool.flush_list_mutex); @@ -1930,11 +1956,10 @@ and try to initiate checkpoints until the target is met. ATTRIBUTE_COLD static void buf_flush_sync_for_checkpoint(lsn_t lsn) { ut_ad(!srv_read_only_mode); + mysql_mutex_assert_not_owner(&buf_pool.flush_list_mutex); for (;;) { - mysql_mutex_unlock(&buf_pool.flush_list_mutex); - if (ulint n_flushed= buf_flush_list(srv_max_io_capacity, lsn)) { MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_SYNC_TOTAL_PAGE, @@ -1985,6 +2010,7 @@ ATTRIBUTE_COLD static void buf_flush_sync_for_checkpoint(lsn_t lsn) /* wake up buf_flush_wait() */ pthread_cond_broadcast(&buf_pool.done_flush_list); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); lsn= std::max(lsn, target); @@ -2179,8 +2205,6 @@ static void buf_flush_page_cleaner() timespec abstime; set_timespec(abstime, 1); - mysql_mutex_lock(&buf_pool.flush_list_mutex); - lsn_t lsn_limit; ulint last_activity_count= srv_get_activity_count(); @@ -2188,45 +2212,34 @@ static void buf_flush_page_cleaner() { lsn_limit= buf_flush_sync_lsn; - if (UNIV_UNLIKELY(lsn_limit != 0)) + if (UNIV_UNLIKELY(lsn_limit != 0) && UNIV_LIKELY(srv_flush_sync)) { furious_flush: - if (UNIV_LIKELY(srv_flush_sync)) - { - buf_flush_sync_for_checkpoint(lsn_limit); - last_pages= 0; - set_timespec(abstime, 1); - continue; - } + buf_flush_sync_for_checkpoint(lsn_limit); + last_pages= 0; + set_timespec(abstime, 1); + continue; } + + mysql_mutex_lock(&buf_pool.flush_list_mutex); + if (buf_pool.ran_out()) + goto no_wait; else if (srv_shutdown_state > SRV_SHUTDOWN_INITIATED) break; - /* If buf pager cleaner is idle and there is no work - (either dirty pages are all flushed or adaptive flushing - is not enabled) then opt for non-timed wait */ if (buf_pool.page_cleaner_idle() && (!UT_LIST_GET_LEN(buf_pool.flush_list) || srv_max_dirty_pages_pct_lwm == 0.0)) + /* We are idle; wait for buf_pool.page_cleaner_wakeup() */ my_cond_wait(&buf_pool.do_flush_list, &buf_pool.flush_list_mutex.m_mutex); else my_cond_timedwait(&buf_pool.do_flush_list, &buf_pool.flush_list_mutex.m_mutex, &abstime); - + no_wait: set_timespec(abstime, 1); - lsn_t soft_lsn_limit= buf_flush_async_lsn; lsn_limit= buf_flush_sync_lsn; - - if (UNIV_UNLIKELY(lsn_limit != 0)) - { - if (UNIV_LIKELY(srv_flush_sync)) - goto furious_flush; - } - else if (srv_shutdown_state > SRV_SHUTDOWN_INITIATED) - break; - const lsn_t oldest_lsn= buf_pool.get_oldest_modification(0); if (!oldest_lsn) @@ -2241,6 +2254,8 @@ static void buf_flush_page_cleaner() buf_flush_async_lsn= 0; set_idle: buf_pool.page_cleaner_set_idle(true); + if (UNIV_UNLIKELY(srv_shutdown_state > SRV_SHUTDOWN_INITIATED)) + break; mysql_mutex_unlock(&buf_pool.flush_list_mutex); end_of_batch: buf_dblwr.flush_buffered_writes(); @@ -2257,10 +2272,57 @@ static void buf_flush_page_cleaner() } while (false); + if (!buf_pool.ran_out()) + continue; mysql_mutex_lock(&buf_pool.flush_list_mutex); - continue; } + lsn_t soft_lsn_limit= buf_flush_async_lsn; + + if (UNIV_UNLIKELY(lsn_limit != 0)) + { + if (srv_flush_sync) + goto do_furious_flush; + if (oldest_lsn >= lsn_limit) + { + buf_flush_sync_lsn= 0; + pthread_cond_broadcast(&buf_pool.done_flush_list); + } + else if (lsn_limit > soft_lsn_limit) + soft_lsn_limit= lsn_limit; + } + + bool idle_flush= false; + ulint n_flushed= 0, n; + + if (UNIV_UNLIKELY(soft_lsn_limit != 0)) + { + if (oldest_lsn >= soft_lsn_limit) + buf_flush_async_lsn= soft_lsn_limit= 0; + } + else if (buf_pool.ran_out()) + { + buf_pool.page_cleaner_set_idle(false); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + n= srv_max_io_capacity; + mysql_mutex_lock(&buf_pool.mutex); + LRU_flush: + n= buf_flush_LRU(n, false); + mysql_mutex_unlock(&buf_pool.mutex); + last_pages+= n; + + if (!idle_flush) + goto end_of_batch; + + /* when idle flushing kicks in page_cleaner is marked active. + reset it back to idle since the it was made active as part of + idle flushing stage. */ + mysql_mutex_lock(&buf_pool.flush_list_mutex); + goto set_idle; + } + else if (UNIV_UNLIKELY(srv_shutdown_state > SRV_SHUTDOWN_INITIATED)) + break; + const ulint dirty_blocks= UT_LIST_GET_LEN(buf_pool.flush_list); ut_ad(dirty_blocks); /* We perform dirty reads of the LRU+free list lengths here. @@ -2268,60 +2330,53 @@ static void buf_flush_page_cleaner() guaranteed to be nonempty, and it is a subset of buf_pool.LRU. */ const double dirty_pct= double(dirty_blocks) * 100.0 / double(UT_LIST_GET_LEN(buf_pool.LRU) + UT_LIST_GET_LEN(buf_pool.free)); - - bool idle_flush= false; - - if (lsn_limit || soft_lsn_limit); - else if (af_needed_for_redo(oldest_lsn)); - else if (srv_max_dirty_pages_pct_lwm != 0.0) + if (srv_max_dirty_pages_pct_lwm != 0.0) { const ulint activity_count= srv_get_activity_count(); if (activity_count != last_activity_count) + { last_activity_count= activity_count; + goto maybe_unemployed; + } else if (buf_pool.page_cleaner_idle() && buf_pool.n_pend_reads == 0) { - /* reaching here means 3 things: - - last_activity_count == activity_count: suggesting server is idle - (no trx_t::commit activity) - - page cleaner is idle (dirty_pct < srv_max_dirty_pages_pct_lwm) - - there are no pending reads but there are dirty pages to flush */ - idle_flush= true; + /* reaching here means 3 things: + - last_activity_count == activity_count: suggesting server is idle + (no trx_t::commit() activity) + - page cleaner is idle (dirty_pct < srv_max_dirty_pages_pct_lwm) + - there are no pending reads but there are dirty pages to flush */ buf_pool.update_last_activity_count(activity_count); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + idle_flush= true; + goto idle_flush; } - - if (!idle_flush && dirty_pct < srv_max_dirty_pages_pct_lwm) - goto unemployed; + else + maybe_unemployed: + if (dirty_pct < srv_max_dirty_pages_pct_lwm) + goto possibly_unemployed; } else if (dirty_pct < srv_max_buf_pool_modified_pct) - goto unemployed; - - if (UNIV_UNLIKELY(lsn_limit != 0) && oldest_lsn >= lsn_limit) - lsn_limit= buf_flush_sync_lsn= 0; - if (UNIV_UNLIKELY(soft_lsn_limit != 0) && oldest_lsn >= soft_lsn_limit) - soft_lsn_limit= buf_flush_async_lsn= 0; + possibly_unemployed: + if (!soft_lsn_limit && !af_needed_for_redo(oldest_lsn)) + goto unemployed; buf_pool.page_cleaner_set_idle(false); mysql_mutex_unlock(&buf_pool.flush_list_mutex); - if (!lsn_limit) - lsn_limit= soft_lsn_limit; - - ulint n_flushed= 0, n; - - if (UNIV_UNLIKELY(lsn_limit != 0)) + if (UNIV_UNLIKELY(soft_lsn_limit != 0)) { n= srv_max_io_capacity; goto background_flush; } - else if (idle_flush || !srv_adaptive_flushing) + + if (!srv_adaptive_flushing) { + idle_flush: n= srv_io_capacity; - lsn_limit= LSN_MAX; + soft_lsn_limit= LSN_MAX; background_flush: mysql_mutex_lock(&buf_pool.mutex); - n_flushed= buf_flush_list_holding_mutex(n, lsn_limit); - /* wake up buf_flush_wait() */ - pthread_cond_broadcast(&buf_pool.done_flush_list); + n_flushed= buf_flush_list_holding_mutex(n, soft_lsn_limit); MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_BACKGROUND_TOTAL_PAGE, MONITOR_FLUSH_BACKGROUND_COUNT, MONITOR_FLUSH_BACKGROUND_PAGES, @@ -2347,18 +2402,8 @@ static void buf_flush_page_cleaner() goto unemployed; } - n= buf_flush_LRU(n >= n_flushed ? n - n_flushed : 0, false); - mysql_mutex_unlock(&buf_pool.mutex); - last_pages+= n; - - if (!idle_flush) - goto end_of_batch; - - /* when idle flushing kicks in page_cleaner is marked active. - reset it back to idle since the it was made active as part of - idle flushing stage. */ - mysql_mutex_lock(&buf_pool.flush_list_mutex); - goto set_idle; + n= n >= n_flushed ? n - n_flushed : 0; + goto LRU_flush; } mysql_mutex_unlock(&buf_pool.flush_list_mutex); @@ -2366,16 +2411,20 @@ static void buf_flush_page_cleaner() if (srv_fast_shutdown != 2) { buf_dblwr.flush_buffered_writes(); - mysql_mutex_lock(&buf_pool.mutex); - buf_flush_wait_batch_end(true); - buf_flush_wait_batch_end(false); - mysql_mutex_unlock(&buf_pool.mutex); + mysql_mutex_lock(&buf_pool.flush_list_mutex); + buf_flush_wait_LRU_batch_end(); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + buf_dblwr.wait_for_page_writes(); } mysql_mutex_lock(&buf_pool.flush_list_mutex); lsn_limit= buf_flush_sync_lsn; if (UNIV_UNLIKELY(lsn_limit != 0)) + { + do_furious_flush: + mysql_mutex_unlock(&buf_pool.flush_list_mutex); goto furious_flush; + } buf_page_cleaner_is_active= false; pthread_cond_broadcast(&buf_pool.done_flush_list); mysql_mutex_unlock(&buf_pool.flush_list_mutex); @@ -2400,17 +2449,6 @@ ATTRIBUTE_COLD void buf_flush_page_cleaner_init() std::thread(buf_flush_page_cleaner).detach(); } -#if defined(HAVE_SYSTEMD) && !defined(EMBEDDED_LIBRARY) -/** @return the number of dirty pages in the buffer pool */ -static ulint buf_flush_list_length() -{ - mysql_mutex_lock(&buf_pool.flush_list_mutex); - const ulint len= UT_LIST_GET_LEN(buf_pool.flush_list); - mysql_mutex_unlock(&buf_pool.flush_list_mutex); - return len; -} -#endif - /** Flush the buffer pool on shutdown. */ ATTRIBUTE_COLD void buf_flush_buffer_pool() { @@ -2425,24 +2463,20 @@ ATTRIBUTE_COLD void buf_flush_buffer_pool() while (buf_pool.get_oldest_modification(0)) { mysql_mutex_unlock(&buf_pool.flush_list_mutex); - mysql_mutex_lock(&buf_pool.mutex); - buf_flush_list_holding_mutex(srv_max_io_capacity); - if (buf_pool.n_flush_list_) + buf_flush_list(srv_max_io_capacity); + if (const size_t pending= buf_dblwr.pending_writes()) { - mysql_mutex_unlock(&buf_pool.mutex); timespec abstime; service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL, - "Waiting to flush " ULINTPF " pages", - buf_flush_list_length()); + "Waiting to write %zu pages", pending); set_timespec(abstime, INNODB_EXTEND_TIMEOUT_INTERVAL / 2); - buf_dblwr.flush_buffered_writes(); - mysql_mutex_lock(&buf_pool.mutex); - while (buf_pool.n_flush_list_) - my_cond_timedwait(&buf_pool.done_flush_list, &buf_pool.mutex.m_mutex, - &abstime); + buf_dblwr.wait_for_page_writes(abstime); } - mysql_mutex_unlock(&buf_pool.mutex); + mysql_mutex_lock(&buf_pool.flush_list_mutex); + service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL, + "Waiting to flush " ULINTPF " pages", + UT_LIST_GET_LEN(buf_pool.flush_list)); } mysql_mutex_unlock(&buf_pool.flush_list_mutex); @@ -2483,6 +2517,7 @@ void buf_flush_sync() if (lsn == log_sys.get_lsn()) break; } + mysql_mutex_unlock(&buf_pool.flush_list_mutex); tpool::tpool_wait_end(); thd_wait_end(nullptr); diff --git a/storage/innobase/buf/buf0lru.cc b/storage/innobase/buf/buf0lru.cc index 9fa6492d525..1947dfaeeb4 100644 --- a/storage/innobase/buf/buf0lru.cc +++ b/storage/innobase/buf/buf0lru.cc @@ -136,7 +136,6 @@ static void buf_LRU_block_free_hashed_page(buf_block_t *block) @param[in] bpage control block */ static inline void incr_LRU_size_in_bytes(const buf_page_t* bpage) { - /* FIXME: use atomics, not mutex */ mysql_mutex_assert_owner(&buf_pool.mutex); buf_pool.stat.LRU_bytes += bpage->physical_size(); @@ -400,6 +399,7 @@ buf_block_t *buf_LRU_get_free_block(bool have_mutex) DBUG_EXECUTE_IF("recv_ran_out_of_buffer", if (recv_recovery_is_on() && recv_sys.apply_log_recs) { + mysql_mutex_lock(&buf_pool.mutex); goto flush_lru; }); get_mutex: @@ -445,20 +445,32 @@ got_block: if ((block = buf_LRU_get_free_only()) != nullptr) { goto got_block; } - if (!buf_pool.n_flush_LRU_) { - break; + mysql_mutex_unlock(&buf_pool.mutex); + mysql_mutex_lock(&buf_pool.flush_list_mutex); + const auto n_flush = buf_pool.n_flush(); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + mysql_mutex_lock(&buf_pool.mutex); + if (!n_flush) { + goto not_found; + } + if (!buf_pool.try_LRU_scan) { + mysql_mutex_lock(&buf_pool.flush_list_mutex); + buf_pool.page_cleaner_wakeup(true); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + my_cond_wait(&buf_pool.done_free, + &buf_pool.mutex.m_mutex); } - my_cond_wait(&buf_pool.done_free, &buf_pool.mutex.m_mutex); } -#ifndef DBUG_OFF not_found: -#endif - mysql_mutex_unlock(&buf_pool.mutex); + if (n_iterations > 1) { + MONITOR_INC( MONITOR_LRU_GET_FREE_WAITS ); + } - if (n_iterations > 20 && !buf_lru_free_blocks_error_printed + if (n_iterations == 21 && !buf_lru_free_blocks_error_printed && srv_buf_pool_old_size == srv_buf_pool_size) { - + buf_lru_free_blocks_error_printed = true; + mysql_mutex_unlock(&buf_pool.mutex); ib::warn() << "Difficult to find free blocks in the buffer pool" " (" << n_iterations << " search iterations)! " << flush_failures << " failed attempts to" @@ -472,12 +484,7 @@ not_found: << os_n_file_writes << " OS file writes, " << os_n_fsyncs << " OS fsyncs."; - - buf_lru_free_blocks_error_printed = true; - } - - if (n_iterations > 1) { - MONITOR_INC( MONITOR_LRU_GET_FREE_WAITS ); + mysql_mutex_lock(&buf_pool.mutex); } /* No free block was found: try to flush the LRU list. @@ -491,8 +498,6 @@ not_found: #ifndef DBUG_OFF flush_lru: #endif - mysql_mutex_lock(&buf_pool.mutex); - if (!buf_flush_LRU(innodb_lru_flush_size, true)) { MONITOR_INC(MONITOR_LRU_SINGLE_FLUSH_FAILURE_COUNT); ++flush_failures; @@ -1039,7 +1044,8 @@ buf_LRU_block_free_non_file_page( } else { UT_LIST_ADD_FIRST(buf_pool.free, &block->page); ut_d(block->page.in_free_list = true); - pthread_cond_signal(&buf_pool.done_free); + buf_pool.try_LRU_scan= true; + pthread_cond_broadcast(&buf_pool.done_free); } MEM_NOACCESS(block->page.frame, srv_page_size); diff --git a/storage/innobase/buf/buf0rea.cc b/storage/innobase/buf/buf0rea.cc index b20b105a4c4..b39a8f49133 100644 --- a/storage/innobase/buf/buf0rea.cc +++ b/storage/innobase/buf/buf0rea.cc @@ -226,6 +226,7 @@ static buf_page_t* buf_page_init_for_read(ulint mode, const page_id_t page_id, buf_LRU_add_block(bpage, true/* to old blocks */); } + buf_pool.stat.n_pages_read++; mysql_mutex_unlock(&buf_pool.mutex); buf_pool.n_pend_reads++; goto func_exit_no_mutex; @@ -245,20 +246,18 @@ buffer buf_pool if it is not already there, in which case does nothing. Sets the io_fix flag and sets an exclusive lock on the buffer frame. The flag is cleared and the x-lock released by an i/o-handler thread. -@param[out] err DB_SUCCESS or DB_TABLESPACE_DELETED - if we are trying - to read from a non-existent tablespace @param[in,out] space tablespace @param[in] sync true if synchronous aio is desired @param[in] mode BUF_READ_IBUF_PAGES_ONLY, ..., @param[in] page_id page id @param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param[in] unzip true=request uncompressed page -@return whether a read request was queued */ +@return error code +@retval DB_SUCCESS if the page was read +@retval DB_SUCCESS_LOCKED_REC if the page exists in the buffer pool already */ static -bool +dberr_t buf_read_page_low( - dberr_t* err, fil_space_t* space, bool sync, ulint mode, @@ -268,15 +267,12 @@ buf_read_page_low( { buf_page_t* bpage; - *err = DB_SUCCESS; - if (buf_dblwr.is_inside(page_id)) { ib::error() << "Trying to read doublewrite buffer page " << page_id; ut_ad(0); -nothing_read: space->release(); - return false; + return DB_PAGE_CORRUPTED; } if (sync) { @@ -299,8 +295,9 @@ nothing_read: completed */ bpage = buf_page_init_for_read(mode, page_id, zip_size, unzip); - if (bpage == NULL) { - goto nothing_read; + if (!bpage) { + space->release(); + return DB_SUCCESS_LOCKED_REC; } ut_ad(bpage->in_file()); @@ -320,7 +317,6 @@ nothing_read: ? IORequest::READ_SYNC : IORequest::READ_ASYNC), page_id.page_no() * len, len, dst, bpage); - *err = fio.err; if (UNIV_UNLIKELY(fio.err != DB_SUCCESS)) { ut_d(auto n=) buf_pool.n_pend_reads--; @@ -329,14 +325,14 @@ nothing_read: } else if (sync) { thd_wait_end(NULL); /* The i/o was already completed in space->io() */ - *err = bpage->read_complete(*fio.node); + fio.err = bpage->read_complete(*fio.node); space->release(); - if (*err == DB_FAIL) { - *err = DB_PAGE_CORRUPTED; + if (fio.err == DB_FAIL) { + fio.err = DB_PAGE_CORRUPTED; } } - return true; + return fio.err; } /** Applies a random read-ahead in buf_pool if there are at least a threshold @@ -414,24 +410,26 @@ read_ahead: continue; if (space->is_stopping()) break; - dberr_t err; space->reacquire(); - if (buf_read_page_low(&err, space, false, ibuf_mode, i, zip_size, false)) + if (buf_read_page_low(space, false, ibuf_mode, i, zip_size, false) == + DB_SUCCESS) count++; } if (count) + { DBUG_PRINT("ib_buf", ("random read-ahead %zu pages from %s: %u", count, space->chain.start->name, low.page_no())); - space->release(); - - /* Read ahead is considered one I/O operation for the purpose of - LRU policy decision. */ - buf_LRU_stat_inc_io(); + mysql_mutex_lock(&buf_pool.mutex); + /* Read ahead is considered one I/O operation for the purpose of + LRU policy decision. */ + buf_LRU_stat_inc_io(); + buf_pool.stat.n_ra_pages_read_rnd+= count; + mysql_mutex_unlock(&buf_pool.mutex); + } - buf_pool.stat.n_ra_pages_read_rnd+= count; - srv_stats.buf_pool_reads.add(count); + space->release(); return count; } @@ -441,8 +439,9 @@ on the buffer frame. The flag is cleared and the x-lock released by the i/o-handler thread. @param[in] page_id page id @param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 -@retval DB_SUCCESS if the page was read and is not corrupted, -@retval DB_PAGE_CORRUPTED if page based on checksum check is corrupted, +@retval DB_SUCCESS if the page was read and is not corrupted +@retval DB_SUCCESS_LOCKED_REC if the page was not read +@retval DB_PAGE_CORRUPTED if page based on checksum check is corrupted @retval DB_DECRYPTION_FAILED if page post encryption checksum matches but after decryption normal page checksum does not match. @retval DB_TABLESPACE_DELETED if tablespace .ibd file is missing */ @@ -456,13 +455,9 @@ dberr_t buf_read_page(const page_id_t page_id, ulint zip_size) return DB_TABLESPACE_DELETED; } - dberr_t err; - if (buf_read_page_low(&err, space, true, BUF_READ_ANY_PAGE, - page_id, zip_size, false)) - srv_stats.buf_pool_reads.add(1); - - buf_LRU_stat_inc_io(); - return err; + buf_LRU_stat_inc_io(); /* NOT protected by buf_pool.mutex */ + return buf_read_page_low(space, true, BUF_READ_ANY_PAGE, + page_id, zip_size, false); } /** High-level function which reads a page asynchronously from a file to the @@ -475,12 +470,8 @@ released by the i/o-handler thread. void buf_read_page_background(fil_space_t *space, const page_id_t page_id, ulint zip_size) { - dberr_t err; - - if (buf_read_page_low(&err, space, false, BUF_READ_ANY_PAGE, - page_id, zip_size, false)) { - srv_stats.buf_pool_reads.add(1); - } + buf_read_page_low(space, false, BUF_READ_ANY_PAGE, + page_id, zip_size, false); /* We do not increment number of I/O operations used for LRU policy here (buf_LRU_stat_inc_io()). We use this in heuristics to decide @@ -638,23 +629,26 @@ failed: continue; if (space->is_stopping()) break; - dberr_t err; space->reacquire(); - count+= buf_read_page_low(&err, space, false, ibuf_mode, new_low, zip_size, - false); + if (buf_read_page_low(space, false, ibuf_mode, new_low, zip_size, false) == + DB_SUCCESS) + count++; } if (count) + { DBUG_PRINT("ib_buf", ("random read-ahead %zu pages from %s: %u", count, space->chain.start->name, new_low.page_no())); - space->release(); - - /* Read ahead is considered one I/O operation for the purpose of - LRU policy decision. */ - buf_LRU_stat_inc_io(); + mysql_mutex_lock(&buf_pool.mutex); + /* Read ahead is considered one I/O operation for the purpose of + LRU policy decision. */ + buf_LRU_stat_inc_io(); + buf_pool.stat.n_ra_pages_read+= count; + mysql_mutex_unlock(&buf_pool.mutex); + } - buf_pool.stat.n_ra_pages_read+= count; + space->release(); return count; } @@ -709,13 +703,12 @@ void buf_read_recv_pages(ulint space_id, const uint32_t* page_nos, ulint n) } } - dberr_t err; space->reacquire(); - buf_read_page_low(&err, space, false, - BUF_READ_ANY_PAGE, cur_page_id, zip_size, - true); - - if (err != DB_SUCCESS) { + switch (buf_read_page_low(space, false, BUF_READ_ANY_PAGE, + cur_page_id, zip_size, true)) { + case DB_SUCCESS: case DB_SUCCESS_LOCKED_REC: + break; + default: sql_print_error("InnoDB: Recovery failed to read page " UINT32PF " from %s", cur_page_id.page_no(), diff --git a/storage/innobase/gis/gis0rtree.cc b/storage/innobase/gis/gis0rtree.cc index 59d77c9c5fc..83afd732b21 100644 --- a/storage/innobase/gis/gis0rtree.cc +++ b/storage/innobase/gis/gis0rtree.cc @@ -1209,8 +1209,6 @@ after_insert: ut_ad(!rec || rec_offs_validate(rec, cursor->index(), *offsets)); #endif - MONITOR_INC(MONITOR_INDEX_SPLIT); - return(rec); } diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc index aa2fb7c38eb..cac20c70e02 100644 --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -915,43 +915,37 @@ static SHOW_VAR innodb_status_variables[]= { (char*) &export_vars.innodb_buffer_pool_resize_status, SHOW_CHAR}, {"buffer_pool_load_incomplete", &export_vars.innodb_buffer_pool_load_incomplete, SHOW_BOOL}, - {"buffer_pool_pages_data", - &export_vars.innodb_buffer_pool_pages_data, SHOW_SIZE_T}, + {"buffer_pool_pages_data", &UT_LIST_GET_LEN(buf_pool.LRU), SHOW_SIZE_T}, {"buffer_pool_bytes_data", &export_vars.innodb_buffer_pool_bytes_data, SHOW_SIZE_T}, {"buffer_pool_pages_dirty", - &export_vars.innodb_buffer_pool_pages_dirty, SHOW_SIZE_T}, - {"buffer_pool_bytes_dirty", - &export_vars.innodb_buffer_pool_bytes_dirty, SHOW_SIZE_T}, - {"buffer_pool_pages_flushed", &buf_flush_page_count, SHOW_SIZE_T}, - {"buffer_pool_pages_free", - &export_vars.innodb_buffer_pool_pages_free, SHOW_SIZE_T}, + &UT_LIST_GET_LEN(buf_pool.flush_list), SHOW_SIZE_T}, + {"buffer_pool_bytes_dirty", &buf_pool.flush_list_bytes, SHOW_SIZE_T}, + {"buffer_pool_pages_flushed", &buf_pool.stat.n_pages_written, SHOW_SIZE_T}, + {"buffer_pool_pages_free", &UT_LIST_GET_LEN(buf_pool.free), SHOW_SIZE_T}, #ifdef UNIV_DEBUG {"buffer_pool_pages_latched", &export_vars.innodb_buffer_pool_pages_latched, SHOW_SIZE_T}, #endif /* UNIV_DEBUG */ {"buffer_pool_pages_made_not_young", - &export_vars.innodb_buffer_pool_pages_made_not_young, SHOW_SIZE_T}, + &buf_pool.stat.n_pages_not_made_young, SHOW_SIZE_T}, {"buffer_pool_pages_made_young", - &export_vars.innodb_buffer_pool_pages_made_young, SHOW_SIZE_T}, + &buf_pool.stat.n_pages_made_young, SHOW_SIZE_T}, {"buffer_pool_pages_misc", &export_vars.innodb_buffer_pool_pages_misc, SHOW_SIZE_T}, - {"buffer_pool_pages_old", - &export_vars.innodb_buffer_pool_pages_old, SHOW_SIZE_T}, + {"buffer_pool_pages_old", &buf_pool.LRU_old_len, SHOW_SIZE_T}, {"buffer_pool_pages_total", &export_vars.innodb_buffer_pool_pages_total, SHOW_SIZE_T}, {"buffer_pool_pages_LRU_flushed", &buf_lru_flush_page_count, SHOW_SIZE_T}, {"buffer_pool_pages_LRU_freed", &buf_lru_freed_page_count, SHOW_SIZE_T}, + {"buffer_pool_pages_split", &buf_pool.pages_split, SHOW_SIZE_T}, {"buffer_pool_read_ahead_rnd", - &export_vars.innodb_buffer_pool_read_ahead_rnd, SHOW_SIZE_T}, - {"buffer_pool_read_ahead", - &export_vars.innodb_buffer_pool_read_ahead, SHOW_SIZE_T}, + &buf_pool.stat.n_ra_pages_read_rnd, SHOW_SIZE_T}, + {"buffer_pool_read_ahead", &buf_pool.stat.n_ra_pages_read, SHOW_SIZE_T}, {"buffer_pool_read_ahead_evicted", - &export_vars.innodb_buffer_pool_read_ahead_evicted, SHOW_SIZE_T}, - {"buffer_pool_read_requests", - &export_vars.innodb_buffer_pool_read_requests, SHOW_SIZE_T}, - {"buffer_pool_reads", - &export_vars.innodb_buffer_pool_reads, SHOW_SIZE_T}, + &buf_pool.stat.n_ra_pages_evicted, SHOW_SIZE_T}, + {"buffer_pool_read_requests", &buf_pool.stat.n_page_gets, SHOW_SIZE_T}, + {"buffer_pool_reads", &buf_pool.stat.n_pages_read, SHOW_SIZE_T}, {"buffer_pool_wait_free", &buf_pool.stat.LRU_waits, SHOW_SIZE_T}, {"buffer_pool_write_requests", &export_vars.innodb_buffer_pool_write_requests, SHOW_SIZE_T}, diff --git a/storage/innobase/include/buf0buf.h b/storage/innobase/include/buf0buf.h index e79cbdadcd6..94f8dc2badb 100644 --- a/storage/innobase/include/buf0buf.h +++ b/storage/innobase/include/buf0buf.h @@ -782,11 +782,11 @@ public: it from buf_pool.flush_list */ inline void write_complete(bool temporary); - /** Write a flushable page to a file. buf_pool.mutex must be held. + /** Write a flushable page to a file or free a freeable block. @param evict whether to evict the page on write completion @param space tablespace - @return whether the page was flushed and buf_pool.mutex was released */ - inline bool flush(bool evict, fil_space_t *space); + @return whether a page write was initiated and buf_pool.mutex released */ + bool flush(bool evict, fil_space_t *space); /** Notify that a page in a temporary tablespace has been modified. */ void set_temp_modified() @@ -856,8 +856,6 @@ public: /** @return whether the block is mapped to a data file */ bool in_file() const { return state() >= FREED; } - /** @return whether the block is modified and ready for flushing */ - inline bool ready_for_flush() const; /** @return whether the block can be relocated in memory. The block can be dirty, but it must not be I/O-fixed or bufferfixed. */ inline bool can_relocate() const; @@ -1030,10 +1028,10 @@ Compute the hash fold value for blocks in buf_pool.zip_hash. */ #define BUF_POOL_ZIP_FOLD_BPAGE(b) BUF_POOL_ZIP_FOLD((buf_block_t*) (b)) /* @} */ -/** A "Hazard Pointer" class used to iterate over page lists -inside the buffer pool. A hazard pointer is a buf_page_t pointer +/** A "Hazard Pointer" class used to iterate over buf_pool.LRU or +buf_pool.flush_list. A hazard pointer is a buf_page_t pointer which we intend to iterate over next and we want it remain valid -even after we release the buffer pool mutex. */ +even after we release the mutex that protects the list. */ class HazardPointer { public: @@ -1148,7 +1146,8 @@ struct buf_buddy_free_t { /*!< Node of zip_free list */ }; -/** @brief The buffer pool statistics structure. */ +/** @brief The buffer pool statistics structure; +protected by buf_pool.mutex unless otherwise noted. */ struct buf_pool_stat_t{ /** Initialize the counters */ void init() { memset((void*) this, 0, sizeof *this); } @@ -1157,9 +1156,8 @@ struct buf_pool_stat_t{ /*!< number of page gets performed; also successful searches through the adaptive hash index are - counted as page gets; this field - is NOT protected by the buffer - pool mutex */ + counted as page gets; + NOT protected by buf_pool.mutex */ ulint n_pages_read; /*!< number read operations */ ulint n_pages_written;/*!< number write operations */ ulint n_pages_created;/*!< number of pages created @@ -1177,10 +1175,9 @@ struct buf_pool_stat_t{ young because the first access was not long enough ago, in buf_page_peek_if_too_old() */ - /** number of waits for eviction; writes protected by buf_pool.mutex */ + /** number of waits for eviction */ ulint LRU_waits; ulint LRU_bytes; /*!< LRU size in bytes */ - ulint flush_list_bytes;/*!< flush_list size in bytes */ }; /** Statistics of buddy blocks of a given size. */ @@ -1501,6 +1498,11 @@ public: n_chunks_new / 4 * chunks->size; } + /** @return whether the buffer pool has run out */ + TPOOL_SUPPRESS_TSAN + bool ran_out() const + { return UNIV_UNLIKELY(!try_LRU_scan || !UT_LIST_GET_LEN(free)); } + /** @return whether the buffer pool is shrinking */ inline bool is_shrinking() const { @@ -1538,14 +1540,10 @@ public: /** Buffer pool mutex */ alignas(CPU_LEVEL1_DCACHE_LINESIZE) mysql_mutex_t mutex; - /** Number of pending LRU flush; protected by mutex. */ - ulint n_flush_LRU_; - /** broadcast when n_flush_LRU reaches 0; protected by mutex */ - pthread_cond_t done_flush_LRU; - /** Number of pending flush_list flush; protected by mutex */ - ulint n_flush_list_; - /** broadcast when n_flush_list reaches 0; protected by mutex */ - pthread_cond_t done_flush_list; + /** current statistics; protected by mutex */ + buf_pool_stat_t stat; + /** old statistics; protected by mutex */ + buf_pool_stat_t old_stat; /** @name General fields */ /* @{ */ @@ -1706,11 +1704,12 @@ public: buf_buddy_stat_t buddy_stat[BUF_BUDDY_SIZES_MAX + 1]; /*!< Statistics of buddy system, indexed by block size */ - buf_pool_stat_t stat; /*!< current statistics */ - buf_pool_stat_t old_stat; /*!< old statistics */ /* @} */ + /** number of index page splits */ + Atomic_counter pages_split; + /** @name Page flushing algorithm fields */ /* @{ */ @@ -1719,31 +1718,76 @@ public: alignas(CPU_LEVEL1_DCACHE_LINESIZE) mysql_mutex_t flush_list_mutex; /** "hazard pointer" for flush_list scans; protected by flush_list_mutex */ FlushHp flush_hp; - /** modified blocks (a subset of LRU) */ + /** flush_list size in bytes; protected by flush_list_mutex */ + ulint flush_list_bytes; + /** possibly modified persistent pages (a subset of LRU); + buf_dblwr.pending_writes() is approximately COUNT(is_write_fixed()) */ UT_LIST_BASE_NODE_T(buf_page_t) flush_list; private: - /** whether the page cleaner needs wakeup from indefinite sleep */ - bool page_cleaner_is_idle; + static constexpr unsigned PAGE_CLEANER_IDLE= 1; + static constexpr unsigned FLUSH_LIST_ACTIVE= 2; + static constexpr unsigned LRU_FLUSH= 4; + + /** Number of pending LRU flush * LRU_FLUSH + + PAGE_CLEANER_IDLE + FLUSH_LIST_ACTIVE flags */ + unsigned page_cleaner_status; /** track server activity count for signaling idle flushing */ ulint last_activity_count; public: /** signalled to wake up the page_cleaner; protected by flush_list_mutex */ pthread_cond_t do_flush_list; + /** broadcast when !n_flush(); protected by flush_list_mutex */ + pthread_cond_t done_flush_LRU; + /** broadcast when a batch completes; protected by flush_list_mutex */ + pthread_cond_t done_flush_list; + + /** @return number of pending LRU flush */ + unsigned n_flush() const + { + mysql_mutex_assert_owner(&flush_list_mutex); + return page_cleaner_status / LRU_FLUSH; + } + + /** Increment the number of pending LRU flush */ + inline void n_flush_inc(); + + /** Decrement the number of pending LRU flush */ + inline void n_flush_dec(); + + /** @return whether flush_list flushing is active */ + bool flush_list_active() const + { + mysql_mutex_assert_owner(&flush_list_mutex); + return page_cleaner_status & FLUSH_LIST_ACTIVE; + } + + void flush_list_set_active() + { + ut_ad(!flush_list_active()); + page_cleaner_status+= FLUSH_LIST_ACTIVE; + } + void flush_list_set_inactive() + { + ut_ad(flush_list_active()); + page_cleaner_status-= FLUSH_LIST_ACTIVE; + } /** @return whether the page cleaner must sleep due to being idle */ bool page_cleaner_idle() const { mysql_mutex_assert_owner(&flush_list_mutex); - return page_cleaner_is_idle; + return page_cleaner_status & PAGE_CLEANER_IDLE; } - /** Wake up the page cleaner if needed */ - void page_cleaner_wakeup(); + /** Wake up the page cleaner if needed. + @param for_LRU whether to wake up for LRU eviction */ + void page_cleaner_wakeup(bool for_LRU= false); /** Register whether an explicit wakeup of the page cleaner is needed */ void page_cleaner_set_idle(bool deep_sleep) { mysql_mutex_assert_owner(&flush_list_mutex); - page_cleaner_is_idle= deep_sleep; + page_cleaner_status= (page_cleaner_status & ~PAGE_CLEANER_IDLE) | + (PAGE_CLEANER_IDLE * deep_sleep); } /** Update server last activity count */ @@ -1753,9 +1797,6 @@ public: last_activity_count= activity_count; } - // n_flush_LRU_ + n_flush_list_ - // is approximately COUNT(is_write_fixed()) in flush_list - unsigned freed_page_clock;/*!< a sequence number used to count the number of buffer blocks removed from the end of @@ -1765,16 +1806,10 @@ public: to read this for heuristic purposes without holding any mutex or latch */ - bool try_LRU_scan; /*!< Cleared when an LRU - scan for free block fails. This - flag is used to avoid repeated - scans of LRU list when we know - that there is no free block - available in the scan depth for - eviction. Set whenever - we flush a batch from the - buffer pool. Protected by the - buf_pool.mutex */ + /** Cleared when buf_LRU_get_free_block() fails. + Set whenever the free list grows, along with a broadcast of done_free. + Protected by buf_pool.mutex. */ + Atomic_relaxed try_LRU_scan; /* @} */ /** @name LRU replacement algorithm fields */ @@ -1783,8 +1818,8 @@ public: UT_LIST_BASE_NODE_T(buf_page_t) free; /*!< base node of the free block list */ - /** signaled each time when the free list grows and - broadcast each time try_LRU_scan is set; protected by mutex */ + /** broadcast each time when the free list grows or try_LRU_scan is set; + protected by mutex */ pthread_cond_t done_free; UT_LIST_BASE_NODE_T(buf_page_t) withdraw; @@ -1844,29 +1879,20 @@ public: { if (n_pend_reads) return true; - mysql_mutex_lock(&mutex); - const bool any_pending{n_flush_LRU_ || n_flush_list_}; - mysql_mutex_unlock(&mutex); + mysql_mutex_lock(&flush_list_mutex); + const bool any_pending= page_cleaner_status > PAGE_CLEANER_IDLE || + buf_dblwr.pending_writes(); + mysql_mutex_unlock(&flush_list_mutex); return any_pending; } - /** @return total amount of pending I/O */ - TPOOL_SUPPRESS_TSAN ulint io_pending() const - { - return n_pend_reads + n_flush_LRU_ + n_flush_list_; - } private: /** Remove a block from the flush list. */ inline void delete_from_flush_list_low(buf_page_t *bpage); - /** Remove a block from flush_list. - @param bpage buffer pool page - @param clear whether to invoke buf_page_t::clear_oldest_modification() */ - void delete_from_flush_list(buf_page_t *bpage, bool clear); public: /** Remove a block from flush_list. @param bpage buffer pool page */ - void delete_from_flush_list(buf_page_t *bpage) - { delete_from_flush_list(bpage, true); } + void delete_from_flush_list(buf_page_t *bpage); /** Insert a modified block into the flush list. @param block modified block @@ -1874,7 +1900,7 @@ public: void insert_into_flush_list(buf_block_t *block, lsn_t lsn); /** Free a page whose underlying file page has been freed. */ - inline void release_freed_page(buf_page_t *bpage); + ATTRIBUTE_COLD void release_freed_page(buf_page_t *bpage); private: /** Temporary memory for page_compressed and encrypted I/O */ @@ -1994,17 +2020,6 @@ inline void buf_page_t::clear_oldest_modification() oldest_modification_.store(0, std::memory_order_release); } -/** @return whether the block is modified and ready for flushing */ -inline bool buf_page_t::ready_for_flush() const -{ - mysql_mutex_assert_owner(&buf_pool.mutex); - ut_ad(in_LRU_list); - const auto s= state(); - ut_a(s >= FREED); - ut_ad(!fsp_is_system_temporary(id().space()) || oldest_modification() == 2); - return s < READ_FIX; -} - /** @return whether the block can be relocated in memory. The block can be dirty, but it must not be I/O-fixed or bufferfixed. */ inline bool buf_page_t::can_relocate() const diff --git a/storage/innobase/include/buf0dblwr.h b/storage/innobase/include/buf0dblwr.h index fb9df55504c..d9c9239c0b4 100644 --- a/storage/innobase/include/buf0dblwr.h +++ b/storage/innobase/include/buf0dblwr.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, 2020, MariaDB Corporation. +Copyright (c) 2017, 2022, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -54,9 +54,9 @@ class buf_dblwr_t }; /** the page number of the first doublewrite block (block_size() pages) */ - page_id_t block1= page_id_t(0, 0); + page_id_t block1{0, 0}; /** the page number of the second doublewrite block (block_size() pages) */ - page_id_t block2= page_id_t(0, 0); + page_id_t block2{0, 0}; /** mutex protecting the data members below */ mysql_mutex_t mutex; @@ -72,11 +72,15 @@ class buf_dblwr_t ulint writes_completed; /** number of pages written by flush_buffered_writes_completed() */ ulint pages_written; + /** condition variable for !writes_pending */ + pthread_cond_t write_cond; + /** number of pending page writes */ + size_t writes_pending; slot slots[2]; - slot *active_slot= &slots[0]; + slot *active_slot; - /** Initialize the doublewrite buffer data structure. + /** Initialise the persistent storage of the doublewrite buffer. @param header doublewrite page header in the TRX_SYS page */ inline void init(const byte *header); @@ -84,6 +88,8 @@ class buf_dblwr_t bool flush_buffered_writes(const ulint size); public: + /** Initialise the doublewrite buffer data structures. */ + void init(); /** Create or restore the doublewrite buffer in the TRX_SYS page. @return whether the operation succeeded */ bool create(); @@ -118,7 +124,7 @@ public: void recover(); /** Update the doublewrite buffer on data page write completion. */ - void write_completed(); + void write_completed(bool with_doublewrite); /** Flush possible buffered writes to persistent storage. It is very important to call this function after a batch of writes has been posted, and also when we may have to wait for a page latch! @@ -137,14 +143,14 @@ public: @param size payload size in bytes */ void add_to_batch(const IORequest &request, size_t size); - /** Determine whether the doublewrite buffer is initialized */ - bool is_initialised() const + /** Determine whether the doublewrite buffer has been created */ + bool is_created() const { return UNIV_LIKELY(block1 != page_id_t(0, 0)); } /** @return whether a page identifier is part of the doublewrite buffer */ bool is_inside(const page_id_t id) const { - if (!is_initialised()) + if (!is_created()) return false; ut_ad(block1 < block2); if (id < block1) @@ -156,13 +162,44 @@ public: /** Wait for flush_buffered_writes() to be fully completed */ void wait_flush_buffered_writes() { - if (is_initialised()) - { - mysql_mutex_lock(&mutex); - while (batch_running) - my_cond_wait(&cond, &mutex.m_mutex); - mysql_mutex_unlock(&mutex); - } + mysql_mutex_lock(&mutex); + while (batch_running) + my_cond_wait(&cond, &mutex.m_mutex); + mysql_mutex_unlock(&mutex); + } + + /** Register an unbuffered page write */ + void add_unbuffered() + { + mysql_mutex_lock(&mutex); + writes_pending++; + mysql_mutex_unlock(&mutex); + } + + size_t pending_writes() + { + mysql_mutex_lock(&mutex); + const size_t pending{writes_pending}; + mysql_mutex_unlock(&mutex); + return pending; + } + + /** Wait for writes_pending to reach 0 */ + void wait_for_page_writes() + { + mysql_mutex_lock(&mutex); + while (writes_pending) + my_cond_wait(&write_cond, &mutex.m_mutex); + mysql_mutex_unlock(&mutex); + } + + /** Wait for writes_pending to reach 0 */ + void wait_for_page_writes(const timespec &abstime) + { + mysql_mutex_lock(&mutex); + while (writes_pending) + my_cond_timedwait(&write_cond, &mutex.m_mutex, &abstime); + mysql_mutex_unlock(&mutex); } }; diff --git a/storage/innobase/include/buf0flu.h b/storage/innobase/include/buf0flu.h index d71a05c0ec9..13a9363922b 100644 --- a/storage/innobase/include/buf0flu.h +++ b/storage/innobase/include/buf0flu.h @@ -30,10 +30,8 @@ Created 11/5/1995 Heikki Tuuri #include "log0log.h" #include "buf0buf.h" -/** Number of pages flushed. Protected by buf_pool.mutex. */ -extern ulint buf_flush_page_count; /** Number of pages flushed via LRU. Protected by buf_pool.mutex. -Also included in buf_flush_page_count. */ +Also included in buf_pool.stat.n_pages_written. */ extern ulint buf_lru_flush_page_count; /** Number of pages freed without flushing. Protected by buf_pool.mutex. */ extern ulint buf_lru_freed_page_count; @@ -96,9 +94,8 @@ after releasing buf_pool.mutex. @retval 0 if a buf_pool.LRU batch is already running */ ulint buf_flush_LRU(ulint max_n, bool evict); -/** Wait until a flush batch ends. -@param lru true=buf_pool.LRU; false=buf_pool.flush_list */ -void buf_flush_wait_batch_end(bool lru); +/** Wait until a LRU flush batch ends. */ +void buf_flush_wait_LRU_batch_end(); /** Wait until all persistent pages are flushed up to a limit. @param sync_lsn buf_pool.get_oldest_modification(LSN_MAX) to wait for */ ATTRIBUTE_COLD void buf_flush_wait_flushed(lsn_t sync_lsn); diff --git a/storage/innobase/include/buf0rea.h b/storage/innobase/include/buf0rea.h index 8d6b28194dc..d898c5efc63 100644 --- a/storage/innobase/include/buf0rea.h +++ b/storage/innobase/include/buf0rea.h @@ -33,10 +33,11 @@ Created 11/5/1995 Heikki Tuuri buffer buf_pool if it is not already there. Sets the io_fix flag and sets an exclusive lock on the buffer frame. The flag is cleared and the x-lock released by the i/o-handler thread. -@param[in] page_id page id -@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 -@retval DB_SUCCESS if the page was read and is not corrupted, -@retval DB_PAGE_CORRUPTED if page based on checksum check is corrupted, +@param page_id page id +@param zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@retval DB_SUCCESS if the page was read and is not corrupted +@retval DB_SUCCESS_LOCKED_REC if the page was not read +@retval DB_PAGE_CORRUPTED if page based on checksum check is corrupted @retval DB_DECRYPTION_FAILED if page post encryption checksum matches but after decryption normal page checksum does not match. @retval DB_TABLESPACE_DELETED if tablespace .ibd file is missing */ diff --git a/storage/innobase/include/fil0fil.h b/storage/innobase/include/fil0fil.h index 533f595c852..ff6ece8a360 100644 --- a/storage/innobase/include/fil0fil.h +++ b/storage/innobase/include/fil0fil.h @@ -1170,7 +1170,7 @@ private: inline bool fil_space_t::use_doublewrite() const { return !UT_LIST_GET_FIRST(chain)->atomic_write && srv_use_doublewrite_buf && - buf_dblwr.is_initialised(); + buf_dblwr.is_created(); } inline void fil_space_t::set_imported() diff --git a/storage/innobase/include/srv0srv.h b/storage/innobase/include/srv0srv.h index 9807d9cd9a4..90d3a21f761 100644 --- a/storage/innobase/include/srv0srv.h +++ b/storage/innobase/include/srv0srv.h @@ -108,10 +108,6 @@ struct srv_stats_t /** Store the number of write requests issued */ ulint_ctr_1_t buf_pool_write_requests; - /** Number of buffer pool reads that led to the reading of - a disk page */ - ulint_ctr_1_t buf_pool_reads; - /** Number of bytes saved by page compression */ ulint_ctr_n_t page_compression_saved; /* Number of pages compressed with page compression */ @@ -670,24 +666,12 @@ struct export_var_t{ char innodb_buffer_pool_resize_status[512];/*!< Buf pool resize status */ my_bool innodb_buffer_pool_load_incomplete;/*!< Buf pool load incomplete */ ulint innodb_buffer_pool_pages_total; /*!< Buffer pool size */ - ulint innodb_buffer_pool_pages_data; /*!< Data pages */ ulint innodb_buffer_pool_bytes_data; /*!< File bytes used */ - ulint innodb_buffer_pool_pages_dirty; /*!< Dirty data pages */ - ulint innodb_buffer_pool_bytes_dirty; /*!< File bytes modified */ ulint innodb_buffer_pool_pages_misc; /*!< Miscellanous pages */ - ulint innodb_buffer_pool_pages_free; /*!< Free pages */ #ifdef UNIV_DEBUG ulint innodb_buffer_pool_pages_latched; /*!< Latched pages */ #endif /* UNIV_DEBUG */ - ulint innodb_buffer_pool_pages_made_not_young; - ulint innodb_buffer_pool_pages_made_young; - ulint innodb_buffer_pool_pages_old; - ulint innodb_buffer_pool_read_requests; /*!< buf_pool.stat.n_page_gets */ - ulint innodb_buffer_pool_reads; /*!< srv_buf_pool_reads */ ulint innodb_buffer_pool_write_requests;/*!< srv_stats.buf_pool_write_requests */ - ulint innodb_buffer_pool_read_ahead_rnd;/*!< srv_read_ahead_rnd */ - ulint innodb_buffer_pool_read_ahead; /*!< srv_read_ahead */ - ulint innodb_buffer_pool_read_ahead_evicted;/*!< srv_read_ahead evicted*/ ulint innodb_checkpoint_age; ulint innodb_checkpoint_max_age; ulint innodb_data_pending_reads; /*!< Pending reads */ diff --git a/storage/innobase/log/log0log.cc b/storage/innobase/log/log0log.cc index 70f561280d9..c53e2fd5074 100644 --- a/storage/innobase/log/log0log.cc +++ b/storage/innobase/log/log0log.cc @@ -1173,14 +1173,6 @@ wait_suspend_loop: if (!buf_pool.is_initialised()) { ut_ad(!srv_was_started); - } else if (ulint pending_io = buf_pool.io_pending()) { - if (srv_print_verbose_log && count > 600) { - ib::info() << "Waiting for " << pending_io << " buffer" - " page I/Os to complete"; - count = 0; - } - - goto loop; } else { buf_flush_buffer_pool(); } diff --git a/storage/innobase/srv/srv0mon.cc b/storage/innobase/srv/srv0mon.cc index 60fef24d183..b6496d03908 100644 --- a/storage/innobase/srv/srv0mon.cc +++ b/storage/innobase/srv/srv0mon.cc @@ -909,7 +909,7 @@ static monitor_info_t innodb_counter_info[] = MONITOR_DEFAULT_START, MONITOR_MODULE_INDEX}, {"index_page_splits", "index", "Number of index page splits", - MONITOR_NONE, + MONITOR_EXISTING, MONITOR_DEFAULT_START, MONITOR_INDEX_SPLIT}, {"index_page_merge_attempts", "index", @@ -1411,10 +1411,12 @@ srv_mon_process_existing_counter( /* Get the value from corresponding global variable */ switch (monitor_id) { - /* export_vars.innodb_buffer_pool_reads. Num Reads from - disk (page not in buffer) */ + case MONITOR_INDEX_SPLIT: + value = buf_pool.pages_split; + break; + case MONITOR_OVLD_BUF_POOL_READS: - value = srv_stats.buf_pool_reads; + value = buf_pool.stat.n_pages_read; break; /* innodb_buffer_pool_read_requests, the number of logical @@ -1475,7 +1477,7 @@ srv_mon_process_existing_counter( /* innodb_buffer_pool_bytes_dirty */ case MONITOR_OVLD_BUF_POOL_BYTES_DIRTY: - value = buf_pool.stat.flush_list_bytes; + value = buf_pool.flush_list_bytes; break; /* innodb_buffer_pool_pages_free */ diff --git a/storage/innobase/srv/srv0srv.cc b/storage/innobase/srv/srv0srv.cc index c16868b5cf5..2e9f5a0eff8 100644 --- a/storage/innobase/srv/srv0srv.cc +++ b/storage/innobase/srv/srv0srv.cc @@ -675,6 +675,7 @@ void srv_boot() if (transactional_lock_enabled()) sql_print_information("InnoDB: Using transactional memory"); #endif + buf_dblwr.init(); srv_thread_pool_init(); trx_pool_init(); srv_init(); @@ -1001,59 +1002,22 @@ srv_export_innodb_status(void) export_vars.innodb_data_writes = os_n_file_writes; - ulint dblwr = 0; - - if (buf_dblwr.is_initialised()) { - buf_dblwr.lock(); - dblwr = buf_dblwr.submitted(); - export_vars.innodb_dblwr_pages_written = buf_dblwr.written(); - export_vars.innodb_dblwr_writes = buf_dblwr.batches(); - buf_dblwr.unlock(); - } + buf_dblwr.lock(); + ulint dblwr = buf_dblwr.submitted(); + export_vars.innodb_dblwr_pages_written = buf_dblwr.written(); + export_vars.innodb_dblwr_writes = buf_dblwr.batches(); + buf_dblwr.unlock(); export_vars.innodb_data_written = srv_stats.data_written + dblwr; - export_vars.innodb_buffer_pool_read_requests - = buf_pool.stat.n_page_gets; - export_vars.innodb_buffer_pool_write_requests = srv_stats.buf_pool_write_requests; - export_vars.innodb_buffer_pool_reads = srv_stats.buf_pool_reads; - - export_vars.innodb_buffer_pool_read_ahead_rnd = - buf_pool.stat.n_ra_pages_read_rnd; - - export_vars.innodb_buffer_pool_read_ahead = - buf_pool.stat.n_ra_pages_read; - - export_vars.innodb_buffer_pool_read_ahead_evicted = - buf_pool.stat.n_ra_pages_evicted; - - export_vars.innodb_buffer_pool_pages_data = - UT_LIST_GET_LEN(buf_pool.LRU); - export_vars.innodb_buffer_pool_bytes_data = buf_pool.stat.LRU_bytes + (UT_LIST_GET_LEN(buf_pool.unzip_LRU) << srv_page_size_shift); - export_vars.innodb_buffer_pool_pages_dirty = - UT_LIST_GET_LEN(buf_pool.flush_list); - - export_vars.innodb_buffer_pool_pages_made_young - = buf_pool.stat.n_pages_made_young; - export_vars.innodb_buffer_pool_pages_made_not_young - = buf_pool.stat.n_pages_not_made_young; - - export_vars.innodb_buffer_pool_pages_old = buf_pool.LRU_old_len; - - export_vars.innodb_buffer_pool_bytes_dirty = - buf_pool.stat.flush_list_bytes; - - export_vars.innodb_buffer_pool_pages_free = - UT_LIST_GET_LEN(buf_pool.free); - #ifdef UNIV_DEBUG export_vars.innodb_buffer_pool_pages_latched = buf_get_latched_pages_number(); diff --git a/storage/innobase/srv/srv0start.cc b/storage/innobase/srv/srv0start.cc index b0adc15300c..a881ae0ad6a 100644 --- a/storage/innobase/srv/srv0start.cc +++ b/storage/innobase/srv/srv0start.cc @@ -1997,7 +1997,7 @@ void innodb_shutdown() ut_ad(dict_sys.is_initialised() || !srv_was_started); ut_ad(trx_sys.is_initialised() || !srv_was_started); - ut_ad(buf_dblwr.is_initialised() || !srv_was_started + ut_ad(buf_dblwr.is_created() || !srv_was_started || srv_read_only_mode || srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO); ut_ad(lock_sys.is_initialised() || !srv_was_started); diff --git a/storage/rocksdb/mysql-test/rocksdb/r/innodb_i_s_tables_disabled.result b/storage/rocksdb/mysql-test/rocksdb/r/innodb_i_s_tables_disabled.result index 1b3b43c0304..d3f0ee3bcd9 100644 --- a/storage/rocksdb/mysql-test/rocksdb/r/innodb_i_s_tables_disabled.result +++ b/storage/rocksdb/mysql-test/rocksdb/r/innodb_i_s_tables_disabled.result @@ -181,7 +181,7 @@ compress_pages_page_decompressed compression 0 NULL NULL NULL 0 NULL NULL NULL N compress_pages_page_compression_error compression 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 counter Number of page compression errors compress_pages_encrypted compression 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 counter Number of pages encrypted compress_pages_decrypted compression 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 counter Number of pages decrypted -index_page_splits index 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 counter Number of index page splits +index_page_splits index 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 status_counter Number of index page splits index_page_merge_attempts index 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 counter Number of index page merge attempts index_page_merge_successful index 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 counter Number of successful index page merges index_page_reorg_attempts index 0 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL 0 counter Number of index page reorganization attempts -- cgit v1.2.1