diff options
Diffstat (limited to 'storage')
42 files changed, 1608 insertions, 1031 deletions
diff --git a/storage/columnstore/columnstore b/storage/columnstore/columnstore -Subproject ac0741e82d71ea8faf6865c4c6604f02449cfa2 +Subproject 1071ce954807104d25c0951f422e83d5e17406f diff --git a/storage/connect/CMakeLists.txt b/storage/connect/CMakeLists.txt index bad7d916043..4816086f7c8 100644 --- a/storage/connect/CMakeLists.txt +++ b/storage/connect/CMakeLists.txt @@ -69,6 +69,7 @@ IF(UNIX) DISABLE_WARNING("format-truncation") DISABLE_WARNING("implicit-fallthrough") DISABLE_WARNING("type-limits") + DISABLE_WARNING("deprecated-declarations") endif(NOT WITH_WARNINGS) add_definitions( -DUNIX -DLINUX -DUBUNTU ) diff --git a/storage/connect/mysql-test/connect/r/odbc_postgresql.result b/storage/connect/mysql-test/connect/r/odbc_postgresql.result index fd23197c37f..6bd8d75a601 100644 --- a/storage/connect/mysql-test/connect/r/odbc_postgresql.result +++ b/storage/connect/mysql-test/connect/r/odbc_postgresql.result @@ -2,7 +2,7 @@ Table Create Table t1 CREATE TABLE `t1` ( `Name` varchar(256) NOT NULL, `Description` varchar(256) DEFAULT NULL -) ENGINE=CONNECT DEFAULT CHARSET=latin1 `TABLE_TYPE`='ODBC' `CATFUNC`='Sources' +) ENGINE=CONNECT DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci `TABLE_TYPE`='ODBC' `CATFUNC`='Sources' SET NAMES utf8; # # Checking CATFUNC=Tables @@ -15,6 +15,7 @@ Table_Cat Table_Schema Table_Name Table_Type Remark mtr public t1 TABLE mtr public t2 TABLE mtr public v1 VIEW +mtr schema1 space_in_column_name TABLE mtr schema1 t1 TABLE mtr schema1 t2 TABLE mtr schema1 t3 TABLE @@ -27,6 +28,7 @@ Table_Cat Table_Schema Table_Name Table_Type Remark mtr public t1 TABLE mtr public t2 TABLE mtr public v1 VIEW +mtr schema1 space_in_column_name TABLE mtr schema1 t1 TABLE mtr schema1 t2 TABLE mtr schema1 t3 TABLE @@ -39,6 +41,7 @@ Table_Cat Table_Schema Table_Name Table_Type Remark mtr public t1 TABLE mtr public t2 TABLE mtr public v1 VIEW +mtr schema1 space_in_column_name TABLE mtr schema1 t1 TABLE mtr schema1 t2 TABLE mtr schema1 t3 TABLE @@ -102,6 +105,7 @@ Table_Cat Table_Schema Table_Name Column_Name Data_Type Type_Name Column_Size Bu mtr public t1 a 4 int4 10 4 0 10 0 mtr public t2 a 4 int4 10 4 0 10 0 mtr public v1 a 4 int4 10 4 0 10 1 +mtr schema1 space_in_column_name my space column 1 bpchar 20 80 NULL NULL 0 mtr schema1 t1 a 1 bpchar 10 40 NULL NULL 0 mtr schema1 t2 a 1 bpchar 10 40 NULL NULL 0 mtr schema1 t3 a 1 bpchar 10 40 NULL NULL 0 @@ -115,6 +119,7 @@ Table_Cat Table_Schema Table_Name Column_Name Data_Type Type_Name Column_Size Bu mtr public t1 a 4 int4 10 4 0 10 0 mtr public t2 a 4 int4 10 4 0 10 0 mtr public v1 a 4 int4 10 4 0 10 1 +mtr schema1 space_in_column_name my space column 1 bpchar 20 80 NULL NULL 0 mtr schema1 t1 a 1 bpchar 10 40 NULL NULL 0 mtr schema1 t2 a 1 bpchar 10 40 NULL NULL 0 mtr schema1 t3 a 1 bpchar 10 40 NULL NULL 0 @@ -157,7 +162,7 @@ SHOW CREATE TABLE t1; Table Create Table t1 CREATE TABLE `t1` ( `a` int(10) NOT NULL -) ENGINE=CONNECT DEFAULT CHARSET=latin1 CONNECTION='DSN=ConnectEnginePostgresql;UID=mtr;PWD=mtr' `TABLE_TYPE`='ODBC' +) ENGINE=CONNECT DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci CONNECTION='DSN=ConnectEnginePostgresql;UID=mtr;PWD=mtr' `TABLE_TYPE`='ODBC' SELECT * FROM t1; a 10 @@ -168,7 +173,7 @@ SHOW CREATE TABLE t2; Table Create Table t2 CREATE TABLE `t2` ( `a` int(10) NOT NULL -) ENGINE=MyISAM DEFAULT CHARSET=latin1 +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci SELECT * FROM t2; a 10 @@ -189,7 +194,7 @@ SHOW CREATE TABLE t1; Table Create Table t1 CREATE TABLE `t1` ( `a` int(10) NOT NULL -) ENGINE=CONNECT DEFAULT CHARSET=latin1 CONNECTION='DSN=ConnectEnginePostgresql;UID=mtr;PWD=mtr' `TABLE_TYPE`='ODBC' `TABNAME`='public.t1' +) ENGINE=CONNECT DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci CONNECTION='DSN=ConnectEnginePostgresql;UID=mtr;PWD=mtr' `TABLE_TYPE`='ODBC' `TABNAME`='public.t1' SELECT * FROM t1; a 10 @@ -202,7 +207,7 @@ SHOW CREATE TABLE t1; Table Create Table t1 CREATE TABLE `t1` ( `a` char(10) NOT NULL -) ENGINE=CONNECT DEFAULT CHARSET=utf8mb3 CONNECTION='DSN=ConnectEnginePostgresql;UID=mtr;PWD=mtr' `TABLE_TYPE`='ODBC' `TABNAME`='schema1.t1' `DATA_CHARSET`='utf8' +) ENGINE=CONNECT DEFAULT CHARSET=utf8 COLLATE=utf8_general_ci CONNECTION='DSN=ConnectEnginePostgresql;UID=mtr;PWD=mtr' `TABLE_TYPE`='ODBC' `TABNAME`='schema1.t1' `DATA_CHARSET`='utf8' SELECT * FROM t1; a aaa @@ -213,8 +218,8 @@ CREATE TABLE t2 AS SELECT * FROM t1; SHOW CREATE TABLE t2; Table Create Table t2 CREATE TABLE `t2` ( - `a` char(10) CHARACTER SET utf8mb3 NOT NULL -) ENGINE=MyISAM DEFAULT CHARSET=latin1 + `a` char(10) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci SELECT * FROM t2; a aaa @@ -237,7 +242,7 @@ SHOW CREATE TABLE t1; Table Create Table t1 CREATE TABLE `t1` ( `a` char(10) DEFAULT NULL -) ENGINE=CONNECT DEFAULT CHARSET=utf8mb3 CONNECTION='DSN=ConnectEnginePostgresql;UID=mtr;PWD=mtr' `TABLE_TYPE`='ODBC' `TABNAME`='schema1.v1' `DATA_CHARSET`='utf8' +) ENGINE=CONNECT DEFAULT CHARSET=utf8 COLLATE=utf8_general_ci CONNECTION='DSN=ConnectEnginePostgresql;UID=mtr;PWD=mtr' `TABLE_TYPE`='ODBC' `TABNAME`='schema1.v1' `DATA_CHARSET`='utf8' SELECT * FROM t1; a aaa @@ -248,8 +253,8 @@ CREATE TABLE t2 AS SELECT * FROM t1; SHOW CREATE TABLE t2; Table Create Table t2 CREATE TABLE `t2` ( - `a` char(10) CHARACTER SET utf8mb3 DEFAULT NULL -) ENGINE=MyISAM DEFAULT CHARSET=latin1 + `a` char(10) CHARACTER SET utf8 COLLATE utf8_general_ci DEFAULT NULL +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci SELECT * FROM t2; a aaa @@ -272,7 +277,7 @@ SHOW CREATE TABLE t1; Table Create Table t1 CREATE TABLE `t1` ( `a` char(10) NOT NULL -) ENGINE=CONNECT DEFAULT CHARSET=utf8mb3 CONNECTION='DSN=ConnectEnginePostgresql;UID=mtr;PWD=mtr' `TABLE_TYPE`='ODBC' `TABNAME`='schema1.t2' `DATA_CHARSET`='utf8' +) ENGINE=CONNECT DEFAULT CHARSET=utf8 COLLATE=utf8_general_ci CONNECTION='DSN=ConnectEnginePostgresql;UID=mtr;PWD=mtr' `TABLE_TYPE`='ODBC' `TABNAME`='schema1.t2' `DATA_CHARSET`='utf8' SELECT * FROM t1; a xxx @@ -283,8 +288,8 @@ CREATE TABLE t2 AS SELECT * FROM t1; SHOW CREATE TABLE t2; Table Create Table t2 CREATE TABLE `t2` ( - `a` char(10) CHARACTER SET utf8mb3 NOT NULL -) ENGINE=MyISAM DEFAULT CHARSET=latin1 + `a` char(10) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci SELECT * FROM t2; a xxx @@ -306,3 +311,11 @@ DELETE FROM t1 WHERE a='20'; Warnings: Note 1105 schema1.t3: 0 affected rows DROP TABLE t1; +# +# MDEV-29687 ODBC tables do not quote identifier names correctly +# +CREATE TABLE pg_in_maria ENGINE=CONNECT TABNAME='schema1.space_in_column_name' CHARSET=utf8 DATA_CHARSET=utf8 TABLE_TYPE=ODBC CONNECTION='DSN=ConnectEnginePostgresql;UID=mtr;PWD=mtr' quoted=1; +SELECT * from pg_in_maria; +my space column +My value +DROP TABLE pg_in_maria; diff --git a/storage/connect/mysql-test/connect/t/odbc_postgresql.sql b/storage/connect/mysql-test/connect/t/odbc_postgresql.sql index a795817a4d3..9b22c69af13 100644 --- a/storage/connect/mysql-test/connect/t/odbc_postgresql.sql +++ b/storage/connect/mysql-test/connect/t/odbc_postgresql.sql @@ -2,7 +2,7 @@ -- The SQL script to create PostgreSQL data for odbc_postgresql.test -- -- Run this script as a admin user: --- psql -U postgres < odbc_postgresql.sql +-- sudo -u postgres psql < storage/connect/mysql-test/connect/t/odbc_postgresql.sql SET NAMES 'UTF8'; @@ -11,7 +11,7 @@ DROP USER IF EXISTS mtr; CREATE USER mtr WITH PASSWORD 'mtr'; CREATE DATABASE mtr OWNER=mtr ENCODING='UTF8'; -GRANT ALL ON DATABASE mtr TO mtr; +GRANT ALL PRIVILEGES ON DATABASE mtr TO mtr; \c mtr SET role mtr; CREATE TABLE t1 (a INT NOT NULL); @@ -27,4 +27,6 @@ CREATE TABLE schema1.t2 (a CHAR(10) NOT NULL); INSERT INTO schema1.t2 VALUES ('xxx'),('yyy'),('zzz'),('ÄÖÜ'); CREATE TABLE schema1.t3 (a CHAR(10) NOT NULL, b CHAR(10) NOT NULL); INSERT INTO schema1.t3 VALUES ('xxx', 'aaa'),('yyy', 'bbb'),('zzz', 'ccc'),('ÄÖÜ', 'яяя'); - +CREATE TABLE schema1.space_in_column_name ("my space column" CHAR(20) NOT NULL); +INSERT INTO schema1.space_in_column_name VALUES ('My value'); +\dt schema1.* diff --git a/storage/connect/mysql-test/connect/t/odbc_postgresql.test b/storage/connect/mysql-test/connect/t/odbc_postgresql.test index 86597423d04..ec98453d630 100644 --- a/storage/connect/mysql-test/connect/t/odbc_postgresql.test +++ b/storage/connect/mysql-test/connect/t/odbc_postgresql.test @@ -5,10 +5,10 @@ # To configure your system to be able to run this test, # follow through the following steps: # -# 1. Install and configure PostgreSQL database to stat on the system startup +# 1. Install and configure PostgreSQL database to start on the system startup # # 2. Create user, database, schema and tables to be used by mtr: -# psql -U postgres < odbc_postgresql.sql +# sudo -u postgres psql < storage/connect/mysql-test/connect/t/odbc_postgresql.sql # # 3. Install PostgreSQL ODBC Driver. # - On CentOS, Fedora: @@ -18,18 +18,23 @@ # # 4. Create a data source with the name "ConnectEnginePostgresql" # - On Windows: use odbcadm.exe -# - On Linux: put these lines into /etc/odbc.ini +# - On Linux: put these lines into /etc/odbc.ini or in ~/.odbc.ini # #[ConnectEnginePostgresql] #Description=PostgreSQL DSN for ConnectSE -#Driver=PostgreSQL (should the path to the driver so file) +#Driver=PostgreSQL (should the path to the driver so file, on linux: /usr/lib/x86_64-linux-gnu/odbc/psqlodbca.so) #Database=mtr #Servername=localhost #Port=5432 # # 5. Allow user "mtr" to connect to the database "mtr" -# Add this line into the begginning of pg_hba.conf -# (usually /var/lib/pgsql/data/pg_hba.conf on Linux): +# Find `pg_hba.conf` file: +# Run `SHOW hba_file;` or `locate pg_hba.conf` to find right location +# (usually /var/lib/pgsql/data/pg_hba.conf or /etc/postgresql/[version]/main/pg_hba.conf on Linux) +# Add this line into the beginning of pg_hba.conf: +# For unix socket connection (connect with `psql -U mtr`) +#local mtr mtr password +# For TCP/IP connection (connect with `psql -U mtr -h 127.0.0.1`) #host mtr mtr 127.0.0.1/32 password # # 6. Restart the server: @@ -211,3 +216,10 @@ DROP TABLE t1; CREATE TABLE t1 (a VARCHAR(6), b VARCHAR(6), PRIMARY KEY(a, b)) ENGINE=CONNECT TABNAME='schema1.t3' CHARSET=utf8 DATA_CHARSET=utf8 TABLE_TYPE=ODBC CONNECTION='DSN=ConnectEnginePostgresql;UID=mtr;PWD=mtr'; DELETE FROM t1 WHERE a='20'; DROP TABLE t1; + +--echo # +--echo # MDEV-29687 ODBC tables do not quote identifier names correctly +--echo # +CREATE TABLE pg_in_maria ENGINE=CONNECT TABNAME='schema1.space_in_column_name' CHARSET=utf8 DATA_CHARSET=utf8 TABLE_TYPE=ODBC CONNECTION='DSN=ConnectEnginePostgresql;UID=mtr;PWD=mtr' quoted=1; +SELECT * from pg_in_maria; +DROP TABLE pg_in_maria; diff --git a/storage/connect/odbconn.cpp b/storage/connect/odbconn.cpp index e93802bfc38..09140712de8 100644 --- a/storage/connect/odbconn.cpp +++ b/storage/connect/odbconn.cpp @@ -1000,6 +1000,11 @@ ODBConn::ODBConn(PGLOBAL g, TDBODBC *tdbp) m_Full = false; m_UseCnc = false; m_IDQuoteChar[0] = '"'; + if (tdbp) + { + if (tdbp->Quoted && tdbp->Quote) + m_IDQuoteChar[0] = *tdbp->Quote; + } m_IDQuoteChar[1] = 0; //*m_ErrMsg = '\0'; } // end of ODBConn @@ -1182,6 +1187,7 @@ int ODBConn::Open(PCSZ ConnectString, POPARM sop, DWORD options) // Verify support for required functionality and cache info // VerifyConnect(); Deprecated GetConnectInfo(); + // Still we want to use the set QChar } catch(DBX *xp) { snprintf(g->Message, sizeof(g->Message), "%s: %s", xp->m_Msg, xp->GetErrorMessage(0)); Close(); diff --git a/storage/connect/tabext.cpp b/storage/connect/tabext.cpp index 44a996243db..dcd93539f19 100644 --- a/storage/connect/tabext.cpp +++ b/storage/connect/tabext.cpp @@ -159,6 +159,9 @@ bool EXTDEF::DefineAM(PGLOBAL g, LPCSTR am, int poff) Maxerr = GetIntCatInfo("Maxerr", 0); Maxres = GetIntCatInfo("Maxres", 0); Quoted = GetIntCatInfo("Quoted", 0); + Qchar = GetStringCatInfo(g,"Qchar", NULL); + if (Qchar && !Quoted) + Quoted = 1; Options = 0; Cto = 0; Qto = 0; @@ -198,6 +201,7 @@ TDBEXT::TDBEXT(EXTDEF *tdp) : TDB(tdp) Cto = tdp->Cto; Qto = tdp->Qto; Quoted = MY_MAX(0, tdp->GetQuoted()); + Quote = tdp->GetQchar(); Rows = tdp->GetElemt(); Memory = tdp->Memory; Scrollable = tdp->Scrollable; @@ -214,12 +218,12 @@ TDBEXT::TDBEXT(EXTDEF *tdp) : TDB(tdp) Cto = 0; Qto = 0; Quoted = 0; + Quote = NULL; Rows = 0; Memory = 0; Scrollable = false; } // endif tdp - Quote = NULL; Query = NULL; Count = NULL; //Where = NULL; @@ -252,6 +256,7 @@ TDBEXT::TDBEXT(PTDBEXT tdbp) : TDB(tdbp) Cto = tdbp->Cto; Qto = tdbp->Qto; Quoted = tdbp->Quoted; + Quote = tdbp->Quote; Rows = tdbp->Rows; Memory = tdbp->Memory; Scrollable = tdbp->Scrollable; @@ -389,6 +394,8 @@ bool TDBEXT::MakeSQL(PGLOBAL g, bool cnt) int len; bool first = true; PCOL colp; + char *res= NULL, *my_schema_table= NULL; + size_t my_len= 0; if (Srcdef) return MakeSrcdef(g); @@ -458,10 +465,37 @@ bool TDBEXT::MakeSQL(PGLOBAL g, bool cnt) Decode(TableName, buf, sizeof(buf)); if (Quote) { - // Put table name between identifier quotes in case in contains blanks - Query->Append(Quote); - Query->Append(buf); - Query->Append(Quote); + // Tabname can have both database and table identifiers, we need to parse + if (res= strstr(buf, ".")) + { + // Parse schema + my_len= res - buf + 1; + my_schema_table= (char *) malloc(my_len); + memcpy(my_schema_table, buf, my_len - 1); + my_schema_table[my_len] = 0; + Query->Append(Quote); + Query->Append(my_schema_table); + Query->Append(Quote); + free(my_schema_table); + Query->Append("."); + // Parse table + my_len= strlen(buf) - my_len + 1; + my_schema_table= (char *) malloc(my_len); + memcpy(my_schema_table, ++res, my_len); + my_schema_table[my_len] = 0; + Query->Append(Quote); + Query->Append(my_schema_table); + Query->Append(Quote); + free(my_schema_table); + } + else + { + // Put table name between identifier quotes in case in contains blanks + Query->Append(Quote); + Query->Append(buf); + Query->Append(Quote); + } + } else Query->Append(buf); diff --git a/storage/connect/tabext.h b/storage/connect/tabext.h index 5fef1b9ece0..8a0d6c784a5 100644 --- a/storage/connect/tabext.h +++ b/storage/connect/tabext.h @@ -68,6 +68,7 @@ public: inline PSZ GetSrcdef(void) { return Srcdef; } inline char GetSep(void) { return (Sep) ? *Sep : 0; } inline int GetQuoted(void) { return Quoted; } + inline PSZ GetQchar(void) { return Qchar; } inline int GetOptions(void) { return Options; } // Methods diff --git a/storage/innobase/CMakeLists.txt b/storage/innobase/CMakeLists.txt index d0194ed49a2..7e938010860 100644 --- a/storage/innobase/CMakeLists.txt +++ b/storage/innobase/CMakeLists.txt @@ -326,7 +326,6 @@ SET(INNOBASE_SOURCES include/row0row.h include/row0row.inl include/row0sel.h - include/row0sel.inl include/row0types.h include/row0uins.h include/row0umod.h diff --git a/storage/innobase/btr/btr0btr.cc b/storage/innobase/btr/btr0btr.cc index 22d063fe68f..8f9984acb98 100644 --- a/storage/innobase/btr/btr0btr.cc +++ b/storage/innobase/btr/btr0btr.cc @@ -1102,6 +1102,7 @@ dberr_t dict_index_t::clear(que_thr_t *thr) mtr.set_log_mode(MTR_LOG_NO_REDO); else set_modified(mtr); + mtr_sx_lock_index(this, &mtr); dberr_t err; if (buf_block_t *root_block= @@ -5278,11 +5279,6 @@ btr_validate_index( dict_index_t* index, /*!< in: index */ const trx_t* trx) /*!< in: transaction or NULL */ { - /* Full Text index are implemented by auxiliary tables, not the B-tree */ - if (index->online_status != ONLINE_INDEX_COMPLETE || - (index->type & (DICT_FTS | DICT_CORRUPT))) - return DB_SUCCESS; - const bool lockout= index->is_spatial(); mtr_t mtr; diff --git a/storage/innobase/data/data0data.cc b/storage/innobase/data/data0data.cc index e63ba450ea1..9a7eff2142f 100644 --- a/storage/innobase/data/data0data.cc +++ b/storage/innobase/data/data0data.cc @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, 2021, MariaDB Corporation. +Copyright (c) 2017, 2022, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -547,7 +547,6 @@ dtuple_convert_big_rec( big_rec_t* vector; dfield_t* dfield; ulint size; - ulint n_fields; ulint local_prefix_len; if (!dict_index_is_clust(index)) { @@ -583,7 +582,7 @@ dtuple_convert_big_rec( a variable-length field that yields the biggest savings when stored externally */ - n_fields = 0; + ut_d(ulint n_fields = 0); uint16_t longest_i; ulint longest; @@ -699,9 +698,8 @@ ext_write: dfield_set_data(dfield, data, local_len); dfield_set_ext(dfield); - n_fields++; (*n_ext)++; - ut_ad(n_fields < dtuple_get_n_fields(entry)); + ut_ad(++n_fields < dtuple_get_n_fields(entry)); if (upd && !upd->is_modified(longest_i)) { diff --git a/storage/innobase/dict/dict0defrag_bg.cc b/storage/innobase/dict/dict0defrag_bg.cc index 9abe6a20589..ea2914e52dc 100644 --- a/storage/innobase/dict/dict0defrag_bg.cc +++ b/storage/innobase/dict/dict0defrag_bg.cc @@ -297,7 +297,7 @@ btr_get_size_and_reserved( { ulint dummy; - ut_ad(mtr->memo_contains(index->lock, MTR_MEMO_S_LOCK)); + ut_ad(mtr->memo_contains(index->lock, MTR_MEMO_SX_LOCK)); ut_a(flag == BTR_N_LEAF_PAGES || flag == BTR_TOTAL_SIZE); if (index->page == FIL_NULL @@ -314,7 +314,7 @@ btr_get_size_and_reserved( return ULINT_UNDEFINED; } - mtr->x_lock_space(index->table->space); + mtr->s_lock_space(index->table->space); ulint n = fseg_n_reserved_pages(*root, PAGE_HEADER + PAGE_BTR_SEG_LEAF + root->page.frame, used, mtr); @@ -345,7 +345,7 @@ dict_stats_save_defrag_stats( mtr_t mtr; ulint n_leaf_pages; mtr.start(); - mtr_s_lock_index(index, &mtr); + mtr_sx_lock_index(index, &mtr); ulint n_leaf_reserved= btr_get_size_and_reserved(index, BTR_N_LEAF_PAGES, &n_leaf_pages, &mtr); mtr.commit(); diff --git a/storage/innobase/dict/dict0stats.cc b/storage/innobase/dict/dict0stats.cc index 46ca661b156..84fac55a304 100644 --- a/storage/innobase/dict/dict0stats.cc +++ b/storage/innobase/dict/dict0stats.cc @@ -33,6 +33,7 @@ Created Jan 06, 2010 Vasil Dimov #include <mysql_com.h> #include "log.h" #include "btr0btr.h" +#include "que0que.h" #include <algorithm> #include <map> @@ -1035,6 +1036,12 @@ struct index_field_stats_t n_non_null_key_vals(n_non_null_key_vals) { } + + bool is_bulk_operation() const + { + return n_diff_key_vals == UINT64_MAX && + n_sample_sizes == UINT64_MAX && n_non_null_key_vals == UINT64_MAX; + } }; /*******************************************************************//** @@ -1376,13 +1383,16 @@ relatively quick and is used to calculate transient statistics that are not saved on disk. This was the only way to calculate statistics before the Persistent Statistics feature was introduced. This function doesn't update the defragmentation related stats. -Only persistent statistics supports defragmentation stats. */ +Only persistent statistics supports defragmentation stats. +@return error code +@retval DB_SUCCESS_LOCKED_REC if the table under bulk insert operation */ static -void +dberr_t dict_stats_update_transient_for_index( /*==================================*/ dict_index_t* index) /*!< in/out: index */ { + dberr_t err = DB_SUCCESS; if (srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO && (srv_force_recovery >= SRV_FORCE_NO_LOG_REDO || !dict_index_is_clust(index))) { @@ -1396,6 +1406,7 @@ dummy_empty: index->table->stats_mutex_lock(); dict_stats_empty_index(index, false); index->table->stats_mutex_unlock(); + return err; #if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG } else if (ibuf_debug && !dict_index_is_clust(index)) { goto dummy_empty; @@ -1407,7 +1418,8 @@ dummy_empty: mtr_t mtr; mtr.start(); - mtr_s_lock_index(index, &mtr); + mtr_sx_lock_index(index, &mtr); + dberr_t err; buf_block_t* root = btr_root_block_get(index, RW_SX_LATCH, &mtr, &err); @@ -1419,10 +1431,11 @@ invalid: const auto bulk_trx_id = index->table->bulk_trx_id; if (bulk_trx_id && trx_sys.find(nullptr, bulk_trx_id, false)) { + err= DB_SUCCESS_LOCKED_REC; goto invalid; } - mtr.x_lock_space(index->table->space); + mtr.s_lock_space(index->table->space); ulint dummy, size; index->stat_index_size @@ -1460,6 +1473,8 @@ invalid: } } } + + return err; } /*********************************************************************//** @@ -1467,9 +1482,11 @@ Calculates new estimates for table and index statistics. This function is relatively quick and is used to calculate transient statistics that are not saved on disk. This was the only way to calculate statistics before the -Persistent Statistics feature was introduced. */ +Persistent Statistics feature was introduced. +@return error code +@retval DB_SUCCESS_LOCKED REC if the table under bulk insert operation */ static -void +dberr_t dict_stats_update_transient( /*========================*/ dict_table_t* table) /*!< in/out: table */ @@ -1478,6 +1495,7 @@ dict_stats_update_transient( dict_index_t* index; ulint sum_of_index_sizes = 0; + dberr_t err = DB_SUCCESS; /* Find out the sizes of the indexes and how many different values for the key they approximately have */ @@ -1486,15 +1504,15 @@ dict_stats_update_transient( if (!table->space) { /* Nothing to do. */ +empty_table: dict_stats_empty_table(table, true); - return; + return err; } else if (index == NULL) { /* Table definition is corrupt */ ib::warn() << "Table " << table->name << " has no indexes. Cannot calculate statistics."; - dict_stats_empty_table(table, true); - return; + goto empty_table; } for (; index != NULL; index = dict_table_get_next_index(index)) { @@ -1506,14 +1524,15 @@ dict_stats_update_transient( } if (dict_stats_should_ignore_index(index) - || !index->is_readable()) { + || !index->is_readable() + || err == DB_SUCCESS_LOCKED_REC) { index->table->stats_mutex_lock(); dict_stats_empty_index(index, false); index->table->stats_mutex_unlock(); continue; } - dict_stats_update_transient_for_index(index); + err = dict_stats_update_transient_for_index(index); sum_of_index_sizes += index->stat_index_size; } @@ -1537,6 +1556,8 @@ dict_stats_update_transient( table->stat_initialized = TRUE; table->stats_mutex_unlock(); + + return err; } /* @{ Pseudo code about the relation between the following functions @@ -2419,6 +2440,19 @@ struct index_stats_t for (ulint i= 0; i < n_uniq; ++i) stats.push_back(index_field_stats_t{0, 1, 0}); } + + void set_bulk_operation() + { + memset((void*) &stats[0], 0xff, stats.size() * sizeof stats[0]); + } + + bool is_bulk_operation() const + { + for (auto &s : stats) + if (!s.is_bulk_operation()) + return false; + return true; + } }; /** Set dict_index_t::stat_n_diff_key_vals[] and stat_n_sample_sizes[]. @@ -2525,9 +2559,9 @@ static index_stats_t dict_stats_analyze_index(dict_index_t* index) DEBUG_PRINTF(" %s(index=%s)\n", __func__, index->name()); mtr.start(); - mtr_s_lock_index(index, &mtr); + mtr_sx_lock_index(index, &mtr); dberr_t err; - buf_block_t* root = btr_root_block_get(index, RW_SX_LATCH, &mtr, &err); + buf_block_t* root = btr_root_block_get(index, RW_SX_LATCH, &mtr, &err); if (!root) { empty_index: mtr.commit(); @@ -2536,7 +2570,7 @@ empty_index: } uint16_t root_level = btr_page_get_level(root->page.frame); - mtr.x_lock_space(index->table->space); + mtr.s_lock_space(index->table->space); ulint dummy, size; result.index_size = fseg_n_reserved_pages(*root, PAGE_HEADER + PAGE_BTR_SEG_LEAF @@ -2547,8 +2581,7 @@ empty_index: const auto bulk_trx_id = index->table->bulk_trx_id; if (bulk_trx_id && trx_sys.find(nullptr, bulk_trx_id, false)) { - result.index_size = 1; - result.n_leaf_pages = 1; + result.set_bulk_operation(); goto empty_index; } @@ -2810,7 +2843,8 @@ found_level: Calculates new estimates for table and index statistics. This function is relatively slow and is used to calculate persistent statistics that will be saved on disk. -@return DB_SUCCESS or error code */ +@return DB_SUCCESS or error code +@retval DB_SUCCESS_LOCKED_REC if the table under bulk insert operation */ static dberr_t dict_stats_update_persistent( @@ -2844,6 +2878,10 @@ dict_stats_update_persistent( index_stats_t stats = dict_stats_analyze_index(index); + if (stats.is_bulk_operation()) { + return DB_SUCCESS_LOCKED_REC; + } + table->stats_mutex_lock(); index->stat_index_size = stats.index_size; index->stat_n_leaf_pages = stats.n_leaf_pages; @@ -3842,7 +3880,8 @@ dict_stats_update_for_index( /*********************************************************************//** Calculates new estimates for table and index statistics. The statistics are used in query optimization. -@return DB_SUCCESS or error code */ +@return DB_SUCCESS or error code +@retval DB_SUCCESS_LOCKED_REC if the table under bulk insert operation */ dberr_t dict_stats_update( /*==============*/ @@ -3868,7 +3907,7 @@ dict_stats_update( if (trx_id_t bulk_trx_id = table->bulk_trx_id) { if (trx_sys.find(nullptr, bulk_trx_id, false)) { dict_stats_empty_table(table, false); - return DB_SUCCESS; + return DB_SUCCESS_LOCKED_REC; } } @@ -4062,9 +4101,7 @@ dict_stats_update( } transient: - dict_stats_update_transient(table); - - return(DB_SUCCESS); + return dict_stats_update_transient(table); } /** Execute DELETE FROM mysql.innodb_table_stats diff --git a/storage/innobase/dict/dict0stats_bg.cc b/storage/innobase/dict/dict0stats_bg.cc index 833d99cdaee..a66aac226a3 100644 --- a/storage/innobase/dict/dict0stats_bg.cc +++ b/storage/innobase/dict/dict0stats_bg.cc @@ -339,8 +339,9 @@ invalid_table_id: const bool update_now= difftime(time(nullptr), table->stats_last_recalc) >= MIN_RECALC_INTERVAL; - if (update_now) - dict_stats_update(table, DICT_STATS_RECALC_PERSISTENT); + const dberr_t err= update_now + ? dict_stats_update(table, DICT_STATS_RECALC_PERSISTENT) + : DB_SUCCESS_LOCKED_REC; dict_table_close(table, false, thd, mdl); @@ -361,7 +362,7 @@ done: ut_ad(i->state == recalc::IN_PROGRESS); recalc_pool.erase(i); const bool reschedule= !update_now && recalc_pool.empty(); - if (!update_now) + if (err == DB_SUCCESS_LOCKED_REC) recalc_pool.emplace_back(recalc{table_id, recalc::IDLE}); mysql_mutex_unlock(&recalc_pool_mutex); if (reschedule) diff --git a/storage/innobase/fts/fts0ast.cc b/storage/innobase/fts/fts0ast.cc index bb42f7c9f54..74d02d63817 100644 --- a/storage/innobase/fts/fts0ast.cc +++ b/storage/innobase/fts/fts0ast.cc @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 2007, 2020, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2018, MariaDB Corporation. +Copyright (c) 2018, 2022, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -28,6 +28,7 @@ Created 2007/3/16 Sunny Bains. #include "fts0ast.h" #include "fts0pars.h" #include "fts0fts.h" +#include "trx0trx.h" /* The FTS ast visit pass. */ enum fts_ast_visit_pass_t { diff --git a/storage/innobase/gis/gis0rtree.cc b/storage/innobase/gis/gis0rtree.cc index 59ad44ec093..88723ce5e8a 100644 --- a/storage/innobase/gis/gis0rtree.cc +++ b/storage/innobase/gis/gis0rtree.cc @@ -896,7 +896,7 @@ rtr_page_split_and_insert( lock_prdt_t new_prdt; rec_t* first_rec = NULL; int first_rec_group = 1; - ulint n_iterations = 0; + IF_DBUG(bool iterated = false,); if (!*heap) { *heap = mem_heap_create(1024); @@ -1128,7 +1128,7 @@ corrupted: the page, and it'll need the second round split in this case. We test this scenario here*/ DBUG_EXECUTE_IF("rtr_page_need_second_split", - if (n_iterations == 0) { + if (!iterated) { rec = NULL; goto after_insert; } ); @@ -1198,7 +1198,7 @@ after_insert: parent. */ rtr_clean_rtr_info(cursor->rtr_info, true); cursor->rtr_info = NULL; - n_iterations++; + IF_DBUG(iterated=true,); rec_t* i_rec = page_rec_get_next(page_get_infimum_rec( buf_block_get_frame(block))); diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc index 371636a435a..e40547777b2 100644 --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -1772,15 +1772,6 @@ MYSQL_THD innobase_create_background_thd(const char* name) return thd; } -extern "C" void thd_increment_pending_ops(MYSQL_THD); - -THD *innodb_thd_increment_pending_ops(THD *thd) -{ - if (!thd || THDVAR(thd, background_thread)) - return nullptr; - thd_increment_pending_ops(thd); - return thd; -} /** Close opened tables, free memory, delete items for a MYSQL_THD. @param[in] thd MYSQL_THD to reset */ @@ -3611,7 +3602,7 @@ ha_innobase::init_table_handle_for_HANDLER(void) innobase_register_trx(ht, m_user_thd, m_prebuilt->trx); /* We did the necessary inits in this function, no need to repeat them - in row_search_for_mysql */ + in row_search_mvcc() */ m_prebuilt->sql_stat_start = FALSE; @@ -7349,7 +7340,7 @@ ha_innobase::build_template( /* We must at least fetch all primary key cols. Note that if the clustered index was internally generated by InnoDB on the row id (no primary key was - defined), then row_search_for_mysql() will always + defined), then row_search_mvcc() will always retrieve the row id to a special buffer in the m_prebuilt struct. */ @@ -7364,6 +7355,11 @@ ha_innobase::build_template( m_prebuilt->versioned_write = table->versioned_write(VERS_TRX_ID); m_prebuilt->need_to_access_clustered = (index == clust_index); + if (m_prebuilt->in_fts_query) { + /* Do clustered index lookup to fetch the FTS_DOC_ID */ + m_prebuilt->need_to_access_clustered = true; + } + /* Either m_prebuilt->index should be a secondary index, or it should be the clustered index. */ ut_ad(dict_index_is_clust(index) == (index == clust_index)); @@ -8881,7 +8877,7 @@ statement issued by the user. We also increment trx->n_mysql_tables_in_use. instructions to m_prebuilt->template of the table handle instance in ::index_read. The template is used to save CPU time in large joins. - 3) In row_search_for_mysql, if m_prebuilt->sql_stat_start is true, we + 3) In row_search_mvcc(), if m_prebuilt->sql_stat_start is true, we allocate a new consistent read view for the trx if it does not yet have one, or in the case of a locking read, set an InnoDB 'intention' table level lock on the table. @@ -9183,7 +9179,7 @@ ha_innobase::change_active_index( } /* The caller seems to ignore this. Thus, we must check - this again in row_search_for_mysql(). */ + this again in row_search_mvcc(). */ DBUG_RETURN(convert_error_code_to_mysql(DB_MISSING_HISTORY, 0, NULL)); } @@ -9783,9 +9779,9 @@ next_record: int error; - switch (dberr_t ret = row_search_for_mysql(buf, PAGE_CUR_GE, - m_prebuilt, - ROW_SEL_EXACT, 0)) { + switch (dberr_t ret = row_search_mvcc(buf, PAGE_CUR_GE, + m_prebuilt, + ROW_SEL_EXACT, 0)) { case DB_SUCCESS: error = 0; table->status = 0; @@ -15127,8 +15123,10 @@ ha_innobase::check( DBUG_ENTER("ha_innobase::check"); DBUG_ASSERT(thd == ha_thd()); + DBUG_ASSERT(thd == m_user_thd); ut_a(m_prebuilt->trx->magic_n == TRX_MAGIC_N); ut_a(m_prebuilt->trx == thd_to_trx(thd)); + ut_ad(m_prebuilt->trx->mysql_thd == thd); if (m_prebuilt->mysql_template == NULL) { /* Build the template; we will use a dummy template @@ -15138,7 +15136,6 @@ ha_innobase::check( } if (!m_prebuilt->table->space) { - ib_senderrf( thd, IB_LOG_LEVEL_ERROR, @@ -15146,10 +15143,7 @@ ha_innobase::check( table->s->table_name.str); DBUG_RETURN(HA_ADMIN_CORRUPT); - - } else if (!m_prebuilt->table->is_readable() && - !m_prebuilt->table->space) { - + } else if (!m_prebuilt->table->is_readable()) { ib_senderrf( thd, IB_LOG_LEVEL_ERROR, ER_TABLESPACE_MISSING, @@ -15167,9 +15161,13 @@ ha_innobase::check( of records in some index; to play safe, we normally use REPEATABLE READ here */ m_prebuilt->trx->isolation_level = high_level_read_only + && !m_prebuilt->table->is_temporary() ? TRX_ISO_READ_UNCOMMITTED : TRX_ISO_REPEATABLE_READ; + trx_start_if_not_started(m_prebuilt->trx, false); + m_prebuilt->trx->read_view.open(m_prebuilt->trx); + for (dict_index_t* index = dict_table_get_first_index(m_prebuilt->table); index; @@ -15178,25 +15176,22 @@ ha_innobase::check( if (!index->is_committed()) { continue; } + if (index->type & DICT_FTS) { + /* We do not check any FULLTEXT INDEX. */ + continue; + } - if (!(check_opt->flags & T_QUICK) - && !index->is_corrupted()) { - - dberr_t err = btr_validate_index( - index, m_prebuilt->trx); - - if (err != DB_SUCCESS) { - is_ok = false; - - push_warning_printf( - thd, - Sql_condition::WARN_LEVEL_WARN, - ER_NOT_KEYFILE, - "InnoDB: The B-tree of" - " index %s is corrupted.", - index->name()); - continue; - } + if ((check_opt->flags & T_QUICK) || index->is_corrupted()) { + } else if (btr_validate_index(index, m_prebuilt->trx) + != DB_SUCCESS) { + is_ok = false; + push_warning_printf(thd, + Sql_condition::WARN_LEVEL_WARN, + ER_NOT_KEYFILE, + "InnoDB: The B-tree of" + " index %s is corrupted.", + index->name()); + continue; } /* Instead of invoking change_active_index(), set up @@ -15218,7 +15213,7 @@ ha_innobase::check( if (UNIV_UNLIKELY(!m_prebuilt->index_usable)) { if (index->is_corrupted()) { push_warning_printf( - m_user_thd, + thd, Sql_condition::WARN_LEVEL_WARN, HA_ERR_INDEX_CORRUPT, "InnoDB: Index %s is marked as" @@ -15227,7 +15222,7 @@ ha_innobase::check( is_ok = false; } else { push_warning_printf( - m_user_thd, + thd, Sql_condition::WARN_LEVEL_WARN, HA_ERR_TABLE_DEF_CHANGED, "InnoDB: Insufficient history for" @@ -15240,18 +15235,22 @@ ha_innobase::check( m_prebuilt->sql_stat_start = TRUE; m_prebuilt->template_type = ROW_MYSQL_DUMMY_TEMPLATE; m_prebuilt->n_template = 0; - m_prebuilt->need_to_access_clustered = FALSE; + m_prebuilt->read_just_key = 0; + m_prebuilt->autoinc_error = DB_SUCCESS; + m_prebuilt->need_to_access_clustered = + !!(check_opt->flags & T_EXTEND); dtuple_set_n_fields(m_prebuilt->search_tuple, 0); m_prebuilt->select_lock_type = LOCK_NONE; /* Scan this index. */ - if (dict_index_is_spatial(index)) { + if (index->is_spatial()) { ret = row_count_rtree_recs(m_prebuilt, &n_rows); + } else if (index->type & DICT_FTS) { + ret = DB_SUCCESS; } else { - ret = row_scan_index_for_mysql( - m_prebuilt, index, &n_rows); + ret = row_check_index(m_prebuilt, &n_rows); } DBUG_EXECUTE_IF( @@ -15260,11 +15259,18 @@ ha_innobase::check( ret = DB_CORRUPTION; }); - if (ret == DB_INTERRUPTED || thd_killed(m_user_thd)) { + if (ret == DB_INTERRUPTED || thd_killed(thd)) { /* Do not report error since this could happen during shutdown */ break; } + + if (ret == DB_SUCCESS + && m_prebuilt->autoinc_error != DB_MISSING_HISTORY) { + /* See if any non-fatal errors were reported. */ + ret = m_prebuilt->autoinc_error; + } + if (ret != DB_SUCCESS) { /* Assume some kind of corruption. */ push_warning_printf( diff --git a/storage/innobase/include/ha_prototypes.h b/storage/innobase/include/ha_prototypes.h index d9a26226e98..da9dec05827 100644 --- a/storage/innobase/include/ha_prototypes.h +++ b/storage/innobase/include/ha_prototypes.h @@ -199,13 +199,6 @@ but can be used for comparison. extern "C" unsigned long long thd_start_utime(const MYSQL_THD thd); -/** - Indicate the start of an async operation in a foreground thread. -@param thd current_thd -@return thd -@retval nullptr if this is not a foreground thread */ -THD *innodb_thd_increment_pending_ops(THD *thd); - /** Determines the current SQL statement. Thread unsafe, can only be called from the thread owning the THD. @param[in] thd MySQL thread handle diff --git a/storage/innobase/include/read0types.h b/storage/innobase/include/read0types.h index bc02fc065f5..e002f1b77e1 100644 --- a/storage/innobase/include/read0types.h +++ b/storage/innobase/include/read0types.h @@ -122,19 +122,6 @@ loop: /** - Check whether transaction id is valid. - @param[in] id transaction id to check - @param[in] name table name - - @todo changes_visible() was an unfortunate choice for this check. - It should be moved towards the functions that load trx id like - trx_read_trx_id(). No need to issue a warning, error log message should - be enough. Although statement should ideally fail if it sees corrupt - data. - */ - static void check_trx_id_sanity(trx_id_t id, const table_name_t &name); - - /** Check whether the changes by id are visible. @param[in] id transaction id to check against the view @return whether the view sees the modifications of id. @@ -150,26 +137,6 @@ loop: } /** - Check whether the changes by id are visible. - @param[in] id transaction id to check against the view - @param[in] name table name - @return whether the view sees the modifications of id. - */ - bool changes_visible(trx_id_t id, const table_name_t &name) const - MY_ATTRIBUTE((warn_unused_result)) - { - if (id >= m_low_limit_id) - { - check_trx_id_sanity(id, name); - return false; - } - return id < m_up_limit_id || - m_ids.empty() || - !std::binary_search(m_ids.begin(), m_ids.end(), id); - } - - - /** @param id transaction to check @return true if view sees transaction id */ @@ -180,6 +147,13 @@ loop: /** @return the low limit id */ trx_id_t low_limit_id() const { return m_low_limit_id; } + + /** Clamp the low limit id for purge_sys.end_view */ + void clamp_low_limit_id(trx_id_t limit) + { + if (m_low_limit_id > limit) + m_low_limit_id= limit; + } }; @@ -250,7 +224,6 @@ public: */ void set_creator_trx_id(trx_id_t id) { - ut_ad(id > 0); ut_ad(m_creator_trx_id == 0); m_creator_trx_id= id; } @@ -275,8 +248,6 @@ public: A wrapper around ReadViewBase::changes_visible(). Intended to be called by the ReadView owner thread. */ - bool changes_visible(trx_id_t id, const table_name_t &name) const - { return id == m_creator_trx_id || ReadViewBase::changes_visible(id, name); } bool changes_visible(trx_id_t id) const { return id == m_creator_trx_id || ReadViewBase::changes_visible(id); } diff --git a/storage/innobase/include/row0mysql.h b/storage/innobase/include/row0mysql.h index 3c624621b1d..a49e2c3f441 100644 --- a/storage/innobase/include/row0mysql.h +++ b/storage/innobase/include/row0mysql.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 2000, 2017, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, 2021, MariaDB Corporation. +Copyright (c) 2017, 2022, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -263,7 +263,7 @@ row_update_for_mysql( /** This can only be used when the current transaction is at READ COMMITTED or READ UNCOMMITTED isolation level. -Before calling this function row_search_for_mysql() must have +Before calling this function row_search_mvcc() must have initialized prebuilt->new_rec_locks to store the information which new record locks really were set. This function removes a newly set clustered index record lock under prebuilt->pcur or @@ -382,22 +382,6 @@ row_rename_table_for_mysql( FOREIGN KEY constraints */ MY_ATTRIBUTE((nonnull, warn_unused_result)); -/*********************************************************************//** -Scans an index for either COOUNT(*) or CHECK TABLE. -If CHECK TABLE; Checks that the index contains entries in an ascending order, -unique constraint is not broken, and calculates the number of index entries -in the read view of the current transaction. -@return DB_SUCCESS or other error */ -dberr_t -row_scan_index_for_mysql( -/*=====================*/ - row_prebuilt_t* prebuilt, /*!< in: prebuilt struct - in MySQL handle */ - const dict_index_t* index, /*!< in: index */ - ulint* n_rows) /*!< out: number of entries - seen in the consistent read */ - MY_ATTRIBUTE((warn_unused_result)); - /* A struct describing a place for an individual column in the MySQL row format which is presented to the table handler in ha_innobase. This template struct is used to speed up row transformations between @@ -606,7 +590,7 @@ struct row_prebuilt_t { ROW_READ_TRY_SEMI_CONSISTENT and to simply skip the row. If the row matches, the next call to - row_search_for_mysql() will lock + row_search_mvcc() will lock the row. This eliminates lock waits in some cases; note that this breaks @@ -615,7 +599,7 @@ struct row_prebuilt_t { the session is using READ COMMITTED or READ UNCOMMITTED isolation level, set in - row_search_for_mysql() if we set a new + row_search_mvcc() if we set a new record lock on the secondary or clustered index; this is used in row_unlock_for_mysql() @@ -847,7 +831,7 @@ innobase_rename_vc_templ( #define ROW_MYSQL_REC_FIELDS 1 #define ROW_MYSQL_NO_TEMPLATE 2 #define ROW_MYSQL_DUMMY_TEMPLATE 3 /* dummy template used in - row_scan_and_check_index */ + row_check_index() */ /* Values for hint_need_to_fetch_extra_cols */ #define ROW_RETRIEVE_PRIMARY_KEY 1 diff --git a/storage/innobase/include/row0sel.h b/storage/innobase/include/row0sel.h index eb83a4bcad6..8134c60fe72 100644 --- a/storage/innobase/include/row0sel.h +++ b/storage/innobase/include/row0sel.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1997, 2017, Oracle and/or its affiliates. -Copyright (c) 2017, MariaDB Corporation. +Copyright (c) 2017, 2022, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -24,8 +24,7 @@ Select Created 12/19/1997 Heikki Tuuri *******************************************************/ -#ifndef row0sel_h -#define row0sel_h +#pragma once #include "data0data.h" #include "que0types.h" @@ -58,15 +57,6 @@ void sel_col_prefetch_buf_free( /*======================*/ sel_buf_t* prefetch_buf); /*!< in, own: prefetch buffer */ -/*********************************************************************//** -Gets the plan node for the nth table in a join. -@return plan node */ -UNIV_INLINE -plan_t* -sel_node_get_nth_plan( -/*==================*/ - sel_node_t* node, /*!< in: select node */ - ulint i); /*!< in: get ith plan node */ /**********************************************************************//** Performs a select step. This is a high-level function used in SQL execution graphs. @@ -76,14 +66,6 @@ row_sel_step( /*=========*/ que_thr_t* thr); /*!< in: query thread */ /**********************************************************************//** -Performs an execution step of an open or close cursor statement node. -@return query thread to run next or NULL */ -UNIV_INLINE -que_thr_t* -open_step( -/*======*/ - que_thr_t* thr); /*!< in: query thread */ -/**********************************************************************//** Performs a fetch for a cursor. @return query thread to run next or NULL */ que_thr_t* @@ -136,37 +118,7 @@ row_sel_convert_mysql_key_to_innobase( ulint key_len); /*!< in: MySQL key value length */ -/** Searches for rows in the database. This is used in the interface to -MySQL. This function opens a cursor, and also implements fetch next -and fetch prev. NOTE that if we do a search with a full key value -from a unique index (ROW_SEL_EXACT), then we will not store the cursor -position and fetch next or fetch prev must not be tried to the cursor! - -@param[out] buf buffer for the fetched row in MySQL format -@param[in] mode search mode PAGE_CUR_L -@param[in,out] prebuilt prebuilt struct for the table handler; - this contains the info to search_tuple, - index; if search tuple contains 0 field then - we position the cursor at start or the end of - index, depending on 'mode' -@param[in] match_mode 0 or ROW_SEL_EXACT or ROW_SEL_EXACT_PREFIX -@param[in] direction 0 or ROW_SEL_NEXT or ROW_SEL_PREV; - Note: if this is != 0, then prebuilt must has a - pcur with stored position! In opening of a - cursor 'direction' should be 0. -@return DB_SUCCESS, DB_RECORD_NOT_FOUND, DB_END_OF_INDEX, DB_DEADLOCK, -DB_LOCK_TABLE_FULL, DB_CORRUPTION, or DB_TOO_BIG_RECORD */ -UNIV_INLINE -dberr_t -row_search_for_mysql( - byte* buf, - page_cur_mode_t mode, - row_prebuilt_t* prebuilt, - ulint match_mode, - ulint direction) - MY_ATTRIBUTE((warn_unused_result)); - -/** Searches for rows in the database using cursor. +/** Search for rows in the database using cursor. Function is mainly used for tables that are shared across connections and so it employs technique that can help re-construct the rows that transaction is suppose to see. @@ -184,7 +136,8 @@ It also has optimization such as pre-caching the rows, using AHI, etc. Note: if this is != 0, then prebuilt must has a pcur with stored position! In opening of a cursor 'direction' should be 0. -@return DB_SUCCESS or error code */ +@return DB_SUCCESS, DB_RECORD_NOT_FOUND, DB_END_OF_INDEX, DB_DEADLOCK, +DB_LOCK_TABLE_FULL, DB_CORRUPTION, or DB_TOO_BIG_RECORD */ dberr_t row_search_mvcc( byte* buf, @@ -210,6 +163,21 @@ row_count_rtree_recs( ulint* n_rows); /*!< out: number of entries seen in the consistent read */ +/** +Check the index records in CHECK TABLE. +The index must contain entries in an ascending order, +unique constraint must not be violated by duplicated keys, +and the number of index entries is counted in according to the +current read view. + +@param prebuilt index and transaction +@param n_rows number of records counted + +@return error code +@retval DB_SUCCESS if no error was found */ +dberr_t row_check_index(row_prebuilt_t *prebuilt, ulint *n_rows) + MY_ATTRIBUTE((nonnull, warn_unused_result)); + /** Read the max AUTOINC value from an index. @param[in] index index starting with an AUTO_INCREMENT column @return the largest AUTO_INCREMENT value @@ -382,6 +350,17 @@ struct sel_node_t{ fetches */ }; +/** +Get the plan node for a table in a join. +@param node query graph node for SELECT +@param i plan node element +@return ith plan node */ +inline plan_t *sel_node_get_nth_plan(sel_node_t *node, ulint i) +{ + ut_ad(i < node->n_tables); + return &node->plans[i]; +} + /** Fetch statement node */ struct fetch_node_t{ que_common_t common; /*!< type: QUE_NODE_FETCH */ @@ -476,7 +455,3 @@ row_sel_field_store_in_mysql_format_func( #endif /* UNIV_DEBUG */ const byte* data, /*!< in: data to store */ ulint len); /*!< in: length of the data */ - -#include "row0sel.inl" - -#endif diff --git a/storage/innobase/include/row0sel.inl b/storage/innobase/include/row0sel.inl deleted file mode 100644 index 7880605ca8f..00000000000 --- a/storage/innobase/include/row0sel.inl +++ /dev/null @@ -1,138 +0,0 @@ -/***************************************************************************** - -Copyright (c) 1997, 2014, Oracle and/or its affiliates. All Rights Reserved. - -This program is free software; you can redistribute it and/or modify it under -the terms of the GNU General Public License as published by the Free Software -Foundation; version 2 of the License. - -This program is distributed in the hope that it will be useful, but WITHOUT -ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS -FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. - -You should have received a copy of the GNU General Public License along with -this program; if not, write to the Free Software Foundation, Inc., -51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA - -*****************************************************************************/ - -/**************************************************//** -@file include/row0sel.ic -Select - -Created 12/19/1997 Heikki Tuuri -*******************************************************/ - -#include "que0que.h" - -/*********************************************************************//** -Gets the plan node for the nth table in a join. -@return plan node */ -UNIV_INLINE -plan_t* -sel_node_get_nth_plan( -/*==================*/ - sel_node_t* node, /*!< in: select node */ - ulint i) /*!< in: get ith plan node */ -{ - ut_ad(i < node->n_tables); - - return(node->plans + i); -} - -/*********************************************************************//** -Resets the cursor defined by sel_node to the SEL_NODE_OPEN state, which means -that it will start fetching from the start of the result set again, regardless -of where it was before, and it will set intention locks on the tables. */ -UNIV_INLINE -void -sel_node_reset_cursor( -/*==================*/ - sel_node_t* node) /*!< in: select node */ -{ - node->state = SEL_NODE_OPEN; -} - -/**********************************************************************//** -Performs an execution step of an open or close cursor statement node. -@return query thread to run next or NULL */ -UNIV_INLINE -que_thr_t* -open_step( -/*======*/ - que_thr_t* thr) /*!< in: query thread */ -{ - sel_node_t* sel_node; - open_node_t* node; - ulint err; - - ut_ad(thr); - - node = (open_node_t*) thr->run_node; - ut_ad(que_node_get_type(node) == QUE_NODE_OPEN); - - sel_node = node->cursor_def; - - err = DB_SUCCESS; - - if (node->op_type == ROW_SEL_OPEN_CURSOR) { - - /* if (sel_node->state == SEL_NODE_CLOSED) { */ - - sel_node_reset_cursor(sel_node); - /* } else { - err = DB_ERROR; - } */ - } else { - if (sel_node->state != SEL_NODE_CLOSED) { - - sel_node->state = SEL_NODE_CLOSED; - } else { - err = DB_ERROR; - } - } - - if (err != DB_SUCCESS) { - /* SQL error detected */ - fprintf(stderr, "SQL error %lu\n", (ulong) err); - - ut_error; - } - - thr->run_node = que_node_get_parent(node); - - return(thr); -} - - -/** Searches for rows in the database. This is used in the interface to -MySQL. This function opens a cursor, and also implements fetch next -and fetch prev. NOTE that if we do a search with a full key value -from a unique index (ROW_SEL_EXACT), then we will not store the cursor -position and fetch next or fetch prev must not be tried to the cursor! - -@param[out] buf buffer for the fetched row in MySQL format -@param[in] mode search mode PAGE_CUR_L -@param[in,out] prebuilt prebuilt struct for the table handler; - this contains the info to search_tuple, - index; if search tuple contains 0 field then - we position the cursor at start or the end of - index, depending on 'mode' -@param[in] match_mode 0 or ROW_SEL_EXACT or ROW_SEL_EXACT_PREFIX -@param[in] direction 0 or ROW_SEL_NEXT or ROW_SEL_PREV; - Note: if this is != 0, then prebuilt must has a - pcur with stored position! In opening of a - cursor 'direction' should be 0. -@return DB_SUCCESS, DB_RECORD_NOT_FOUND, DB_END_OF_INDEX, DB_DEADLOCK, -DB_LOCK_TABLE_FULL, DB_CORRUPTION, or DB_TOO_BIG_RECORD */ -UNIV_INLINE -dberr_t -row_search_for_mysql( - byte* buf, - page_cur_mode_t mode, - row_prebuilt_t* prebuilt, - ulint match_mode, - ulint direction) -{ - return(row_search_mvcc(buf, mode, prebuilt, match_mode, direction)); -} diff --git a/storage/innobase/include/row0upd.h b/storage/innobase/include/row0upd.h index d47ec793f89..cc05df395ea 100644 --- a/storage/innobase/include/row0upd.h +++ b/storage/innobase/include/row0upd.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1996, 2018, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, 2020, MariaDB Corporation. +Copyright (c) 2017, 2022, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -118,14 +118,6 @@ row_upd_changes_field_size_or_external( dict_index_t* index, /*!< in: index */ const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */ const upd_t* update);/*!< in: update vector */ -/***********************************************************//** -Returns true if row update contains disowned external fields. -@return true if the update contains disowned external fields. */ -bool -row_upd_changes_disowned_external( -/*==============================*/ - const upd_t* update) /*!< in: update vector */ - MY_ATTRIBUTE((nonnull, warn_unused_result)); /***************************************************************//** Builds an update vector from those fields which in a secondary index entry diff --git a/storage/innobase/include/row0vers.h b/storage/innobase/include/row0vers.h index e05b18a8ccc..60f310e1b0f 100644 --- a/storage/innobase/include/row0vers.h +++ b/storage/innobase/include/row0vers.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, 2019, MariaDB Corporation. +Copyright (c) 2017, 2022, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -55,7 +55,7 @@ row_vers_impl_x_locked( const rec_offs* offsets); /** Finds out if a version of the record, where the version >= the current -purge view, should have ientry as its secondary index entry. We check +purge_sys.view, should have ientry as its secondary index entry. We check if there is any not delete marked version of the record where the trx id >= purge view, and the secondary index entry == ientry; exactly in this case we return TRUE. @@ -85,7 +85,9 @@ row_vers_old_has_index_entry( Constructs the version of a clustered index record which a consistent read should see. We assume that the trx id stored in rec is such that the consistent read should not see rec in its present version. -@return DB_SUCCESS or DB_MISSING_HISTORY */ +@return error code +@retval DB_SUCCESS if a previous version was fetched +@retval DB_MISSING_HISTORY if the history is missing (a sign of corruption) */ dberr_t row_vers_build_for_consistent_read( /*===============================*/ diff --git a/storage/innobase/include/trx0purge.h b/storage/innobase/include/trx0purge.h index 6109f7fb358..3711599bd8c 100644 --- a/storage/innobase/include/trx0purge.h +++ b/storage/innobase/include/trx0purge.h @@ -24,8 +24,7 @@ Purge old versions Created 3/26/1996 Heikki Tuuri *******************************************************/ -#ifndef trx0purge_h -#define trx0purge_h +#pragma once #include "trx0sys.h" #include "que0types.h" @@ -123,7 +122,8 @@ public: /** latch protecting view, m_enabled */ alignas(CPU_LEVEL1_DCACHE_LINESIZE) mutable srw_spin_lock latch; private: - /** The purge will not remove undo logs which are >= this view */ + /** Read view at the start of a purge batch. Any encountered index records + that are older than view will be removed. */ ReadViewBase view; /** whether purge is enabled; protected by latch and std::atomic */ std::atomic<bool> m_enabled; @@ -133,6 +133,12 @@ private: Atomic_counter<uint32_t> m_SYS_paused; /** number of stop_FTS() calls without resume_FTS() */ Atomic_counter<uint32_t> m_FTS_paused; + + /** latch protecting end_view */ + alignas(CPU_LEVEL1_DCACHE_LINESIZE) srw_spin_lock_low end_latch; + /** Read view at the end of a purge batch (copied from view). Any undo pages + containing records older than end_view may be freed. */ + ReadViewBase end_view; public: que_t* query; /*!< The query graph which will do the parallelized purge operation */ @@ -261,28 +267,56 @@ public: /** check stop_SYS() */ void check_stop_FTS() { if (must_wait_FTS()) wait_FTS(); } - /** A wrapper around ReadView::changes_visible(). */ - bool changes_visible(trx_id_t id, const table_name_t &name) const - { - return view.changes_visible(id, name); - } + /** Determine if the history of a transaction is purgeable. + @param trx_id transaction identifier + @return whether the history is purgeable */ + TRANSACTIONAL_TARGET bool is_purgeable(trx_id_t trx_id) const; + /** A wrapper around ReadView::low_limit_no(). */ trx_id_t low_limit_no() const { - /* Other callers than purge_coordinator_callback() must be holding - purge_sys.latch here. The purge coordinator task may call this - without holding any latch, because it is the only thread that may - modify purge_sys.view. */ + /* This function may only be called by purge_coordinator_callback(). + + The purge coordinator task may call this without holding any latch, + because it is the only thread that may modify purge_sys.view. + + Any other threads that access purge_sys.view must hold purge_sys.latch, + typically via purge_sys_t::view_guard. */ return view.low_limit_no(); } /** A wrapper around trx_sys_t::clone_oldest_view(). */ + template<bool also_end_view= false> void clone_oldest_view() { latch.wr_lock(SRW_LOCK_CALL); trx_sys.clone_oldest_view(&view); + if (also_end_view) + (end_view= view). + clamp_low_limit_id(head.trx_no ? head.trx_no : tail.trx_no); latch.wr_unlock(); } + /** Update end_view at the end of a purge batch. */ + inline void clone_end_view(); + + struct view_guard + { + inline view_guard(); + inline ~view_guard(); + + /** @return purge_sys.view */ + inline const ReadViewBase &view() const; + }; + + struct end_view_guard + { + inline end_view_guard(); + inline ~end_view_guard(); + + /** @return purge_sys.end_view */ + inline const ReadViewBase &view() const; + }; + /** Stop the purge thread and check n_ref_count of all auxiliary and common table associated with the fts table. @param table parent FTS table @@ -294,4 +328,20 @@ public: /** The global data structure coordinating a purge */ extern purge_sys_t purge_sys; -#endif /* trx0purge_h */ +purge_sys_t::view_guard::view_guard() +{ purge_sys.latch.rd_lock(SRW_LOCK_CALL); } + +purge_sys_t::view_guard::~view_guard() +{ purge_sys.latch.rd_unlock(); } + +const ReadViewBase &purge_sys_t::view_guard::view() const +{ return purge_sys.view; } + +purge_sys_t::end_view_guard::end_view_guard() +{ purge_sys.end_latch.rd_lock(); } + +purge_sys_t::end_view_guard::~end_view_guard() +{ purge_sys.end_latch.rd_unlock(); } + +const ReadViewBase &purge_sys_t::end_view_guard::view() const +{ return purge_sys.end_view; } diff --git a/storage/innobase/include/trx0rec.h b/storage/innobase/include/trx0rec.h index 708af690e2b..bb348d7ef8b 100644 --- a/storage/innobase/include/trx0rec.h +++ b/storage/innobase/include/trx0rec.h @@ -181,17 +181,17 @@ trx_undo_report_row_operation( is being called purge view and we would like to get the purge record even it is in the purge view (in normal case, it will return without fetching the purge record */ -#define TRX_UNDO_PREV_IN_PURGE 0x1 +static constexpr ulint TRX_UNDO_PREV_IN_PURGE = 1; /** This tells trx_undo_prev_version_build() to fetch the old value in the undo log (which is the after image for an update) */ -#define TRX_UNDO_GET_OLD_V_VALUE 0x2 +static constexpr ulint TRX_UNDO_GET_OLD_V_VALUE = 2; + +/** indicate a call from row_vers_old_has_index_entry() */ +static constexpr ulint TRX_UNDO_CHECK_PURGEABILITY = 4; /** Build a previous version of a clustered index record. The caller must hold a latch on the index page of the clustered index record. -@param index_rec clustered index record in the index tree -@param index_mtr mtr which contains the latch to index_rec page - and purge_view @param rec version of a clustered index record @param index clustered index @param offsets rec_get_offsets(rec, index) @@ -210,14 +210,12 @@ must hold a latch on the index page of the clustered index record. @param v_status status determine if it is going into this function by purge thread or not. And if we read "after image" of undo log -@retval true if previous version was built, or if it was an insert -or the table has been rebuilt -@retval false if the previous version is earlier than purge_view, -or being purged, which means that it may have been removed */ -bool +@return error code +@retval DB_SUCCESS if previous version was successfully built, +or if it was an insert or the undo record refers to the table before rebuild +@retval DB_MISSING_HISTORY if the history is missing */ +dberr_t trx_undo_prev_version_build( - const rec_t *index_rec, - mtr_t *index_mtr, const rec_t *rec, dict_index_t *index, rec_offs *offsets, diff --git a/storage/innobase/include/trx0trx.h b/storage/innobase/include/trx0trx.h index 57cf7041295..7933e9bc883 100644 --- a/storage/innobase/include/trx0trx.h +++ b/storage/innobase/include/trx0trx.h @@ -496,6 +496,13 @@ public: return false; } + /** @return the first undo record that modified the table */ + undo_no_t get_first() const + { + ut_ad(valid()); + return LIMIT & first; + } + /** Add the tuple to the transaction bulk buffer for the given index. @param entry tuple to be inserted @param index bulk insert for the index @@ -610,15 +617,20 @@ struct trx_t : ilist_node<> { private: /** - Count of references. + Least significant 31 bits is count of references. We can't release the locks nor commit the transaction until this reference is 0. We can change the state to TRX_STATE_COMMITTED_IN_MEMORY to signify that it is no longer "active". - */ + If the most significant bit is set this transaction should stop inheriting + (GAP)locks. Generally set to true during transaction prepare for RC or lower + isolation, if requested. Needed for replication replay where + we don't want to get blocked on GAP locks taken for protecting + concurrent unique insert or replace operation. + */ alignas(CPU_LEVEL1_DCACHE_LINESIZE) - Atomic_counter<int32_t> n_ref; + Atomic_relaxed<uint32_t> skip_lock_inheritance_and_n_ref; public: @@ -1023,26 +1035,48 @@ public: void savepoints_discard(trx_named_savept_t *savept); - bool is_referenced() const { return n_ref > 0; } + bool is_referenced() const + { + return (skip_lock_inheritance_and_n_ref & ~(1U << 31)) > 0; + } void reference() { -#ifdef UNIV_DEBUG - auto old_n_ref= -#endif - n_ref++; - ut_ad(old_n_ref >= 0); + ut_d(auto old_n_ref =) + skip_lock_inheritance_and_n_ref.fetch_add(1); + ut_ad(int32_t(old_n_ref << 1) >= 0); } - void release_reference() { -#ifdef UNIV_DEBUG - auto old_n_ref= + ut_d(auto old_n_ref =) + skip_lock_inheritance_and_n_ref.fetch_sub(1); + ut_ad(int32_t(old_n_ref << 1) > 0); + } + + bool is_not_inheriting_locks() const + { + return skip_lock_inheritance_and_n_ref >> 31; + } + + void set_skip_lock_inheritance() + { + ut_d(auto old_n_ref=) skip_lock_inheritance_and_n_ref.fetch_add(1U << 31); + ut_ad(!(old_n_ref >> 31)); + } + + void reset_skip_lock_inheritance() + { +#if defined __GNUC__ && (defined __i386__ || defined __x86_64__) + __asm__("lock btrl $31, %0" : : "m"(skip_lock_inheritance_and_n_ref)); +#elif defined _MSC_VER && (defined _M_IX86 || defined _M_X64) + _interlockedbittestandreset( + reinterpret_cast<volatile long *>(&skip_lock_inheritance_and_n_ref), + 31); +#else + skip_lock_inheritance_and_n_ref.fetch_and(~1U << 31); #endif - n_ref--; - ut_ad(old_n_ref > 0); } /** @return whether the table has lock on @@ -1072,6 +1106,7 @@ public: ut_ad(UT_LIST_GET_LEN(lock.evicted_tables) == 0); ut_ad(!dict_operation); ut_ad(!apply_online_log); + ut_ad(!is_not_inheriting_locks()); } /** This has to be invoked on SAVEPOINT or at the end of a statement. @@ -1126,6 +1161,22 @@ public: return &it->second; } + /** Rollback all bulk insert operations */ + void bulk_rollback() + { + undo_no_t low_limit= UINT64_MAX; + for (auto& t : mod_tables) + { + if (!t.second.is_bulk_insert()) + continue; + if (t.second.get_first() < low_limit) + low_limit= t.second.get_first(); + } + + trx_savept_t bulk_save{low_limit}; + rollback(&bulk_save); + } + /** Do the bulk insert for the buffered insert operation for the transaction. @return DB_SUCCESS or error code */ @@ -1138,7 +1189,10 @@ public: for (auto& t : mod_tables) if (t.second.is_bulk_insert()) if (dberr_t err= t.second.write_bulk(t.first, this)) + { + bulk_rollback(); return err; + } return DB_SUCCESS; } diff --git a/storage/innobase/include/ut0lst.h b/storage/innobase/include/ut0lst.h index 9a5f3059826..7b7ed7b8e80 100644 --- a/storage/innobase/include/ut0lst.h +++ b/storage/innobase/include/ut0lst.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2015, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2019, MariaDB Corporation. +Copyright (c) 2019, 2022, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -25,8 +25,7 @@ Created 9/10/1995 Heikki Tuuri Rewritten by Sunny Bains Dec 2011. ***********************************************************************/ -#ifndef ut0lst_h -#define ut0lst_h +#pragma once /* Do not include univ.i because univ.i includes this. */ @@ -474,17 +473,17 @@ template <typename List, class Functor> void ut_list_validate(const List& list, Functor& functor) { ut_list_map(list, functor); - +#ifdef UNIV_DEBUG /* Validate the list backwards. */ - ulint count = 0; + ulint count = list.count; for (typename List::elem_type* elem = list.end; elem != 0; elem = (elem->*list.node).prev) { - ++count; + --count; } - - ut_a(count == list.count); + ut_ad(!count); +#endif } /** Check the consistency of a doubly linked list. @@ -494,23 +493,24 @@ template <typename List, class Functor> inline void ut_list_validate(const List& list, const Functor& functor) { ut_list_map(list, functor); - +#ifdef UNIV_DEBUG /* Validate the list backwards. */ - ulint count = 0; + ulint count = list.count; for (typename List::elem_type* elem = list.end; elem != 0; elem = (elem->*list.node).prev) { - ++count; + --count; } - ut_a(count == list.count); + ut_ad(!count); +#endif } template <typename List> inline void ut_list_validate(const List& list) { - ut_list_validate(list, NullValidate()); + ut_d(ut_list_validate(list, NullValidate())); } #ifdef UNIV_DEBUG @@ -561,8 +561,3 @@ ut_list_move_to_front( ut_list_prepend(list, elem); } } - -#ifdef UNIV_DEBUG -#endif - -#endif /* ut0lst.h */ diff --git a/storage/innobase/lock/lock0lock.cc b/storage/innobase/lock/lock0lock.cc index 8490773dc68..448021d840a 100644 --- a/storage/innobase/lock/lock0lock.cc +++ b/storage/innobase/lock/lock0lock.cc @@ -44,6 +44,8 @@ Created 5/7/1996 Heikki Tuuri #include "row0vers.h" #include "pars0pars.h" #include "srv0mon.h" +#include "que0que.h" +#include "scope.h" #include <set> @@ -1723,6 +1725,12 @@ dberr_t lock_wait(que_thr_t *thr) if (trx->mysql_thd) DEBUG_SYNC_C("lock_wait_start"); + /* Create the sync point for any quit from the function. */ + ut_d(SCOPE_EXIT([trx]() { + if (trx->mysql_thd) + DEBUG_SYNC_C("lock_wait_end"); + })); + /* InnoDB system transactions may use the global value of innodb_lock_wait_timeout, because trx->mysql_thd == NULL. */ const ulong innodb_lock_wait_timeout= trx_lock_wait_timeout_get(trx); @@ -1788,8 +1796,8 @@ dberr_t lock_wait(que_thr_t *thr) wait_lock->un_member.tab_lock.table->id <= DICT_FIELDS_ID); thd_wait_begin(trx->mysql_thd, (type_mode & LOCK_TABLE) ? THD_WAIT_TABLE_LOCK : THD_WAIT_ROW_LOCK); - trx->error_state= DB_SUCCESS; + int err= 0; mysql_mutex_lock(&lock_sys.wait_mutex); if (trx->lock.wait_lock) { @@ -1811,25 +1819,24 @@ dberr_t lock_wait(que_thr_t *thr) if (row_lock_wait) lock_sys.wait_start(); - trx->error_state= DB_SUCCESS; - #ifdef HAVE_REPLICATION if (rpl) lock_wait_rpl_report(trx); #endif + if (trx->error_state != DB_SUCCESS) + goto check_trx_error; + while (trx->lock.wait_lock) { - int err; + DEBUG_SYNC_C("lock_wait_before_suspend"); if (no_timeout) - { my_cond_wait(&trx->lock.cond, &lock_sys.wait_mutex.m_mutex); - err= 0; - } else err= my_cond_timedwait(&trx->lock.cond, &lock_sys.wait_mutex.m_mutex, &abstime); +check_trx_error: switch (trx->error_state) { case DB_DEADLOCK: case DB_INTERRUPTED: @@ -1875,17 +1882,19 @@ end_wait: /** Resume a lock wait */ -static void lock_wait_end(trx_t *trx) +template <bool from_deadlock= false> +void lock_wait_end(trx_t *trx) { mysql_mutex_assert_owner(&lock_sys.wait_mutex); ut_ad(trx->mutex_is_owner()); ut_d(const auto state= trx->state); - ut_ad(state == TRX_STATE_ACTIVE || state == TRX_STATE_PREPARED); - ut_ad(trx->lock.wait_thr); + ut_ad(state == TRX_STATE_COMMITTED_IN_MEMORY || state == TRX_STATE_ACTIVE || + state == TRX_STATE_PREPARED); + ut_ad(from_deadlock || trx->lock.wait_thr); if (trx->lock.was_chosen_as_deadlock_victim) { - ut_ad(state == TRX_STATE_ACTIVE); + ut_ad(from_deadlock || state == TRX_STATE_ACTIVE); trx->error_state= DB_DEADLOCK; } @@ -2111,49 +2120,58 @@ lock_rec_reset_and_release_wait(const hash_cell_t &cell, const page_id_t id, } } -/*************************************************************//** -Makes a record to inherit the locks (except LOCK_INSERT_INTENTION type) +/** Makes a record to inherit the locks (except LOCK_INSERT_INTENTION type) of another record as gap type locks, but does not reset the lock bits of the other record. Also waiting lock requests on rec are inherited as -GRANTED gap locks. */ -static -void -lock_rec_inherit_to_gap( -/*====================*/ - hash_cell_t& heir_cell, /*!< heir hash table cell */ - const page_id_t heir, /*!< in: page containing the - record which inherits */ - const hash_cell_t& donor_cell, /*!< donor hash table cell */ - const page_id_t donor, /*!< in: page containing the - record from which inherited; - does NOT reset the locks on - this record */ - const page_t* heir_page, /*!< in: heir page frame */ - ulint heir_heap_no, /*!< in: heap_no of the - inheriting record */ - ulint heap_no) /*!< in: heap_no of the - donating record */ +GRANTED gap locks. +@param heir_cell heir hash table cell +@param heir page containing the record which inherits +@param donor_cell donor hash table cell +@param donor page containing the record from which inherited; does NOT + reset the locks on this record +@param heir_page heir page frame +@param heir_heap_no heap_no of the inheriting record +@param heap_no heap_no of the donating record +@tparam from_split true if the function is invoked from + lock_update_split_(left|right)(), in this case not-gap + locks are not inherited to supremum if transaction + isolation level less or equal to READ COMMITTED */ +template <bool from_split= false> +static void +lock_rec_inherit_to_gap(hash_cell_t &heir_cell, const page_id_t heir, + const hash_cell_t &donor_cell, const page_id_t donor, + const page_t *heir_page, ulint heir_heap_no, + ulint heap_no) { - /* At READ UNCOMMITTED or READ COMMITTED isolation level, - we do not want locks set - by an UPDATE or a DELETE to be inherited as gap type locks. But we - DO want S-locks/X-locks(taken for replace) set by a consistency - constraint to be inherited also then. */ + ut_ad(!from_split || heir_heap_no == PAGE_HEAP_NO_SUPREMUM); - for (lock_t* lock= lock_sys_t::get_first(donor_cell, donor, heap_no); - lock; - lock = lock_rec_get_next(heap_no, lock)) { - trx_t* lock_trx = lock->trx; - if (!lock->is_insert_intention() - && (lock_trx->isolation_level > TRX_ISO_READ_COMMITTED - || lock->mode() != - (lock_trx->duplicates ? LOCK_S : LOCK_X))) { - lock_rec_add_to_queue(LOCK_GAP | lock->mode(), - heir_cell, heir, heir_page, - heir_heap_no, - lock->index, lock_trx, false); - } - } + /* At READ UNCOMMITTED or READ COMMITTED isolation level, + we do not want locks set + by an UPDATE or a DELETE to be inherited as gap type locks. But we + DO want S-locks/X-locks(taken for replace) set by a consistency + constraint to be inherited also then. */ + + for (lock_t *lock= lock_sys_t::get_first(donor_cell, donor, heap_no); lock; + lock= lock_rec_get_next(heap_no, lock)) + { + trx_t *lock_trx= lock->trx; + if (!lock->trx->is_not_inheriting_locks() && + !lock->is_insert_intention() && + (lock_trx->isolation_level > TRX_ISO_READ_COMMITTED || + /* When we are in a page split (not purge), then we don't set a lock + on supremum if the donor lock type is LOCK_REC_NOT_GAP. That is, do + not create bogus gap locks for non-gap locks for READ UNCOMMITTED and + READ COMMITTED isolation levels. LOCK_ORDINARY and + LOCK_GAP require a gap before the record to be locked, that is why + setting lock on supremmum is necessary. */ + ((!from_split || !lock->is_record_not_gap()) && + lock->mode() != (lock_trx->duplicates ? LOCK_S : LOCK_X)))) + { + lock_rec_add_to_queue(LOCK_GAP | lock->mode(), heir_cell, heir, + heir_page, heir_heap_no, lock->index, lock_trx, + false); + } + } } /*************************************************************//** @@ -2177,7 +2195,8 @@ lock_rec_inherit_to_gap_if_gap_lock( for (lock_t *lock= lock_sys_t::get_first(g.cell(), id, heap_no); lock; lock= lock_rec_get_next(heap_no, lock)) - if (!lock->is_insert_intention() && (heap_no == PAGE_HEAP_NO_SUPREMUM || + if (!lock->trx->is_not_inheriting_locks() && + !lock->is_insert_intention() && (heap_no == PAGE_HEAP_NO_SUPREMUM || !lock->is_record_not_gap()) && !lock_table_has(lock->trx, lock->index->table, LOCK_X)) lock_rec_add_to_queue(LOCK_GAP | lock->mode(), @@ -2794,8 +2813,9 @@ lock_update_split_right( /* Inherit the locks to the supremum of left page from the successor of the infimum on right page */ - lock_rec_inherit_to_gap(g.cell1(), l, g.cell2(), r, left_block->page.frame, - PAGE_HEAP_NO_SUPREMUM, h); + lock_rec_inherit_to_gap<true>(g.cell1(), l, g.cell2(), r, + left_block->page.frame, PAGE_HEAP_NO_SUPREMUM, + h); } void lock_update_node_pointer(const buf_block_t *left_block, @@ -2910,8 +2930,9 @@ lock_update_split_left( LockMultiGuard g{lock_sys.rec_hash, l, r}; /* Inherit the locks to the supremum of the left page from the successor of the infimum on the right page */ - lock_rec_inherit_to_gap(g.cell1(), l, g.cell2(), r, left_block->page.frame, - PAGE_HEAP_NO_SUPREMUM, h); + lock_rec_inherit_to_gap<true>(g.cell1(), l, g.cell2(), r, + left_block->page.frame, PAGE_HEAP_NO_SUPREMUM, + h); } /** Update the lock table when a page is merged to the left. @@ -4049,8 +4070,14 @@ static bool lock_release_on_prepare_try(trx_t *trx) if (!lock->is_table()) { ut_ad(!lock->index->table->is_temporary()); - if (lock->mode() == LOCK_X && !lock->is_gap()) + if (lock->mode() == LOCK_X && !lock->is_gap()) { + ut_ad(lock->trx->isolation_level > TRX_ISO_READ_COMMITTED || + /* Insert-intention lock is valid for supremum for isolation + level > TRX_ISO_READ_COMMITTED */ + lock->mode() == LOCK_X || + !lock_rec_get_nth_bit(lock, PAGE_HEAP_NO_SUPREMUM)); continue; + } auto &lock_hash= lock_sys.hash_get(lock->type_mode); auto cell= lock_hash.cell_get(lock->un_member.rec_lock.page_id.fold()); auto latch= lock_sys_t::hash_table::latch(cell); @@ -4096,6 +4123,8 @@ static bool lock_release_on_prepare_try(trx_t *trx) and release possible other transactions waiting because of these locks. */ void lock_release_on_prepare(trx_t *trx) { + auto _ = make_scope_exit([trx]() { trx->set_skip_lock_inheritance(); }); + for (ulint count= 5; count--; ) if (lock_release_on_prepare_try(trx)) return; @@ -4113,6 +4142,12 @@ void lock_release_on_prepare(trx_t *trx) ut_ad(!lock->index->table->is_temporary()); if (lock->mode() != LOCK_X || lock->is_gap()) lock_rec_dequeue_from_page(lock, false); + else + ut_ad(lock->trx->isolation_level > TRX_ISO_READ_COMMITTED || + /* Insert-intention lock is valid for supremum for isolation + level > TRX_ISO_READ_COMMITTED */ + lock->mode() == LOCK_X || + !lock_rec_get_nth_bit(lock, PAGE_HEAP_NO_SUPREMUM)); } else { @@ -5672,13 +5707,16 @@ static void lock_release_autoinc_locks(trx_t *trx) } /** Cancel a waiting lock request and release possibly waiting transactions */ -static void lock_cancel_waiting_and_release(lock_t *lock) +template <bool from_deadlock= false> +void lock_cancel_waiting_and_release(lock_t *lock) { lock_sys.assert_locked(*lock); mysql_mutex_assert_owner(&lock_sys.wait_mutex); trx_t *trx= lock->trx; trx->mutex_lock(); - ut_ad(trx->state == TRX_STATE_ACTIVE); + ut_d(const auto trx_state= trx->state); + ut_ad(trx_state == TRX_STATE_COMMITTED_IN_MEMORY || + trx_state == TRX_STATE_ACTIVE); if (!lock->is_table()) lock_rec_dequeue_from_page(lock, true); @@ -5697,7 +5735,8 @@ static void lock_cancel_waiting_and_release(lock_t *lock) /* Reset the wait flag and the back pointer to lock in trx. */ lock_reset_lock_and_trx_wait(lock); - lock_wait_end(trx); + lock_wait_end<from_deadlock>(trx); + trx->mutex_unlock(); } @@ -5868,6 +5907,7 @@ lock_unlock_table_autoinc( /** Handle a pending lock wait (DB_LOCK_WAIT) in a semi-consistent read while holding a clustered index leaf page latch. + @param trx transaction that is or was waiting for a lock @retval DB_SUCCESS if the lock was granted @retval DB_DEADLOCK if the transaction must be aborted due to a deadlock @@ -5878,8 +5918,13 @@ dberr_t lock_trx_handle_wait(trx_t *trx) DEBUG_SYNC_C("lock_trx_handle_wait_enter"); if (trx->lock.was_chosen_as_deadlock_victim) return DB_DEADLOCK; + DEBUG_SYNC_C("lock_trx_handle_wait_before_unlocked_wait_lock_check"); + /* trx->lock.was_chosen_as_deadlock_victim must always be set before + trx->lock.wait_lock if the transaction was chosen as deadlock victim, + the function must not return DB_SUCCESS if + trx->lock.was_chosen_as_deadlock_victim is set. */ if (!trx->lock.wait_lock) - return DB_SUCCESS; + return trx->lock.was_chosen_as_deadlock_victim ? DB_DEADLOCK : DB_SUCCESS; dberr_t err= DB_SUCCESS; mysql_mutex_lock(&lock_sys.wait_mutex); if (trx->lock.was_chosen_as_deadlock_victim) @@ -6282,8 +6327,11 @@ namespace Deadlock ut_ad(victim->state == TRX_STATE_ACTIVE); + /* victim->lock.was_chosen_as_deadlock_victim must always be set before + releasing waiting locks and reseting trx->lock.wait_lock */ victim->lock.was_chosen_as_deadlock_victim= true; - lock_cancel_waiting_and_release(victim->lock.wait_lock); + DEBUG_SYNC_C("deadlock_report_before_lock_releasing"); + lock_cancel_waiting_and_release<true>(victim->lock.wait_lock); #ifdef WITH_WSREP if (victim->is_wsrep() && wsrep_thd_is_SR(victim->mysql_thd)) wsrep_handle_SR_rollback(trx->mysql_thd, victim->mysql_thd); diff --git a/storage/innobase/que/que0que.cc b/storage/innobase/que/que0que.cc index 3ea5c15bccc..80c34af2790 100644 --- a/storage/innobase/que/que0que.cc +++ b/storage/innobase/que/que0que.cc @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, 2021, MariaDB Corporation. +Copyright (c) 2017, 2022, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -566,6 +566,27 @@ que_node_type_string( } #endif /* DBUG_TRACE */ + +/**********************************************************************//** +Performs an execution step of an open or close cursor statement node. +@param thr query thread */ +static void open_step(que_thr_t *thr) +{ + open_node_t *node= static_cast<open_node_t*>(thr->run_node); + ut_ad(que_node_get_type(node) == QUE_NODE_OPEN); + sel_node_t *sel_node= node->cursor_def; + + if (node->op_type == ROW_SEL_OPEN_CURSOR) + sel_node->state= SEL_NODE_OPEN; + else + { + ut_ad(sel_node->state != SEL_NODE_CLOSED); + sel_node->state= SEL_NODE_CLOSED; + } + + thr->run_node= que_node_get_parent(node); +} + /**********************************************************************//** Performs an execution step on a query thread. @return query thread to run next: it may differ from the input @@ -636,7 +657,7 @@ que_thr_step( } else if (type == QUE_NODE_FETCH) { thr = fetch_step(thr); } else if (type == QUE_NODE_OPEN) { - thr = open_step(thr); + open_step(thr); } else if (type == QUE_NODE_FUNC) { proc_eval_step(thr); diff --git a/storage/innobase/row/row0log.cc b/storage/innobase/row/row0log.cc index eca894c4fd4..ff190bd0779 100644 --- a/storage/innobase/row/row0log.cc +++ b/storage/innobase/row/row0log.cc @@ -3842,9 +3842,8 @@ UndorecApplier::get_old_rec(const dtuple_t &tuple, dict_index_t *index, ut_ad(len == DATA_ROLL_PTR_LEN); if (is_same(roll_ptr)) return version; - trx_undo_prev_version_build(*clust_rec, &mtr, version, index, - *offsets, heap, &prev_version, nullptr, - nullptr, 0); + trx_undo_prev_version_build(version, index, *offsets, heap, &prev_version, + nullptr, nullptr, 0); version= prev_version; } while (version); @@ -4014,9 +4013,8 @@ void UndorecApplier::log_update(const dtuple_t &tuple, if (match_rec == rec) copy_rec= rec_copy(mem_heap_alloc( heap, rec_offs_size(offsets)), match_rec, offsets); - trx_undo_prev_version_build(rec, &mtr, match_rec, clust_index, - offsets, heap, &prev_version, nullptr, - nullptr, 0); + trx_undo_prev_version_build(match_rec, clust_index, offsets, heap, + &prev_version, nullptr, nullptr, 0); prev_offsets= rec_get_offsets(prev_version, clust_index, prev_offsets, clust_index->n_core_fields, diff --git a/storage/innobase/row/row0merge.cc b/storage/innobase/row/row0merge.cc index 4a9d75e9adb..ec13e73b9ed 100644 --- a/storage/innobase/row/row0merge.cc +++ b/storage/innobase/row/row0merge.cc @@ -2294,8 +2294,14 @@ end_of_index: ut_ad(trx->read_view.is_open()); ut_ad(rec_trx_id != trx->id); - if (!trx->read_view.changes_visible( - rec_trx_id, old_table->name)) { + if (!trx->read_view.changes_visible(rec_trx_id)) { + if (rec_trx_id + >= trx->read_view.low_limit_id() + && rec_trx_id + >= trx_sys.get_max_trx_id()) { + goto corrupted_rec; + } + rec_t* old_vers; row_vers_build_for_consistent_read( @@ -4617,9 +4623,7 @@ row_merge_is_index_usable( && (index->table->is_temporary() || index->table->no_rollback() || index->trx_id == 0 || !trx->read_view.is_open() - || trx->read_view.changes_visible( - index->trx_id, - index->table->name))); + || trx->read_view.changes_visible(index->trx_id))); } /** Build indexes on a table by reading a clustered index, creating a temporary diff --git a/storage/innobase/row/row0mysql.cc b/storage/innobase/row/row0mysql.cc index 66d937f0c27..de469c5b088 100644 --- a/storage/innobase/row/row0mysql.cc +++ b/storage/innobase/row/row0mysql.cc @@ -1766,7 +1766,7 @@ error: /** This can only be used when the current transaction is at READ COMMITTED or READ UNCOMMITTED isolation level. -Before calling this function row_search_for_mysql() must have +Before calling this function row_search_mvcc() must have initialized prebuilt->new_rec_locks to store the information which new record locks really were set. This function removes a newly set clustered index record lock under prebuilt->pcur or @@ -2937,183 +2937,3 @@ funct_exit: return(err); } - -/*********************************************************************//** -Scans an index for either COUNT(*) or CHECK TABLE. -If CHECK TABLE; Checks that the index contains entries in an ascending order, -unique constraint is not broken, and calculates the number of index entries -in the read view of the current transaction. -@return DB_SUCCESS or other error */ -dberr_t -row_scan_index_for_mysql( -/*=====================*/ - row_prebuilt_t* prebuilt, /*!< in: prebuilt struct - in MySQL handle */ - const dict_index_t* index, /*!< in: index */ - ulint* n_rows) /*!< out: number of entries - seen in the consistent read */ -{ - dtuple_t* prev_entry = NULL; - ulint matched_fields; - byte* buf; - dberr_t ret; - rec_t* rec; - int cmp; - ibool contains_null; - ulint i; - ulint cnt; - mem_heap_t* heap = NULL; - rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; - rec_offs* offsets; - rec_offs_init(offsets_); - - *n_rows = 0; - - /* Don't support RTree Leaf level scan */ - ut_ad(!dict_index_is_spatial(index)); - - if (dict_index_is_clust(index)) { - /* The clustered index of a table is always available. - During online ALTER TABLE that rebuilds the table, the - clustered index in the old table will have - index->online_log pointing to the new table. All - indexes of the old table will remain valid and the new - table will be unaccessible to MySQL until the - completion of the ALTER TABLE. */ - } else if (dict_index_is_online_ddl(index) - || (index->type & DICT_FTS)) { - /* Full Text index are implemented by auxiliary tables, - not the B-tree. We also skip secondary indexes that are - being created online. */ - return(DB_SUCCESS); - } - - ulint bufsize = std::max<ulint>(srv_page_size, - prebuilt->mysql_row_len); - buf = static_cast<byte*>(ut_malloc_nokey(bufsize)); - heap = mem_heap_create(100); - - cnt = 1000; - - ret = row_search_for_mysql(buf, PAGE_CUR_G, prebuilt, 0, 0); -loop: - /* Check thd->killed every 1,000 scanned rows */ - if (--cnt == 0) { - if (trx_is_interrupted(prebuilt->trx)) { - ret = DB_INTERRUPTED; - goto func_exit; - } - cnt = 1000; - } - - switch (ret) { - case DB_SUCCESS: - break; - case DB_DEADLOCK: - case DB_LOCK_TABLE_FULL: - case DB_LOCK_WAIT_TIMEOUT: - case DB_INTERRUPTED: - goto func_exit; - default: - ib::warn() << "CHECK TABLE on index " << index->name << " of" - " table " << index->table->name << " returned " << ret; - /* (this error is ignored by CHECK TABLE) */ - /* fall through */ - case DB_END_OF_INDEX: - ret = DB_SUCCESS; -func_exit: - ut_free(buf); - mem_heap_free(heap); - - return(ret); - } - - *n_rows = *n_rows + 1; - - /* else this code is doing handler::check() for CHECK TABLE */ - - /* row_search... returns the index record in buf, record origin offset - within buf stored in the first 4 bytes, because we have built a dummy - template */ - - rec = buf + mach_read_from_4(buf); - - offsets = rec_get_offsets(rec, index, offsets_, index->n_core_fields, - ULINT_UNDEFINED, &heap); - - if (prev_entry != NULL) { - matched_fields = 0; - - cmp = cmp_dtuple_rec_with_match(prev_entry, - rec, index, offsets, - &matched_fields); - contains_null = FALSE; - - /* In a unique secondary index we allow equal key values if - they contain SQL NULLs */ - - for (i = 0; - i < dict_index_get_n_ordering_defined_by_user(index); - i++) { - if (UNIV_SQL_NULL == dfield_get_len( - dtuple_get_nth_field(prev_entry, i))) { - - contains_null = TRUE; - break; - } - } - - const char* msg; - - if (cmp > 0) { - ret = DB_INDEX_CORRUPT; - msg = "index records in a wrong order in "; -not_ok: - ib::error() - << msg << index->name - << " of table " << index->table->name - << ": " << *prev_entry << ", " - << rec_offsets_print(rec, offsets); - /* Continue reading */ - } else if (dict_index_is_unique(index) - && !contains_null - && matched_fields - >= dict_index_get_n_ordering_defined_by_user( - index)) { - ret = DB_DUPLICATE_KEY; - msg = "duplicate key in "; - goto not_ok; - } - } - - { - mem_heap_t* tmp_heap = NULL; - - /* Empty the heap on each round. But preserve offsets[] - for the row_rec_to_index_entry() call, by copying them - into a separate memory heap when needed. */ - if (UNIV_UNLIKELY(offsets != offsets_)) { - ulint size = rec_offs_get_n_alloc(offsets) - * sizeof *offsets; - - tmp_heap = mem_heap_create(size); - - offsets = static_cast<rec_offs*>( - mem_heap_dup(tmp_heap, offsets, size)); - } - - mem_heap_empty(heap); - - prev_entry = row_rec_to_index_entry( - rec, index, offsets, heap); - - if (UNIV_LIKELY_NULL(tmp_heap)) { - mem_heap_free(tmp_heap); - } - } - - ret = row_search_for_mysql( - buf, PAGE_CUR_G, prebuilt, 0, ROW_SEL_NEXT); - - goto loop; -} diff --git a/storage/innobase/row/row0sel.cc b/storage/innobase/row/row0sel.cc index f60403ec4eb..cf1aaedf240 100644 --- a/storage/innobase/row/row0sel.cc +++ b/storage/innobase/row/row0sel.cc @@ -36,6 +36,8 @@ Created 12/19/1997 Heikki Tuuri #include "dict0boot.h" #include "trx0undo.h" #include "trx0trx.h" +#include "trx0purge.h" +#include "trx0rec.h" #include "btr0btr.h" #include "btr0cur.h" #include "btr0sea.h" @@ -54,6 +56,7 @@ Created 12/19/1997 Heikki Tuuri #include "buf0lru.h" #include "srv0srv.h" #include "srv0mon.h" +#include "sql_error.h" #ifdef WITH_WSREP #include "mysql/service_wsrep.h" /* For wsrep_thd_skip_locking */ #endif @@ -282,7 +285,6 @@ row_sel_sec_rec_is_for_clust_rec( rec_offs_init(clust_offsets_); rec_offs_init(sec_offsets_); - ib_vcol_row vc(heap); clust_offs = rec_get_offsets(clust_rec, clust_index, clust_offs, @@ -951,9 +953,12 @@ row_sel_test_other_conds( @param index clustered index @param offsets rec_get_offsets(rec, index) @param view consistent read view -@return whether rec is visible in view */ -static bool row_sel_clust_sees(const rec_t *rec, const dict_index_t &index, - const rec_offs *offsets, const ReadView &view) +@retval DB_SUCCESS if rec is visible in view +@retval DB_SUCCESS_LOCKED_REC if rec is not visible in view +@retval DB_CORRUPTION if the DB_TRX_ID is corrupted */ +static dberr_t row_sel_clust_sees(const rec_t *rec, const dict_index_t &index, + const rec_offs *offsets, + const ReadView &view) { ut_ad(index.is_primary()); ut_ad(page_rec_is_user_rec(rec)); @@ -961,8 +966,16 @@ static bool row_sel_clust_sees(const rec_t *rec, const dict_index_t &index, ut_ad(!rec_is_metadata(rec, index)); ut_ad(!index.table->is_temporary()); - return view.changes_visible(row_get_rec_trx_id(rec, &index, offsets), - index.table->name); + const trx_id_t id= row_get_rec_trx_id(rec, &index, offsets); + + if (view.changes_visible(id)) + return DB_SUCCESS; + if (UNIV_LIKELY(id < view.low_limit_id() || id < trx_sys.get_max_trx_id())) + return DB_SUCCESS_LOCKED_REC; + + ib::warn() << "A transaction id in a record of table " << index.table->name + << " is newer than the system-wide maximum."; + return DB_CORRUPTION; } /*********************************************************************//** @@ -1073,9 +1086,15 @@ row_sel_get_clust_rec( old_vers = NULL; - if (!row_sel_clust_sees(clust_rec, *index, offsets, - *node->read_view)) { + err = row_sel_clust_sees(clust_rec, *index, offsets, + *node->read_view); + switch (err) { + default: + goto err_exit; + case DB_SUCCESS: + break; + case DB_SUCCESS_LOCKED_REC: err = row_sel_build_prev_vers( node->read_view, index, clust_rec, &offsets, &heap, &plan->old_vers_heap, @@ -1593,8 +1612,8 @@ row_sel_try_search_shortcut( ULINT_UNDEFINED, &heap); if (dict_index_is_clust(index)) { - if (!row_sel_clust_sees(rec, *index, offsets, - *node->read_view)) { + if (row_sel_clust_sees(rec, *index, offsets, *node->read_view) + != DB_SUCCESS) { return SEL_RETRY; } } else if (!srv_read_only_mode) { @@ -1961,9 +1980,16 @@ skip_lock: a previous version of the record */ if (dict_index_is_clust(index)) { - if (!node->read_view->changes_visible( - row_get_rec_trx_id(rec, index, offsets), - index->table->name)) { + const trx_id_t id = row_get_rec_trx_id( + rec, index, offsets); + + if (!node->read_view->changes_visible(id)) { + if (id >= node->read_view->low_limit_id() + && id >= trx_sys.get_max_trx_id()) { + err = DB_CORRUPTION; + goto lock_wait_or_error; + } + err = row_sel_build_prev_vers( node->read_view, index, rec, &offsets, &heap, &plan->old_vers_heap, @@ -3229,6 +3255,14 @@ static bool row_sel_store_mysql_rec( DBUG_RETURN(true); } +static void row_sel_reset_old_vers_heap(row_prebuilt_t *prebuilt) +{ + if (prebuilt->old_vers_heap) + mem_heap_empty(prebuilt->old_vers_heap); + else + prebuilt->old_vers_heap= mem_heap_create(200); +} + /*********************************************************************//** Builds a previous version of a clustered index record for a consistent read @return DB_SUCCESS or error code */ @@ -3236,9 +3270,8 @@ static MY_ATTRIBUTE((warn_unused_result)) dberr_t row_sel_build_prev_vers_for_mysql( /*==============================*/ - ReadView* read_view, /*!< in: read view */ + row_prebuilt_t* prebuilt, /*!< in/out: prebuilt struct */ dict_index_t* clust_index, /*!< in: clustered index */ - row_prebuilt_t* prebuilt, /*!< in: prebuilt struct */ const rec_t* rec, /*!< in: record in a clustered index */ rec_offs** offsets, /*!< in/out: offsets returned by rec_get_offsets(rec, clust_index) */ @@ -3252,18 +3285,12 @@ row_sel_build_prev_vers_for_mysql( column data */ mtr_t* mtr) /*!< in: mtr */ { - dberr_t err; - - if (prebuilt->old_vers_heap) { - mem_heap_empty(prebuilt->old_vers_heap); - } else { - prebuilt->old_vers_heap = mem_heap_create(200); - } + row_sel_reset_old_vers_heap(prebuilt); - err = row_vers_build_for_consistent_read( - rec, mtr, clust_index, offsets, read_view, offset_heap, + return row_vers_build_for_consistent_read( + rec, mtr, clust_index, offsets, + &prebuilt->trx->read_view, offset_heap, prebuilt->old_vers_heap, old_vers, vrow); - return(err); } /** Helper class to cache clust_rec and old_vers */ @@ -3340,7 +3367,6 @@ Row_sel_get_clust_rec_for_mysql::operator()( access the clustered index */ { dict_index_t* clust_index; - const rec_t* clust_rec; rec_t* old_vers; trx_t* trx; @@ -3363,7 +3389,7 @@ Row_sel_get_clust_rec_for_mysql::operator()( return err; } - clust_rec = btr_pcur_get_rec(prebuilt->clust_pcur); + const rec_t* clust_rec = btr_pcur_get_rec(prebuilt->clust_pcur); prebuilt->clust_pcur->trx_if_known = trx; @@ -3391,8 +3417,6 @@ Row_sel_get_clust_rec_for_mysql::operator()( if (!rtr_info->matches->valid) { mysql_mutex_unlock(&rtr_info->matches->rtr_match_mutex); clust_rec = NULL; - - err = DB_SUCCESS; goto func_exit; } mysql_mutex_unlock(&rtr_info->matches->rtr_match_mutex); @@ -3402,15 +3426,11 @@ Row_sel_get_clust_rec_for_mysql::operator()( && prebuilt->select_lock_type == LOCK_NONE) { clust_rec = NULL; - - err = DB_SUCCESS; goto func_exit; } if (rec != btr_pcur_get_rec(prebuilt->pcur)) { clust_rec = NULL; - - err = DB_SUCCESS; goto func_exit; } @@ -3436,18 +3456,14 @@ Row_sel_get_clust_rec_for_mysql::operator()( nullptr)); ut_ad(low_match < dtuple_get_n_fields_cmp(tuple)); mem_heap_free(heap); - clust_rec = NULL; - err = DB_SUCCESS; - goto func_exit; #endif /* UNIV_DEBUG */ } else if (!rec_get_deleted_flag(rec, dict_table_is_comp(sec_index->table)) || prebuilt->select_lock_type != LOCK_NONE) { /* In a rare case it is possible that no clust rec is found for a delete-marked secondary index - record: if in row0umod.cc in - row_undo_mod_remove_clust_low() we have already removed + record: if row_undo_mod_clust() has already removed the clust rec, while purge is still cleaning and removing secondary index records associated with earlier versions of the clustered index record. @@ -3463,11 +3479,8 @@ Row_sel_get_clust_rec_for_mysql::operator()( "InnoDB: clust index record ", stderr); rec_print(stderr, clust_rec, clust_index); err = DB_CORRUPTION; - clust_rec = NULL; - goto func_exit; } - err = DB_SUCCESS; clust_rec = NULL; goto func_exit; } @@ -3503,11 +3516,20 @@ Row_sel_get_clust_rec_for_mysql::operator()( if (trx->isolation_level == TRX_ISO_READ_UNCOMMITTED || clust_index->table->is_temporary()) { + } else { /* If the isolation level allows reading of uncommitted data, then we never look for an earlier version */ - } else if (!row_sel_clust_sees(clust_rec, *clust_index, - *offsets, trx->read_view)) { + err = row_sel_clust_sees(clust_rec, *clust_index, + *offsets, trx->read_view); + } + + switch (err) { + default: + return err; + case DB_SUCCESS: + break; + case DB_SUCCESS_LOCKED_REC: const buf_page_t& bpage = btr_pcur_get_block( prebuilt->clust_pcur)->page; @@ -3520,7 +3542,7 @@ Row_sel_get_clust_rec_for_mysql::operator()( /* The following call returns 'offsets' associated with 'old_vers' */ err = row_sel_build_prev_vers_for_mysql( - &trx->read_view, clust_index, prebuilt, + prebuilt, clust_index, clust_rec, offsets, offset_heap, &old_vers, vrow, mtr); @@ -3977,7 +3999,8 @@ row_sel_try_search_shortcut_for_mysql( *offsets = rec_get_offsets(rec, index, *offsets, index->n_core_fields, ULINT_UNDEFINED, heap); - if (!row_sel_clust_sees(rec, *index, *offsets, trx->read_view)) { + if (row_sel_clust_sees(rec, *index, *offsets, trx->read_view) + != DB_SUCCESS) { return SEL_RETRY; } @@ -4375,8 +4398,8 @@ row_search_mvcc( /* We need to get the virtual column values stored in secondary index key, if this is covered index scan or virtual key read is requested. */ - bool need_vrow = dict_index_has_virtual(prebuilt->index) - && prebuilt->read_just_key; + bool need_vrow = prebuilt->read_just_key + && prebuilt->index->has_virtual(); /* Reset the new record lock info if READ UNCOMMITTED or READ COMMITED isolation level is used. Then @@ -4880,11 +4903,6 @@ rec_loop: rec = btr_pcur_get_rec(pcur); - if (!index->table->is_readable()) { - err = DB_DECRYPTION_FAILED; - goto page_read_error; - } - ut_ad(!!page_rec_is_comp(rec) == comp); ut_ad(page_rec_is_leaf(rec)); @@ -5252,6 +5270,8 @@ no_gap_lock: switch (err) { case DB_SUCCESS: + ut_ad( + !trx->lock.was_chosen_as_deadlock_victim); /* The lock was granted while we were searching for the last committed version. Do a normal locking read. */ @@ -5324,18 +5344,24 @@ no_gap_lock: high force recovery level set, we try to avoid crashes by skipping this lookup */ - if (!row_sel_clust_sees(rec, *index, offsets, - trx->read_view)) { + err = row_sel_clust_sees(rec, *index, offsets, + trx->read_view); + + switch (err) { + default: + goto lock_wait_or_error; + case DB_SUCCESS: + break; + case DB_SUCCESS_LOCKED_REC: ut_ad(srv_force_recovery < SRV_FORCE_NO_UNDO_LOG_SCAN); rec_t* old_vers; /* The following call returns 'offsets' associated with 'old_vers' */ err = row_sel_build_prev_vers_for_mysql( - &trx->read_view, clust_index, - prebuilt, rec, &offsets, &heap, - &old_vers, need_vrow ? &vrow : NULL, - &mtr); + prebuilt, clust_index, + rec, &offsets, &heap, &old_vers, + need_vrow ? &vrow : nullptr, &mtr); if (err != DB_SUCCESS) { @@ -5476,8 +5502,7 @@ requires_clust_rec: &offsets, &heap, need_vrow ? &vrow : NULL, &mtr); - if (prebuilt->skip_locked && - err == DB_LOCK_WAIT) { + if (err == DB_LOCK_WAIT && prebuilt->skip_locked) { err = lock_trx_handle_wait(trx); } switch (err) { @@ -5486,7 +5511,6 @@ requires_clust_rec: /* The record did not exist in the read view */ ut_ad(prebuilt->select_lock_type == LOCK_NONE || dict_index_is_spatial(index)); - goto next_rec; } break; @@ -5581,9 +5605,7 @@ use_covering_index: && !prebuilt->templ_contains_blob && !prebuilt->clust_index_was_generated && !prebuilt->used_in_HANDLER - && prebuilt->template_type != ROW_MYSQL_DUMMY_TEMPLATE && !prebuilt->in_fts_query) { - /* Inside an update, for example, we do not cache rows, since we may use the cursor position to do the actual update, that is why we require ...lock_type == LOCK_NONE. @@ -5648,29 +5670,8 @@ use_covering_index: if (prebuilt->n_fetch_cached < MYSQL_FETCH_CACHE_SIZE) { goto next_rec; } - } else { - if (UNIV_UNLIKELY - (prebuilt->template_type == ROW_MYSQL_DUMMY_TEMPLATE)) { - /* CHECK TABLE: fetch the row */ - - if (result_rec != rec - && !prebuilt->need_to_access_clustered) { - /* We used 'offsets' for the clust - rec, recalculate them for 'rec' */ - offsets = rec_get_offsets(rec, index, offsets, - index->n_core_fields, - ULINT_UNDEFINED, - &heap); - result_rec = rec; - } - - memcpy(buf + 4, result_rec - - rec_offs_extra_size(offsets), - rec_offs_size(offsets)); - mach_write_to_4(buf, - rec_offs_extra_size(offsets) + 4); - } else if (!prebuilt->pk_filter && !prebuilt->idx_cond) { + if (!prebuilt->pk_filter && !prebuilt->idx_cond) { /* The record was not yet converted to MySQL format. */ if (!row_sel_store_mysql_rec( buf, prebuilt, result_rec, vrow, @@ -6026,18 +6027,11 @@ row_count_rtree_recs( prebuilt->mysql_row_len); buf = static_cast<byte*>(ut_malloc_nokey(bufsize)); - ulint cnt = 1000; + ulint direction = 0; - ret = row_search_for_mysql(buf, PAGE_CUR_WITHIN, prebuilt, 0, 0); loop: - /* Check thd->killed every 1,000 scanned rows */ - if (--cnt == 0) { - if (trx_is_interrupted(prebuilt->trx)) { - ret = DB_INTERRUPTED; - goto func_exit; - } - cnt = 1000; - } + ret = row_search_mvcc(buf, PAGE_CUR_WITHIN, prebuilt, 0, direction); + direction = ROW_SEL_NEXT; switch (ret) { case DB_SUCCESS: @@ -6059,12 +6053,778 @@ func_exit: return(ret); } - *n_rows = *n_rows + 1; + ++*n_rows; + goto loop; +} + +/** Check if a version of a clustered index record and a secondary +index record match. + +@param prebuilt index and transaction +@param clust_rec a version of a clustered index record +@param clust_index clustered index +@param clust_offsets rec_get_offsets(clust_rec, clust_index) +@param rec secondary index leaf page record +@param offsets rec_get_offsets(rec, index) +@return an error code +@retval DB_SUCCESS if rec matches clust_rec +@retval DB_SUCCESS_LOCKED_REC if rec does not match clust_rec +*/ +static dberr_t row_check_index_match(row_prebuilt_t *prebuilt, + const rec_t *clust_rec, + const dict_index_t *clust_index, + const rec_offs *clust_offsets, + const rec_t *rec, + const dict_index_t *index, + const rec_offs *offsets) +{ + ut_ad(index == prebuilt->index); - ret = row_search_for_mysql( - buf, PAGE_CUR_WITHIN, prebuilt, 0, ROW_SEL_NEXT); + ib_vcol_row vc(index->has_virtual() ? mem_heap_create(256) : nullptr); - goto loop; + const uint16_t n= index->n_user_defined_cols; + + for (uint16_t i= 0; i < n; i++) + { + ulint pos= 0; + ulint len, sec_len; + + const dict_field_t &ifield= index->fields[i]; + const byte *sec_field= rec_get_nth_field(rec, offsets, i, &sec_len); + const byte *field; + + if (ifield.col->is_virtual()) + { + /* Virtual column values must be reconstructed from the base columns. */ + row_ext_t *ext; + byte *record= vc.record(prebuilt->trx->mysql_thd, clust_index, + &prebuilt->m_mysql_table); + const dict_v_col_t *v_col= reinterpret_cast<const dict_v_col_t*> + (ifield.col); + dtuple_t *row= row_build(ROW_COPY_POINTERS, + clust_index, clust_rec, clust_offsets, + nullptr, nullptr, nullptr, &ext, vc.heap); + if (dfield_t *vfield= + innobase_get_computed_value(row, v_col, clust_index, &vc.heap, + nullptr, nullptr, + prebuilt->trx->mysql_thd, + prebuilt->m_mysql_table, + record, nullptr, nullptr)) + { + len= vfield->len; + field= static_cast<byte*>(vfield->data); + } + else + { + innobase_report_computed_value_failed(row); + return DB_COMPUTE_VALUE_FAILED; + } + } + else + { + pos= dict_col_get_clust_pos(ifield.col, clust_index); + field= rec_get_nth_cfield(clust_rec, clust_index, clust_offsets, pos, + &len); + if (len == UNIV_SQL_NULL) + { + if (sec_len == UNIV_SQL_NULL) + continue; + return DB_SUCCESS_LOCKED_REC; + } + if (sec_len == UNIV_SQL_NULL) + return DB_SUCCESS_LOCKED_REC; + + if (rec_offs_nth_extern(clust_offsets, pos)) + { + if (len == BTR_EXTERN_FIELD_REF_SIZE) + goto compare_blobs; + len-= BTR_EXTERN_FIELD_REF_SIZE; + } + + if (ifield.prefix_len) + { + len= + dtype_get_at_most_n_mbchars(ifield.col->prtype, ifield.col->mbminlen, + ifield.col->mbmaxlen, + ifield.prefix_len, len, + reinterpret_cast<const char*>(field)); + if (len < sec_len) + goto check_for_blob; + } + else + { +check_for_blob: + if (rec_offs_nth_extern(clust_offsets, pos)) + { +compare_blobs: + if (!row_sel_sec_rec_is_for_blob(ifield.col->mtype, + ifield.col->prtype, + ifield.col->mbminlen, + ifield.col->mbmaxlen, + field, len, sec_field, sec_len, + ifield.prefix_len, + clust_index->table)) + return DB_SUCCESS_LOCKED_REC; + continue; + } + } + } + + if (cmp_data(ifield.col->mtype, ifield.col->prtype, false, + field, len, sec_field, sec_len)) + return DB_SUCCESS_LOCKED_REC; + } + + return DB_SUCCESS; +} + +/** +Check the index records in CHECK TABLE. +The index must contain entries in an ascending order, +unique constraint must not be violated by duplicated keys, +and the number of index entries is counted in according to the +current read view. + +@param prebuilt index and transaction +@param n_rows number of records counted + +@return error code +@retval DB_SUCCESS if no error was found */ +dberr_t row_check_index(row_prebuilt_t *prebuilt, ulint *n_rows) +{ + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs_init(offsets_); + + *n_rows= 0; + dict_index_t *const index= prebuilt->index; + + prebuilt->fetch_direction= ROW_SEL_NEXT; + + if (!index->is_btree()) + return DB_CORRUPTION; + + mem_heap_t *heap= mem_heap_create(100); + + dtuple_t *prev_entry= nullptr; + mtr_t mtr; + mtr.start(); + + dict_index_t *clust_index= dict_table_get_first_index(prebuilt->table); + + dberr_t err= btr_pcur_open_at_index_side(true, index, BTR_SEARCH_LEAF, + prebuilt->pcur, false, 0, &mtr); + if (UNIV_UNLIKELY(err != DB_SUCCESS)) + { +func_exit: + mtr.commit(); + mem_heap_free(heap); + return err; + } + + if (const trx_id_t bulk_trx_id= index->table->bulk_trx_id) + if (!prebuilt->trx->read_view.changes_visible(bulk_trx_id)) + goto func_exit; + + ReadView check_table_extended_view; + ReadView &view= + prebuilt->need_to_access_clustered && + !prebuilt->table->is_temporary() && + prebuilt->trx->isolation_level != TRX_ISO_READ_UNCOMMITTED + ? check_table_extended_view : prebuilt->trx->read_view; + if (&view == &check_table_extended_view) + check_table_extended_view.set_creator_trx_id(prebuilt->trx->id); + +page_loop: + if (&view == &check_table_extended_view) + /* In CHECK TABLE...EXTENDED, we make a copy of purge_sys.end_view + while holding a shared latch on the index leaf page. + Should a currently active purge batch desire to remove any further + records from this page, it would be blocked by our page latch. + + We will consult check_table_extended_view to determine if a + clustered index record corresponding to a secondary index record + is visible to the current purge batch. Right after we have made our + copy, purge_sys.end_view is free to be changed again. + + If we have an orphan secondary index record, we may attempt to + request a clustered index record version that cannot be retrieved + any more because the undo log records may have been freed + (according to the purge_sys.end_view). In such a case, + trx_undo_get_undo_rec() would cause + trx_undo_prev_version_build() and trx_undo_prev_version_build() + to return DB_MISSING_HISTORY. */ + static_cast<ReadViewBase&>(check_table_extended_view)= + purge_sys_t::end_view_guard{}.view(); + +rec_loop: + ut_ad(err == DB_SUCCESS); + + if (!btr_pcur_move_to_next_on_page(prebuilt->pcur)) + { + err= DB_CORRUPTION; + goto func_exit; + } + + const rec_t *rec= btr_pcur_get_rec(prebuilt->pcur); + rec_offs *offsets= offsets_; + + if (page_rec_is_supremum(rec)) + { + next_page: + if (btr_pcur_is_after_last_in_tree(prebuilt->pcur)) + goto func_exit; + err= btr_pcur_move_to_next_page(prebuilt->pcur, &mtr); + if (err == DB_SUCCESS && trx_is_interrupted(prebuilt->trx)) + err= DB_INTERRUPTED; + if (UNIV_UNLIKELY(err != DB_SUCCESS)) + goto func_exit; + goto page_loop; + } + + offsets= rec_get_offsets(rec, index, offsets, index->n_core_fields, + ULINT_UNDEFINED, &heap); + + const auto info_bits= + rec_get_info_bits(rec, prebuilt->table->not_redundant()); + const bool rec_deleted= info_bits & REC_INFO_DELETED_FLAG; + + if (UNIV_UNLIKELY(info_bits & REC_INFO_MIN_REC_FLAG)) + { + if (*n_rows || !index->is_instant()) + { + push_warning_printf(prebuilt->trx->mysql_thd, + Sql_condition::WARN_LEVEL_WARN, ER_NOT_KEYFILE, + "InnoDB: invalid record encountered"); + prebuilt->autoinc_error= DB_INDEX_CORRUPT; + } + goto next_rec; + } + + if (index->is_clust()) + { + if (prebuilt->trx->isolation_level == TRX_ISO_READ_UNCOMMITTED) + { + if (!rec_deleted) + goto count_row; + goto next_rec; + } + + trx_id_t rec_trx_id= row_get_rec_trx_id(rec, index, offsets); + + if (rec_trx_id >= prebuilt->trx->read_view.low_limit_id() && + UNIV_UNLIKELY(rec_trx_id >= trx_sys.get_max_trx_id())) + { + invalid_trx_id: + if (prebuilt->autoinc_error == DB_SUCCESS) + push_warning_printf(prebuilt->trx->mysql_thd, + Sql_condition::WARN_LEVEL_WARN, + ER_NOT_KEYFILE, + "InnoDB: DB_TRX_ID=" TRX_ID_FMT + " exceeds the system-wide maximum", + rec_trx_id); + prebuilt->autoinc_error= DB_CORRUPTION; + goto next_rec; + } + + if (!prebuilt->trx->read_view.changes_visible(rec_trx_id)) + { + ut_ad(srv_force_recovery < SRV_FORCE_NO_UNDO_LOG_SCAN); + rec_t *old_vers; + /* The following call returns 'offsets' associated with 'old_vers' */ + err= row_sel_build_prev_vers_for_mysql(prebuilt, index, rec, &offsets, + &heap, &old_vers, nullptr, &mtr); + + if (err != DB_SUCCESS) + goto func_exit; + + if (old_vers) + { + rec= old_vers; + rec_trx_id= row_get_rec_trx_id(rec, index, offsets); + + if (rec_trx_id >= prebuilt->trx->read_view.low_limit_id() && + UNIV_UNLIKELY(rec_trx_id >= trx_sys.get_max_trx_id())) + goto invalid_trx_id; + + if (!rec_get_deleted_flag(rec, prebuilt->table->not_redundant())) + goto count_row; + } + else + offsets= rec_get_offsets(rec, index, offsets, index->n_core_fields, + ULINT_UNDEFINED, &heap); + goto next_rec; + } + else if (!rec_deleted && !rec_trx_id); + else if (!check_table_extended_view.changes_visible(rec_trx_id)); + else if (prebuilt->autoinc_error == DB_SUCCESS) + { + const char *msg= rec_deleted + ? "Unpurged clustered index record" + : "Clustered index record with stale history"; + + ib::warn w; + w << msg << " in table " << index->table->name << ": " + << rec_offsets_print(rec, offsets); + prebuilt->autoinc_error= DB_MISSING_HISTORY; + push_warning_printf(prebuilt->trx->mysql_thd, + Sql_condition::WARN_LEVEL_WARN, + ER_NOT_KEYFILE, "InnoDB: %s", w.m_oss.str().c_str()); + } + + if (!rec_deleted) + goto count_row; + + goto next_rec; + } + else if (const trx_id_t page_trx_id= page_get_max_trx_id(page_align(rec))) + { + if (page_trx_id >= trx_sys.get_max_trx_id()) + goto invalid_PAGE_MAX_TRX_ID; + if (prebuilt->trx->isolation_level == TRX_ISO_READ_UNCOMMITTED); + else if (&view == &check_table_extended_view || rec_deleted || + !view.sees(page_trx_id)) + { + bool got_extended_match= &view == &check_table_extended_view; + const auto savepoint= mtr.get_savepoint(); + + row_build_row_ref_in_tuple(prebuilt->clust_ref, rec, index, offsets); + err= btr_pcur_open_with_no_init(clust_index, prebuilt->clust_ref, + PAGE_CUR_LE, BTR_SEARCH_LEAF, + prebuilt->clust_pcur, &mtr); + if (err != DB_SUCCESS) + goto func_exit; + + const rec_t *clust_rec= btr_pcur_get_rec(prebuilt->clust_pcur); + + /* Note: only if the search ends up on a non-infimum record is the + low_match value the real match to the search tuple */ + + if (!page_rec_is_user_rec(clust_rec) || + btr_pcur_get_low_match(prebuilt->clust_pcur) < clust_index->n_uniq) + { + if (!rec_deleted) + { + not_found: + /* MDEV-29823 FIXME: There is a race condition between + rollback, purge, and possibly other SQL connections that + are creating and releasing read views. At the time + row_undo_mod_del_mark_or_remove_sec_low() is executing + rollback on a secondary index record, purge_sys.view + may not allow it to delete the record, and it will be + delete-marked. Eventually purge_sys.view would advance, + but the delete-marked record could never be removed, + because no undo log record was ever added to + the purge queue by trx_purge_add_undo_to_history(). + + For now, we will not flag an error about orphan secondary index + records that are delete-marked; we will only warn about them. */ + + if (!rec_deleted || prebuilt->autoinc_error == DB_SUCCESS) + { + ib::error_or_warn w(!rec_deleted); + w << "Clustered index record not found for index " + << index->name << " of table " << index->table->name + << ": " << rec_offsets_print(rec, offsets); + push_warning_printf(prebuilt->trx->mysql_thd, + Sql_condition::WARN_LEVEL_WARN, + ER_NOT_KEYFILE, "InnoDB: %s", + w.m_oss.str().c_str()); + } + + if (prebuilt->autoinc_error == DB_SUCCESS) + prebuilt->autoinc_error= rec_deleted + ? DB_MISSING_HISTORY + : DB_CORRUPTION; + } + else if (&view == &check_table_extended_view) + extended_not_found: + if (view.changes_visible(page_trx_id)) + goto not_found; + did_not_find: + mtr.rollback_to_savepoint(savepoint); + goto next_rec; + } + + rec_offs *clust_offsets; + trx_id_t rec_trx_id; + rec_t *old_vers= nullptr; + + bool found_in_view= false; + trx_id_t visible_trx_id= ~0ULL; + + if (ulint trx_id_offset= clust_index->trx_id_offset) + { + clust_offsets= nullptr; + read_trx_id: + rec_trx_id= trx_read_trx_id(clust_rec + trx_id_offset); + + if (clust_rec[trx_id_offset + DATA_TRX_ID_LEN] & 0x80) + { + if (UNIV_UNLIKELY + (rec_get_deleted_flag(clust_rec, + prebuilt->table->not_redundant()))) + { + err= DB_CORRUPTION; + goto func_exit; + } + + /* This is the oldest available record version (fresh insert). */ + if (!view.changes_visible(rec_trx_id)) + { + if (rec_trx_id >= view.low_limit_id() && + UNIV_UNLIKELY(rec_trx_id >= trx_sys.get_max_trx_id())) + goto invalid_rec_trx_id; + if (got_extended_match) + goto check_latest_version; + goto did_not_find; + } + } + } + else + { + clust_offsets= rec_get_offsets(clust_rec, clust_index, nullptr, + clust_index->n_core_fields, + ULINT_UNDEFINED, &heap); + ulint trx_id_pos= clust_index->n_uniq ? clust_index->n_uniq : 1; + ulint len; + trx_id_offset= rec_get_nth_field_offs(clust_offsets, trx_id_pos, &len); + ut_ad(len == DATA_TRX_ID_LEN); + goto read_trx_id; + } + + if (got_extended_match) + { + check_latest_version: + /* In CHECK TABLE...EXTENDED, always check if the secondary + index record matches the latest clustered index record + version, no matter if it is visible in our own read view. + + If the latest clustered index version is delete-marked and + purgeable, it is not safe to fetch any BLOBs for column prefix + indexes because they may already have been freed. */ + if (rec_trx_id && + rec_get_deleted_flag(clust_rec, + prebuilt->table->not_redundant()) && + purge_sys.is_purgeable(rec_trx_id)) + goto did_not_find; + + if (!clust_offsets) + clust_offsets= rec_get_offsets(clust_rec, clust_index, nullptr, + clust_index->n_core_fields, + ULINT_UNDEFINED, &heap); + err= row_check_index_match(prebuilt, + clust_rec, clust_index, clust_offsets, + rec, index, offsets); + + switch (err) { + default: + goto func_exit; + case DB_SUCCESS_LOCKED_REC: + case DB_SUCCESS: + break; + } + + got_extended_match= err == DB_SUCCESS; + err= DB_SUCCESS; + + if (!prebuilt->trx->read_view.changes_visible(rec_trx_id)) + /* While CHECK TABLE ... EXTENDED checks for a matching + clustered index record version for each secondary index + record, it must count only those records that belong to its + own read view. + + If the latest version of clust_rec matches rec but is not + in our read view, there may still be an older version of + clust_rec that not only matches rec but is in our view. + We must evaluate old versions before deciding whether rec + should be counted. */ + goto check_old_vers; + + /* Remember that this is the visible clust_rec for rec, + and whether it matches rec. */ + visible_trx_id= rec_trx_id; + found_in_view= got_extended_match && + !rec_get_deleted_flag(clust_rec, + prebuilt->table->not_redundant()); + + if (!got_extended_match) + goto check_old_vers; + + if (!found_in_view) + goto did_not_find; + + found_match: + mtr.rollback_to_savepoint(savepoint); + goto count_row; + } + else if (!view.changes_visible(rec_trx_id)) + { + check_old_vers: + if (rec_trx_id >= view.low_limit_id() && + UNIV_UNLIKELY(rec_trx_id >= trx_sys.get_max_trx_id())) + { + invalid_rec_trx_id: + if (prebuilt->autoinc_error == DB_SUCCESS) + push_warning_printf(prebuilt->trx->mysql_thd, + Sql_condition::WARN_LEVEL_WARN, + ER_NOT_KEYFILE, + "InnoDB: DB_TRX_ID=" TRX_ID_FMT + " exceeds the system-wide maximum", + rec_trx_id); + goto not_found; + } + + if (!clust_offsets) + clust_offsets= rec_get_offsets(clust_rec, clust_index, nullptr, + clust_index->n_core_fields, + ULINT_UNDEFINED, &heap); + + row_sel_reset_old_vers_heap(prebuilt); + /* The following is adapted from row_vers_build_for_consistent_read() + because when using check_table_extended_view, we must + consider every available version of the clustered index record. */ + mem_heap_t *vers_heap= nullptr; + + for (;;) + { + mem_heap_t *prev_heap= vers_heap; + vers_heap= mem_heap_create(1024); + err= trx_undo_prev_version_build(clust_rec, + clust_index, clust_offsets, + vers_heap, &old_vers, + nullptr, nullptr, 0); + if (prev_heap) + mem_heap_free(prev_heap); + if (err != DB_SUCCESS) + { + old_vers_err: + mem_heap_free(vers_heap); + if (err == DB_MISSING_HISTORY) + { + err= DB_SUCCESS; + if (got_extended_match) + goto did_not_find; + goto not_found; + } + goto func_exit; + } + + if (UNIV_UNLIKELY(!old_vers)) + { + mem_heap_free(vers_heap); + /* We did not find a matching clustered index record version + for the secondary index record. Normal CHECK TABLE will simply + not count the secondary index record; CHECK TABLE ... EXTENDED + will flag such orphan records if appropriate. + + A secondary index record may may be "temporarily orphan" + if purge is in progress. We will only flag them if + everything up to PAGE_MAX_TRX_ID has been fully purged. + + "Temporary orphans" may be produced when + row_undo_mod_clust() resets the DB_TRX_ID of the latest + clust_rec version or when trx_undo_prev_version_build() + encounters a BLOB that may have been freed according to + purge_sys.view (not purge_sys.end_view). */ + if (&view == &check_table_extended_view && !got_extended_match) + goto extended_not_found; + goto did_not_find; + } + + clust_rec= old_vers; + clust_offsets= rec_get_offsets(clust_rec, clust_index, clust_offsets, + clust_index->n_core_fields, + ULINT_UNDEFINED, &heap); + + rec_trx_id= row_get_rec_trx_id(clust_rec, clust_index, + clust_offsets); + + if (UNIV_UNLIKELY(rec_trx_id >= + prebuilt->trx->read_view.low_limit_id() && + rec_trx_id >= trx_sys.get_max_trx_id())) + { + mem_heap_free(vers_heap); + goto invalid_rec_trx_id; + } + + const bool rec_visible= + prebuilt->trx->read_view.changes_visible(rec_trx_id); + const bool clust_rec_deleted= + rec_get_deleted_flag(clust_rec, prebuilt->table->not_redundant()); + + if (&view != &prebuilt->trx->read_view) + { + /* It is not safe to fetch BLOBs of committed delete-marked + records that may have been freed in purge. */ + err= clust_rec_deleted && rec_trx_id && + purge_sys.is_purgeable(rec_trx_id) + ? DB_SUCCESS_LOCKED_REC + : row_check_index_match(prebuilt, + clust_rec, clust_index, clust_offsets, + rec, index, offsets); + + switch (err) { + default: + goto old_vers_err; + case DB_SUCCESS_LOCKED_REC: + if (rec_visible && !~visible_trx_id) + visible_trx_id= rec_trx_id; + continue; + case DB_SUCCESS: + got_extended_match= true; + if (!rec_visible) + continue; + if (!~visible_trx_id) + { + visible_trx_id= rec_trx_id; + found_in_view= !clust_rec_deleted; + } + mem_heap_free(vers_heap); + if (!found_in_view) + goto did_not_find; + goto found_match; + } + } + else if (rec_visible) + { + if (!clust_rec_deleted) + { + clust_rec= rec_copy(mem_heap_alloc(heap, + rec_offs_size(clust_offsets)), + clust_rec, clust_offsets); + rec_offs_make_valid(clust_rec, clust_index, true, clust_offsets); + } + mem_heap_free(vers_heap); + if (clust_rec_deleted) + goto did_not_find; + goto check_match; + } + } + } + else if (rec_get_deleted_flag(clust_rec, + prebuilt->table->not_redundant())) + goto did_not_find; + + ut_ad(clust_rec); + ut_ad(&view != &check_table_extended_view); + + /* If we had to go to an earlier version of row or the secondary + index record is delete marked, then it may be that the secondary + index record corresponding to clust_rec (or old_vers) is not + rec; in that case we must ignore such row because in our + snapshot rec would not have existed. Remember that from rec we + cannot see directly which transaction id corresponds to it: we + have to go to the clustered index record. A query where we want + to fetch all rows where the secondary index value is in some + interval would return a wrong result if we would not drop rows + which we come to visit through secondary index records that + would not really exist in our snapshot. */ + + if (rec_deleted) + { + if (!clust_offsets) + clust_offsets= rec_get_offsets(clust_rec, clust_index, nullptr, + clust_index->n_core_fields, + ULINT_UNDEFINED, &heap); + check_match: + /* This clustered index record version exists in + prebuilt->trx->read_view and is not delete-marked. + By design, any BLOBs in it are not allowed to be + freed in the purge of committed transaction history. */ + err= row_check_index_match(prebuilt, clust_rec, clust_index, + clust_offsets, rec, index, offsets); + switch (err) { + case DB_SUCCESS: + break; + case DB_SUCCESS_LOCKED_REC: + err= DB_SUCCESS; + goto did_not_find; + default: + goto func_exit; + } + } + + mtr.rollback_to_savepoint(savepoint); + } + } + else + { + invalid_PAGE_MAX_TRX_ID: + if (UNIV_LIKELY(srv_force_recovery < SRV_FORCE_NO_UNDO_LOG_SCAN)) + { + push_warning_printf(prebuilt->trx->mysql_thd, + Sql_condition::WARN_LEVEL_WARN, ER_NOT_KEYFILE, + "InnoDB: Invalid PAGE_MAX_TRX_ID=%llu" + " in index '%-.200s'", + page_trx_id, index->name()); + prebuilt->autoinc_error= DB_INDEX_CORRUPT; + } + goto next_rec; + } + +count_row: + ++*n_rows; + + if (prev_entry) + { + ulint matched_fields= 0; + int cmp= cmp_dtuple_rec_with_match(prev_entry, rec, index, offsets, + &matched_fields); + const char* msg; + + if (UNIV_LIKELY(cmp < 0)); + else if (cmp > 0) + { + prebuilt->autoinc_error= DB_INDEX_CORRUPT; + msg= "index records in a wrong order in "; +not_ok: + ib::error() << msg << index->name << " of table " << index->table->name + << ": " << *prev_entry << ", " + << rec_offsets_print(rec, offsets); + } + else if (index->is_unique() && matched_fields >= + dict_index_get_n_ordering_defined_by_user(index)) + { + /* NULL values in unique indexes are considered not to be duplicates */ + for (ulint i= 0; i < dict_index_get_n_ordering_defined_by_user(index); + i++) + if (dfield_is_null(dtuple_get_nth_field(prev_entry, i))) + goto next_rec; + + if (prebuilt->autoinc_error == DB_SUCCESS) + prebuilt->autoinc_error= DB_DUPLICATE_KEY; + msg= "duplicate key in "; + goto not_ok; + } + } + +next_rec: + ut_ad(err == DB_SUCCESS); + + { + mem_heap_t *tmp_heap= nullptr; + + /* Empty the heap on each round. But preserve offsets[] + for the row_rec_to_index_entry() call, by copying them + into a separate memory heap when needed. */ + if (UNIV_UNLIKELY(offsets != offsets_)) + { + ulint size= rec_offs_get_n_alloc(offsets) * sizeof *offsets; + tmp_heap= mem_heap_create(size); + offsets= static_cast<rec_offs*>(mem_heap_dup(tmp_heap, offsets, size)); + } + + mem_heap_empty(heap); + prev_entry= row_rec_to_index_entry(rec, index, offsets, heap); + + if (UNIV_LIKELY_NULL(tmp_heap)) + mem_heap_free(tmp_heap); + } + + if (btr_pcur_is_after_last_on_page(prebuilt->pcur)) + goto next_page; + + goto rec_loop; } /*******************************************************************//** diff --git a/storage/innobase/row/row0umod.cc b/storage/innobase/row/row0umod.cc index 91925219ea8..cca44f01920 100644 --- a/storage/innobase/row/row0umod.cc +++ b/storage/innobase/row/row0umod.cc @@ -216,26 +216,23 @@ static ulint row_trx_id_offset(const rec_t* rec, const dict_index_t* index) } /** Determine if rollback must execute a purge-like operation. -@param[in,out] node row undo -@param[in,out] mtr mini-transaction +@param node row undo @return whether the record should be purged */ -static bool row_undo_mod_must_purge(undo_node_t* node, mtr_t* mtr) +static bool row_undo_mod_must_purge(const undo_node_t &node) { - ut_ad(node->rec_type == TRX_UNDO_UPD_DEL_REC); - ut_ad(!node->table->is_temporary()); + ut_ad(node.rec_type == TRX_UNDO_UPD_DEL_REC); + ut_ad(!node.table->is_temporary()); - btr_cur_t* btr_cur = btr_pcur_get_btr_cur(&node->pcur); - ut_ad(btr_cur->index->is_primary()); - DEBUG_SYNC_C("rollback_purge_clust"); + const btr_cur_t &btr_cur= node.pcur.btr_cur; + ut_ad(btr_cur.index->is_primary()); + DEBUG_SYNC_C("rollback_purge_clust"); - if (!purge_sys.changes_visible(node->new_trx_id, node->table->name)) { - return false; - } + if (!purge_sys.is_purgeable(node.new_trx_id)) + return false; - const rec_t* rec = btr_cur_get_rec(btr_cur); - - return trx_read_trx_id(rec + row_trx_id_offset(rec, btr_cur->index)) - == node->new_trx_id; + const rec_t *rec= btr_cur_get_rec(&btr_cur); + return trx_read_trx_id(rec + row_trx_id_offset(rec, btr_cur.index)) == + node.new_trx_id; } /***********************************************************//** @@ -251,7 +248,6 @@ row_undo_mod_clust( { btr_pcur_t* pcur; mtr_t mtr; - bool have_latch = false; dberr_t err; dict_index_t* index; @@ -347,9 +343,7 @@ row_undo_mod_clust( btr_pcur_commit_specify_mtr(pcur, &mtr); } else { index->set_modified(mtr); - have_latch = true; - purge_sys.latch.rd_lock(SRW_LOCK_CALL); - if (!row_undo_mod_must_purge(node, &mtr)) { + if (!row_undo_mod_must_purge(*node)) { goto mtr_commit_exit; } err = btr_cur_optimistic_delete(&pcur->btr_cur, 0, @@ -358,9 +352,7 @@ row_undo_mod_clust( goto mtr_commit_exit; } err = DB_SUCCESS; - purge_sys.latch.rd_unlock(); btr_pcur_commit_specify_mtr(pcur, &mtr); - have_latch = false; } mtr.start(); @@ -376,9 +368,7 @@ row_undo_mod_clust( if (index->table->is_temporary()) { mtr.set_log_mode(MTR_LOG_NO_REDO); } else { - have_latch = true; - purge_sys.latch.rd_lock(SRW_LOCK_CALL); - if (!row_undo_mod_must_purge(node, &mtr)) { + if (!row_undo_mod_must_purge(*node)) { goto mtr_commit_exit; } index->set_modified(mtr); @@ -400,17 +390,12 @@ row_undo_mod_clust( mtr.start(); if (pcur->restore_position(BTR_MODIFY_LEAF, &mtr) - != btr_pcur_t::SAME_ALL) { - goto mtr_commit_exit; - } - rec_t* rec = btr_pcur_get_rec(pcur); - have_latch = true; - purge_sys.latch.rd_lock(SRW_LOCK_CALL); - if (!purge_sys.changes_visible(node->new_trx_id, - node->table->name)) { + != btr_pcur_t::SAME_ALL + || !purge_sys.is_purgeable(node->new_trx_id)) { goto mtr_commit_exit; } + rec_t* rec = btr_pcur_get_rec(pcur); ulint trx_id_offset = index->trx_id_offset; ulint trx_id_pos = index->n_uniq ? index->n_uniq : 1; /* Reserve enough offsets for the PRIMARY KEY and @@ -477,10 +462,6 @@ row_undo_mod_clust( } mtr_commit_exit: - if (have_latch) { - purge_sys.latch.rd_unlock(); - } - btr_pcur_commit_specify_mtr(pcur, &mtr); func_exit: diff --git a/storage/innobase/row/row0upd.cc b/storage/innobase/row/row0upd.cc index 61ac22ca27a..26c434ca474 100644 --- a/storage/innobase/row/row0upd.cc +++ b/storage/innobase/row/row0upd.cc @@ -469,46 +469,6 @@ row_upd_changes_field_size_or_external( return(FALSE); } -/***********************************************************//** -Returns true if row update contains disowned external fields. -@return true if the update contains disowned external fields. */ -bool -row_upd_changes_disowned_external( -/*==============================*/ - const upd_t* update) /*!< in: update vector */ -{ - const upd_field_t* upd_field; - const dfield_t* new_val; - ulint new_len; - ulint n_fields; - ulint i; - - n_fields = upd_get_n_fields(update); - - for (i = 0; i < n_fields; i++) { - const byte* field_ref; - - upd_field = upd_get_nth_field(update, i); - new_val = &(upd_field->new_val); - new_len = dfield_get_len(new_val); - - if (!dfield_is_ext(new_val)) { - continue; - } - - ut_ad(new_len >= BTR_EXTERN_FIELD_REF_SIZE); - - field_ref = static_cast<const byte*>(dfield_get_data(new_val)) - + new_len - BTR_EXTERN_FIELD_REF_SIZE; - - if (field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_OWNER_FLAG) { - return(true); - } - } - - return(false); -} - /***************************************************************//** Builds an update vector from those fields which in a secondary index entry differ from a record that has the equal ordering fields. NOTE: we compare diff --git a/storage/innobase/row/row0vers.cc b/storage/innobase/row/row0vers.cc index acba98ceb05..a4b52fd2a2f 100644 --- a/storage/innobase/row/row0vers.cc +++ b/storage/innobase/row/row0vers.cc @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1997, 2017, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, 2021, MariaDB Corporation. +Copyright (c) 2017, 2022, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -104,6 +104,9 @@ row_vers_impl_x_locked_low( DBUG_ENTER("row_vers_impl_x_locked_low"); ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(mtr->memo_contains_page_flagged(clust_rec, + MTR_MEMO_PAGE_S_FIX + | MTR_MEMO_PAGE_X_FIX)); if (ulint trx_id_offset = clust_index->trx_id_offset) { trx_id = mach_read_from_6(clust_rec + trx_id_offset); @@ -190,7 +193,7 @@ row_vers_impl_x_locked_low( heap = mem_heap_create(1024); trx_undo_prev_version_build( - clust_rec, mtr, version, clust_index, clust_offsets, + version, clust_index, clust_offsets, heap, &prev_version, NULL, dict_index_has_virtual(index) ? &vrow : NULL, 0); @@ -527,6 +530,10 @@ row_vers_build_cur_vrow_low( = DATA_MISSING; } + ut_ad(mtr->memo_contains_page_flagged(rec, + MTR_MEMO_PAGE_S_FIX + | MTR_MEMO_PAGE_X_FIX)); + version = rec; /* If this is called by purge thread, set TRX_UNDO_PREV_IN_PURGE @@ -543,7 +550,7 @@ row_vers_build_cur_vrow_low( version, clust_index, clust_offsets); trx_undo_prev_version_build( - rec, mtr, version, clust_index, clust_offsets, + version, clust_index, clust_offsets, heap, &prev_version, NULL, vrow, status); if (heap2) { @@ -643,6 +650,10 @@ row_vers_vc_matches_cluster( /* First compare non-virtual columns (primary keys) */ ut_ad(index->n_fields == n_fields); ut_ad(n_fields == dtuple_get_n_fields(icentry)); + ut_ad(mtr->memo_contains_page_flagged(rec, + MTR_MEMO_PAGE_S_FIX + | MTR_MEMO_PAGE_X_FIX)); + { const dfield_t* a = ientry->fields; const dfield_t* b = icentry->fields; @@ -684,7 +695,7 @@ row_vers_vc_matches_cluster( ut_ad(roll_ptr != 0); trx_undo_prev_version_build( - rec, mtr, version, clust_index, clust_offsets, + version, clust_index, clust_offsets, heap, &prev_version, NULL, vrow, TRX_UNDO_PREV_IN_PURGE | TRX_UNDO_GET_OLD_V_VALUE); @@ -849,7 +860,7 @@ static bool dtuple_coll_eq(const dtuple_t &tuple1, const dtuple_t &tuple2) } /** Finds out if a version of the record, where the version >= the current -purge view, should have ientry as its secondary index entry. We check +purge_sys.view, should have ientry as its secondary index entry. We check if there is any not delete marked version of the record where the trx id >= purge view, and the secondary index entry == ientry; exactly in this case we return TRUE. @@ -1031,11 +1042,12 @@ unsafe_to_purge: heap = mem_heap_create(1024); vrow = NULL; - trx_undo_prev_version_build(rec, mtr, version, + trx_undo_prev_version_build(version, clust_index, clust_offsets, - heap, &prev_version, NULL, + heap, &prev_version, nullptr, dict_index_has_virtual(index) - ? &vrow : NULL, 0); + ? &vrow : nullptr, + TRX_UNDO_CHECK_PURGEABILITY); mem_heap_free(heap2); /* free version and clust_offsets */ if (!prev_version) { @@ -1114,7 +1126,9 @@ unsafe_to_purge: Constructs the version of a clustered index record which a consistent read should see. We assume that the trx id stored in rec is such that the consistent read should not see rec in its present version. -@return DB_SUCCESS or DB_MISSING_HISTORY */ +@return error code +@retval DB_SUCCESS if a previous version was fetched +@retval DB_MISSING_HISTORY if the history is missing (a sign of corruption) */ dberr_t row_vers_build_for_consistent_read( /*===============================*/ @@ -1154,7 +1168,7 @@ row_vers_build_for_consistent_read( trx_id = row_get_rec_trx_id(rec, index, *offsets); - ut_ad(!view->changes_visible(trx_id, index->table->name)); + ut_ad(!view->changes_visible(trx_id)); ut_ad(!vrow || !(*vrow)); @@ -1172,12 +1186,10 @@ row_vers_build_for_consistent_read( /* If purge can't see the record then we can't rely on the UNDO log record. */ - bool purge_sees = trx_undo_prev_version_build( - rec, mtr, version, index, *offsets, heap, + err = trx_undo_prev_version_build( + version, index, *offsets, heap, &prev_version, NULL, vrow, 0); - err = (purge_sees) ? DB_SUCCESS : DB_MISSING_HISTORY; - if (prev_heap != NULL) { mem_heap_free(prev_heap); } @@ -1199,7 +1211,7 @@ row_vers_build_for_consistent_read( trx_id = row_get_rec_trx_id(prev_version, index, *offsets); - if (view->changes_visible(trx_id, index->table->name)) { + if (view->changes_visible(trx_id)) { /* The view already sees this version: we can copy it to in_heap and return */ @@ -1216,8 +1228,11 @@ row_vers_build_for_consistent_read( dtuple_dup_v_fld(*vrow, in_heap); } break; + } else if (trx_id >= view->low_limit_id() + && trx_id >= trx_sys.get_max_trx_id()) { + err = DB_CORRUPTION; + break; } - version = prev_version; } @@ -1334,10 +1349,9 @@ committed_version_trx: heap2 = heap; heap = mem_heap_create(1024); - if (!trx_undo_prev_version_build(rec, mtr, version, index, - *offsets, heap, - &prev_version, - in_heap, vrow, 0)) { + if (trx_undo_prev_version_build(version, index, *offsets, heap, + &prev_version, in_heap, vrow, + 0) != DB_SUCCESS) { mem_heap_free(heap); heap = heap2; heap2 = NULL; diff --git a/storage/innobase/trx/trx0purge.cc b/storage/innobase/trx/trx0purge.cc index 1707f2885ba..7d52894051d 100644 --- a/storage/innobase/trx/trx0purge.cc +++ b/storage/innobase/trx/trx0purge.cc @@ -42,10 +42,6 @@ Created 3/26/1996 Heikki Tuuri #include <unordered_map> -#ifdef UNIV_PFS_RWLOCK -extern mysql_pfs_key_t trx_purge_latch_key; -#endif /* UNIV_PFS_RWLOCK */ - /** Maximum allowable purge history length. <=0 means 'infinite'. */ ulong srv_max_purge_lag = 0; @@ -184,6 +180,7 @@ void purge_sys_t::create() hdr_page_no= 0; hdr_offset= 0; latch.SRW_LOCK_INIT(trx_purge_latch_key); + end_latch.init(); mysql_mutex_init(purge_sys_pq_mutex_key, &pq_mutex, nullptr); truncate.current= NULL; truncate.last= NULL; @@ -205,11 +202,40 @@ void purge_sys_t::close() trx->state= TRX_STATE_NOT_STARTED; trx->free(); latch.destroy(); + end_latch.destroy(); mysql_mutex_destroy(&pq_mutex); mem_heap_free(heap); heap= nullptr; } +/** Determine if the history of a transaction is purgeable. +@param trx_id transaction identifier +@return whether the history is purgeable */ +TRANSACTIONAL_TARGET bool purge_sys_t::is_purgeable(trx_id_t trx_id) const +{ + bool purgeable; +#if !defined SUX_LOCK_GENERIC && !defined NO_ELISION + purgeable= false; + if (xbegin()) + { + if (!latch.is_write_locked()) + { + purgeable= view.changes_visible(trx_id); + xend(); + } + else + xabort(); + } + else +#endif + { + latch.rd_lock(SRW_LOCK_CALL); + purgeable= view.changes_visible(trx_id); + latch.rd_unlock(); + } + return purgeable; +} + /*================ UNDO LOG HISTORY LIST =============================*/ /** Prepend the history list with an undo log. @@ -1199,7 +1225,6 @@ trx_purge_attach_undo_recs(ulint n_purge_threads) i = 0; - const ulint batch_size = srv_purge_batch_size; std::unordered_map<table_id_t, purge_node_t*> table_id_map; mem_heap_empty(purge_sys.heap); @@ -1251,7 +1276,7 @@ trx_purge_attach_undo_recs(ulint n_purge_threads) node->undo_recs.push(purge_rec); - if (n_pages_handled >= batch_size) { + if (n_pages_handled >= srv_purge_batch_size) { break; } } @@ -1303,14 +1328,14 @@ extern tpool::waitable_task purge_worker_task; /** Wait for pending purge jobs to complete. */ static void trx_purge_wait_for_workers_to_complete() { - bool notify_wait = purge_worker_task.is_running(); + const bool notify_wait{purge_worker_task.is_running()}; if (notify_wait) - tpool::tpool_wait_begin(); + tpool::tpool_wait_begin(); purge_worker_task.wait(); - if(notify_wait) + if (notify_wait) tpool::tpool_wait_end(); /* There should be no outstanding tasks as long @@ -1318,12 +1343,33 @@ static void trx_purge_wait_for_workers_to_complete() ut_ad(srv_get_task_queue_length() == 0); } +/** Update end_view at the end of a purge batch. */ +TRANSACTIONAL_INLINE void purge_sys_t::clone_end_view() +{ + /* This is only invoked only by the purge coordinator, + which is the only thread that can modify our inputs head, tail, view. + Therefore, we only need to protect end_view from concurrent reads. */ + + /* Limit the end_view similar to what trx_purge_truncate_history() does. */ + const trx_id_t trx_no= head.trx_no ? head.trx_no : tail.trx_no; +#ifdef SUX_LOCK_GENERIC + end_latch.wr_lock(); +#else + transactional_lock_guard<srw_spin_lock_low> g(end_latch); +#endif + end_view= view; + end_view.clamp_low_limit_id(trx_no); +#ifdef SUX_LOCK_GENERIC + end_latch.wr_unlock(); +#endif +} + /** Run a purge batch. @param n_tasks number of purge tasks to submit to the queue @param truncate whether to truncate the history at the end of the batch @return number of undo log pages handled in the batch */ -ulint trx_purge(ulint n_tasks, bool truncate) +TRANSACTIONAL_TARGET ulint trx_purge(ulint n_tasks, bool truncate) { que_thr_t* thr = NULL; ulint n_pages_handled; @@ -1357,6 +1403,8 @@ ulint trx_purge(ulint n_tasks, bool truncate) trx_purge_wait_for_workers_to_complete(); + purge_sys.clone_end_view(); + if (truncate) { trx_purge_truncate_history(); } diff --git a/storage/innobase/trx/trx0rec.cc b/storage/innobase/trx/trx0rec.cc index 0ced0c91c12..e70516a2d2d 100644 --- a/storage/innobase/trx/trx0rec.cc +++ b/storage/innobase/trx/trx0rec.cc @@ -2076,51 +2076,49 @@ trx_undo_get_undo_rec_low( return undo_rec; } -/** Copy an undo record to heap. -@param[in] roll_ptr roll pointer to record -@param[in,out] heap memory heap where copied -@param[in] trx_id id of the trx that generated - the roll pointer: it points to an - undo log of this transaction -@param[in] name table name -@param[out] undo_rec own: copy of the record -@retval true if the undo log has been -truncated and we cannot fetch the old version -@retval false if the undo log record is available -NOTE: the caller must have latches on the clustered index page. */ -static MY_ATTRIBUTE((warn_unused_result)) -bool -trx_undo_get_undo_rec( - roll_ptr_t roll_ptr, - mem_heap_t* heap, - trx_id_t trx_id, - const table_name_t& name, - trx_undo_rec_t** undo_rec) +/** Copy an undo record to heap, to check if a secondary index record +can be safely purged. +@param trx_id DB_TRX_ID corresponding to roll_ptr +@param name table name +@param roll_ptr DB_ROLL_PTR pointing to the undo log record +@param heap memory heap for allocation +@return copy of the record +@retval nullptr if the version is visible to purge_sys.view */ +static trx_undo_rec_t *trx_undo_get_rec_if_purgeable(trx_id_t trx_id, + const table_name_t &name, + roll_ptr_t roll_ptr, + mem_heap_t* heap) { - purge_sys.latch.rd_lock(SRW_LOCK_CALL); - - bool missing_history = purge_sys.changes_visible(trx_id, name); - if (!missing_history) { - *undo_rec = trx_undo_get_undo_rec_low(roll_ptr, heap); - missing_history = !*undo_rec; - } - - purge_sys.latch.rd_unlock(); - - return missing_history; + { + purge_sys_t::view_guard check; + if (!check.view().changes_visible(trx_id)) + return trx_undo_get_undo_rec_low(roll_ptr, heap); + } + return nullptr; } -#ifdef UNIV_DEBUG -#define ATTRIB_USED_ONLY_IN_DEBUG -#else /* UNIV_DEBUG */ -#define ATTRIB_USED_ONLY_IN_DEBUG MY_ATTRIBUTE((unused)) -#endif /* UNIV_DEBUG */ +/** Copy an undo record to heap. +@param trx_id DB_TRX_ID corresponding to roll_ptr +@param name table name +@param roll_ptr DB_ROLL_PTR pointing to the undo log record +@param heap memory heap for allocation +@return copy of the record +@retval nullptr if the undo log is not available */ +static trx_undo_rec_t *trx_undo_get_undo_rec(trx_id_t trx_id, + const table_name_t &name, + roll_ptr_t roll_ptr, + mem_heap_t *heap) +{ + { + purge_sys_t::end_view_guard check; + if (!check.view().changes_visible(trx_id)) + return trx_undo_get_undo_rec_low(roll_ptr, heap); + } + return nullptr; +} /** Build a previous version of a clustered index record. The caller must hold a latch on the index page of the clustered index record. -@param index_rec clustered index record in the index tree -@param index_mtr mtr which contains the latch to index_rec page - and purge_view @param rec version of a clustered index record @param index clustered index @param offsets rec_get_offsets(rec, index) @@ -2141,14 +2139,13 @@ must hold a latch on the index page of the clustered index record. And if we read "after image" of undo log @param undo_block undo log block which was cached during online dml apply or nullptr -@retval true if previous version was built, or if it was an insert -or the table has been rebuilt -@retval false if the previous version is earlier than purge_view, -or being purged, which means that it may have been removed */ -bool +@return error code +@retval DB_SUCCESS if previous version was successfully built, +or if it was an insert or the undo record refers to the table before rebuild +@retval DB_MISSING_HISTORY if the history is missing */ +TRANSACTIONAL_TARGET +dberr_t trx_undo_prev_version_build( - const rec_t *index_rec ATTRIB_USED_ONLY_IN_DEBUG, - mtr_t *index_mtr ATTRIB_USED_ONLY_IN_DEBUG, const rec_t *rec, dict_index_t *index, rec_offs *offsets, @@ -2158,7 +2155,6 @@ trx_undo_prev_version_build( dtuple_t **vrow, ulint v_status) { - trx_undo_rec_t* undo_rec = NULL; dtuple_t* entry; trx_id_t rec_trx_id; ulint type; @@ -2173,11 +2169,7 @@ trx_undo_prev_version_build( byte* buf; ut_ad(!index->table->is_temporary()); - ut_ad(index_mtr->memo_contains_page_flagged(index_rec, - MTR_MEMO_PAGE_S_FIX - | MTR_MEMO_PAGE_X_FIX)); ut_ad(rec_offs_validate(rec, index, offsets)); - ut_a(index->is_primary()); roll_ptr = row_get_rec_roll_ptr(rec, index, offsets); @@ -2185,27 +2177,20 @@ trx_undo_prev_version_build( if (trx_undo_roll_ptr_is_insert(roll_ptr)) { /* The record rec is the first inserted version */ - return(true); + return DB_SUCCESS; } rec_trx_id = row_get_rec_trx_id(rec, index, offsets); ut_ad(!index->table->skip_alter_undo); - if (trx_undo_get_undo_rec( - roll_ptr, heap, rec_trx_id, index->table->name, - &undo_rec)) { - if (v_status & TRX_UNDO_PREV_IN_PURGE) { - /* We are fetching the record being purged */ - undo_rec = trx_undo_get_undo_rec_low(roll_ptr, heap); - if (!undo_rec) { - return false; - } - } else { - /* The undo record may already have been purged, - during purge or semi-consistent read. */ - return(false); - } + trx_undo_rec_t* undo_rec = v_status == TRX_UNDO_CHECK_PURGEABILITY + ? trx_undo_get_rec_if_purgeable(rec_trx_id, index->table->name, + roll_ptr, heap) + : trx_undo_get_undo_rec(rec_trx_id, index->table->name, + roll_ptr, heap); + if (!undo_rec) { + return DB_MISSING_HISTORY; } const byte *ptr = @@ -2216,7 +2201,7 @@ trx_undo_prev_version_build( /* The table should have been rebuilt, but purge has not yet removed the undo log records for the now-dropped old table (table_id). */ - return(true); + return DB_SUCCESS; } ptr = trx_undo_update_rec_get_sys_cols(ptr, &trx_id, &roll_ptr, @@ -2264,24 +2249,9 @@ trx_undo_prev_version_build( delete-marked record by trx_id, no transactions need to access the BLOB. */ - /* the row_upd_changes_disowned_external(update) call could be - omitted, but the synchronization on purge_sys.latch is likely - more expensive. */ - - if ((update->info_bits & REC_INFO_DELETED_FLAG) - && row_upd_changes_disowned_external(update)) { - purge_sys.latch.rd_lock(SRW_LOCK_CALL); - - bool missing_extern = purge_sys.changes_visible( - trx_id, index->table->name); - - purge_sys.latch.rd_unlock(); - - if (missing_extern) { - /* treat as a fresh insert, not to - cause assertion error at the caller. */ - return(true); - } + if (update->info_bits & REC_INFO_DELETED_FLAG + && purge_sys.is_purgeable(trx_id)) { + return DB_SUCCESS; } /* We have to set the appropriate extern storage bits in the @@ -2296,8 +2266,8 @@ trx_undo_prev_version_build( following call is safe. */ if (!row_upd_index_replace_new_col_vals(entry, *index, update, heap)) { - ut_a(v_status & TRX_UNDO_PREV_IN_PURGE); - return false; + return (v_status & TRX_UNDO_PREV_IN_PURGE) + ? DB_MISSING_HISTORY : DB_CORRUPTION; } /* Get number of externally stored columns in updated record */ @@ -2394,7 +2364,7 @@ trx_undo_prev_version_build( v_status & TRX_UNDO_PREV_IN_PURGE); } - return(true); + return DB_SUCCESS; } /** Read virtual column value from undo log diff --git a/storage/innobase/trx/trx0sys.cc b/storage/innobase/trx/trx0sys.cc index 1113c72fcbb..ee27e8f51c1 100644 --- a/storage/innobase/trx/trx0sys.cc +++ b/storage/innobase/trx/trx0sys.cc @@ -44,40 +44,6 @@ Created 3/26/1996 Heikki Tuuri /** The transaction system */ trx_sys_t trx_sys; -/** Check whether transaction id is valid. -@param[in] id transaction id to check -@param[in] name table name */ -void -ReadViewBase::check_trx_id_sanity( - trx_id_t id, - const table_name_t& name) -{ - if (id >= trx_sys.get_max_trx_id()) { - - ib::warn() << "A transaction id" - << " in a record of table " - << name - << " is newer than the" - << " system-wide maximum."; - ut_ad(0); - THD *thd = current_thd; - if (thd != NULL) { - char table_name[MAX_FULL_NAME_LEN + 1]; - - innobase_format_name( - table_name, sizeof(table_name), - name.m_name); - - push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, - ER_SIGNAL_WARN, - "InnoDB: Transaction id" - " in a record of table" - " %s is newer than system-wide" - " maximum.", table_name); - } - } -} - #ifdef UNIV_DEBUG /* Flag to control TRX_RSEG_N_SLOTS behavior debugging. */ uint trx_rseg_n_slots_debug = 0; diff --git a/storage/innobase/trx/trx0trx.cc b/storage/innobase/trx/trx0trx.cc index 900039d1408..56bdac29694 100644 --- a/storage/innobase/trx/trx0trx.cc +++ b/storage/innobase/trx/trx0trx.cc @@ -399,7 +399,8 @@ void trx_t::free() autoinc_locks= NULL; } - MEM_NOACCESS(&n_ref, sizeof n_ref); + MEM_NOACCESS(&skip_lock_inheritance_and_n_ref, + sizeof skip_lock_inheritance_and_n_ref); /* do not poison mutex */ MEM_NOACCESS(&id, sizeof id); MEM_NOACCESS(&state, sizeof state); @@ -485,6 +486,7 @@ TRANSACTIONAL_INLINE inline void trx_t::commit_state() /** Release any explicit locks of a committing transaction. */ inline void trx_t::release_locks() { + DEBUG_SYNC_C("trx_t_release_locks_enter"); DBUG_ASSERT(state == TRX_STATE_COMMITTED_IN_MEMORY); DBUG_ASSERT(!is_referenced()); @@ -498,6 +500,7 @@ inline void trx_t::release_locks() } lock.table_locks.clear(); + reset_skip_lock_inheritance(); id= 0; while (dict_table_t *table= UT_LIST_GET_FIRST(lock.evicted_tables)) { @@ -785,7 +788,7 @@ corrupted: ib::info() << "Trx id counter is " << trx_sys.get_max_trx_id(); } - purge_sys.clone_oldest_view(); + purge_sys.clone_oldest_view<true>(); return DB_SUCCESS; } @@ -1131,8 +1134,8 @@ trx_finalize_for_fts( trx->fts_trx = NULL; } - -extern "C" void thd_decrement_pending_ops(MYSQL_THD); +extern "C" MYSQL_THD thd_increment_pending_ops(MYSQL_THD); +extern "C" void thd_decrement_pending_ops(MYSQL_THD); #include "../log/log0sync.h" @@ -1155,7 +1158,7 @@ static void trx_flush_log_if_needed_low(lsn_t lsn, const trx_t *trx) completion_callback cb, *callback= nullptr; if (trx->state != TRX_STATE_PREPARED && !log_sys.is_pmem() && - (cb.m_param= innodb_thd_increment_pending_ops(trx->mysql_thd))) + (cb.m_param= thd_increment_pending_ops(trx->mysql_thd))) { cb.m_callback= (void (*)(void *)) thd_decrement_pending_ops; callback= &cb; @@ -2167,6 +2170,7 @@ trx_set_rw_mode( ut_ad(trx->rsegs.m_redo.rseg != 0); trx_sys.register_rw(trx); + ut_ad(trx->id); /* So that we can see our own changes. */ if (trx->read_view.is_open()) { diff --git a/storage/maria/CMakeLists.txt b/storage/maria/CMakeLists.txt index 13d8035bdc8..f55d78f0162 100644 --- a/storage/maria/CMakeLists.txt +++ b/storage/maria/CMakeLists.txt @@ -132,7 +132,7 @@ SET(CPACK_RPM_s3-engine_PACKAGE_DESCRIPTION "The S3 storage engine allows one to IF(TARGET s3) MYSQL_ADD_EXECUTABLE(aria_s3_copy aria_s3_copy.cc ${S3_SOURCES} COMPONENT s3-engine) - TARGET_LINK_LIBRARIES(aria_s3_copy aria myisam mysys mysys_ssl curl z) + TARGET_LINK_LIBRARIES(aria_s3_copy aria myisam mysys mysys_ssl ${CURL_LIBRARIES} ${ZLIB_LIBRARY}) INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/libmarias3) ADD_DEFINITIONS(-DWITH_S3_STORAGE_ENGINE) ENDIF() |